Compare commits

..

4 commits

Author SHA1 Message Date
Stefano Brivio
0c6c20dee5 udp, udp_flow: Add instrumentation, handle EPOLLERR without queued errors
Link: https://github.com/containers/podman/issues/23686#issuecomment-2324945010
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2024-09-04 18:36:54 +02:00
David Gibson
d098e0527a additional debug 2024-09-04 18:36:54 +02:00
David Gibson
026fb71d1d tcp: Attempt to mitigate EPOLLRDHUP storms with half-closed connections
Link: https://github.com/containers/podman/issues/23686
2024-09-04 18:36:54 +02:00
Stefano Brivio
232e12529e log: Don't prefix log file messages with time and severity if they're continuations
In fecb1b65b1 ("log: Don't prefix message with timestamp on --debug
if it's a continuation"), I fixed this for --debug on standard error,
but not for log files: if messages are continuations, they shouldn't
be prefixed by timestamp and severity.

Otherwise, we'll print stuff like this:

  0.0028: ERROR:   Receive error on guest connection, reset0.0028:  ERROR:   : Bad file descriptor

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2024-09-04 16:22:27 +02:00
64 changed files with 1186 additions and 1762 deletions

View file

@ -1,126 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
#
# clang-format configuration file. Intended for clang-format >= 11.
#
# For more information, see:
#
# Documentation/dev-tools/clang-format.rst
# https://clang.llvm.org/docs/ClangFormat.html
# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
#
---
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: None
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: false
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: true
AfterNamespace: true
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Custom
BreakBeforeInheritanceComma: false
BreakBeforeTernaryOperators: false
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeComma
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 8
ContinuationIndentWidth: 8
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: false
# Taken from:
# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
# | LC_ALL=C sort -u
ForEachMacros:
- 'for_each_nst'
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '.*'
Priority: 1
IncludeIsMainRegex: '(Test)?$'
IndentCaseLabels: false
IndentGotoLabels: false
IndentPPDirectives: None
IndentWidth: 8
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 8
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
# Taken from git's rules
PenaltyBreakAssignment: 10
PenaltyBreakBeforeFirstCallParameter: 30
PenaltyBreakComment: 10
PenaltyBreakFirstLessLess: 0
PenaltyBreakString: 10
PenaltyExcessCharacter: 100
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Right
ReflowComments: false
SortIncludes: false
SortUsingDeclarations: false
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatementsExceptForEachMacros
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp03
TabWidth: 8
UseTab: Always
...

View file

@ -1,93 +0,0 @@
---
Checks:
- "clang-diagnostic-*,clang-analyzer-*,*,-modernize-*"
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
- "-clang-analyzer-valist.Uninitialized"
# Dubious value, would kill readability
- "-cppcoreguidelines-init-variables"
# Dubious value over the compiler's built-in warning. Would
# increase verbosity.
- "-bugprone-assignment-in-if-condition"
# Debatable whether these improve readability, right now it would look
# like a mess
- "-google-readability-braces-around-statements"
- "-hicpp-braces-around-statements"
- "-readability-braces-around-statements"
# TODO: in most cases they are justified, but probably not everywhere
#
- "-readability-magic-numbers"
- "-cppcoreguidelines-avoid-magic-numbers"
# TODO: this is Linux-only for the moment, nice to fix eventually
- "-llvmlibc-restrict-system-libc-headers"
# Those are needed for syscalls, epoll_wait flags, etc.
- "-hicpp-signed-bitwise"
# Probably not doable to impement this without plain memcpy(), memset()
- "-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
# TODO: not really important, but nice to fix eventually
- "-llvm-include-order"
# Dubious value, would kill readability
- "-readability-isolate-declaration"
# TODO: nice to fix eventually
- "-bugprone-narrowing-conversions"
- "-cppcoreguidelines-narrowing-conversions"
# TODO: check, fix, and more in general constify wherever possible
- "-cppcoreguidelines-avoid-non-const-global-variables"
# TODO: check paths where it might make sense to improve performance
- "-altera-unroll-loops"
- "-altera-id-dependent-backward-branch"
# Not much can be done about them other than being careful
- "-bugprone-easily-swappable-parameters"
# TODO: split reported functions
- "-readability-function-cognitive-complexity"
# "Poor" alignment needed for structs reflecting message formats/headers
- "-altera-struct-pack-align"
# TODO: check again if multithreading is implemented
- "-concurrency-mt-unsafe"
# Complains about any identifier <3 characters, reasonable for
# globals, pointlessly verbose for locals and parameters.
- "-readability-identifier-length"
# Wants to include headers which *directly* provide the things
# we use. That sounds nice, but means it will often want a OS
# specific header instead of a mostly standard one, such as
# <linux/limits.h> instead of <limits.h>.
- "-misc-include-cleaner"
# Want to replace all #defines of integers with enums. Kind of
# makes sense when those defines form an enum-like set, but
# weird for cases like standalone constants, and causes other
# awkwardness for a bunch of cases we use
- "-cppcoreguidelines-macro-to-enum"
# It's been a couple of centuries since multiplication has been granted
# precedence over addition in modern mathematical notation. Adding
# parentheses to reinforce that certainly won't improve readability.
- "-readability-math-missing-parentheses"
WarningsAsErrors: "*"
HeaderFileExtensions:
- h
ImplementationFileExtensions:
- c
HeaderFilterRegex: ""
FormatStyle: none
CheckOptions:
bugprone-suspicious-string-compare.WarnOnImplicitComparison: "false"
SystemHeaders: false

View file

@ -1,3 +0,0 @@
CompileFlags:
# Don't try to interpret our headers as C++'
Add: [-xc, -Wall]

161
Makefile
View file

@ -15,11 +15,24 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio
# the IPv6 socket API? (Linux does) # the IPv6 socket API? (Linux does)
DUAL_STACK_SOCKETS := 1 DUAL_STACK_SOCKETS := 1
RLIMIT_STACK_VAL := $(shell /bin/sh -c 'ulimit -s')
ifeq ($(RLIMIT_STACK_VAL),unlimited)
RLIMIT_STACK_VAL := 1024
endif
TARGET ?= $(shell $(CC) -dumpmachine) TARGET ?= $(shell $(CC) -dumpmachine)
# Get 'uname -m'-like architecture description for target # Get 'uname -m'-like architecture description for target
TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z]) TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/') TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')
AUDIT_ARCH := $(shell echo $(TARGET_ARCH) | tr [a-z] [A-Z] | sed 's/^ARM.*/ARM/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/I[456]86/I386/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/')
# On some systems enabling optimization also enables source fortification, # On some systems enabling optimization also enables source fortification,
# automagically. Do not override it. # automagically. Do not override it.
FORTIFY_FLAG := FORTIFY_FLAG :=
@ -31,6 +44,10 @@ FLAGS := -Wall -Wextra -Wno-format-zero-length
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
FLAGS += -DNETNS_RUN_DIR=\"/run/netns\"
FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)
FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
FLAGS += -DARCH=\"$(TARGET_ARCH)\"
FLAGS += -DVERSION=\"$(VERSION)\" FLAGS += -DVERSION=\"$(VERSION)\"
FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
@ -50,6 +67,21 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
udp.h udp_flow.h util.h udp.h udp_flow.h util.h
HEADERS = $(PASST_HEADERS) seccomp.h HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_SND_WND
endif
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 };
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_BYTES_ACKED
endif
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_min_rtt = 0 };
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_MIN_RTT
endif
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);} C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_GETRANDOM FLAGS += -DHAS_GETRANDOM
@ -59,6 +91,11 @@ ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - -o - >/dev/null 2>&1; ec
FLAGS += -fstack-protector-strong FLAGS += -fstack-protector-strong
endif endif
C := \#define _GNU_SOURCE\n\#include <fcntl.h>\nint x = FALLOC_FL_COLLAPSE_RANGE;
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
EXTRA_SYSCALLS += fallocate
endif
prefix ?= /usr/local prefix ?= /usr/local
exec_prefix ?= $(prefix) exec_prefix ?= $(prefix)
bindir ?= $(exec_prefix)/bin bindir ?= $(exec_prefix)/bin
@ -95,7 +132,7 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
ln -sf $< $@ ln -sf $< $@
qrap: $(QRAP_SRCS) passt.h qrap: $(QRAP_SRCS) passt.h
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS) $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS)
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \ valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
rt_sigreturn getpid gettid kill clock_gettime mmap \ rt_sigreturn getpid gettid kill clock_gettime mmap \
@ -159,11 +196,116 @@ docs: README.md
done < README.md; \ done < README.md; \
) > README.plain.md ) > README.plain.md
clang-tidy: $(PASST_SRCS) $(HEADERS) # Checkers currently disabled for clang-tidy:
clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \ # - llvmlibc-restrict-system-libc-headers
-DCLANG_TIDY_58992 # TODO: this is Linux-only for the moment, nice to fix eventually
#
# - google-readability-braces-around-statements
# - hicpp-braces-around-statements
# - readability-braces-around-statements
# Debatable whether that improves readability, right now it would look
# like a mess
#
# - readability-magic-numbers
# - cppcoreguidelines-avoid-magic-numbers
# TODO: in most cases they are justified, but probably not everywhere
#
# - clang-analyzer-valist.Uninitialized
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
#
# - clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling
# Probably not doable to impement this without plain memcpy(), memset()
#
# - cppcoreguidelines-init-variables
# Dubious value, would kill readability
#
# - hicpp-signed-bitwise
# Those are needed for syscalls, epoll_wait flags, etc.
#
# - llvm-include-order
# TODO: not really important, but nice to fix eventually
#
# - readability-isolate-declaration
# Dubious value, would kill readability
#
# - bugprone-narrowing-conversions
# - cppcoreguidelines-narrowing-conversions
# TODO: nice to fix eventually
#
# - cppcoreguidelines-avoid-non-const-global-variables
# TODO: check, fix, and more in general constify wherever possible
#
# - altera-unroll-loops
# - altera-id-dependent-backward-branch
# TODO: check paths where it might make sense to improve performance
#
# - bugprone-easily-swappable-parameters
# Not much can be done about them other than being careful
#
# - readability-function-cognitive-complexity
# TODO: split reported functions
#
# - altera-struct-pack-align
# "Poor" alignment needed for structs reflecting message formats/headers
#
# - concurrency-mt-unsafe
# TODO: check again if multithreading is implemented
#
# - readability-identifier-length
# Complains about any identifier <3 characters, reasonable for
# globals, pointlessly verbose for locals and parameters.
#
# - bugprone-assignment-in-if-condition
# Dubious value over the compiler's built-in warning. Would
# increase verbosity.
#
# - misc-include-cleaner
# Wants to include headers which *directly* provide the things
# we use. That sounds nice, but means it will often want a OS
# specific header instead of a mostly standard one, such as
# <linux/limits.h> instead of <limits.h>.
#
# - cppcoreguidelines-macro-to-enum
# Want to replace all #defines of integers with enums. Kind of
# makes sense when those defines form an enum-like set, but
# weird for cases like standalone constants, and causes other
# awkwardness for a bunch of cases we use
cppcheck: $(PASST_SRCS) $(HEADERS) clang-tidy: $(SRCS) $(HEADERS)
clang-tidy -checks=*,-modernize-*,\
-clang-analyzer-valist.Uninitialized,\
-cppcoreguidelines-init-variables,\
-bugprone-assignment-in-if-condition,\
-google-readability-braces-around-statements,\
-hicpp-braces-around-statements,\
-readability-braces-around-statements,\
-readability-magic-numbers,\
-llvmlibc-restrict-system-libc-headers,\
-hicpp-signed-bitwise,\
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\
-llvm-include-order,\
-cppcoreguidelines-avoid-magic-numbers,\
-readability-isolate-declaration,\
-bugprone-narrowing-conversions,\
-cppcoreguidelines-narrowing-conversions,\
-cppcoreguidelines-avoid-non-const-global-variables,\
-altera-unroll-loops,-altera-id-dependent-backward-branch,\
-bugprone-easily-swappable-parameters,\
-readability-function-cognitive-complexity,\
-altera-struct-pack-align,\
-concurrency-mt-unsafe,\
-readability-identifier-length,\
-misc-include-cleaner,\
-cppcoreguidelines-macro-to-enum \
-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
--warnings-as-errors=* $(SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)
VER := $(shell $(CC) -dumpversion)
SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include
endif
cppcheck: $(SRCS) $(HEADERS)
if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \ if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \
CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \ CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \
else \ else \
@ -172,8 +314,11 @@ cppcheck: $(PASST_SRCS) $(HEADERS)
cppcheck --std=c11 --error-exitcode=1 --enable=all --force \ cppcheck --std=c11 --error-exitcode=1 --enable=all --force \
--inconclusive --library=posix --quiet \ --inconclusive --library=posix --quiet \
$${CPPCHECK_EXHAUSTIVE} \ $${CPPCHECK_EXHAUSTIVE} \
$(SYSTEM_INCLUDES:%=-I%) \
$(SYSTEM_INCLUDES:%=--config-exclude=%) \
$(SYSTEM_INCLUDES:%=--suppress=*:%/*) \
$(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*) \
--inline-suppr \ --inline-suppr \
--suppress=missingIncludeSystem \
--suppress=unusedStructMember \ --suppress=unusedStructMember \
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936 \ $(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
$(PASST_SRCS) $(HEADERS) $(SRCS) $(HEADERS)

8
arch.c
View file

@ -19,7 +19,6 @@
#include <unistd.h> #include <unistd.h>
#include "log.h" #include "log.h"
#include "util.h"
/** /**
* arch_avx2_exec() - Switch to AVX2 build if supported * arch_avx2_exec() - Switch to AVX2 build if supported
@ -41,11 +40,8 @@ void arch_avx2_exec(char **argv)
if (__builtin_cpu_supports("avx2")) { if (__builtin_cpu_supports("avx2")) {
char new_path[PATH_MAX + sizeof(".avx2")]; char new_path[PATH_MAX + sizeof(".avx2")];
if (snprintf_check(new_path, PATH_MAX + sizeof(".avx2"), snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe);
"%s.avx2", exe)) execve(new_path, argv, environ);
die_perror("Can't build AVX2 executable path");
execv(new_path, argv);
warn_perror("Can't run AVX2 build, using non-AVX2 version"); warn_perror("Can't run AVX2 build, using non-AVX2 version");
} }
} }

8
arp.c
View file

@ -59,12 +59,14 @@ int arp(const struct ctx *c, const struct pool *p)
ah->ar_op != htons(ARPOP_REQUEST)) ah->ar_op != htons(ARPOP_REQUEST))
return 1; return 1;
/* Discard announcements, but not 0.0.0.0 "probes" */ /* Discard announcements (but not 0.0.0.0 "probes"): we might have the
if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) && * same IP address, hide that.
*/
if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) &&
!memcmp(am->sip, am->tip, sizeof(am->sip))) !memcmp(am->sip, am->tip, sizeof(am->sip)))
return 1; return 1;
/* Don't resolve the guest's assigned address, either. */ /* Don't resolve our own address, either. */
if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip))) if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
return 1; return 1;

View file

@ -59,7 +59,6 @@
#include "util.h" #include "util.h"
#include "ip.h" #include "ip.h"
#include "checksum.h" #include "checksum.h"
#include "iov.h"
/* Checksums are optional for UDP over IPv4, so we usually just set /* Checksums are optional for UDP over IPv4, so we usually just set
* them to 0. Change this to 1 to calculate real UDP over IPv4 * them to 0. Change this to 1 to calculate real UDP over IPv4
@ -166,24 +165,22 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
* @udp4hr: UDP header, initialised apart from checksum * @udp4hr: UDP header, initialised apart from checksum
* @saddr: IPv4 source address * @saddr: IPv4 source address
* @daddr: IPv4 destination address * @daddr: IPv4 destination address
* @iov: Pointer to the array of IO vectors * @payload: UDP packet payload
* @iov_cnt: Length of the array * @dlen: Length of @payload (not including UDP header)
* @offset: UDP payload offset in the iovec array
*/ */
void csum_udp4(struct udphdr *udp4hr, void csum_udp4(struct udphdr *udp4hr,
struct in_addr saddr, struct in_addr daddr, struct in_addr saddr, struct in_addr daddr,
const struct iovec *iov, int iov_cnt, size_t offset) const void *payload, size_t dlen)
{ {
/* UDP checksums are optional, so don't bother */ /* UDP checksums are optional, so don't bother */
udp4hr->check = 0; udp4hr->check = 0;
if (UDP4_REAL_CHECKSUMS) { if (UDP4_REAL_CHECKSUMS) {
uint16_t l4len = iov_size(iov, iov_cnt) - offset + uint16_t l4len = dlen + sizeof(struct udphdr);
sizeof(struct udphdr);
uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP, uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
saddr, daddr); saddr, daddr);
psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum); psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
udp4hr->check = csum_iov(iov, iov_cnt, offset, psum); udp4hr->check = csum(payload, dlen, psum);
} }
} }
@ -229,24 +226,19 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
/** /**
* csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet * csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet
* @udp6hr: UDP header, initialised apart from checksum * @udp6hr: UDP header, initialised apart from checksum
* @saddr: Source address * @payload: UDP packet payload
* @daddr: Destination address * @dlen: Length of @payload (not including UDP header)
* @iov: Pointer to the array of IO vectors
* @iov_cnt: Length of the array
* @offset: UDP payload offset in the iovec array
*/ */
void csum_udp6(struct udphdr *udp6hr, void csum_udp6(struct udphdr *udp6hr,
const struct in6_addr *saddr, const struct in6_addr *daddr, const struct in6_addr *saddr, const struct in6_addr *daddr,
const struct iovec *iov, int iov_cnt, size_t offset) const void *payload, size_t dlen)
{ {
uint16_t l4len = iov_size(iov, iov_cnt) - offset + uint32_t psum = proto_ipv6_header_psum(dlen + sizeof(struct udphdr),
sizeof(struct udphdr); IPPROTO_UDP, saddr, daddr);
uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
saddr, daddr);
udp6hr->check = 0; udp6hr->check = 0;
psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum); psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
udp6hr->check = csum_iov(iov, iov_cnt, offset, psum); udp6hr->check = csum(payload, dlen, psum);
} }
/** /**
@ -505,26 +497,16 @@ uint16_t csum(const void *buf, size_t len, uint32_t init)
* *
* @iov Pointer to the array of IO vectors * @iov Pointer to the array of IO vectors
* @n Length of the array * @n Length of the array
* @offset: Offset of the data to checksum within the full data length
* @init Initial 32-bit checksum, 0 for no pre-computed checksum * @init Initial 32-bit checksum, 0 for no pre-computed checksum
* *
* Return: 16-bit folded, complemented checksum * Return: 16-bit folded, complemented checksum
*/ */
uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, /* cppcheck-suppress unusedFunction */
uint32_t init) uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init)
{ {
unsigned int i; unsigned int i;
size_t first;
i = iov_skip_bytes(iov, n, offset, &first); for (i = 0; i < n; i++)
if (i >= n)
return (uint16_t)~csum_fold(init);
init = csum_unfolded((char *)iov[i].iov_base + first,
iov[i].iov_len - first, init);
i++;
for (; i < n; i++)
init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init); init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);
return (uint16_t)~csum_fold(init); return (uint16_t)~csum_fold(init);

View file

@ -19,20 +19,19 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
struct in_addr saddr, struct in_addr daddr); struct in_addr saddr, struct in_addr daddr);
void csum_udp4(struct udphdr *udp4hr, void csum_udp4(struct udphdr *udp4hr,
struct in_addr saddr, struct in_addr daddr, struct in_addr saddr, struct in_addr daddr,
const struct iovec *iov, int iov_cnt, size_t offset); const void *payload, size_t dlen);
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen); void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen);
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol, uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
const struct in6_addr *saddr, const struct in6_addr *saddr,
const struct in6_addr *daddr); const struct in6_addr *daddr);
void csum_udp6(struct udphdr *udp6hr, void csum_udp6(struct udphdr *udp6hr,
const struct in6_addr *saddr, const struct in6_addr *daddr, const struct in6_addr *saddr, const struct in6_addr *daddr,
const struct iovec *iov, int iov_cnt, size_t offset); const void *payload, size_t dlen);
void csum_icmp6(struct icmp6hdr *icmp6hr, void csum_icmp6(struct icmp6hdr *icmp6hr,
const struct in6_addr *saddr, const struct in6_addr *daddr, const struct in6_addr *saddr, const struct in6_addr *daddr,
const void *payload, size_t dlen); const void *payload, size_t dlen);
uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init); uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
uint16_t csum(const void *buf, size_t len, uint32_t init); uint16_t csum(const void *buf, size_t len, uint32_t init);
uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init);
uint32_t init);
#endif /* CHECKSUM_H */ #endif /* CHECKSUM_H */

117
conf.c
View file

@ -46,8 +46,6 @@
#include "isolation.h" #include "isolation.h"
#include "log.h" #include "log.h"
#define NETNS_RUN_DIR "/run/netns"
/** /**
* next_chunk - Return the next piece of a string delimited by a character * next_chunk - Return the next piece of a string delimited by a character
* @s: String to search * @s: String to search
@ -118,10 +116,11 @@ static int parse_port_range(const char *s, char **endptr,
static void conf_ports(const struct ctx *c, char optname, const char *optarg, static void conf_ports(const struct ctx *c, char optname, const char *optarg,
struct fwd_ports *fwd) struct fwd_ports *fwd)
{ {
union inany_addr addr_buf = inany_any6, *addr = &addr_buf; char addr_buf[sizeof(struct in6_addr)] = { 0 }, *addr = addr_buf;
char buf[BUFSIZ], *spec, *ifname = NULL, *p; char buf[BUFSIZ], *spec, *ifname = NULL, *p;
bool exclude_only = true, bound_one = false; bool exclude_only = true, bound_one = false;
uint8_t exclude[PORT_BITMAP_SIZE] = { 0 }; uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
sa_family_t af = AF_UNSPEC;
unsigned i; unsigned i;
int ret; int ret;
@ -167,13 +166,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
bitmap_set(fwd->map, i); bitmap_set(fwd->map, i);
if (optname == 't') { if (optname == 't') {
ret = tcp_sock_init(c, NULL, NULL, i); ret = tcp_sock_init(c, AF_UNSPEC, NULL, NULL,
i);
if (ret == -ENFILE || ret == -EMFILE) if (ret == -ENFILE || ret == -EMFILE)
goto enfile; goto enfile;
if (!ret) if (!ret)
bound_one = true; bound_one = true;
} else if (optname == 'u') { } else if (optname == 'u') {
ret = udp_sock_init(c, 0, NULL, NULL, i); ret = udp_sock_init(c, 0, AF_UNSPEC, NULL, NULL,
i);
if (ret == -ENFILE || ret == -EMFILE) if (ret == -ENFILE || ret == -EMFILE)
goto enfile; goto enfile;
if (!ret) if (!ret)
@ -225,7 +226,11 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
p++; p++;
} }
if (!inany_pton(p, addr)) if (inet_pton(AF_INET, p, addr))
af = AF_INET;
else if (inet_pton(AF_INET6, p, addr))
af = AF_INET6;
else
goto bad; goto bad;
} }
} else { } else {
@ -271,13 +276,13 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
bitmap_set(fwd->map, i); bitmap_set(fwd->map, i);
if (optname == 't') { if (optname == 't') {
ret = tcp_sock_init(c, addr, ifname, i); ret = tcp_sock_init(c, af, addr, ifname, i);
if (ret == -ENFILE || ret == -EMFILE) if (ret == -ENFILE || ret == -EMFILE)
goto enfile; goto enfile;
if (!ret) if (!ret)
bound_one = true; bound_one = true;
} else if (optname == 'u') { } else if (optname == 'u') {
ret = udp_sock_init(c, 0, addr, ifname, i); ret = udp_sock_init(c, 0, af, addr, ifname, i);
if (ret == -ENFILE || ret == -EMFILE) if (ret == -ENFILE || ret == -EMFILE)
goto enfile; goto enfile;
if (!ret) if (!ret)
@ -333,9 +338,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
ret = 0; ret = 0;
if (optname == 't') if (optname == 't')
ret = tcp_sock_init(c, addr, ifname, i); ret = tcp_sock_init(c, af, addr, ifname, i);
else if (optname == 'u') else if (optname == 'u')
ret = udp_sock_init(c, 0, addr, ifname, i); ret = udp_sock_init(c, 0, af, addr, ifname, i);
if (ret) if (ret)
goto bind_fail; goto bind_fail;
} }
@ -576,15 +581,10 @@ static void conf_pasta_ns(int *netns_only, char *userns, char *netns,
if (pidval < 0 || pidval > INT_MAX) if (pidval < 0 || pidval > INT_MAX)
die("Invalid PID %s", argv[optind]); die("Invalid PID %s", argv[optind]);
if (snprintf_check(netns, PATH_MAX, snprintf(netns, PATH_MAX, "/proc/%ld/ns/net", pidval);
"/proc/%ld/ns/net", pidval)) if (!*userns)
die_perror("Can't build netns path"); snprintf(userns, PATH_MAX, "/proc/%ld/ns/user",
pidval);
if (!*userns) {
if (snprintf_check(userns, PATH_MAX,
"/proc/%ld/ns/user", pidval))
die_perror("Can't build userns path");
}
} }
} }
@ -735,19 +735,19 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
static void usage(const char *name, FILE *f, int status) static void usage(const char *name, FILE *f, int status)
{ {
if (strstr(name, "pasta")) { if (strstr(name, "pasta")) {
FPRINTF(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name); fprintf(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name);
FPRINTF(f, " %s [OPTION]... PID\n", name); fprintf(f, " %s [OPTION]... PID\n", name);
FPRINTF(f, " %s [OPTION]... --netns [PATH|NAME]\n", name); fprintf(f, " %s [OPTION]... --netns [PATH|NAME]\n", name);
FPRINTF(f, fprintf(f,
"\n" "\n"
"Without PID or --netns, run the given command or a\n" "Without PID or --netns, run the given command or a\n"
"default shell in a new network and user namespace, and\n" "default shell in a new network and user namespace, and\n"
"connect it via pasta.\n"); "connect it via pasta.\n");
} else { } else {
FPRINTF(f, "Usage: %s [OPTION]...\n", name); fprintf(f, "Usage: %s [OPTION]...\n", name);
} }
FPRINTF(f, fprintf(f,
"\n" "\n"
" -d, --debug Be verbose\n" " -d, --debug Be verbose\n"
" --trace Be extra verbose, implies --debug\n" " --trace Be extra verbose, implies --debug\n"
@ -764,17 +764,17 @@ static void usage(const char *name, FILE *f, int status)
" --version Show version and exit\n"); " --version Show version and exit\n");
if (strstr(name, "pasta")) { if (strstr(name, "pasta")) {
FPRINTF(f, fprintf(f,
" -I, --ns-ifname NAME namespace interface name\n" " -I, --ns-ifname NAME namespace interface name\n"
" default: same interface name as external one\n"); " default: same interface name as external one\n");
} else { } else {
FPRINTF(f, fprintf(f,
" -s, --socket PATH UNIX domain socket path\n" " -s, --socket PATH UNIX domain socket path\n"
" default: probe free path starting from " " default: probe free path starting from "
UNIX_SOCK_PATH "\n", 1); UNIX_SOCK_PATH "\n", 1);
} }
FPRINTF(f, fprintf(f,
" -F, --fd FD Use FD as pre-opened connected socket\n" " -F, --fd FD Use FD as pre-opened connected socket\n"
" -p, --pcap FILE Log tap-facing traffic to pcap file\n" " -p, --pcap FILE Log tap-facing traffic to pcap file\n"
" -P, --pid FILE Write own PID to the given file\n" " -P, --pid FILE Write own PID to the given file\n"
@ -805,28 +805,28 @@ static void usage(const char *name, FILE *f, int status)
" can be specified multiple times\n" " can be specified multiple times\n"
" a single, empty option disables DNS information\n"); " a single, empty option disables DNS information\n");
if (strstr(name, "pasta")) if (strstr(name, "pasta"))
FPRINTF(f, " default: don't use any addresses\n"); fprintf(f, " default: don't use any addresses\n");
else else
FPRINTF(f, " default: use addresses from /etc/resolv.conf\n"); fprintf(f, " default: use addresses from /etc/resolv.conf\n");
FPRINTF(f, fprintf(f,
" -S, --search LIST Space-separated list, search domains\n" " -S, --search LIST Space-separated list, search domains\n"
" a single, empty option disables the DNS search list\n"); " a single, empty option disables the DNS search list\n");
if (strstr(name, "pasta")) if (strstr(name, "pasta"))
FPRINTF(f, " default: don't use any search list\n"); fprintf(f, " default: don't use any search list\n");
else else
FPRINTF(f, " default: use search list from /etc/resolv.conf\n"); fprintf(f, " default: use search list from /etc/resolv.conf\n");
if (strstr(name, "pasta")) if (strstr(name, "pasta"))
FPRINTF(f, " --dhcp-dns \tPass DNS list via DHCP/DHCPv6/NDP\n"); fprintf(f, " --dhcp-dns \tPass DNS list via DHCP/DHCPv6/NDP\n");
else else
FPRINTF(f, " --no-dhcp-dns No DNS list in DHCP/DHCPv6/NDP\n"); fprintf(f, " --no-dhcp-dns No DNS list in DHCP/DHCPv6/NDP\n");
if (strstr(name, "pasta")) if (strstr(name, "pasta"))
FPRINTF(f, " --dhcp-search Pass list via DHCP/DHCPv6/NDP\n"); fprintf(f, " --dhcp-search Pass list via DHCP/DHCPv6/NDP\n");
else else
FPRINTF(f, " --no-dhcp-search No list in DHCP/DHCPv6/NDP\n"); fprintf(f, " --no-dhcp-search No list in DHCP/DHCPv6/NDP\n");
FPRINTF(f, fprintf(f,
" --map-host-loopback ADDR Translate ADDR to refer to host\n" " --map-host-loopback ADDR Translate ADDR to refer to host\n"
" can be specified zero to two times (for IPv4 and IPv6)\n" " can be specified zero to two times (for IPv4 and IPv6)\n"
" default: gateway address\n" " default: gateway address\n"
@ -836,9 +836,6 @@ static void usage(const char *name, FILE *f, int status)
" --dns-forward ADDR Forward DNS queries sent to ADDR\n" " --dns-forward ADDR Forward DNS queries sent to ADDR\n"
" can be specified zero to two times (for IPv4 and IPv6)\n" " can be specified zero to two times (for IPv4 and IPv6)\n"
" default: don't forward DNS queries\n" " default: don't forward DNS queries\n"
" --dns-host ADDR Host nameserver to direct queries to\n"
" can be specified zero to two times (for IPv4 and IPv6)\n"
" default: first nameserver from host's /etc/resolv.conf\n"
" --no-tcp Disable TCP protocol handler\n" " --no-tcp Disable TCP protocol handler\n"
" --no-udp Disable UDP protocol handler\n" " --no-udp Disable UDP protocol handler\n"
" --no-icmp Disable ICMP/ICMPv6 protocol handler\n" " --no-icmp Disable ICMP/ICMPv6 protocol handler\n"
@ -846,7 +843,6 @@ static void usage(const char *name, FILE *f, int status)
" --no-ndp Disable NDP responses\n" " --no-ndp Disable NDP responses\n"
" --no-dhcpv6 Disable DHCPv6 server\n" " --no-dhcpv6 Disable DHCPv6 server\n"
" --no-ra Disable router advertisements\n" " --no-ra Disable router advertisements\n"
" --freebind Bind to any address for forwarding\n"
" --no-map-gw Don't map gateway address to host\n" " --no-map-gw Don't map gateway address to host\n"
" -4, --ipv4-only Enable IPv4 operation only\n" " -4, --ipv4-only Enable IPv4 operation only\n"
" -6, --ipv6-only Enable IPv6 operation only\n"); " -6, --ipv6-only Enable IPv6 operation only\n");
@ -854,7 +850,7 @@ static void usage(const char *name, FILE *f, int status)
if (strstr(name, "pasta")) if (strstr(name, "pasta"))
goto pasta_opts; goto pasta_opts;
FPRINTF(f, fprintf(f,
" -1, --one-off Quit after handling one single client\n" " -1, --one-off Quit after handling one single client\n"
" -t, --tcp-ports SPEC TCP port forwarding to guest\n" " -t, --tcp-ports SPEC TCP port forwarding to guest\n"
" can be specified multiple times\n" " can be specified multiple times\n"
@ -885,7 +881,7 @@ static void usage(const char *name, FILE *f, int status)
pasta_opts: pasta_opts:
FPRINTF(f, fprintf(f,
" -t, --tcp-ports SPEC TCP port forwarding to namespace\n" " -t, --tcp-ports SPEC TCP port forwarding to namespace\n"
" can be specified multiple times\n" " can be specified multiple times\n"
" SPEC can be:\n" " SPEC can be:\n"
@ -919,9 +915,6 @@ pasta_opts:
" -U, --udp-ns SPEC UDP port forwarding to init namespace\n" " -U, --udp-ns SPEC UDP port forwarding to init namespace\n"
" SPEC is as described above\n" " SPEC is as described above\n"
" default: auto\n" " default: auto\n"
" --host-lo-to-ns-lo DEPRECATED:\n"
" Translate host-loopback forwards to\n"
" namespace loopback\n"
" --userns NSPATH Target user namespace to join\n" " --userns NSPATH Target user namespace to join\n"
" --netns PATH|NAME Target network namespace to join\n" " --netns PATH|NAME Target network namespace to join\n"
" --netns-only Don't join existing user namespace\n" " --netns-only Don't join existing user namespace\n"
@ -1196,11 +1189,7 @@ static void conf_open_files(struct ctx *c)
if (c->mode != MODE_PASTA && c->fd_tap == -1) if (c->mode != MODE_PASTA && c->fd_tap == -1)
c->fd_tap_listen = tap_sock_unix_open(c->sock_path); c->fd_tap_listen = tap_sock_unix_open(c->sock_path);
if (*c->pidfile) { c->pidfile_fd = pidfile_open(c->pidfile);
c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY);
if (c->pidfile_fd < 0)
die_perror("Couldn't open PID file %s", c->pidfile);
}
} }
/** /**
@ -1273,7 +1262,6 @@ void conf(struct ctx *c, int argc, char **argv)
{"no-dhcpv6", no_argument, &c->no_dhcpv6, 1 }, {"no-dhcpv6", no_argument, &c->no_dhcpv6, 1 },
{"no-ndp", no_argument, &c->no_ndp, 1 }, {"no-ndp", no_argument, &c->no_ndp, 1 },
{"no-ra", no_argument, &c->no_ra, 1 }, {"no-ra", no_argument, &c->no_ra, 1 },
{"freebind", no_argument, &c->freebind, 1 },
{"no-map-gw", no_argument, &no_map_gw, 1 }, {"no-map-gw", no_argument, &no_map_gw, 1 },
{"ipv4-only", no_argument, NULL, '4' }, {"ipv4-only", no_argument, NULL, '4' },
{"ipv6-only", no_argument, NULL, '6' }, {"ipv6-only", no_argument, NULL, '6' },
@ -1303,8 +1291,6 @@ void conf(struct ctx *c, int argc, char **argv)
{"netns-only", no_argument, NULL, 20 }, {"netns-only", no_argument, NULL, 20 },
{"map-host-loopback", required_argument, NULL, 21 }, {"map-host-loopback", required_argument, NULL, 21 },
{"map-guest-addr", required_argument, NULL, 22 }, {"map-guest-addr", required_argument, NULL, 22 },
{"host-lo-to-ns-lo", no_argument, NULL, 23 },
{"dns-host", required_argument, NULL, 24 },
{ 0 }, { 0 },
}; };
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@ -1427,9 +1413,9 @@ void conf(struct ctx *c, int argc, char **argv)
break; break;
case 14: case 14:
FPRINTF(stdout, fprintf(stdout,
c->mode == MODE_PASTA ? "pasta " : "passt "); c->mode == MODE_PASTA ? "pasta " : "passt ");
FPRINTF(stdout, VERSION_BLOB); fprintf(stdout, VERSION_BLOB);
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
case 15: case 15:
ret = snprintf(c->ip4.ifname_out, ret = snprintf(c->ip4.ifname_out,
@ -1482,23 +1468,6 @@ void conf(struct ctx *c, int argc, char **argv)
conf_nat(optarg, &c->ip4.map_guest_addr, conf_nat(optarg, &c->ip4.map_guest_addr,
&c->ip6.map_guest_addr, NULL); &c->ip6.map_guest_addr, NULL);
break; break;
case 23:
if (c->mode != MODE_PASTA)
die("--host-lo-to-ns-lo is for pasta mode only");
c->host_lo_to_ns_lo = 1;
break;
case 24:
if (inet_pton(AF_INET6, optarg, &c->ip6.dns_host) &&
!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
break;
if (inet_pton(AF_INET, optarg, &c->ip4.dns_host) &&
!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host) &&
!IN4_IS_ADDR_BROADCAST(&c->ip4.dns_host))
break;
die("Invalid host nameserver address: %s", optarg);
break;
case 'd': case 'd':
c->debug = 1; c->debug = 1;
c->quiet = 0; c->quiet = 0;

View file

@ -34,8 +34,6 @@
owner @{PROC}/@{pid}/uid_map r, # conf_ugid() owner @{PROC}/@{pid}/uid_map r, # conf_ugid()
@{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral()
network netlink raw, # nl_sock_init_do(), netlink.c network netlink raw, # nl_sock_init_do(), netlink.c
network inet stream, # tcp.c network inet stream, # tcp.c

View file

@ -50,7 +50,6 @@ require {
type passwd_file_t; type passwd_file_t;
class netlink_route_socket { bind create nlmsg_read }; class netlink_route_socket { bind create nlmsg_read };
type sysctl_net_t;
class capability { sys_tty_config setuid setgid }; class capability { sys_tty_config setuid setgid };
class cap_userns { setpcap sys_admin sys_ptrace }; class cap_userns { setpcap sys_admin sys_ptrace };
@ -105,8 +104,6 @@ allow passt_t net_conf_t:lnk_file read;
allow passt_t tmp_t:sock_file { create unlink write }; allow passt_t tmp_t:sock_file { create unlink write };
allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt }; allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt };
kernel_search_network_sysctl(passt_t) kernel_search_network_sysctl(passt_t)
allow passt_t sysctl_net_t:dir search;
allow passt_t sysctl_net_t:file { open read };
corenet_tcp_bind_all_nodes(passt_t) corenet_tcp_bind_all_nodes(passt_t)
corenet_udp_bind_all_nodes(passt_t) corenet_udp_bind_all_nodes(passt_t)

View file

@ -196,7 +196,7 @@ allow pasta_t ifconfig_var_run_t:dir { read search watch };
allow pasta_t self:tun_socket create; allow pasta_t self:tun_socket create;
allow pasta_t tun_tap_device_t:chr_file { ioctl open read write }; allow pasta_t tun_tap_device_t:chr_file { ioctl open read write };
allow pasta_t sysctl_net_t:dir search; allow pasta_t sysctl_net_t:dir search;
allow pasta_t sysctl_net_t:file { open read write }; allow pasta_t sysctl_net_t:file { open write };
allow pasta_t kernel_t:system module_request; allow pasta_t kernel_t:system module_request;
allow pasta_t nsfs_t:file read; allow pasta_t nsfs_t:file read;

View file

@ -296,42 +296,47 @@ static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset,
static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p, static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
struct in6_addr *la) struct in6_addr *la)
{ {
int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type;
const struct opt_ia_addr *opt_addr;
char buf[INET6_ADDRSTRLEN]; char buf[INET6_ADDRSTRLEN];
struct in6_addr req_addr; struct in6_addr req_addr;
const struct opt_hdr *h; const struct opt_hdr *h;
struct opt_hdr *ia; struct opt_hdr *ia;
size_t offset; size_t offset;
int ia_type;
foreach(ia_type, ia_types) { ia_type = OPT_IA_NA;
ia_ta:
offset = 0; offset = 0;
while ((ia = dhcpv6_opt(p, &offset, *ia_type))) { while ((ia = dhcpv6_opt(p, &offset, ia_type))) {
if (ntohs(ia->l) < OPT_VSIZE(ia_na)) if (ntohs(ia->l) < OPT_VSIZE(ia_na))
return NULL; return NULL;
offset += sizeof(struct opt_ia_na); offset += sizeof(struct opt_ia_na);
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) { while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
const struct opt_ia_addr *opt_addr;
if (ntohs(h->l) != OPT_VSIZE(ia_addr)) if (ntohs(h->l) != OPT_VSIZE(ia_addr))
return NULL; return NULL;
opt_addr = (const struct opt_ia_addr *)h; opt_addr = (const struct opt_ia_addr *)h;
req_addr = opt_addr->addr; req_addr = opt_addr->addr;
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) {
goto err; info("DHCPv6: requested address %s not on link",
inet_ntop(AF_INET6, &req_addr,
buf, sizeof(buf)));
return ia;
}
offset += sizeof(struct opt_ia_addr); offset += sizeof(struct opt_ia_addr);
} }
} }
if (ia_type == OPT_IA_NA) {
ia_type = OPT_IA_TA;
goto ia_ta;
} }
return NULL; return NULL;
err:
info("DHCPv6: requested address %s not on link",
inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf)));
return ia;
} }
/** /**
@ -423,11 +428,11 @@ search:
int dhcpv6(struct ctx *c, const struct pool *p, int dhcpv6(struct ctx *c, const struct pool *p,
const struct in6_addr *saddr, const struct in6_addr *daddr) const struct in6_addr *saddr, const struct in6_addr *daddr)
{ {
const struct opt_hdr *client_id, *server_id, *ia; struct opt_hdr *ia, *bad_ia, *client_id;
const struct opt_hdr *server_id;
const struct in6_addr *src; const struct in6_addr *src;
const struct msg_hdr *mh; const struct msg_hdr *mh;
const struct udphdr *uh; const struct udphdr *uh;
struct opt_hdr *bad_ia;
size_t mlen, n; size_t mlen, n;
uh = packet_get(p, 0, 0, sizeof(*uh), &mlen); uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);

53
flow.c
View file

@ -283,23 +283,28 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
"Flow %u (%s): %s", flow_idx(f), type_or_state, msg); "Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
} }
/** flow_log_details_() - Log the details of a flow /**
* @f: flow to log * flow_set_state() - Change flow's state
* @pri: Log priority * @f: Flow changing state
* @state: State to log details according to * @state: New state
*
* Logs the details of the flow: endpoints, interfaces, type etc.
*/ */
void flow_log_details_(const struct flow_common *f, int pri, static void flow_set_state(struct flow_common *f, enum flow_state state)
enum flow_state state)
{ {
char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN]; char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN]; char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
const struct flowside *ini = &f->side[INISIDE]; const struct flowside *ini = &f->side[INISIDE];
const struct flowside *tgt = &f->side[TGTSIDE]; const struct flowside *tgt = &f->side[TGTSIDE];
uint8_t oldstate = f->state;
if (state >= FLOW_STATE_TGT) ASSERT(state < FLOW_NUM_STATES);
flow_log_(f, pri, ASSERT(oldstate < FLOW_NUM_STATES);
f->state = state;
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
FLOW_STATE(f));
if (MAX(state, oldstate) >= FLOW_STATE_TGT)
flow_log_(f, LOG_DEBUG,
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu", "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
pif_name(f->pif[INISIDE]), pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
@ -311,8 +316,8 @@ void flow_log_details_(const struct flow_common *f, int pri,
tgt->oport, tgt->oport,
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)), inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
tgt->eport); tgt->eport);
else if (state >= FLOW_STATE_INI) else if (MAX(state, oldstate) >= FLOW_STATE_INI)
flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?", flow_log_(f, LOG_DEBUG, "%s [%s]:%hu -> [%s]:%hu => ?",
pif_name(f->pif[INISIDE]), pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
ini->eport, ini->eport,
@ -320,25 +325,6 @@ void flow_log_details_(const struct flow_common *f, int pri,
ini->oport); ini->oport);
} }
/**
* flow_set_state() - Change flow's state
* @f: Flow changing state
* @state: New state
*/
static void flow_set_state(struct flow_common *f, enum flow_state state)
{
uint8_t oldstate = f->state;
ASSERT(state < FLOW_NUM_STATES);
ASSERT(oldstate < FLOW_NUM_STATES);
f->state = state;
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
FLOW_STATE(f));
flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
}
/** /**
* flow_initiate_() - Move flow to INI, setting pif[INISIDE] * flow_initiate_() - Move flow to INI, setting pif[INISIDE]
* @flow: Flow to change state * @flow: Flow to change state
@ -711,7 +697,7 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
!(FLOW_PROTO(&flow->f) == proto && !(FLOW_PROTO(&flow->f) == proto &&
flow->f.pif[sidx.sidei] == pif && flow->f.pif[sidx.sidei] == pif &&
flowside_eq(&flow->f.side[sidx.sidei], side))) flowside_eq(&flow->f.side[sidx.sidei], side)))
b = mod_sub(b, 1, FLOW_HASH_SIZE); b = (b + 1) % FLOW_HASH_SIZE;
return flow_hashtab[b]; return flow_hashtab[b];
} }
@ -846,8 +832,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
closed = icmp_ping_timer(c, &flow->ping, now); closed = icmp_ping_timer(c, &flow->ping, now);
break; break;
case FLOW_UDP: case FLOW_UDP:
closed = udp_flow_defer(&flow->udp); if (timer)
if (!closed && timer)
closed = udp_flow_timer(c, &flow->udp, now); closed = udp_flow_timer(c, &flow->udp, now);
break; break;
default: default:

7
flow.h
View file

@ -264,11 +264,4 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
flow_dbg((f), __VA_ARGS__); \ flow_dbg((f), __VA_ARGS__); \
} while (0) } while (0)
void flow_log_details_(const struct flow_common *f, int pri,
enum flow_state state);
#define flow_log_details(f_, pri) \
flow_log_details_(&((f_)->f), (pri), (f_)->f.state)
#define flow_dbg_details(f_) flow_log_details((f_), LOG_DEBUG)
#define flow_err_details(f_) flow_log_details((f_), LOG_ERR)
#endif /* FLOW_H */ #endif /* FLOW_H */

View file

@ -110,7 +110,7 @@ static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx)
const union flow *flow = flow_at_sidx(sidx); const union flow *flow = flow_at_sidx(sidx);
if (!flow) if (!flow)
return NULL; return PIF_NONE;
return &flow->f.side[sidx.sidei]; return &flow->f.side[sidx.sidei];
} }

35
fwd.c
View file

@ -75,8 +75,8 @@ void fwd_probe_ephemeral(void)
if (*end || errno) if (*end || errno)
goto parse_err; goto parse_err;
if (min < 0 || min >= (long)NUM_PORTS || if (min < 0 || min >= NUM_PORTS ||
max < 0 || max >= (long)NUM_PORTS) max < 0 || max >= NUM_PORTS)
goto parse_err; goto parse_err;
fwd_ephemeral_min = min; fwd_ephemeral_min = min;
@ -447,35 +447,20 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
(proto == IPPROTO_TCP || proto == IPPROTO_UDP)) { (proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
/* spliceable */ /* spliceable */
/* The traffic will go over the guest's 'lo' interface, but by /* Preserve the specific loopback adddress used, but let the
* default use its external address, so we don't inadvertently * kernel pick a source port on the target side
* expose services that listen only on the guest's loopback
* address. That can be overridden by --host-lo-to-ns-lo which
* will instead forward to the loopback address in the guest.
*
* In either case, let the kernel pick the source address to
* match.
*/ */
if (inany_v4(&ini->eaddr)) { tgt->oaddr = ini->eaddr;
if (c->host_lo_to_ns_lo)
tgt->eaddr = inany_loopback4;
else
tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
tgt->oaddr = inany_any4;
} else {
if (c->host_lo_to_ns_lo)
tgt->eaddr = inany_loopback6;
else
tgt->eaddr.a6 = c->ip6.addr_seen;
tgt->oaddr = inany_any6;
}
/* Let the kernel pick source port */
tgt->oport = 0; tgt->oport = 0;
if (proto == IPPROTO_UDP) if (proto == IPPROTO_UDP)
/* But for UDP preserve the source port */ /* But for UDP preserve the source port */
tgt->oport = ini->eport; tgt->oport = ini->eport;
if (inany_v4(&ini->eaddr))
tgt->eaddr = inany_loopback4;
else
tgt->eaddr = inany_loopback6;
return PIF_SPLICE; return PIF_SPLICE;
} }

20
inany.c
View file

@ -36,23 +36,3 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
return inet_ntop(AF_INET6, &src->a6, dst, size); return inet_ntop(AF_INET6, &src->a6, dst, size);
} }
/** inany_pton - Parse an IPv[46] address from text format
* @src: IPv[46] address
* @dst: output buffer, filled with parsed address
*
* Return: On success, 1, if no parseable address is found, 0
*/
int inany_pton(const char *src, union inany_addr *dst)
{
if (inet_pton(AF_INET, src, &dst->v4mapped.a4)) {
memset(&dst->v4mapped.zero, 0, sizeof(dst->v4mapped.zero));
memset(&dst->v4mapped.one, 0xff, sizeof(dst->v4mapped.one));
return 1;
}
if (inet_pton(AF_INET6, src, &dst->a6))
return 1;
return 0;
}

View file

@ -270,6 +270,5 @@ static inline void inany_siphash_feed(struct siphash_state *state,
#define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN) #define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size); const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size);
int inany_pton(const char *src, union inany_addr *dst);
#endif /* INANY_H */ #endif /* INANY_H */

View file

@ -1,144 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later
* Copyright Red Hat
*
* Declarations for Linux specific dependencies
*/
#ifndef LINUX_DEP_H
#define LINUX_DEP_H
/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt()
*
* Largely derived from include/linux/tcp.h in the Linux kernel
*
* Some fields returned by TCP_INFO have been there for ages and are shared with
* BSD. struct tcp_info from netinet/tcp.h has only those fields. There are
* also a many Linux specific extensions to the structure, which are only found
* in the linux/tcp.h version of struct tcp_info.
*
* We want to use some of those extension fields, when available. We can test
* for availability in the runtime kernel using the length returned from
* getsockopt(). However, we won't necessarily be compiled against the same
* kernel headers as we'll run with, so compiling directly against linux/tcp.h
* means wrapping every field access in an #ifdef whose #else does the same
* thing as when the field is missing at runtime. This rapidly gets messy.
*
* Instead we define here struct tcp_info_linux which includes all the Linux
* extensions that we want to use. This is taken from v6.11 of the kernel.
*/
struct tcp_info_linux {
uint8_t tcpi_state;
uint8_t tcpi_ca_state;
uint8_t tcpi_retransmits;
uint8_t tcpi_probes;
uint8_t tcpi_backoff;
uint8_t tcpi_options;
uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
uint32_t tcpi_rto;
uint32_t tcpi_ato;
uint32_t tcpi_snd_mss;
uint32_t tcpi_rcv_mss;
uint32_t tcpi_unacked;
uint32_t tcpi_sacked;
uint32_t tcpi_lost;
uint32_t tcpi_retrans;
uint32_t tcpi_fackets;
/* Times. */
uint32_t tcpi_last_data_sent;
uint32_t tcpi_last_ack_sent;
uint32_t tcpi_last_data_recv;
uint32_t tcpi_last_ack_recv;
/* Metrics. */
uint32_t tcpi_pmtu;
uint32_t tcpi_rcv_ssthresh;
uint32_t tcpi_rtt;
uint32_t tcpi_rttvar;
uint32_t tcpi_snd_ssthresh;
uint32_t tcpi_snd_cwnd;
uint32_t tcpi_advmss;
uint32_t tcpi_reordering;
uint32_t tcpi_rcv_rtt;
uint32_t tcpi_rcv_space;
uint32_t tcpi_total_retrans;
/* Linux extensions */
uint64_t tcpi_pacing_rate;
uint64_t tcpi_max_pacing_rate;
uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
uint32_t tcpi_notsent_bytes;
uint32_t tcpi_min_rtt;
uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
uint64_t tcpi_delivery_rate;
uint64_t tcpi_busy_time; /* Time (usec) busy sending data */
uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */
uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
uint32_t tcpi_delivered;
uint32_t tcpi_delivered_ce;
uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */
uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */
uint32_t tcpi_reord_seen; /* reordering events seen */
uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */
uint32_t tcpi_snd_wnd; /* peer's advertised receive window after
* scaling (bytes)
*/
uint32_t tcpi_rcv_wnd; /* local advertised receive window after
* scaling (bytes)
*/
uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */
uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including
* SYN/SYN-ACK and recurring timeouts.
*/
uint16_t tcpi_total_rto_recoveries; /* Total number of RTO
* recoveries, including any
* unfinished recovery.
*/
uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries
* in milliseconds, including any
* unfinished recovery.
*/
};
#include <linux/falloc.h>
#ifndef FALLOC_FL_COLLAPSE_RANGE
#define FALLOC_FL_COLLAPSE_RANGE 0x08
#endif
#include <linux/close_range.h>
/* glibc < 2.34 and musl as of 1.2.5 need these */
#ifndef SYS_close_range
#define SYS_close_range 436
#endif
#ifndef CLOSE_RANGE_UNSHARE /* Linux kernel < 5.9 */
#define CLOSE_RANGE_UNSHARE (1U << 1)
#endif
__attribute__ ((weak))
/* cppcheck-suppress funcArgNamesDifferent */
int close_range(unsigned int first, unsigned int last, int flags) {
return syscall(SYS_close_range, first, last, flags);
}
#endif /* LINUX_DEP_H */

19
log.c
View file

@ -26,7 +26,6 @@
#include <stdarg.h> #include <stdarg.h>
#include <sys/socket.h> #include <sys/socket.h>
#include "linux_dep.h"
#include "log.h" #include "log.h"
#include "util.h" #include "util.h"
#include "passt.h" #include "passt.h"
@ -93,6 +92,7 @@ const char *logfile_prefix[] = {
" ", /* LOG_DEBUG */ " ", /* LOG_DEBUG */
}; };
#ifdef FALLOC_FL_COLLAPSE_RANGE
/** /**
* logfile_rotate_fallocate() - Write header, set log_written after fallocate() * logfile_rotate_fallocate() - Write header, set log_written after fallocate()
* @fd: Log file descriptor * @fd: Log file descriptor
@ -126,6 +126,7 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)
log_written -= log_cut_size; log_written -= log_cut_size;
} }
#endif /* FALLOC_FL_COLLAPSE_RANGE */
/** /**
* logfile_rotate_move() - Fallback: move recent entries toward start, then cut * logfile_rotate_move() - Fallback: move recent entries toward start, then cut
@ -197,17 +198,21 @@ out:
* *
* Return: 0 on success, negative error code on failure * Return: 0 on success, negative error code on failure
* *
* #syscalls fcntl fallocate * #syscalls fcntl
*
* fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there
*/ */
static int logfile_rotate(int fd, const struct timespec *now) static int logfile_rotate(int fd, const struct timespec *now)
{ {
if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */)) if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */))
return -errno; return -errno;
#ifdef FALLOC_FL_COLLAPSE_RANGE
/* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */ /* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */
if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size)) if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size))
logfile_rotate_fallocate(fd, now); logfile_rotate_fallocate(fd, now);
else else
#endif
logfile_rotate_move(fd, now); logfile_rotate_move(fd, now);
if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND)) if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND))
@ -269,7 +274,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
char timestr[LOGTIME_STRLEN]; char timestr[LOGTIME_STRLEN];
logtime_fmt(timestr, sizeof(timestr), now); logtime_fmt(timestr, sizeof(timestr), now);
FPRINTF(stderr, "%s: ", timestr); fprintf(stderr, "%s: ", timestr);
} }
if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) { if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) {
@ -288,7 +293,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
(log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) { (log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) {
(void)vfprintf(stderr, format, ap); (void)vfprintf(stderr, format, ap);
if (newline && format[strlen(format)] != '\n') if (newline && format[strlen(format)] != '\n')
FPRINTF(stderr, "\n"); fprintf(stderr, "\n");
} }
} }
@ -394,7 +399,7 @@ void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
n += snprintf(buf + n, BUFSIZ - n, "\n"); n += snprintf(buf + n, BUFSIZ - n, "\n");
if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr) if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n); fprintf(stderr, "Failed to send %i bytes to syslog\n", n);
} }
/** /**
@ -411,7 +416,8 @@ void logfile_init(const char *name, const char *path, size_t size)
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0)
die_perror("Failed to read own /proc/self/exe link"); die_perror("Failed to read own /proc/self/exe link");
log_file = output_file_open(path, O_APPEND | O_RDWR); log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC,
S_IRUSR | S_IWUSR);
if (log_file == -1) if (log_file == -1)
die_perror("Couldn't open log file %s", path); die_perror("Couldn't open log file %s", path);
@ -427,3 +433,4 @@ void logfile_init(const char *name, const char *path, size_t size)
/* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */ /* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */
log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE); log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE);
} }

4
ndp.c
View file

@ -234,8 +234,8 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
return 1; return 1;
if (ih->icmp6_type == NS) { if (ih->icmp6_type == NS) {
const struct ndp_ns *ns = struct ndp_ns *ns = packet_get(p, 0, 0, sizeof(struct ndp_ns),
packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL); NULL);
if (!ns) if (!ns)
return -1; return -1;

View file

@ -353,7 +353,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
*/ */
bool nl_route_get_def_multipath(struct rtattr *rta, void *gw) bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
{ {
int nh_len = RTA_PAYLOAD(rta); size_t nh_len = RTA_PAYLOAD(rta);
struct rtnexthop *rtnh; struct rtnexthop *rtnh;
bool found = false; bool found = false;
int hops = -1; int hops = -1;
@ -582,7 +582,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
*(unsigned int *)RTA_DATA(rta) = ifi_dst; *(unsigned int *)RTA_DATA(rta) = ifi_dst;
} else if (rta->rta_type == RTA_MULTIPATH) { } else if (rta->rta_type == RTA_MULTIPATH) {
int nh_len = RTA_PAYLOAD(rta); size_t nh_len = RTA_PAYLOAD(rta);
struct rtnexthop *rtnh; struct rtnexthop *rtnh;
for (rtnh = (struct rtnexthop *)RTA_DATA(rta); for (rtnh = (struct rtnexthop *)RTA_DATA(rta);

96
passt.1
View file

@ -95,7 +95,7 @@ detached PID namespace after starting, because the PID itself cannot change.
Default is to fork into background. Default is to fork into background.
.TP .TP
.BR \-e ", " \-\-stderr " " (DEPRECATED) .BR \-e ", " \-\-stderr
This option has no effect, and is maintained for compatibility purposes only. This option has no effect, and is maintained for compatibility purposes only.
Note that this configuration option is \fBdeprecated\fR and will be removed in a Note that this configuration option is \fBdeprecated\fR and will be removed in a
@ -249,19 +249,10 @@ the host.
.TP .TP
.BR \-\-dns-forward " " \fIaddr .BR \-\-dns-forward " " \fIaddr
Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the
nameserver (with corresponding IP version) specified by the first configured DNS resolver (with corresponding IP version). Maps
\fB\-\-dns-host\fR option. Maps only UDP and TCP traffic to port 53 or only UDP and TCP traffic to port 53 or port 853. Replies are
port 853. Replies are translated back with a reverse mapping. This translated back with a reverse mapping. This option can be specified
option can be specified zero to two times (once for IPv4, once for zero to two times (once for IPv4, once for IPv6).
IPv6).
.TP
.BR \-\-dns-host " " \fIaddr
Configure the host nameserver which guest or namespace queries to the
\fB\-\-dns-forward\fR address will be redirected to. This option can
be specified zero to two times (once for IPv4, once for IPv6).
By default, the first nameserver from the host's
\fI/etc/resolv.conf\fR.
.TP .TP
.BR \-S ", " \-\-search " " \fIlist .BR \-S ", " \-\-search " " \fIlist
@ -336,16 +327,6 @@ namespace will be silently dropped.
Disable Router Advertisements. Router Solicitations coming from guest or target Disable Router Advertisements. Router Solicitations coming from guest or target
namespace will be ignored. namespace will be ignored.
.TP
.BR \-\-freebind
Allow any binding address to be specified for \fB-t\fR and \fB-u\fR
options. Usually binding addresses must be addresses currently
configured on the host. With \fB\-\-freebind\fR, the
\fBIP_FREEBIND\fR or \fBIPV6_FREEBIND\fR socket option is enabled
allowing any address to be used. This is typically used to bind
addresses which might be configured on the host in future, at which
point the forwarding will immediately start operating.
.TP .TP
.BR \-\-map-host-loopback " " \fIaddr .BR \-\-map-host-loopback " " \fIaddr
Translate \fIaddr\fR to refer to the host. Packets from the guest to Translate \fIaddr\fR to refer to the host. Packets from the guest to
@ -605,13 +586,6 @@ Configure UDP port forwarding from target namespace to init namespace.
Default is \fBauto\fR. Default is \fBauto\fR.
.TP
.BR \-\-host-lo-to-ns-lo " " (DEPRECATED)
If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from
the host's loopback address will appear on the loopback address in the
guest as well. Without this option such forwarded packets will appear
to come from the guest's public address.
.TP .TP
.BR \-\-userns " " \fIspec .BR \-\-userns " " \fIspec
Target user namespace to join, as a path. If PID is given, without this option, Target user namespace to join, as a path. If PID is given, without this option,
@ -889,41 +863,38 @@ root@localhost's password:
.SH NOTES .SH NOTES
.SS Handling of traffic with loopback destination and source addresses .SS Handling of traffic with local destination and source addresses
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a loopback Both \fBpasst\fR and \fBpasta\fR can bind on ports with a local address,
address (127.0.0.0/8 or ::1), depending on the configuration. Loopback depending on the configuration. Local destination or source addresses need to be
destination or source addresses need to be changed before packets are changed before packets are delivered to the guest or target namespace: most
delivered to the guest or target namespace: most operating systems operating systems would drop packets received from non-loopback interfaces with
would drop packets received with loopback addresses on non-loopback local addresses, and it would also be impossible for guest or target namespace
interfaces, and it would also be impossible for guest or target to route answers back.
namespace to route answers back.
For convenience, the source address on these packets is translated to For convenience, and somewhat arbitrarily, the source address on these packets
the address specified by the \fB\-\-map-host-loopback\fR option (with is translated to the address of the default IPv4 or IPv6 gateway (if any) --
some exceptions in pasta mode, see next section below). If not this is known to be an existing, valid address on the same subnet.
specified this defaults, somewhat arbitrarily, to the address of
default IPv4 or IPv6 gateway (if any) -- this is known to be an
existing, valid address on the same subnet. If \fB\-\-no-map-gw\fR or
\fB\-\-map-host-loopback none\fR are specified this translation is
disabled and packets with loopback addresses are simply dropped.
Loopback destination addresses are translated to the observed external Loopback destination addresses are instead translated to the observed external
address of the guest or target namespace. For IPv6, the observed address of the guest or target namespace. For IPv6 packets, if usage of a
link-local address is used if the translated source address is link-local address by guest or namespace has ever been observed, and the
link-local, otherwise the observed global address is used. For both original destination address is also a link-local address, the observed
IPv4 and IPv6, if no addresses have been seen yet, the configured link-local address is used. Otherwise, the observed global address is used. For
addresses will be used instead. both IPv4 and IPv6, if no addresses have been seen yet, the configured addresses
will be used instead.
For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1, For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1,
with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
the last observed source address from guest or namespace is 192.0.2.2, this will the last observed source address from guest or namespace is 192.0.2.2, this will
be translated to a connection from 192.0.2.1 to 192.0.2.2. be translated to a connection from 192.0.2.1 to 192.0.2.2.
Similarly, for traffic coming from guest or namespace, packets with Similarly, for traffic coming from guest or namespace, packets with destination
destination address corresponding to the \fB\-\-map-host-loopback\fR address corresponding to the default gateway will have their destination address
address will have their destination address translated to a loopback translated to a loopback address, if and only if a packet, in the opposite
address. direction, with a loopback destination or source address, port-wise matching for
UDP, or connection-wise for TCP, has been recently forwarded to guest or
namespace. This behaviour can be disabled with \-\-no\-map\-gw.
.SS Handling of local traffic in pasta .SS Handling of local traffic in pasta
@ -939,15 +910,8 @@ and the new socket using the \fBsplice\fR(2) system call, and for UDP, a pair
of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet
transfers. transfers.
Because it's not possible to bind sockets to foreign addresses, this This bypass only applies to local connections and traffic, because it's not
bypass only applies to local connections and traffic. It also means possible to bind sockets to foreign addresses.
that the address translation differs slightly from passt mode.
Connections from loopback to loopback on the host will appear to come
from the target namespace's public address within the guest, unless
\fB\-\-host-lo-to-ns-lo\fR is specified, in which case they will
appear to come from loopback in the namespace as well. The latter
behaviour used to be the default, but is usually undesirable, since it
can unintentionally expose namespace local services to the host.
.SS Binding to low numbered ports (well-known or system ports, up to 1023) .SS Binding to low numbered ports (well-known or system ports, up to 1023)

12
passt.c
View file

@ -207,8 +207,7 @@ int main(int argc, char **argv)
struct timespec now; struct timespec now;
struct sigaction sa; struct sigaction sa;
if (clock_gettime(CLOCK_MONOTONIC, &log_start)) clock_gettime(CLOCK_MONOTONIC, &log_start);
die_perror("Failed to get CLOCK_MONOTONIC time");
arch_avx2_exec(argv); arch_avx2_exec(argv);
@ -266,8 +265,7 @@ int main(int argc, char **argv)
secret_init(&c); secret_init(&c);
if (clock_gettime(CLOCK_MONOTONIC, &now)) clock_gettime(CLOCK_MONOTONIC, &now);
die_perror("Failed to get CLOCK_MONOTONIC time");
flow_init(); flow_init();
@ -309,15 +307,13 @@ int main(int argc, char **argv)
timer_init(&c, &now); timer_init(&c, &now);
loop: loop:
/* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */ /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */ /* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL); nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
/* NOLINTEND(bugprone-branch-clone) */
if (nfds == -1 && errno != EINTR) if (nfds == -1 && errno != EINTR)
die_perror("epoll_wait() failed in main loop"); die_perror("epoll_wait() failed in main loop");
if (clock_gettime(CLOCK_MONOTONIC, &now)) clock_gettime(CLOCK_MONOTONIC, &now);
err_perror("Failed to get CLOCK_MONOTONIC time");
for (i = 0; i < nfds; i++) { for (i = 0; i < nfds; i++) {
union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64); union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);

View file

@ -225,8 +225,6 @@ struct ip6_ctx {
* @no_dhcpv6: Disable DHCPv6 server * @no_dhcpv6: Disable DHCPv6 server
* @no_ndp: Disable NDP handler altogether * @no_ndp: Disable NDP handler altogether
* @no_ra: Disable router advertisements * @no_ra: Disable router advertisements
* @host_lo_to_ns_lo: Map host loopback addresses to ns loopback addresses
* @freebind: Allow binding of non-local addresses for forwarding
* @low_wmem: Low probed net.core.wmem_max * @low_wmem: Low probed net.core.wmem_max
* @low_rmem: Low probed net.core.rmem_max * @low_rmem: Low probed net.core.rmem_max
*/ */
@ -286,8 +284,6 @@ struct ctx {
int no_dhcpv6; int no_dhcpv6;
int no_ndp; int no_ndp;
int no_ra; int no_ra;
int host_lo_to_ns_lo;
int freebind;
int low_wmem; int low_wmem;
int low_rmem; int low_rmem;

14
pasta.c
View file

@ -102,9 +102,7 @@ static int pasta_wait_for_ns(void *arg)
int flags = O_RDONLY | O_CLOEXEC; int flags = O_RDONLY | O_CLOEXEC;
char ns[PATH_MAX]; char ns[PATH_MAX];
if (snprintf_check(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid)) snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
die_perror("Can't build netns path");
do { do {
while ((c->pasta_netns_fd = open(ns, flags)) < 0) { while ((c->pasta_netns_fd = open(ns, flags)) < 0) {
if (errno != ENOENT) if (errno != ENOENT)
@ -241,11 +239,8 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
c->quiet = 1; c->quiet = 1;
/* Configure user and group mappings */ /* Configure user and group mappings */
if (snprintf_check(uidmap, BUFSIZ, "0 %u 1", uid)) snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
die_perror("Can't build uidmap"); snprintf(gidmap, BUFSIZ, "0 %u 1", gid);
if (snprintf_check(gidmap, BUFSIZ, "0 %u 1", gid))
die_perror("Can't build gidmap");
if (write_file("/proc/self/uid_map", uidmap) || if (write_file("/proc/self/uid_map", uidmap) ||
write_file("/proc/self/setgroups", "deny") || write_file("/proc/self/setgroups", "deny") ||
@ -432,12 +427,12 @@ static int pasta_netns_quit_timer(void)
*/ */
void pasta_netns_quit_init(const struct ctx *c) void pasta_netns_quit_init(const struct ctx *c)
{ {
union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY };
struct epoll_event ev = { .events = EPOLLIN }; struct epoll_event ev = { .events = EPOLLIN };
int flags = O_NONBLOCK | O_CLOEXEC; int flags = O_NONBLOCK | O_CLOEXEC;
struct statfs s = { 0 }; struct statfs s = { 0 };
bool try_inotify = true; bool try_inotify = true;
int fd = -1, dir_fd; int fd = -1, dir_fd;
union epoll_ref ref;
if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base) if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base)
return; return;
@ -468,7 +463,6 @@ void pasta_netns_quit_init(const struct ctx *c)
ref.type = EPOLL_TYPE_NSQUIT_TIMER; ref.type = EPOLL_TYPE_NSQUIT_TIMER;
} else { } else {
close(dir_fd); close(dir_fd);
ref.type = EPOLL_TYPE_NSQUIT_INOTIFY;
} }
if (fd > FD_REF_MAX) if (fd > FD_REF_MAX)

32
pcap.c
View file

@ -86,8 +86,9 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
.caplen = l2len, .caplen = l2len,
.len = l2len .len = l2len
}; };
struct iovec hiov = { &h, sizeof(h) };
if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 || if (write_remainder(pcap_fd, &hiov, 1, 0) < 0 ||
write_remainder(pcap_fd, iov, iovcnt, offset) < 0) write_remainder(pcap_fd, iov, iovcnt, offset) < 0)
debug_perror("Cannot log packet, length %zu", l2len); debug_perror("Cannot log packet, length %zu", l2len);
} }
@ -100,14 +101,12 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
void pcap(const char *pkt, size_t l2len) void pcap(const char *pkt, size_t l2len)
{ {
struct iovec iov = { (char *)pkt, l2len }; struct iovec iov = { (char *)pkt, l2len };
struct timespec now = { 0 }; struct timespec now;
if (pcap_fd == -1) if (pcap_fd == -1)
return; return;
if (clock_gettime(CLOCK_REALTIME, &now)) clock_gettime(CLOCK_REALTIME, &now);
err_perror("Failed to get CLOCK_REALTIME time");
pcap_frame(&iov, 1, 0, &now); pcap_frame(&iov, 1, 0, &now);
} }
@ -121,14 +120,13 @@ void pcap(const char *pkt, size_t l2len)
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
size_t offset) size_t offset)
{ {
struct timespec now = { 0 }; struct timespec now;
unsigned int i; unsigned int i;
if (pcap_fd == -1) if (pcap_fd == -1)
return; return;
if (clock_gettime(CLOCK_REALTIME, &now)) clock_gettime(CLOCK_REALTIME, &now);
err_perror("Failed to get CLOCK_REALTIME time");
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
pcap_frame(iov + i * frame_parts, frame_parts, offset, &now); pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
@ -141,20 +139,17 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
* @iov: Pointer to the array of struct iovec describing the I/O vector * @iov: Pointer to the array of struct iovec describing the I/O vector
* containing packet data to write, including L2 header * containing packet data to write, including L2 header
* @iovcnt: Number of buffers (@iov entries) * @iovcnt: Number of buffers (@iov entries)
* @offset: Offset of the L2 frame within the full data length
*/ */
/* cppcheck-suppress unusedFunction */ /* cppcheck-suppress unusedFunction */
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) void pcap_iov(const struct iovec *iov, size_t iovcnt)
{ {
struct timespec now = { 0 }; struct timespec now;
if (pcap_fd == -1) if (pcap_fd == -1)
return; return;
if (clock_gettime(CLOCK_REALTIME, &now)) clock_gettime(CLOCK_REALTIME, &now);
err_perror("Failed to get CLOCK_REALTIME time"); pcap_frame(iov, iovcnt, 0, &now);
pcap_frame(iov, iovcnt, offset, &now);
} }
/** /**
@ -163,15 +158,18 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
*/ */
void pcap_init(struct ctx *c) void pcap_init(struct ctx *c)
{ {
int flags = O_WRONLY | O_CREAT | O_TRUNC;
if (pcap_fd != -1) if (pcap_fd != -1)
return; return;
if (!*c->pcap) if (!*c->pcap)
return; return;
pcap_fd = output_file_open(c->pcap, O_WRONLY); flags |= c->foreground ? O_CLOEXEC : 0;
pcap_fd = open(c->pcap, flags, S_IRUSR | S_IWUSR);
if (pcap_fd == -1) { if (pcap_fd == -1) {
err_perror("Couldn't open pcap file %s", c->pcap); perror("open");
return; return;
} }

2
pcap.h
View file

@ -9,7 +9,7 @@
void pcap(const char *pkt, size_t l2len); void pcap(const char *pkt, size_t l2len);
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
size_t offset); size_t offset);
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset); void pcap_iov(const struct iovec *iov, size_t iovcnt);
void pcap_init(struct ctx *c); void pcap_init(struct ctx *c);
#endif /* PCAP_H */ #endif /* PCAP_H */

42
pif.c
View file

@ -59,45 +59,3 @@ void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
*sl = sizeof(sa->sa6); *sl = sizeof(sa->sa6);
} }
} }
/** pif_sock_l4() - Open a socket bound to an address on a specified interface
* @c: Execution context
* @type: Socket epoll type
* @pif: Interface for this socket
* @addr: Address to bind to, or NULL for dual-stack any
* @ifname: Interface for binding, NULL for any
* @port: Port number to bind to (host byte order)
* @data: epoll reference portion for protocol handlers
*
* NOTE: For namespace pifs, this must be called having already entered the
* relevant namespace.
*
* Return: newly created socket, negative error code on failure
*/
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
const union inany_addr *addr, const char *ifname,
in_port_t port, uint32_t data)
{
union sockaddr_inany sa = {
.sa6.sin6_family = AF_INET6,
.sa6.sin6_addr = in6addr_any,
.sa6.sin6_port = htons(port),
};
socklen_t sl;
ASSERT(pif_is_socket(pif));
if (pif == PIF_SPLICE) {
/* Sanity checks */
ASSERT(!ifname);
ASSERT(addr && inany_is_loopback(addr));
}
if (!addr)
return sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
ifname, false, data);
pif_sockaddr(c, &sa, &sl, pif, addr, port);
return sock_l4_sa(c, type, &sa, sl,
ifname, sa.sa_family == AF_INET6, data);
}

3
pif.h
View file

@ -59,8 +59,5 @@ static inline bool pif_is_socket(uint8_t pif)
void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl, void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
uint8_t pif, const union inany_addr *addr, in_port_t port); uint8_t pif, const union inany_addr *addr, in_port_t port);
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
const union inany_addr *addr, const char *ifname,
in_port_t port, uint32_t data);
#endif /* PIF_H */ #endif /* PIF_H */

View file

@ -20,15 +20,6 @@ OUT="$(mktemp)"
[ -z "${ARCH}" ] && ARCH="$(uname -m)" [ -z "${ARCH}" ] && ARCH="$(uname -m)"
[ -z "${CC}" ] && CC="cc" [ -z "${CC}" ] && CC="cc"
AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z] \
| sed 's/^ARM.*/ARM/' \
| sed 's/I[456]86/I386/' \
| sed 's/PPC64/PPC/' \
| sed 's/PPCLE/PPC64LE/' \
| sed 's/MIPS64EL/MIPSEL64/' \
| sed 's/HPPA/PARISC/' \
| sed 's/SH4/SH/')"
HEADER="/* This file was automatically generated by $(basename ${0}) */ HEADER="/* This file was automatically generated by $(basename ${0}) */
#ifndef AUDIT_ARCH_PPC64LE #ifndef AUDIT_ARCH_PPC64LE
@ -41,7 +32,7 @@ struct sock_filter filter_@PROFILE@[] = {
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */ /* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(offsetof(struct seccomp_data, arch))), (offsetof(struct seccomp_data, arch))),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, @AUDIT_ARCH@, 0, @KILL@), BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@),
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */ /* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(offsetof(struct seccomp_data, nr))), (offsetof(struct seccomp_data, nr))),
@ -242,8 +233,7 @@ gen_profile() {
sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}" sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}"
done done
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" \ finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))"
"AUDIT_ARCH:${AUDIT_ARCH}"
} }
printf '%s\n' "${HEADER}" > "${OUT}" printf '%s\n' "${HEADER}" > "${OUT}"

142
tap.c
View file

@ -172,15 +172,11 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP); struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
char *data = (char *)(uh + 1); char *data = (char *)(uh + 1);
const struct iovec iov = {
.iov_base = (void *)in,
.iov_len = dlen
};
uh->source = htons(sport); uh->source = htons(sport);
uh->dest = htons(dport); uh->dest = htons(dport);
uh->len = htons(l4len); uh->len = htons(l4len);
csum_udp4(uh, src, dst, &iov, 1, 0); csum_udp4(uh, src, dst, in, dlen);
memcpy(data, in, dlen); memcpy(data, in, dlen);
tap_send_single(c, buf, dlen + (data - buf)); tap_send_single(c, buf, dlen + (data - buf));
@ -251,7 +247,7 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h,
void tap_udp6_send(const struct ctx *c, void tap_udp6_send(const struct ctx *c,
const struct in6_addr *src, in_port_t sport, const struct in6_addr *src, in_port_t sport,
const struct in6_addr *dst, in_port_t dport, const struct in6_addr *dst, in_port_t dport,
uint32_t flow, void *in, size_t dlen) uint32_t flow, const void *in, size_t dlen)
{ {
size_t l4len = dlen + sizeof(struct udphdr); size_t l4len = dlen + sizeof(struct udphdr);
char buf[USHRT_MAX]; char buf[USHRT_MAX];
@ -259,15 +255,11 @@ void tap_udp6_send(const struct ctx *c,
struct udphdr *uh = tap_push_ip6h(ip6h, src, dst, struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
l4len, IPPROTO_UDP, flow); l4len, IPPROTO_UDP, flow);
char *data = (char *)(uh + 1); char *data = (char *)(uh + 1);
const struct iovec iov = {
.iov_base = in,
.iov_len = dlen
};
uh->source = htons(sport); uh->source = htons(sport);
uh->dest = htons(dport); uh->dest = htons(dport);
uh->len = htons(l4len); uh->len = htons(l4len);
csum_udp6(uh, src, dst, &iov, 1, 0); csum_udp6(uh, src, dst, in, dlen);
memcpy(data, in, dlen); memcpy(data, in, dlen);
tap_send_single(c, buf, dlen + (data - buf)); tap_send_single(c, buf, dlen + (data - buf));
@ -990,17 +982,24 @@ static void tap_sock_reset(struct ctx *c)
} }
/** /**
* tap_passt_input() - Handler for new data on the socket to qemu * tap_handler_passt() - Packet handler for AF_UNIX file descriptor
* @c: Execution context * @c: Execution context
* @events: epoll events
* @now: Current timestamp * @now: Current timestamp
*/ */
static void tap_passt_input(struct ctx *c, const struct timespec *now) void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now)
{ {
static const char *partial_frame; static const char *partial_frame;
static ssize_t partial_len = 0; static ssize_t partial_len = 0;
ssize_t n; ssize_t n;
char *p; char *p;
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
tap_sock_reset(c);
return;
}
tap_flush_pools(); tap_flush_pools();
if (partial_len) { if (partial_len) {
@ -1011,13 +1010,10 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
memmove(pkt_buf, partial_frame, partial_len); memmove(pkt_buf, partial_frame, partial_len);
} }
do { n = recv(c->fd_tap, pkt_buf + partial_len, TAP_BUF_BYTES - partial_len,
n = recv(c->fd_tap, pkt_buf + partial_len, MSG_DONTWAIT);
TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
} while ((n < 0) && errno == EINTR);
if (n < 0) { if (n < 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK) { if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
err_perror("Receive error on guest connection, reset"); err_perror("Receive error on guest connection, reset");
tap_sock_reset(c); tap_sock_reset(c);
} }
@ -1055,63 +1051,6 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
tap_handler(c, now); tap_handler(c, now);
} }
/**
* tap_handler_passt() - Event handler for AF_UNIX file descriptor
* @c: Execution context
* @events: epoll events
* @now: Current timestamp
*/
void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now)
{
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
tap_sock_reset(c);
return;
}
if (events & EPOLLIN)
tap_passt_input(c, now);
}
/**
* tap_pasta_input() - Handler for new data on the socket to hypervisor
* @c: Execution context
* @now: Current timestamp
*/
static void tap_pasta_input(struct ctx *c, const struct timespec *now)
{
ssize_t n, len;
tap_flush_pools();
for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
if (len == 0) {
die("EOF on tap device, exiting");
} else if (len < 0) {
if (errno == EINTR) {
len = 0;
continue;
}
if (errno == EAGAIN && errno == EWOULDBLOCK)
break; /* all done for now */
die("Error on tap device, exiting");
}
/* Ignore frames of bad length */
if (len < (ssize_t)sizeof(struct ethhdr) ||
len > (ssize_t)ETH_MAX_MTU)
continue;
tap_add_packet(c, len, pkt_buf + n);
}
tap_handler(c, now);
}
/** /**
* tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor
* @c: Execution context * @c: Execution context
@ -1121,11 +1060,46 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
void tap_handler_pasta(struct ctx *c, uint32_t events, void tap_handler_pasta(struct ctx *c, uint32_t events,
const struct timespec *now) const struct timespec *now)
{ {
ssize_t n, len;
int ret;
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
die("Disconnect event on /dev/net/tun device, exiting"); die("Disconnect event on /dev/net/tun device, exiting");
if (events & EPOLLIN) redo:
tap_pasta_input(c, now); n = 0;
tap_flush_pools();
restart:
while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
if (len < (ssize_t)sizeof(struct ethhdr) ||
len > (ssize_t)ETH_MAX_MTU) {
n += len;
continue;
}
tap_add_packet(c, len, pkt_buf + n);
if ((n += len) == TAP_BUF_BYTES)
break;
}
if (len < 0 && errno == EINTR)
goto restart;
ret = errno;
tap_handler(c, now);
if (len > 0 || ret == EAGAIN)
return;
if (n == TAP_BUF_BYTES)
goto redo;
die("Error on tap device, exiting");
} }
/** /**
@ -1136,7 +1110,7 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
*/ */
int tap_sock_unix_open(char *sock_path) int tap_sock_unix_open(char *sock_path)
{ {
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); int fd = socket(AF_UNIX, SOCK_STREAM, 0);
struct sockaddr_un addr = { struct sockaddr_un addr = {
.sun_family = AF_UNIX, .sun_family = AF_UNIX,
}; };
@ -1151,12 +1125,10 @@ int tap_sock_unix_open(char *sock_path)
if (*sock_path) if (*sock_path)
memcpy(path, sock_path, UNIX_PATH_MAX); memcpy(path, sock_path, UNIX_PATH_MAX);
else if (snprintf_check(path, UNIX_PATH_MAX - 1, else
UNIX_SOCK_PATH, i)) snprintf(path, UNIX_PATH_MAX - 1, UNIX_SOCK_PATH, i);
die_perror("Can't build UNIX domain socket path");
ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
0);
if (ex < 0) if (ex < 0)
die_perror("Failed to check for UNIX domain conflicts"); die_perror("Failed to check for UNIX domain conflicts");
@ -1289,7 +1261,7 @@ static int tap_ns_tun(void *arg)
if (fd < 0) if (fd < 0)
die_perror("Failed to open() /dev/net/tun"); die_perror("Failed to open() /dev/net/tun");
rc = ioctl(fd, (int)TUNSETIFF, &ifr); rc = ioctl(fd, TUNSETIFF, &ifr);
if (rc < 0) if (rc < 0)
die_perror("TUNSETIFF ioctl on /dev/net/tun failed"); die_perror("TUNSETIFF ioctl on /dev/net/tun failed");

2
tap.h
View file

@ -53,7 +53,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
void tap_udp6_send(const struct ctx *c, void tap_udp6_send(const struct ctx *c,
const struct in6_addr *src, in_port_t sport, const struct in6_addr *src, in_port_t sport,
const struct in6_addr *dst, in_port_t dport, const struct in6_addr *dst, in_port_t dport,
uint32_t flow, void *in, size_t dlen); uint32_t flow, const void *in, size_t dlen);
void tap_icmp6_send(const struct ctx *c, void tap_icmp6_send(const struct ctx *c,
const struct in6_addr *src, const struct in6_addr *dst, const struct in6_addr *src, const struct in6_addr *dst,
const void *in, size_t l4len); const void *in, size_t l4len);

387
tcp.c
View file

@ -274,7 +274,6 @@
#include <net/if.h> #include <net/if.h>
#include <netinet/in.h> #include <netinet/in.h>
#include <netinet/ip.h> #include <netinet/ip.h>
#include <netinet/tcp.h>
#include <stdint.h> #include <stdint.h>
#include <stdbool.h> #include <stdbool.h>
#include <stddef.h> #include <stddef.h>
@ -287,6 +286,8 @@
#include <time.h> #include <time.h>
#include <arpa/inet.h> #include <arpa/inet.h>
#include <linux/tcp.h> /* For struct tcp_info */
#include "checksum.h" #include "checksum.h"
#include "util.h" #include "util.h"
#include "iov.h" #include "iov.h"
@ -299,7 +300,6 @@
#include "log.h" #include "log.h"
#include "inany.h" #include "inany.h"
#include "flow.h" #include "flow.h"
#include "linux_dep.h"
#include "flow_table.h" #include "flow_table.h"
#include "tcp_internal.h" #include "tcp_internal.h"
@ -308,6 +308,11 @@
/* MSS rounding: see SET_MSS() */ /* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536 #define MSS_DEFAULT 536
#define WINDOW_DEFAULT 14600 /* RFC 6928 */ #define WINDOW_DEFAULT 14600 /* RFC 6928 */
#ifdef HAS_SND_WND
# define KERNEL_REPORTS_SND_WND(c) ((c)->tcp.kernel_snd_wnd)
#else
# define KERNEL_REPORTS_SND_WND(c) (0 && (c))
#endif
#define ACK_INTERVAL 10 /* ms */ #define ACK_INTERVAL 10 /* ms */
#define SYN_TIMEOUT 10 /* s */ #define SYN_TIMEOUT 10 /* s */
@ -318,6 +323,11 @@
#define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */ #define LOW_RTT_THRESHOLD 10 /* us */
/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
* <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
*/
#define SOL_TCP IPPROTO_TCP
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define CONN_IS_CLOSING(conn) \ #define CONN_IS_CLOSING(conn) \
@ -361,20 +371,6 @@ char tcp_buf_discard [MAX_WINDOW];
/* Does the kernel support TCP_PEEK_OFF? */ /* Does the kernel support TCP_PEEK_OFF? */
bool peek_offset_cap; bool peek_offset_cap;
/* Size of data returned by TCP_INFO getsockopt() */
socklen_t tcp_info_size;
#define tcp_info_cap(f_) \
((offsetof(struct tcp_info_linux, tcpi_##f_) + \
sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
#define snd_wnd_cap tcp_info_cap(snd_wnd)
/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
#define bytes_acked_cap tcp_info_cap(bytes_acked)
/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
#define min_rtt_cap tcp_info_cap(min_rtt)
/* sendmsg() to socket */ /* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV]; static struct iovec tcp_iov [UIO_MAXIOV];
@ -428,23 +424,27 @@ int tcp_set_peek_offset(int s, int offset)
*/ */
static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
{ {
uint32_t rdhup;
if (!events) if (!events)
return 0; return 0;
rdhup = (events & SOCK_FIN_RCVD) ? 0 : EPOLLRDHUP;
if (events & ESTABLISHED) { if (events & ESTABLISHED) {
if (events & TAP_FIN_SENT) if (events & TAP_FIN_SENT)
return EPOLLET; return EPOLLET;
if (conn_flags & STALLED) if (conn_flags & STALLED)
return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET; return EPOLLIN | EPOLLOUT | rdhup | EPOLLET;
return EPOLLIN | EPOLLRDHUP; return EPOLLIN | rdhup;
} }
if (events == TAP_SYN_RCVD) if (events == TAP_SYN_RCVD)
return EPOLLOUT | EPOLLET | EPOLLRDHUP; return EPOLLOUT | EPOLLET | rdhup;
return EPOLLET | EPOLLRDHUP; return rdhup;
} }
/** /**
@ -549,8 +549,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
(unsigned long long)it.it_value.tv_sec, (unsigned long long)it.it_value.tv_sec,
(unsigned long long)it.it_value.tv_nsec / 1000 / 1000); (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
if (timerfd_settime(conn->timer, 0, &it, NULL)) timerfd_settime(conn->timer, 0, &it, NULL);
flow_err(conn, "failed to set timer: %s", strerror(errno));
} }
/** /**
@ -680,12 +679,13 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
* @tinfo: Pointer to struct tcp_info for socket * @tinfo: Pointer to struct tcp_info for socket
*/ */
static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
const struct tcp_info_linux *tinfo) const struct tcp_info *tinfo)
{ {
#ifdef HAS_MIN_RTT
const struct flowside *tapside = TAPFLOW(conn); const struct flowside *tapside = TAPFLOW(conn);
int i, hole = -1; int i, hole = -1;
if (!min_rtt_cap || if (!tinfo->tcpi_min_rtt ||
(int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD) (int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD)
return; return;
@ -706,6 +706,10 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
if (hole == LOW_RTT_TABLE_SIZE) if (hole == LOW_RTT_TABLE_SIZE)
hole = 0; hole = 0;
inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any); inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
#else
(void)conn;
(void)tinfo;
#endif /* HAS_MIN_RTT */
} }
/** /**
@ -752,106 +756,34 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
} }
/** /**
* tcp_update_check_tcp4() - Calculate TCP checksum for IPv4 * tcp_update_check_tcp4() - Update TCP checksum from stored one
* @iph: IPv4 header * @iph: IPv4 header
* @iov: Pointer to the array of IO vectors * @th: TCP header followed by TCP payload
* @iov_cnt: Length of the array
* @l4offset: IPv4 payload offset in the iovec array
*/ */
static void tcp_update_check_tcp4(const struct iphdr *iph, static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th)
const struct iovec *iov, int iov_cnt,
size_t l4offset)
{ {
uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
struct in_addr saddr = { .s_addr = iph->saddr }; struct in_addr saddr = { .s_addr = iph->saddr };
struct in_addr daddr = { .s_addr = iph->daddr }; struct in_addr daddr = { .s_addr = iph->daddr };
size_t check_ofs; uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
uint16_t *check;
int check_idx;
uint32_t sum;
char *ptr;
sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); th->check = 0;
th->check = csum(th, l4len, sum);
check_idx = iov_skip_bytes(iov, iov_cnt,
l4offset + offsetof(struct tcphdr, check),
&check_ofs);
if (check_idx >= iov_cnt) {
err("TCP4 buffer is too small, iov size %zd, check offset %zd",
iov_size(iov, iov_cnt),
l4offset + offsetof(struct tcphdr, check));
return;
}
if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
err("TCP4 checksum field memory is not contiguous "
"check_ofs %zd check_idx %d iov_len %zd",
check_ofs, check_idx, iov[check_idx].iov_len);
return;
}
ptr = (char *)iov[check_idx].iov_base + check_ofs;
if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
err("TCP4 checksum field is not correctly aligned in memory");
return;
}
check = (uint16_t *)ptr;
*check = 0;
*check = csum_iov(iov, iov_cnt, l4offset, sum);
} }
/** /**
* tcp_update_check_tcp6() - Calculate TCP checksum for IPv6 * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
* @ip6h: IPv6 header * @ip6h: IPv6 header
* @iov: Pointer to the array of IO vectors * @th: TCP header followed by TCP payload
* @iov_cnt: Length of the array
* @l4offset: IPv6 payload offset in the iovec array
*/ */
static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
const struct iovec *iov, int iov_cnt,
size_t l4offset)
{ {
uint16_t l4len = ntohs(ip6h->payload_len); uint16_t l4len = ntohs(ip6h->payload_len);
size_t check_ofs; uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
uint16_t *check; &ip6h->saddr, &ip6h->daddr);
int check_idx;
uint32_t sum;
char *ptr;
sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr, th->check = 0;
&ip6h->daddr); th->check = csum(th, l4len, sum);
check_idx = iov_skip_bytes(iov, iov_cnt,
l4offset + offsetof(struct tcphdr, check),
&check_ofs);
if (check_idx >= iov_cnt) {
err("TCP6 buffer is too small, iov size %zd, check offset %zd",
iov_size(iov, iov_cnt),
l4offset + offsetof(struct tcphdr, check));
return;
}
if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
err("TCP6 checksum field memory is not contiguous "
"check_ofs %zd check_idx %d iov_len %zd",
check_ofs, check_idx, iov[check_idx].iov_len);
return;
}
ptr = (char *)iov[check_idx].iov_base + check_ofs;
if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
err("TCP6 checksum field is not correctly aligned in memory");
return;
}
check = (uint16_t *)ptr;
*check = 0;
*check = csum_iov(iov, iov_cnt, l4offset, sum);
} }
/** /**
@ -937,6 +869,7 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn)
/* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */ /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
void tcp_defer_handler(struct ctx *c) void tcp_defer_handler(struct ctx *c)
{ {
tcp_flags_flush(c);
tcp_payload_flush(c); tcp_payload_flush(c);
} }
@ -970,24 +903,23 @@ static void tcp_fill_header(struct tcphdr *th,
* @conn: Connection pointer * @conn: Connection pointer
* @taph: tap backend specific header * @taph: tap backend specific header
* @iph: Pointer to IPv4 header * @iph: Pointer to IPv4 header
* @bp: Pointer to TCP header followed by TCP payload * @th: Pointer to TCP header
* @dlen: TCP payload length * @dlen: TCP payload length
* @check: Checksum, if already known * @check: Checksum, if already known
* @seq: Sequence number for this segment * @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
* *
* Return: The IPv4 payload length, host order * Return: The IPv4 payload length, host order
*/ */
static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
struct tap_hdr *taph, struct tap_hdr *taph,
struct iphdr *iph, struct tcp_payload_t *bp, struct iphdr *iph, struct tcphdr *th,
size_t dlen, const uint16_t *check, size_t dlen, const uint16_t *check,
uint32_t seq, bool no_tcp_csum) uint32_t seq)
{ {
const struct flowside *tapside = TAPFLOW(conn); const struct flowside *tapside = TAPFLOW(conn);
const struct in_addr *src4 = inany_v4(&tapside->oaddr); const struct in_addr *src4 = inany_v4(&tapside->oaddr);
const struct in_addr *dst4 = inany_v4(&tapside->eaddr); const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
size_t l4len = dlen + sizeof(bp->th); size_t l4len = dlen + sizeof(*th);
size_t l3len = l4len + sizeof(*iph); size_t l3len = l4len + sizeof(*iph);
ASSERT(src4 && dst4); ASSERT(src4 && dst4);
@ -999,18 +931,9 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
iph->check = check ? *check : iph->check = check ? *check :
csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4); csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4);
tcp_fill_header(&bp->th, conn, seq); tcp_fill_header(th, conn, seq);
if (no_tcp_csum) { tcp_update_check_tcp4(iph, th);
bp->th.check = 0;
} else {
const struct iovec iov = {
.iov_base = bp,
.iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr),
};
tcp_update_check_tcp4(iph, &iov, 1, 0);
}
tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
@ -1022,21 +945,20 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
* @conn: Connection pointer * @conn: Connection pointer
* @taph: tap backend specific header * @taph: tap backend specific header
* @ip6h: Pointer to IPv6 header * @ip6h: Pointer to IPv6 header
* @bp: Pointer to TCP header followed by TCP payload * @th: Pointer to TCP header
* @dlen: TCP payload length * @dlen: TCP payload length
* @check: Checksum, if already known * @check: Checksum, if already known
* @seq: Sequence number for this segment * @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
* *
* Return: The IPv6 payload length, host order * Return: The IPv6 payload length, host order
*/ */
static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
struct tap_hdr *taph, struct tap_hdr *taph,
struct ipv6hdr *ip6h, struct tcp_payload_t *bp, struct ipv6hdr *ip6h, struct tcphdr *th,
size_t dlen, uint32_t seq, bool no_tcp_csum) size_t dlen, uint32_t seq)
{ {
const struct flowside *tapside = TAPFLOW(conn); const struct flowside *tapside = TAPFLOW(conn);
size_t l4len = dlen + sizeof(bp->th); size_t l4len = dlen + sizeof(*th);
ip6h->payload_len = htons(l4len); ip6h->payload_len = htons(l4len);
ip6h->saddr = tapside->oaddr.a6; ip6h->saddr = tapside->oaddr.a6;
@ -1050,18 +972,9 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
tcp_fill_header(&bp->th, conn, seq); tcp_fill_header(th, conn, seq);
if (no_tcp_csum) { tcp_update_check_tcp6(ip6h, th);
bp->th.check = 0;
} else {
const struct iovec iov = {
.iov_base = bp,
.iov_len = ntohs(ip6h->payload_len)
};
tcp_update_check_tcp6(ip6h, &iov, 1, 0);
}
tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
@ -1075,14 +988,12 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
* @dlen: TCP payload length * @dlen: TCP payload length
* @check: Checksum, if already known * @check: Checksum, if already known
* @seq: Sequence number for this segment * @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
* *
* Return: IP payload length, host order * Return: IP payload length, host order
*/ */
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
struct iovec *iov, size_t dlen, struct iovec *iov, size_t dlen,
const uint16_t *check, uint32_t seq, const uint16_t *check, uint32_t seq)
bool no_tcp_csum)
{ {
const struct flowside *tapside = TAPFLOW(conn); const struct flowside *tapside = TAPFLOW(conn);
const struct in_addr *a4 = inany_v4(&tapside->oaddr); const struct in_addr *a4 = inany_v4(&tapside->oaddr);
@ -1091,13 +1002,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base, return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_IP].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, dlen, iov[TCP_IOV_PAYLOAD].iov_base, dlen,
check, seq, no_tcp_csum); check, seq);
} }
return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base, return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_IP].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, dlen, iov[TCP_IOV_PAYLOAD].iov_base, dlen,
seq, no_tcp_csum); seq);
} }
/** /**
@ -1110,24 +1021,25 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
* Return: 1 if sequence or window were updated, 0 otherwise * Return: 1 if sequence or window were updated, 0 otherwise
*/ */
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
bool force_seq, struct tcp_info_linux *tinfo) int force_seq, struct tcp_info *tinfo)
{ {
uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
/* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */ /* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */
socklen_t sl = sizeof(*tinfo); socklen_t sl = sizeof(*tinfo);
struct tcp_info_linux tinfo_new; struct tcp_info tinfo_new;
uint32_t new_wnd_to_tap = prev_wnd_to_tap; uint32_t new_wnd_to_tap = prev_wnd_to_tap;
int s = conn->sock; int s = conn->sock;
if (!bytes_acked_cap) { #ifndef HAS_BYTES_ACKED
(void)force_seq;
conn->seq_ack_to_tap = conn->seq_from_tap; conn->seq_ack_to_tap = conn->seq_from_tap;
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
conn->seq_ack_to_tap = prev_ack_to_tap; conn->seq_ack_to_tap = prev_ack_to_tap;
} else { #else
if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) || || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
(conn->flags & LOCAL) || force_seq) {
conn->seq_ack_to_tap = conn->seq_from_tap; conn->seq_ack_to_tap = conn->seq_from_tap;
} else if (conn->seq_ack_to_tap != conn->seq_from_tap) { } else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
if (!tinfo) { if (!tinfo) {
@ -1142,9 +1054,9 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
conn->seq_ack_to_tap = prev_ack_to_tap; conn->seq_ack_to_tap = prev_ack_to_tap;
} }
} #endif /* !HAS_BYTES_ACKED */
if (!snd_wnd_cap) { if (!KERNEL_REPORTS_SND_WND(c)) {
tcp_get_sndbuf(conn); tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW); new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
@ -1162,6 +1074,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
} }
} }
#ifdef HAS_SND_WND
if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
new_wnd_to_tap = tinfo->tcpi_snd_wnd; new_wnd_to_tap = tinfo->tcpi_snd_wnd;
} else { } else {
@ -1169,6 +1082,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
SNDBUF_GET(conn)); SNDBUF_GET(conn));
} }
#endif
new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW); new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
if (!(conn->events & ESTABLISHED)) if (!(conn->events & ESTABLISHED))
@ -1226,11 +1140,11 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
* 0 if there is no flag to send * 0 if there is no flag to send
* 1 otherwise * 1 otherwise
*/ */
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, struct tcp_syn_opts *opts, int flags, struct tcphdr *th, char *data,
size_t *optlen) size_t *optlen)
{ {
struct tcp_info_linux tinfo = { 0 }; struct tcp_info tinfo = { 0 };
socklen_t sl = sizeof(tinfo); socklen_t sl = sizeof(tinfo);
int s = conn->sock; int s = conn->sock;
@ -1243,16 +1157,27 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
return -ECONNRESET; return -ECONNRESET;
} }
#ifdef HAS_SND_WND
if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
c->tcp.kernel_snd_wnd = 1;
#endif
if (!(conn->flags & LOCAL)) if (!(conn->flags & LOCAL))
tcp_rtt_dst_check(conn, &tinfo); tcp_rtt_dst_check(conn, &tinfo);
if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags) if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
return 0; return 0;
*optlen = 0; *optlen = 0;
if (flags & SYN) { if (flags & SYN) {
int mss; int mss;
/* Options: MSS, NOP and window scale (8 bytes) */
*optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
*data++ = OPT_MSS;
*data++ = OPT_MSS_LEN;
if (c->mtu == -1) { if (c->mtu == -1) {
mss = tinfo.tcpi_snd_mss; mss = tinfo.tcpi_snd_mss;
} else { } else {
@ -1268,11 +1193,16 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
else if (mss > PAGE_SIZE) else if (mss > PAGE_SIZE)
mss = ROUND_DOWN(mss, PAGE_SIZE); mss = ROUND_DOWN(mss, PAGE_SIZE);
} }
*(uint16_t *)data = htons(MIN(USHRT_MAX, mss));
data += OPT_MSS_LEN - 2;
conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale); conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
*opts = TCP_SYN_OPTS(mss, conn->ws_to_tap); *data++ = OPT_NOP;
*optlen = sizeof(*opts); *data++ = OPT_WS;
*data++ = OPT_WS_LEN;
*data++ = conn->ws_to_tap;
} else if (!(flags & RST)) { } else if (!(flags & RST)) {
flags |= ACK; flags |= ACK;
} }
@ -1309,8 +1239,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
* *
* Return: negative error code on connection reset, 0 otherwise * Return: negative error code on connection reset, 0 otherwise
*/ */
static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
int flags)
{ {
return tcp_buf_send_flag(c, conn, flags); return tcp_buf_send_flag(c, conn, flags);
} }
@ -1320,7 +1249,7 @@ static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
* @c: Execution context * @c: Execution context
* @conn: Connection pointer * @conn: Connection pointer
*/ */
void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
{ {
if (conn->events == CLOSED) if (conn->events == CLOSED)
return; return;
@ -1410,7 +1339,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
{ {
int s; int s;
s = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP); s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
if (s > FD_REF_MAX) { if (s > FD_REF_MAX) {
close(s); close(s);
@ -1538,7 +1467,7 @@ static void tcp_bind_outbound(const struct ctx *c,
* @optlen: Bytes in options: caller MUST ensure available length * @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp * @now: Current timestamp
*/ */
static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
const void *saddr, const void *daddr, const void *saddr, const void *daddr,
const struct tcphdr *th, const char *opts, const struct tcphdr *th, const char *opts,
size_t optlen, const struct timespec *now) size_t optlen, const struct timespec *now)
@ -1703,7 +1632,7 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
* *
* #syscalls recvmsg * #syscalls recvmsg
*/ */
static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
{ {
return tcp_buf_data_from_sock(c, conn); return tcp_buf_data_from_sock(c, conn);
} }
@ -1719,7 +1648,7 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
* *
* Return: count of consumed packets * Return: count of consumed packets
*/ */
static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
const struct pool *p, int idx) const struct pool *p, int idx)
{ {
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
@ -1917,8 +1846,7 @@ out:
* @opts: Pointer to start of options * @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length * @optlen: Bytes in options: caller MUST ensure available length
*/ */
static void tcp_conn_from_sock_finish(const struct ctx *c, static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
struct tcp_tap_conn *conn,
const struct tcphdr *th, const struct tcphdr *th,
const char *opts, size_t optlen) const char *opts, size_t optlen)
{ {
@ -1941,12 +1869,11 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
return; return;
} }
tcp_send_flag(c, conn, ACK);
/* The client might have sent data already, which we didn't /* The client might have sent data already, which we didn't
* dequeue waiting for SYN,ACK from tap -- check now. * dequeue waiting for SYN,ACK from tap -- check now.
*/ */
tcp_data_from_sock(c, conn); tcp_data_from_sock(c, conn);
tcp_send_flag(c, conn, ACK);
} }
/** /**
@ -1962,7 +1889,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
* *
* Return: count of consumed packets * Return: count of consumed packets
*/ */
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr, const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now) const struct pool *p, int idx, const struct timespec *now)
{ {
@ -2100,7 +2027,7 @@ reset:
* @c: Execution context * @c: Execution context
* @conn: Connection pointer * @conn: Connection pointer
*/ */
static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn) static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
{ {
socklen_t sl; socklen_t sl;
int so; int so;
@ -2126,8 +2053,8 @@ static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
* @sa: Peer socket address (from accept()) * @sa: Peer socket address (from accept())
* @now: Current timestamp * @now: Current timestamp
*/ */
static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
int s, const struct timespec *now) const struct timespec *now)
{ {
struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
uint64_t hash; uint64_t hash;
@ -2158,7 +2085,7 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
* @ref: epoll reference of listening socket * @ref: epoll reference of listening socket
* @now: Current timestamp * @now: Current timestamp
*/ */
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
const struct timespec *now) const struct timespec *now)
{ {
const struct flowside *ini; const struct flowside *ini;
@ -2223,7 +2150,7 @@ cancel:
* *
* #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64 * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
*/ */
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
{ {
struct itimerspec check_armed = { { 0 }, { 0 } }; struct itimerspec check_armed = { { 0 }, { 0 } };
struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp; struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp;
@ -2235,9 +2162,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
* timer is currently armed, this event came from a previous setting, * timer is currently armed, this event came from a previous setting,
* and we just set the timer to a new point in the future: discard it. * and we just set the timer to a new point in the future: discard it.
*/ */
if (timerfd_gettime(conn->timer, &check_armed)) timerfd_gettime(conn->timer, &check_armed);
flow_err(conn, "failed to read timer: %s", strerror(errno));
if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec) if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
return; return;
@ -2275,10 +2200,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
* case. This avoids having to preemptively reset the timer on * case. This avoids having to preemptively reset the timer on
* ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
*/ */
if (timerfd_settime(conn->timer, 0, &new, &old)) timerfd_settime(conn->timer, 0, &new, &old);
flow_err(conn, "failed to set timer: %s",
strerror(errno));
if (old.it_value.tv_sec == ACT_TIMEOUT) { if (old.it_value.tv_sec == ACT_TIMEOUT) {
flow_dbg(conn, "activity timeout"); flow_dbg(conn, "activity timeout");
tcp_rst(c, conn); tcp_rst(c, conn);
@ -2292,14 +2214,19 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
* @ref: epoll reference * @ref: epoll reference
* @events: epoll events bitmap * @events: epoll events bitmap
*/ */
void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
uint32_t events)
{ {
struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside); struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside);
ASSERT(!c->no_tcp); ASSERT(!c->no_tcp);
ASSERT(pif_at_sidx(ref.flowside) != PIF_TAP); ASSERT(pif_at_sidx(ref.flowside) != PIF_TAP);
if (events & EPOLLRDHUP) {
flow_err(conn, "EPOLLRDHUP: events=0x%x conn->events=0x%x "
"conn->flags=0x%x\n", events, conn->events,
conn->flags);
}
if (conn->events == CLOSED) if (conn->events == CLOSED)
return; return;
@ -2324,7 +2251,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
tcp_data_from_sock(c, conn); tcp_data_from_sock(c, conn);
if (events & EPOLLOUT) if (events & EPOLLOUT)
tcp_update_seqack_wnd(c, conn, false, NULL); tcp_update_seqack_wnd(c, conn, 0, NULL);
return; return;
} }
@ -2347,16 +2274,17 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
} }
/** /**
* tcp_sock_init_one() - Initialise listening socket for address and port * tcp_sock_init_af() - Initialise listening socket for a given af and port
* @c: Execution context * @c: Execution context
* @addr: Pointer to address for binding, NULL for dual stack any * @af: Address family to listen on
* @ifname: Name of interface to bind to, NULL if not configured
* @port: Port, host order * @port: Port, host order
* @addr: Pointer to address for binding, NULL if not configured
* @ifname: Name of interface to bind to, NULL if not configured
* *
* Return: fd for the new listening socket, negative error code on failure * Return: fd for the new listening socket, negative error code on failure
*/ */
static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr, static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
const char *ifname, in_port_t port) const void *addr, const char *ifname)
{ {
union tcp_listen_epoll_ref tref = { union tcp_listen_epoll_ref tref = {
.port = port, .port = port,
@ -2364,13 +2292,12 @@ static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
}; };
int s; int s;
s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr, s = sock_l4(c, af, EPOLL_TYPE_TCP_LISTEN, addr, ifname, port, tref.u32);
ifname, port, tref.u32);
if (c->tcp.fwd_in.mode == FWD_AUTO) { if (c->tcp.fwd_in.mode == FWD_AUTO) {
if (!addr || inany_v4(addr)) if (af == AF_INET || af == AF_UNSPEC)
tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s; tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s;
if (!addr || !inany_v4(addr)) if (af == AF_INET6 || af == AF_UNSPEC)
tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s; tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s;
} }
@ -2384,32 +2311,31 @@ static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
/** /**
* tcp_sock_init() - Create listening sockets for a given host ("inbound") port * tcp_sock_init() - Create listening sockets for a given host ("inbound") port
* @c: Execution context * @c: Execution context
* @af: Address family to select a specific IP version, or AF_UNSPEC
* @addr: Pointer to address for binding, NULL if not configured * @addr: Pointer to address for binding, NULL if not configured
* @ifname: Name of interface to bind to, NULL if not configured * @ifname: Name of interface to bind to, NULL if not configured
* @port: Port, host order * @port: Port, host order
* *
* Return: 0 on (partial) success, negative error code on (complete) failure * Return: 0 on (partial) success, negative error code on (complete) failure
*/ */
int tcp_sock_init(const struct ctx *c, const union inany_addr *addr, int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
const char *ifname, in_port_t port) const char *ifname, in_port_t port)
{ {
int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
ASSERT(!c->no_tcp); ASSERT(!c->no_tcp);
if (!addr && c->ifi4 && c->ifi6) if (af == AF_UNSPEC && c->ifi4 && c->ifi6)
/* Attempt to get a dual stack socket */ /* Attempt to get a dual stack socket */
if (tcp_sock_init_one(c, NULL, ifname, port) >= 0) if (tcp_sock_init_af(c, AF_UNSPEC, port, addr, ifname) >= 0)
return 0; return 0;
/* Otherwise create a socket per IP version */ /* Otherwise create a socket per IP version */
if ((!addr || inany_v4(addr)) && c->ifi4) if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4)
r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4, r4 = tcp_sock_init_af(c, AF_INET, port, addr, ifname);
ifname, port);
if ((!addr || !inany_v4(addr)) && c->ifi6) if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6)
r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6, r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname);
ifname, port);
if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6)) if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
return 0; return 0;
@ -2432,7 +2358,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
ASSERT(c->mode == MODE_PASTA); ASSERT(c->mode == MODE_PASTA);
s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4, s = sock_l4(c, AF_INET, EPOLL_TYPE_TCP_LISTEN, &in4addr_loopback,
NULL, port, tref.u32); NULL, port, tref.u32);
if (s >= 0) if (s >= 0)
tcp_sock_set_bufsize(c, s); tcp_sock_set_bufsize(c, s);
@ -2458,7 +2384,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
ASSERT(c->mode == MODE_PASTA); ASSERT(c->mode == MODE_PASTA);
s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6, s = sock_l4(c, AF_INET6, EPOLL_TYPE_TCP_LISTEN, &in6addr_loopback,
NULL, port, tref.u32); NULL, port, tref.u32);
if (s >= 0) if (s >= 0)
tcp_sock_set_bufsize(c, s); tcp_sock_set_bufsize(c, s);
@ -2561,7 +2487,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
* *
* Return: true if supported, false otherwise * Return: true if supported, false otherwise
*/ */
static bool tcp_probe_peek_offset_cap(sa_family_t af) bool tcp_probe_peek_offset_cap(sa_family_t af)
{ {
bool ret = false; bool ret = false;
int s, optv = 0; int s, optv = 0;
@ -2578,34 +2504,6 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
return ret; return ret;
} }
/**
* tcp_probe_tcp_info() - Check what data TCP_INFO reports
*
* Return: Number of bytes returned by TCP_INFO getsockopt()
*/
static socklen_t tcp_probe_tcp_info(void)
{
struct tcp_info_linux tinfo;
socklen_t sl = sizeof(tinfo);
int s;
s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
if (s < 0) {
warn_perror("Temporary TCP socket creation failed");
return false;
}
if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
warn_perror("Failed to get TCP_INFO on temporary socket");
close(s);
return false;
}
close(s);
return sl;
}
/** /**
* tcp_init() - Get initial sequence, hash secret, initialise per-socket data * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
* @c: Execution context * @c: Execution context
@ -2616,7 +2514,11 @@ int tcp_init(struct ctx *c)
{ {
ASSERT(!c->no_tcp); ASSERT(!c->no_tcp);
tcp_sock_iov_init(c); if (c->ifi4)
tcp_sock4_iov_init(c);
if (c->ifi6)
tcp_sock6_iov_init(c);
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
@ -2635,15 +2537,6 @@ int tcp_init(struct ctx *c)
(!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6)); (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
tcp_info_size = tcp_probe_tcp_info();
#define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \
STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
dbg_tcpi(snd_wnd);
dbg_tcpi(bytes_acked);
dbg_tcpi(min_rtt);
#undef dbg_tcpi
return 0; return 0;
} }
@ -2685,7 +2578,7 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
if (outbound) if (outbound)
tcp_ns_sock_init(c, port); tcp_ns_sock_init(c, port);
else else
tcp_sock_init(c, NULL, NULL, port); tcp_sock_init(c, AF_UNSPEC, NULL, NULL, port);
} }
} }
} }

15
tcp.h
View file

@ -10,15 +10,14 @@
struct ctx; struct ctx;
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref); void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
const struct timespec *now); const struct timespec *now);
void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
uint32_t events); int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr, const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now); const struct pool *p, int idx, const struct timespec *now);
int tcp_sock_init(const struct ctx *c, const union inany_addr *addr, int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
const char *ifname, in_port_t port); const char *ifname, in_port_t port);
int tcp_init(struct ctx *c); int tcp_init(struct ctx *c);
void tcp_timer(struct ctx *c, const struct timespec *now); void tcp_timer(struct ctx *c, const struct timespec *now);
@ -59,12 +58,16 @@ union tcp_listen_epoll_ref {
* @fwd_in: Port forwarding configuration for inbound packets * @fwd_in: Port forwarding configuration for inbound packets
* @fwd_out: Port forwarding configuration for outbound packets * @fwd_out: Port forwarding configuration for outbound packets
* @timer_run: Timestamp of most recent timer run * @timer_run: Timestamp of most recent timer run
* @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035)
* @pipe_size: Size of pipes for spliced connections * @pipe_size: Size of pipes for spliced connections
*/ */
struct tcp_ctx { struct tcp_ctx {
struct fwd_ports fwd_in; struct fwd_ports fwd_in;
struct fwd_ports fwd_out; struct fwd_ports fwd_out;
struct timespec timer_run; struct timespec timer_run;
#ifdef HAS_SND_WND
int kernel_snd_wnd;
#endif
size_t pipe_size; size_t pipe_size;
}; };

370
tcp_buf.c
View file

@ -20,7 +20,7 @@
#include <netinet/ip.h> #include <netinet/ip.h>
#include <netinet/tcp.h> #include <linux/tcp.h>
#include "util.h" #include "util.h"
#include "ip.h" #include "ip.h"
@ -38,32 +38,88 @@
(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM) (c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
/* Static buffers */ /* Static buffers */
/**
* struct tcp_payload_t - TCP header and data to send segments with payload
* @th: TCP header
* @data: TCP data
*/
struct tcp_payload_t {
struct tcphdr th;
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
/* Ethernet header for IPv4 and IPv6 frames */ /**
* struct tcp_flags_t - TCP header and data to send zero-length
* segments (flags)
* @th: TCP header
* @opts TCP options
*/
struct tcp_flags_t {
struct tcphdr th;
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
/* Ethernet header for IPv4 frames */
static struct ethhdr tcp4_eth_src; static struct ethhdr tcp4_eth_src;
static struct ethhdr tcp6_eth_src;
static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM]; static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
/* IPv4 headers */
static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
/* TCP segments with payload for IPv4 frames */
static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM];
/* IP headers for IPv4 and IPv6 */ static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
/* TCP segments with payload for IPv4 and IPv6 frames */
static struct tcp_payload_t tcp_payload[TCP_FRAMES_MEM];
static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516");
static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516");
/* References tracking the owner connection of frames in the tap outqueue */ /* References tracking the owner connection of frames in the tap outqueue */
static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM]; static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp_payload_used; static unsigned int tcp4_payload_used;
static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
/* IPv4 headers for TCP segment without payload */
static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM];
/* TCP segments without payload for IPv4 frames */
static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM];
static unsigned int tcp4_flags_used;
/* Ethernet header for IPv6 frames */
static struct ethhdr tcp6_eth_src;
static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
/* IPv6 headers */
static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
/* TCP headers and data for IPv6 frames */
static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM];
static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
/* References tracking the owner connection of frames in the tap outqueue */
static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp6_payload_used;
static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
/* IPv6 headers for TCP segment without payload */
static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM];
/* TCP segment without payload for IPv6 frames */
static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM];
static unsigned int tcp6_flags_used;
/* recvmsg()/sendmsg() data for tap */ /* recvmsg()/sendmsg() data for tap */
static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS]; static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
/** /**
* tcp_update_l2_buf() - Update Ethernet header buffers with addresses * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
* @eth_d: Ethernet destination address, NULL if unchanged * @eth_d: Ethernet destination address, NULL if unchanged
@ -76,30 +132,105 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
} }
/** /**
* tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
* @c: Execution context * @c: Execution context
*/ */
void tcp_sock_iov_init(const struct ctx *c) void tcp_sock4_iov_init(const struct ctx *c)
{ {
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
struct iovec *iov;
int i; int i;
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) { for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
tcp6_payload_ip[i] = ip6;
tcp4_payload_ip[i] = iph; tcp4_payload_ip[i] = iph;
tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
tcp4_payload[i].th.ack = 1;
}
for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
tcp4_flags_ip[i] = iph;
tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
tcp4_flags[i].th.ack = 1;
} }
for (i = 0; i < TCP_FRAMES_MEM; i++) { for (i = 0; i < TCP_FRAMES_MEM; i++) {
struct iovec *iov = tcp_l2_iov[i]; iov = tcp4_l2_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]); iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i]; iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
} }
for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov = tcp4_l2_flags_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
}
}
/**
* tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
* @c: Execution context
*/
void tcp_sock6_iov_init(const struct ctx *c)
{
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
struct iovec *iov;
int i;
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
tcp6_payload_ip[i] = ip6;
tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
tcp6_payload[i].th.ack = 1;
}
for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
tcp6_flags_ip[i] = ip6;
tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
tcp6_flags[i].th .ack = 1;
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov = tcp6_l2_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov = tcp6_l2_flags_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
}
}
/**
* tcp_flags_flush() - Send out buffers for segments with no data (flags)
* @c: Execution context
*/
void tcp_flags_flush(const struct ctx *c)
{
tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
tcp6_flags_used);
tcp6_flags_used = 0;
tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
tcp4_flags_used);
tcp4_flags_used = 0;
} }
/** /**
@ -109,7 +240,7 @@ void tcp_sock_iov_init(const struct ctx *c)
* @frames: Two-dimensional array containing queued frames with sub-iovs * @frames: Two-dimensional array containing queued frames with sub-iovs
* @num_frames: Number of entries in the two arrays to be compared * @num_frames: Number of entries in the two arrays to be compared
*/ */
static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
struct iovec (*frames)[TCP_NUM_IOVS], int num_frames) struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
{ {
int i; int i;
@ -131,20 +262,28 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
} }
/** /**
* tcp_payload_flush() - Send out buffers for segments with data or flags * tcp_payload_flush() - Send out buffers for segments with data
* @c: Execution context * @c: Execution context
*/ */
void tcp_payload_flush(const struct ctx *c) void tcp_payload_flush(struct ctx *c)
{ {
size_t m; size_t m;
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS, m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
tcp_payload_used); tcp6_payload_used);
if (m != tcp_payload_used) { if (m != tcp6_payload_used) {
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m], tcp_revert_seq(c, &tcp6_frame_conns[m], &tcp6_l2_iov[m],
tcp_payload_used - m); tcp6_payload_used - m);
} }
tcp_payload_used = 0; tcp6_payload_used = 0;
m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
tcp4_payload_used);
if (m != tcp4_payload_used) {
tcp_revert_seq(c, &tcp4_frame_conns[m], &tcp4_l2_iov[m],
tcp4_payload_used - m);
}
tcp4_payload_used = 0;
} }
/** /**
@ -155,48 +294,58 @@ void tcp_payload_flush(const struct ctx *c)
* *
* Return: negative error code on connection reset, 0 otherwise * Return: negative error code on connection reset, 0 otherwise
*/ */
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{ {
struct tcp_payload_t *payload; struct tcp_flags_t *payload;
struct iovec *iov; struct iovec *iov;
size_t optlen; size_t optlen;
size_t l4len; size_t l4len;
uint32_t seq; uint32_t seq;
int ret; int ret;
iov = tcp_l2_iov[tcp_payload_used]; if (CONN_V4(conn))
if (CONN_V4(conn)) { iov = tcp4_l2_flags_iov[tcp4_flags_used++];
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]); else
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; iov = tcp6_l2_flags_iov[tcp6_flags_used++];
} else {
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
payload = iov[TCP_IOV_PAYLOAD].iov_base; payload = iov[TCP_IOV_PAYLOAD].iov_base;
seq = conn->seq_to_tap; seq = conn->seq_to_tap;
ret = tcp_prepare_flags(c, conn, flags, &payload->th, ret = tcp_prepare_flags(c, conn, flags, &payload->th,
(struct tcp_syn_opts *)&payload->data, &optlen); payload->opts, &optlen);
if (ret <= 0) if (ret <= 0) {
if (CONN_V4(conn))
tcp4_flags_used--;
else
tcp6_flags_used--;
return ret; return ret;
tcp_payload_used++;
l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (flags & DUP_ACK) {
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_TAP].iov_len);
dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP];
memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, l4len);
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
} }
if (tcp_payload_used > TCP_FRAMES_MEM - 2) l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq);
tcp_payload_flush(c); iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (flags & DUP_ACK) {
struct iovec *dup_iov;
int i;
if (CONN_V4(conn))
dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
else
dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
for (i = 0; i < TCP_NUM_IOVS; i++)
memcpy(dup_iov[i].iov_base, iov[i].iov_base,
iov[i].iov_len);
dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
}
if (CONN_V4(conn)) {
if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
tcp_flags_flush(c);
} else {
if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
tcp_flags_flush(c);
}
return 0; return 0;
} }
@ -209,39 +358,39 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
* @seq: Sequence number to be sent * @seq: Sequence number to be sent
*/ */
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
ssize_t dlen, int no_csum, uint32_t seq) ssize_t dlen, int no_csum, uint32_t seq)
{ {
struct tcp_payload_t *payload;
const uint16_t *check = NULL;
struct iovec *iov; struct iovec *iov;
size_t l4len; size_t l4len;
conn->seq_to_tap = seq + dlen; conn->seq_to_tap = seq + dlen;
tcp_frame_conns[tcp_payload_used] = conn;
iov = tcp_l2_iov[tcp_payload_used];
if (CONN_V4(conn)) {
if (no_csum) {
struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
if (CONN_V4(conn)) {
struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
const uint16_t *check = NULL;
if (no_csum) {
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
check = &iph->check; check = &iph->check;
} }
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; tcp4_frame_conns[tcp4_payload_used] = conn;
} else if (CONN_V6(conn)) {
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]); iov = tcp4_l2_iov[tcp4_payload_used++];
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq);
}
payload = iov[TCP_IOV_PAYLOAD].iov_base;
payload->th.th_off = sizeof(struct tcphdr) / 4;
payload->th.th_x2 = 0;
payload->th.th_flags = 0;
payload->th.ack = 1;
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
iov[TCP_IOV_PAYLOAD].iov_len = l4len; iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (++tcp_payload_used > TCP_FRAMES_MEM - 1) if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c); tcp_payload_flush(c);
} else if (CONN_V6(conn)) {
tcp6_frame_conns[tcp6_payload_used] = conn;
iov = tcp6_l2_iov[tcp6_payload_used++];
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
}
} }
/** /**
@ -253,11 +402,12 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
* *
* #syscalls recvmsg * #syscalls recvmsg
*/ */
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
{ {
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
int len, dlen, i, s = conn->sock; int sendlen, len, dlen, v4 = CONN_V4(conn);
int s = conn->sock, i, ret = 0;
struct msghdr mh_sock = { 0 }; struct msghdr mh_sock = { 0 };
uint16_t mss = MSS_GET(conn); uint16_t mss = MSS_GET(conn);
uint32_t already_sent, seq; uint32_t already_sent, seq;
@ -304,15 +454,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
mh_sock.msg_iovlen = fill_bufs; mh_sock.msg_iovlen = fill_bufs;
} }
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) { if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
(!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
tcp_payload_flush(c); tcp_payload_flush(c);
/* Silence Coverity CWE-125 false positive */ /* Silence Coverity CWE-125 false positive */
tcp_payload_used = 0; tcp4_payload_used = tcp6_payload_used = 0;
} }
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
iov->iov_base = &tcp_payload[tcp_payload_used + i].data; if (v4)
iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
else
iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
iov->iov_len = mss; iov->iov_len = mss;
} }
if (iov_rem) if (iov_rem)
@ -323,19 +477,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
len = recvmsg(s, &mh_sock, MSG_PEEK); len = recvmsg(s, &mh_sock, MSG_PEEK);
while (len < 0 && errno == EINTR); while (len < 0 && errno == EINTR);
if (len < 0) { if (len < 0)
if (errno != EAGAIN && errno != EWOULDBLOCK) { goto err;
tcp_rst(c, conn);
return -errno;
}
return 0;
}
if (!len) { if (!len) {
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
int ret = tcp_buf_send_flag(c, conn, FIN | ACK); if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
if (ret) {
tcp_rst(c, conn); tcp_rst(c, conn);
return ret; return ret;
} }
@ -346,27 +493,28 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
return 0; return 0;
} }
sendlen = len;
if (!peek_offset_cap) if (!peek_offset_cap)
len -= already_sent; sendlen -= already_sent;
if (len <= 0) { if (sendlen <= 0) {
conn_flag(c, conn, STALLED); conn_flag(c, conn, STALLED);
return 0; return 0;
} }
conn_flag(c, conn, ~STALLED); conn_flag(c, conn, ~STALLED);
send_bufs = DIV_ROUND_UP(len, mss); send_bufs = DIV_ROUND_UP(sendlen, mss);
last_len = len - (send_bufs - 1) * mss; last_len = sendlen - (send_bufs - 1) * mss;
/* Likely, some new data was acked too. */ /* Likely, some new data was acked too. */
tcp_update_seqack_wnd(c, conn, false, NULL); tcp_update_seqack_wnd(c, conn, 0, NULL);
/* Finally, queue to tap */ /* Finally, queue to tap */
dlen = mss; dlen = mss;
seq = conn->seq_to_tap; seq = conn->seq_to_tap;
for (i = 0; i < send_bufs; i++) { for (i = 0; i < send_bufs; i++) {
int no_csum = i && i != send_bufs - 1 && tcp_payload_used; int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
if (i == send_bufs - 1) if (i == send_bufs - 1)
dlen = last_len; dlen = last_len;
@ -378,4 +526,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
conn_flag(c, conn, ACK_FROM_TAP_DUE); conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0; return 0;
err:
if (errno != EAGAIN && errno != EWOULDBLOCK) {
ret = -errno;
tcp_rst(c, conn);
}
return ret;
} }

View file

@ -6,9 +6,11 @@
#ifndef TCP_BUF_H #ifndef TCP_BUF_H
#define TCP_BUF_H #define TCP_BUF_H
void tcp_sock_iov_init(const struct ctx *c); void tcp_sock4_iov_init(const struct ctx *c);
void tcp_payload_flush(const struct ctx *c); void tcp_sock6_iov_init(const struct ctx *c);
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn); void tcp_flags_flush(const struct ctx *c);
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags); void tcp_payload_flush(struct ctx *c);
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
#endif /*TCP_BUF_H */ #endif /*TCP_BUF_H */

View file

@ -33,7 +33,9 @@
#define OPT_EOL 0 #define OPT_EOL 0
#define OPT_NOP 1 #define OPT_NOP 1
#define OPT_MSS 2 #define OPT_MSS 2
#define OPT_MSS_LEN 4
#define OPT_WS 3 #define OPT_WS 3
#define OPT_WS_LEN 3
#define OPT_SACKP 4 #define OPT_SACKP 4
#define OPT_SACK 5 #define OPT_SACK 5
#define OPT_TS 8 #define OPT_TS 8
@ -61,79 +63,6 @@ enum tcp_iov_parts {
TCP_NUM_IOVS TCP_NUM_IOVS
}; };
/**
* struct tcp_payload_t - TCP header and data to send segments with payload
* @th: TCP header
* @data: TCP data
*/
struct tcp_payload_t {
struct tcphdr th;
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
/** struct tcp_opt_nop - TCP NOP option
* @kind: Option kind (OPT_NOP = 1)
*/
struct tcp_opt_nop {
uint8_t kind;
} __attribute__ ((packed));
#define TCP_OPT_NOP ((struct tcp_opt_nop){ .kind = OPT_NOP, })
/** struct tcp_opt_mss - TCP MSS option
* @kind: Option kind (OPT_MSS == 2)
* @len: Option length (4)
* @mss: Maximum Segment Size
*/
struct tcp_opt_mss {
uint8_t kind;
uint8_t len;
uint16_t mss;
} __attribute__ ((packed));
#define TCP_OPT_MSS(mss_) \
((struct tcp_opt_mss) { \
.kind = OPT_MSS, \
.len = sizeof(struct tcp_opt_mss), \
.mss = htons(mss_), \
})
/** struct tcp_opt_ws - TCP Window Scaling option
* @kind: Option kind (OPT_WS == 3)
* @len: Option length (3)
* @shift: Window scaling shift
*/
struct tcp_opt_ws {
uint8_t kind;
uint8_t len;
uint8_t shift;
} __attribute__ ((packed));
#define TCP_OPT_WS(shift_) \
((struct tcp_opt_ws) { \
.kind = OPT_WS, \
.len = sizeof(struct tcp_opt_ws), \
.shift = (shift_), \
})
/** struct tcp_syn_opts - TCP options we apply to SYN packets
* @mss: Maximum Segment Size (MSS) option
* @nop: NOP opt (for alignment)
* @ws: Window Scaling (WS) option
*/
struct tcp_syn_opts {
struct tcp_opt_mss mss;
struct tcp_opt_nop nop;
struct tcp_opt_ws ws;
} __attribute__ ((packed));
#define TCP_SYN_OPTS(mss_, ws_) \
((struct tcp_syn_opts){ \
.mss = TCP_OPT_MSS(mss_), \
.nop = TCP_OPT_NOP, \
.ws = TCP_OPT_WS(ws_), \
})
extern char tcp_buf_discard [MAX_WINDOW]; extern char tcp_buf_discard [MAX_WINDOW];
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
@ -153,23 +82,19 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
conn_event_do(c, conn, event); \ conn_event_do(c, conn, event); \
} while (0) } while (0)
void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
#define tcp_rst(c, conn) \ #define tcp_rst(c, conn) \
do { \ do { \
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
tcp_rst_do(c, conn); \ tcp_rst_do(c, conn); \
} while (0) } while (0)
struct tcp_info_linux;
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
struct iovec *iov, size_t dlen, struct iovec *iov, size_t dlen,
const uint16_t *check, uint32_t seq, const uint16_t *check, uint32_t seq);
bool no_tcp_csum);
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
bool force_seq, struct tcp_info_linux *tinfo); int force_seq, struct tcp_info *tinfo);
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
int flags, struct tcphdr *th, struct tcp_syn_opts *opts, struct tcphdr *th, char *data, size_t *optlen);
size_t *optlen);
#endif /* TCP_INTERNAL_H */ #endif /* TCP_INTERNAL_H */

View file

@ -320,7 +320,7 @@ static int tcp_splice_connect_finish(const struct ctx *c,
} }
if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ, if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ,
c->tcp.pipe_size) != (int)c->tcp.pipe_size) { c->tcp.pipe_size)) {
flow_trace(conn, flow_trace(conn,
"cannot set %d->%d pipe size to %zu", "cannot set %d->%d pipe size to %zu",
sidei, !sidei, c->tcp.pipe_size); sidei, !sidei, c->tcp.pipe_size);
@ -503,7 +503,7 @@ swap:
lowat_act_flag = RCVLOWAT_ACT(fromsidei); lowat_act_flag = RCVLOWAT_ACT(fromsidei);
while (1) { while (1) {
ssize_t readlen, written, pending; ssize_t readlen, to_write = 0, written;
int more = 0; int more = 0;
retry: retry:
@ -518,11 +518,14 @@ retry:
if (errno != EAGAIN) if (errno != EAGAIN)
goto close; goto close;
to_write = c->tcp.pipe_size;
} else if (!readlen) { } else if (!readlen) {
eof = 1; eof = 1;
to_write = c->tcp.pipe_size;
} else { } else {
never_read = 0; never_read = 0;
to_write += readlen;
if (readlen >= (long)c->tcp.pipe_size * 90 / 100) if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
more = SPLICE_F_MORE; more = SPLICE_F_MORE;
@ -532,10 +535,10 @@ retry:
eintr: eintr:
written = splice(conn->pipe[fromsidei][0], NULL, written = splice(conn->pipe[fromsidei][0], NULL,
conn->s[!fromsidei], NULL, c->tcp.pipe_size, conn->s[!fromsidei], NULL, to_write,
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
flow_trace(conn, "%zi from write-side call (passed %zi)", flow_trace(conn, "%zi from write-side call (passed %zi)",
written, c->tcp.pipe_size); written, to_write);
/* Most common case: skip updating counters. */ /* Most common case: skip updating counters. */
if (readlen > 0 && readlen == written) { if (readlen > 0 && readlen == written) {
@ -581,9 +584,10 @@ eintr:
if (never_read && written == (long)(c->tcp.pipe_size)) if (never_read && written == (long)(c->tcp.pipe_size))
goto retry; goto retry;
pending = conn->read[fromsidei] - conn->written[fromsidei]; if (!never_read && written < to_write) {
if (!never_read && written > 0 && written < pending) to_write -= written;
goto retry; goto retry;
}
if (eof) if (eof)
break; break;
@ -672,7 +676,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
continue; continue;
if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ, if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
c->tcp.pipe_size) != (int)c->tcp.pipe_size) { c->tcp.pipe_size)) {
trace("TCP (spliced): cannot set pool pipe size to %zu", trace("TCP (spliced): cannot set pool pipe size to %zu",
c->tcp.pipe_size); c->tcp.pipe_size);
} }

View file

@ -8,6 +8,7 @@
WGET = wget -c WGET = wget -c
DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \ DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
debian-9-nocloud-amd64-daily-20200210-166.qcow2 \
debian-10-nocloud-amd64.qcow2 \ debian-10-nocloud-amd64.qcow2 \
debian-10-generic-arm64.qcow2 \ debian-10-generic-arm64.qcow2 \
debian-10-generic-ppc64el-20220911-1135.qcow2 \ debian-10-generic-ppc64el-20220911-1135.qcow2 \
@ -41,7 +42,8 @@ OPENSUSE_IMGS = openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 \
openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \ openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \
openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \ openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \
openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz \
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \ UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \
trusty-server-cloudimg-i386-disk1.img \ trusty-server-cloudimg-i386-disk1.img \
@ -133,6 +135,9 @@ realclean: clean
debian-8.11.0-openstack-%.qcow2: debian-8.11.0-openstack-%.qcow2:
$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2 $(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2
debian-9-nocloud-%-daily-20200210-166.qcow2:
$(WGET) -O $@ https://cloud.debian.org/images/cloud/stretch/daily/20200210-166/debian-9-nocloud-$*-daily-20200210-166.qcow2
debian-10-nocloud-%.qcow2: debian-10-nocloud-%.qcow2:
$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2 $(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2
@ -198,6 +203,9 @@ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz:
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz: openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz:
$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz $(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2:
$(WGET) -O $@ https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
# Ubuntu downloads # Ubuntu downloads
trusty-server-cloudimg-%-disk1.img: trusty-server-cloudimg-%-disk1.img:
$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img $(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img

View file

@ -58,7 +58,7 @@ setup_passt() {
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \ context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
' -machine accel=kvm' \ ' -machine accel=kvm' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \ ' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \ ' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \
@ -159,7 +159,7 @@ setup_passt_in_ns() {
' -machine accel=kvm' \ ' -machine accel=kvm' \
' -M accel=kvm:tcg' \ ' -M accel=kvm:tcg' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \ ' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \ ' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \
@ -230,7 +230,7 @@ setup_two_guests() {
context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \ context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
' -M accel=kvm:tcg' \ ' -M accel=kvm:tcg' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \ ' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \ ' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \
@ -243,7 +243,7 @@ setup_two_guests() {
context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \ context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
' -M accel=kvm:tcg' \ ' -M accel=kvm:tcg' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \ ' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \ ' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \

View file

@ -31,8 +31,8 @@ PR_DELAY_INIT=100 # ms
# $@: Message to print # $@: Message to print
info() { info() {
tmux select-pane -t ${PANE_INFO} tmux select-pane -t ${PANE_INFO}
printf "${@}\n" >> $STATEBASE/log_pipe echo "${@}" >> $STATEBASE/log_pipe
printf "${@}\n" >> "${LOGFILE}" echo "${@}" >> "${LOGFILE}"
} }
# info_n() - Highlight, print message to pane and to log file without newline # info_n() - Highlight, print message to pane and to log file without newline
@ -47,13 +47,13 @@ info_n() {
# $@: Message to print # $@: Message to print
info_nolog() { info_nolog() {
tmux select-pane -t ${PANE_INFO} tmux select-pane -t ${PANE_INFO}
printf "${@}\n" >> $STATEBASE/log_pipe echo "${@}" >> $STATEBASE/log_pipe
} }
# info_nolog() - Print message to log file # info_nolog() - Print message to log file
# $@: Message to print # $@: Message to print
log() { log() {
printf "${@}\n" >> "${LOGFILE}" echo "${@}" >> "${LOGFILE}"
} }
# info_nolog_n() - Send message to pane without highlighting it, without newline # info_nolog_n() - Send message to pane without highlighting it, without newline
@ -664,7 +664,7 @@ pause_continue() {
# run_term() - Start tmux session, running entry point, with recording if needed # run_term() - Start tmux session, running entry point, with recording if needed
run_term() { run_term() {
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eTRACE=$TRACE -eKERNEL=$KERNEL" TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG"
if [ ${CI} -eq 1 ]; then if [ ${CI} -eq 1 ]; then
printf '\e[8;50;240t' printf '\e[8;50;240t'

View file

@ -33,15 +33,10 @@
#define die(...) \ #define die(...) \
do { \ do { \
fprintf(stderr, "nstool: " __VA_ARGS__); \ fprintf(stderr, __VA_ARGS__); \
exit(1); \ exit(1); \
} while (0) } while (0)
#define err(...) \
do { \
fprintf(stderr, "nstool: " __VA_ARGS__); \
} while (0)
struct ns_type { struct ns_type {
int flag; int flag;
const char *name; const char *name;
@ -161,9 +156,6 @@ static int connect_ctl(const char *sockpath, bool wait,
static void cmd_hold(int argc, char *argv[]) static void cmd_hold(int argc, char *argv[])
{ {
struct sigaction sa = {
.sa_handler = SIG_IGN,
};
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX); int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX);
struct sockaddr_un addr; struct sockaddr_un addr;
const char *sockpath = argv[1]; const char *sockpath = argv[1];
@ -193,10 +185,6 @@ static void cmd_hold(int argc, char *argv[])
if (!getcwd(info.cwd, sizeof(info.cwd))) if (!getcwd(info.cwd, sizeof(info.cwd)))
die("getcwd(): %s\n", strerror(errno)); die("getcwd(): %s\n", strerror(errno));
rc = sigaction(SIGPIPE, &sa, NULL);
if (rc)
die("sigaction(SIGPIPE): %s\n", strerror(errno));
do { do {
int afd = accept(fd, NULL, NULL); int afd = accept(fd, NULL, NULL);
char buf; char buf;
@ -205,21 +193,17 @@ static void cmd_hold(int argc, char *argv[])
die("accept(): %s\n", strerror(errno)); die("accept(): %s\n", strerror(errno));
rc = write(afd, &info, sizeof(info)); rc = write(afd, &info, sizeof(info));
if (rc < 0) { if (rc < 0)
err("holder write() to control socket: %s\n", die("write(): %s\n", strerror(errno));
strerror(errno));
}
if ((size_t)rc < sizeof(info)) if ((size_t)rc < sizeof(info))
err("holder short write() on control socket\n"); die("short write() on control socket\n");
rc = read(afd, &buf, sizeof(buf)); rc = read(afd, &buf, sizeof(buf));
if (rc < 0) { if (rc < 0)
err("holder read() on control socket: %s\n", die("read(): %s\n", strerror(errno));
strerror(errno));
}
close(afd); close(afd);
} while (rc <= 0); } while (rc == 0);
unlink(sockpath); unlink(sockpath);
} }
@ -362,7 +346,7 @@ static int openns(const char *fmt, ...)
} }
static pid_t sig_pid; static pid_t sig_pid;
static void sig_propagate(int signum) static void sig_handler(int signum)
{ {
int err; int err;
@ -374,7 +358,7 @@ static void sig_propagate(int signum)
static void wait_for_child(pid_t pid) static void wait_for_child(pid_t pid)
{ {
struct sigaction sa = { struct sigaction sa = {
.sa_handler = sig_propagate, .sa_handler = sig_handler,
.sa_flags = SA_RESETHAND, .sa_flags = SA_RESETHAND,
}; };
int status, err; int status, err;

View file

@ -49,8 +49,6 @@ check [ "__SEARCH__" = "__HOST_SEARCH__" ]
test DHCPv6: address test DHCPv6: address
guest /sbin/dhclient -6 __IFNAME__ guest /sbin/dhclient -6 __IFNAME__
# Wait for DAD to complete
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]' gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
check [ "__ADDR6__" = "__HOST_ADDR6__" ] check [ "__ADDR6__" = "__HOST_ADDR6__" ]

View file

@ -16,15 +16,13 @@ htools ip jq sipcalc grep cut
test Interface name test Interface name
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
guest ip link set dev __IFNAME__ up guest ip link set dev __IFNAME__ up && sleep 2
# Wait for DAD to complete
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME__" ] check [ -n "__IFNAME__" ]
test SLAAC: prefix test SLAAC: prefix
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]' gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
gout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4 gout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4 hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ] check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]

View file

@ -52,8 +52,6 @@ check [ "__SEARCH__" = "__HOST_SEARCH__" ]
test DHCPv6: address test DHCPv6: address
guest /sbin/dhclient -6 __IFNAME__ guest /sbin/dhclient -6 __IFNAME__
# Wait for DAD to complete
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]' gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
check [ "__ADDR6__" = "__HOST_ADDR6__" ] check [ "__ADDR6__" = "__HOST_ADDR6__" ]

View file

@ -32,7 +32,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10001
guestw guestw
guest cmp test_big.bin /root/big.bin guest cmp test_big.bin /root/big.bin
test TCP/IPv4: host to ns (spliced): big transfer test TCP/IPv4: host to ns: big transfer
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
sleep 1 sleep 1
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002 host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
@ -90,7 +90,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10001
guestw guestw
guest cmp test_small.bin /root/small.bin guest cmp test_small.bin /root/small.bin
test TCP/IPv4: host to ns (spliced): small transfer test TCP/IPv4: host to ns: small transfer
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
sleep 1 sleep 1
host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002 host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
@ -146,7 +146,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10001
guestw guestw
guest cmp test_big.bin /root/big.bin guest cmp test_big.bin /root/big.bin
test TCP/IPv6: host to ns (spliced): big transfer test TCP/IPv6: host to ns: big transfer
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
sleep 1 sleep 1
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002 host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
@ -204,7 +204,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10001
guestw guestw
guest cmp test_small.bin /root/small.bin guest cmp test_small.bin /root/small.bin
test TCP/IPv6: host to ns (spliced): small transfer test TCP/IPv6: host to ns: small transfer
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
sleep 1 sleep 1
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002 host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002

View file

@ -30,7 +30,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10001,shut-null
guestw guestw
guest cmp test.bin /root/medium.bin guest cmp test.bin /root/medium.bin
test UDP/IPv4: host to ns (recvmmsg/sendmmsg) test UDP/IPv4: host to ns
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
sleep 1 sleep 1
host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
@ -88,7 +88,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10001,shut-null
guestw guestw
guest cmp test.bin /root/medium.bin guest cmp test.bin /root/medium.bin
test UDP/IPv6: host to ns (recvmmsg/sendmmsg) test UDP/IPv6: host to ns
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
sleep 1 sleep 1
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null

View file

@ -35,8 +35,6 @@ check [ __MTU__ = 65520 ]
test DHCPv6: address test DHCPv6: address
ns /sbin/dhclient -6 --no-pid __IFNAME__ ns /sbin/dhclient -6 --no-pid __IFNAME__
# Wait for DAD to complete
ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]' nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'

View file

@ -18,12 +18,11 @@ test Interface name
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
check [ -n "__IFNAME__" ] check [ -n "__IFNAME__" ]
ns ip link set dev __IFNAME__ up ns ip link set dev __IFNAME__ up
# Wait for DAD to complete sleep 2
ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
test SLAAC: prefix test SLAAC: prefix
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]' nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
nsout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4 nsout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4 hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ] check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]

View file

@ -19,8 +19,8 @@ set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin
set TEMP_SMALL __STATEDIR__/test_small.bin set TEMP_SMALL __STATEDIR__/test_small.bin
set TEMP_NS_SMALL __STATEDIR__/test_ns_small.bin set TEMP_NS_SMALL __STATEDIR__/test_ns_small.bin
test TCP/IPv4: host to ns (spliced): big transfer test TCP/IPv4: host to ns: big transfer
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_BIG__,create,trunc
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002 host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
nsw nsw
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__ check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
@ -38,8 +38,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003
hostw hostw
check cmp __BASEPATH__/big.bin __TEMP_BIG__ check cmp __BASEPATH__/big.bin __TEMP_BIG__
test TCP/IPv4: host to ns (spliced): small transfer test TCP/IPv4: host to ns: small transfer
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_SMALL__,create,trunc
host socat OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002 host socat OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
nsw nsw
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__ check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
@ -57,8 +57,8 @@ ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003
hostw hostw
check cmp __BASEPATH__/small.bin __TEMP_SMALL__ check cmp __BASEPATH__/small.bin __TEMP_SMALL__
test TCP/IPv6: host to ns (spliced): big transfer test TCP/IPv6: host to ns: big transfer
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_BIG__,create,trunc
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002 host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
nsw nsw
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__ check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
@ -77,8 +77,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003
hostw hostw
check cmp __BASEPATH__/big.bin __TEMP_BIG__ check cmp __BASEPATH__/big.bin __TEMP_BIG__
test TCP/IPv6: host to ns (spliced): small transfer test TCP/IPv6: host to ns: small transfer
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_SMALL__,create,trunc
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002 host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
nsw nsw
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__ check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__

View file

@ -17,8 +17,8 @@ htools dd socat ip jq
set TEMP __STATEDIR__/test.bin set TEMP __STATEDIR__/test.bin
set TEMP_NS __STATEDIR__/test_ns.bin set TEMP_NS __STATEDIR__/test_ns.bin
test UDP/IPv4: host to ns (recvmmsg/sendmmsg) test UDP/IPv4: host to ns
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc nsb socat -u UDP4-LISTEN:10002,bind=127.0.0.1,null-eof OPEN:__TEMP_NS__,create,trunc
host socat OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null host socat OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
nsw nsw
check cmp __BASEPATH__/medium.bin __TEMP_NS__ check cmp __BASEPATH__/medium.bin __TEMP_NS__
@ -37,8 +37,8 @@ ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null
hostw hostw
check cmp __BASEPATH__/medium.bin __TEMP__ check cmp __BASEPATH__/medium.bin __TEMP__
test UDP/IPv6: host to ns (recvmmsg/sendmmsg) test UDP/IPv6: host to ns
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc nsb socat -u UDP6-LISTEN:10002,bind=[::1],null-eof OPEN:__TEMP_NS__,create,trunc
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
nsw nsw
check cmp __BASEPATH__/medium.bin __TEMP_NS__ check cmp __BASEPATH__/medium.bin __TEMP_NS__

View file

@ -116,8 +116,6 @@ iperf3k ns
# Reducing MTU below 1280 deconfigures IPv6, get our address back # Reducing MTU below 1280 deconfigures IPv6, get our address back
guest dhclient -6 -x guest dhclient -6 -x
guest dhclient -6 __IFNAME__ guest dhclient -6 __IFNAME__
# Wait for DAD to complete
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
tl TCP RR latency over IPv4: guest to host tl TCP RR latency over IPv4: guest to host
lat - lat -

View file

@ -211,7 +211,7 @@ tr TCP throughput over IPv6: host to ns
iperf3s ns 10002 iperf3s ns 10002
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local' nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
bw - bw -
bw - bw -
bw - bw -

View file

@ -196,7 +196,7 @@ tr UDP throughput over IPv6: host to ns
iperf3s ns 10002 iperf3s ns 10002
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local' nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472 iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472
bw __BW__ 0.3 0.5 bw __BW__ 0.3 0.5
iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972 iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972

View file

@ -38,9 +38,6 @@ TRACE=${TRACE:-0}
# If set, tell passt and pasta to take packet captures # If set, tell passt and pasta to take packet captures
PCAP=${PCAP:-0} PCAP=${PCAP:-0}
# Custom kernel to boot guests with, if given
KERNEL=${KERNEL:-"/boot/vmlinuz-$(uname -r)"}
COMMIT="$(git log --oneline --no-decorate -1)" COMMIT="$(git log --oneline --no-decorate -1)"
. lib/util . lib/util

View file

@ -36,13 +36,9 @@ check [ "__ADDR2__" = "__HOST_ADDR__" ]
test DHCPv6: addresses test DHCPv6: addresses
# Link is up now, wait for DAD to complete # Link is up now, wait for DAD to complete
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done sleep 2
guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
guest1 /sbin/dhclient -6 __IFNAME1__ guest1 /sbin/dhclient -6 __IFNAME1__
guest2 /sbin/dhclient -6 __IFNAME2__ guest2 /sbin/dhclient -6 __IFNAME2__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
g2out ADDR2_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME2__").addr_info[] | select(.prefixlen == 128).local] | .[0]' g2out ADDR2_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME2__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
@ -52,33 +48,33 @@ check [ "__ADDR2_6__" = "__HOST_ADDR6__" ]
test TCP/IPv4: guest 1 > guest 2 test TCP/IPv4: guest 1 > guest 2
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc
sleep 1
guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004 guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004
guest2w guest2w
sleep 1
g2out MSG2 cat msg g2out MSG2 cat msg
check [ "__MSG2__" = "Hello_from_guest_1" ] check [ "__MSG2__" = "Hello_from_guest_1" ]
test TCP/IPv6: guest 2 > guest 1 test TCP/IPv6: guest 2 > guest 1
g2out GW2_6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' g2out GW2_6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc
sleep 1
guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001 guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001
guest1w guest1w
sleep 1
g1out MSG1 cat msg g1out MSG1 cat msg
check [ "__MSG1__" = "Hello_from_guest_2" ] check [ "__MSG1__" = "Hello_from_guest_2" ]
test UDP/IPv4: guest 1 > guest 2 test UDP/IPv4: guest 1 > guest 2
guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc
sleep 1
guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004 guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004
guest2w guest2w
sleep 1
g2out MSG2 cat msg g2out MSG2 cat msg
check [ "__MSG2__" = "Hello_from_guest_1" ] check [ "__MSG2__" = "Hello_from_guest_1" ]
test UDP/IPv6: guest 2 > guest 1 test UDP/IPv6: guest 2 > guest 1
guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc
sleep 1
guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001 guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001
guest1w guest1w
sleep 1
g1out MSG1 cat msg g1out MSG1 cat msg
check [ "__MSG1__" = "Hello_from_guest_2" ] check [ "__MSG1__" = "Hello_from_guest_2" ]

252
udp.c
View file

@ -169,11 +169,11 @@ udp_meta[UDP_MAX_FRAMES];
* @UDP_NUM_IOVS the number of entries in the iovec array * @UDP_NUM_IOVS the number of entries in the iovec array
*/ */
enum udp_iov_idx { enum udp_iov_idx {
UDP_IOV_TAP, UDP_IOV_TAP = 0,
UDP_IOV_ETH, UDP_IOV_ETH = 1,
UDP_IOV_IP, UDP_IOV_IP = 2,
UDP_IOV_PAYLOAD, UDP_IOV_PAYLOAD = 3,
UDP_NUM_IOVS, UDP_NUM_IOVS
}; };
/* IOVs and msghdr arrays for receiving datagrams from sockets */ /* IOVs and msghdr arrays for receiving datagrams from sockets */
@ -298,13 +298,11 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
* @bp: Pointer to udp_payload_t to update * @bp: Pointer to udp_payload_t to update
* @toside: Flowside for destination side * @toside: Flowside for destination side
* @dlen: Length of UDP payload * @dlen: Length of UDP payload
* @no_udp_csum: Do not set UDP checksum
* *
* Return: size of IPv4 payload (UDP header + data) * Return: size of IPv4 payload (UDP header + data)
*/ */
static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
const struct flowside *toside, size_t dlen, const struct flowside *toside, size_t dlen)
bool no_udp_csum)
{ {
const struct in_addr *src = inany_v4(&toside->oaddr); const struct in_addr *src = inany_v4(&toside->oaddr);
const struct in_addr *dst = inany_v4(&toside->eaddr); const struct in_addr *dst = inany_v4(&toside->eaddr);
@ -321,33 +319,22 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
bp->uh.source = htons(toside->oport); bp->uh.source = htons(toside->oport);
bp->uh.dest = htons(toside->eport); bp->uh.dest = htons(toside->eport);
bp->uh.len = htons(l4len); bp->uh.len = htons(l4len);
if (no_udp_csum) { csum_udp4(&bp->uh, *src, *dst, bp->data, dlen);
bp->uh.check = 0;
} else {
const struct iovec iov = {
.iov_base = bp->data,
.iov_len = dlen
};
csum_udp4(&bp->uh, *src, *dst, &iov, 1, 0);
}
return l4len; return l4len;
} }
/** /**
* udp_update_hdr6() - Update headers for one IPv6 datagram * udp_update_hdr6() - Update headers for one IPv6 datagram
* @ip6h: Pre-filled IPv6 header (except for payload_len and * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses)
* addresses)
* @bp: Pointer to udp_payload_t to update * @bp: Pointer to udp_payload_t to update
* @toside: Flowside for destination side * @toside: Flowside for destination side
* @dlen: Length of UDP payload * @dlen: Length of UDP payload
* @no_udp_csum: Do not set UDP checksum
* *
* Return: size of IPv6 payload (UDP header + data) * Return: size of IPv6 payload (UDP header + data)
*/ */
static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
const struct flowside *toside, size_t dlen, const struct flowside *toside, size_t dlen)
bool no_udp_csum)
{ {
uint16_t l4len = dlen + sizeof(bp->uh); uint16_t l4len = dlen + sizeof(bp->uh);
@ -361,20 +348,7 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
bp->uh.source = htons(toside->oport); bp->uh.source = htons(toside->oport);
bp->uh.dest = htons(toside->eport); bp->uh.dest = htons(toside->eport);
bp->uh.len = ip6h->payload_len; bp->uh.len = ip6h->payload_len;
if (no_udp_csum) { csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, bp->data, dlen);
/* 0 is an invalid checksum for UDP IPv6 and dropped by
* the kernel stack, even if the checksum is disabled by virtio
* flags. We need to put any non-zero value here.
*/
bp->uh.check = 0xffff;
} else {
const struct iovec iov = {
.iov_base = bp->data,
.iov_len = dlen
};
csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
&iov, 1, 0);
}
return l4len; return l4len;
} }
@ -384,11 +358,9 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
* @mmh: Receiving mmsghdr array * @mmh: Receiving mmsghdr array
* @idx: Index of the datagram to prepare * @idx: Index of the datagram to prepare
* @toside: Flowside for destination side * @toside: Flowside for destination side
* @no_udp_csum: Do not set UDP checksum
*/ */
static void udp_tap_prepare(const struct mmsghdr *mmh, static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
unsigned idx, const struct flowside *toside, const struct flowside *toside)
bool no_udp_csum)
{ {
struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx]; struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
struct udp_payload_t *bp = &udp_payload[idx]; struct udp_payload_t *bp = &udp_payload[idx];
@ -396,15 +368,13 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
size_t l4len; size_t l4len;
if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) { if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
l4len = udp_update_hdr6(&bm->ip6h, bp, toside, l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len);
mmh[idx].msg_len, no_udp_csum);
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
sizeof(udp6_eth_hdr)); sizeof(udp6_eth_hdr));
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
} else { } else {
l4len = udp_update_hdr4(&bm->ip4h, bp, toside, l4len = udp_update_hdr4(&bm->ip4h, bp, toside, mmh[idx].msg_len);
mmh[idx].msg_len, no_udp_csum);
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
sizeof(udp4_eth_hdr)); sizeof(udp4_eth_hdr));
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
@ -417,8 +387,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
* udp_sock_recverr() - Receive and clear an error from a socket * udp_sock_recverr() - Receive and clear an error from a socket
* @s: Socket to receive from * @s: Socket to receive from
* *
* Return: 1 if error received and processed, 0 if no more errors in queue, < 0 * Return: ee_errno, 0 on empty queue
* if there was an error reading the queue
* *
* #syscalls recvmsg * #syscalls recvmsg
*/ */
@ -439,16 +408,15 @@ static int udp_sock_recverr(int s)
rc = recvmsg(s, &mh, MSG_ERRQUEUE); rc = recvmsg(s, &mh, MSG_ERRQUEUE);
if (rc < 0) { if (rc < 0) {
if (errno == EAGAIN || errno == EWOULDBLOCK) if (errno != EAGAIN && errno != EWOULDBLOCK)
return 0; err_perror("Failed to read error queue");
err_perror("UDP: Failed to read error queue"); return 0;
return -1;
} }
if (!(mh.msg_flags & MSG_ERRQUEUE)) { if (!(mh.msg_flags & MSG_ERRQUEUE)) {
err("Missing MSG_ERRQUEUE flag reading error queue"); err("Missing MSG_ERRQUEUE flag reading error queue");
return -1; return 0;
} }
hdr = CMSG_FIRSTHDR(&mh); hdr = CMSG_FIRSTHDR(&mh);
@ -457,7 +425,7 @@ static int udp_sock_recverr(int s)
(hdr->cmsg_level == IPPROTO_IPV6 && (hdr->cmsg_level == IPPROTO_IPV6 &&
hdr->cmsg_type == IPV6_RECVERR))) { hdr->cmsg_type == IPV6_RECVERR))) {
err("Unexpected cmsg reading error queue"); err("Unexpected cmsg reading error queue");
return -1; return 0;
} }
ee = (const struct sock_extended_err *)CMSG_DATA(hdr); ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
@ -466,54 +434,7 @@ static int udp_sock_recverr(int s)
debug("%s error on UDP socket %i: %s", debug("%s error on UDP socket %i: %s",
str_ee_origin(ee), s, strerror(ee->ee_errno)); str_ee_origin(ee), s, strerror(ee->ee_errno));
return 1; return ee->ee_errno;
}
/**
* udp_sock_errs() - Process errors on a socket
* @c: Execution context
* @s: Socket to receive from
* @events: epoll events bitmap
*
* Return: Number of errors handled, or < 0 if we have an unrecoverable error
*/
static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
{
unsigned n_err = 0;
socklen_t errlen;
int rc, err;
ASSERT(!c->no_udp);
if (!(events & EPOLLERR))
return 0; /* Nothing to do */
/* Empty the error queue */
while ((rc = udp_sock_recverr(s)) > 0)
n_err += rc;
if (rc < 0)
return -1; /* error reading error, unrecoverable */
errlen = sizeof(err);
if (getsockopt(s, SOL_SOCKET, SO_ERROR, &err, &errlen) < 0 ||
errlen != sizeof(err)) {
err_perror("Error reading SO_ERROR");
return -1; /* error reading error, unrecoverable */
}
if (err) {
debug("Unqueued error on UDP socket %i: %s", s, strerror(err));
n_err++;
}
if (!n_err) {
/* EPOLLERR, but no errors to clear !? */
err("EPOLLERR event without reported errors on socket %i", s);
return -1; /* no way to clear, unrecoverable */
}
return n_err;
} }
/** /**
@ -521,14 +442,15 @@ static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
* @c: Execution context * @c: Execution context
* @s: Socket to receive from * @s: Socket to receive from
* @events: epoll events bitmap * @events: epoll events bitmap
* @mmh mmsghdr array to receive into * @mmh: mmsghdr array to receive into
* @recv_err: Set to last error in queue. If none: -1 on EPOLLERR, 0 otherwise
* *
* Return: Number of datagrams received * Return: count of datagrams received
* *
* #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64 * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
*/ */
static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
struct mmsghdr *mmh) struct mmsghdr *mmh, int *recv_err)
{ {
/* For not entirely clear reasons (data locality?) pasta gets better /* For not entirely clear reasons (data locality?) pasta gets better
* throughput if we receive tap datagrams one at a atime. For small * throughput if we receive tap datagrams one at a atime. For small
@ -541,6 +463,17 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
ASSERT(!c->no_udp); ASSERT(!c->no_udp);
/* Clear any errors first */
if (events & EPOLLERR) {
bool found = false;
int ret;
while ((ret = udp_sock_recverr(s)))
found = true;
*recv_err = found ? ret : -1;
}
if (!(events & EPOLLIN)) if (!(events & EPOLLIN))
return 0; return 0;
@ -566,16 +499,10 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now) uint32_t events, const struct timespec *now)
{ {
const socklen_t sasize = sizeof(udp_meta[0].s_in); const socklen_t sasize = sizeof(udp_meta[0].s_in);
int recv_err = 0;
int n, i; int n, i;
if (udp_sock_errs(c, ref.fd, events) < 0) { if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv, &recv_err)) <= 0)
err("UDP: Unrecoverable error on listening socket:"
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
/* FIXME: what now? close/re-open socket? */
return;
}
if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
return; return;
/* We divide datagrams into batches based on how we need to send them, /* We divide datagrams into batches based on how we need to send them,
@ -595,8 +522,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
udp_splice_prepare(udp_mh_recv, i); udp_splice_prepare(udp_mh_recv, i);
} else if (batchpif == PIF_TAP) { } else if (batchpif == PIF_TAP) {
udp_tap_prepare(udp_mh_recv, i, udp_tap_prepare(udp_mh_recv, i,
flowside_at_sidx(batchsidx), flowside_at_sidx(batchsidx));
false);
} }
if (++i >= n) if (++i >= n)
@ -644,21 +570,51 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
const struct flowside *toside = flowside_at_sidx(tosidx); const struct flowside *toside = flowside_at_sidx(tosidx);
struct udp_flow *uflow = udp_at_sidx(ref.flowside); struct udp_flow *uflow = udp_at_sidx(ref.flowside);
int from_s = uflow->s[ref.flowside.sidei];
uint8_t topif = pif_at_sidx(tosidx); uint8_t topif = pif_at_sidx(tosidx);
int n, i, from_s; int recv_err = 0;
int n, i;
ASSERT(!c->no_udp && uflow); ASSERT(!c->no_udp && uflow);
from_s = uflow->s[ref.flowside.sidei]; n = udp_sock_recv(c, from_s, events, udp_mh_recv, &recv_err);
if (recv_err == -1) {
struct flow_common *f = &uflow->f;
char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
const struct flowside *ini = &f->side[INISIDE];
const struct flowside *tgt = &f->side[TGTSIDE];
flow_err(uflow, "EPOLLERR without error queue, closing flow");
err("Last recorded errno was: %i (%s)", uflow->last_errno,
strerror(uflow->last_errno));
flow_log_(f, LOG_ERR,
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
ini->eport,
inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
ini->oport,
pif_name(f->pif[TGTSIDE]),
inany_ntop(&tgt->oaddr, fstr1, sizeof(fstr1)),
tgt->oport,
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
tgt->eport);
if (udp_sock_errs(c, from_s, events) < 0) {
flow_err(uflow, "Unrecoverable error on reply socket");
flow_err_details(uflow);
udp_flow_close(c, uflow); udp_flow_close(c, uflow);
return; return;
} }
if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0) if (recv_err) {
struct udp_flow *uflow = udp_at_sidx(udp_meta[0].tosidx);
uflow->last_errno = recv_err;
flow_err(uflow, "Recorded errno %i (%s)", recv_err,
strerror(recv_err));
}
if (n <= 0)
return; return;
flow_trace(uflow, "Received %d datagrams on reply socket", n); flow_trace(uflow, "Received %d datagrams on reply socket", n);
@ -668,7 +624,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
if (pif_is_socket(topif)) if (pif_is_socket(topif))
udp_splice_prepare(udp_mh_recv, i); udp_splice_prepare(udp_mh_recv, i);
else if (topif == PIF_TAP) else if (topif == PIF_TAP)
udp_tap_prepare(udp_mh_recv, i, toside, false); udp_tap_prepare(udp_mh_recv, i, toside);
/* Restore sockaddr length clobbered by recvmsg() */ /* Restore sockaddr length clobbered by recvmsg() */
udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in); udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
} }
@ -795,61 +751,69 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
* udp_sock_init() - Initialise listening sockets for a given port * udp_sock_init() - Initialise listening sockets for a given port
* @c: Execution context * @c: Execution context
* @ns: In pasta mode, if set, bind with loopback address in namespace * @ns: In pasta mode, if set, bind with loopback address in namespace
* @af: Address family to select a specific IP version, or AF_UNSPEC
* @addr: Pointer to address for binding, NULL if not configured * @addr: Pointer to address for binding, NULL if not configured
* @ifname: Name of interface to bind to, NULL if not configured * @ifname: Name of interface to bind to, NULL if not configured
* @port: Port, host order * @port: Port, host order
* *
* Return: 0 on (partial) success, negative error code on (complete) failure * Return: 0 on (partial) success, negative error code on (complete) failure
*/ */
int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr, int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
const char *ifname, in_port_t port) const void *addr, const char *ifname, in_port_t port)
{ {
union udp_listen_epoll_ref uref = { union udp_listen_epoll_ref uref = { .port = port };
.pif = ns ? PIF_SPLICE : PIF_HOST,
.port = port,
};
int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
ASSERT(!c->no_udp); ASSERT(!c->no_udp);
if (!addr && c->ifi4 && c->ifi6 && !ns) { if (ns)
uref.pif = PIF_SPLICE;
else
uref.pif = PIF_HOST;
if (af == AF_UNSPEC && c->ifi4 && c->ifi6) {
int s; int s;
/* Attempt to get a dual stack socket */ /* Attempt to get a dual stack socket */
s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, if (!ns) {
NULL, ifname, port, uref.u32); s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
addr, ifname, port, uref.u32);
udp_splice_init[V4][port] = s < 0 ? -1 : s; udp_splice_init[V4][port] = s < 0 ? -1 : s;
udp_splice_init[V6][port] = s < 0 ? -1 : s; udp_splice_init[V6][port] = s < 0 ? -1 : s;
} else {
s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
&in4addr_loopback, ifname, port, uref.u32);
udp_splice_ns[V4][port] = s < 0 ? -1 : s;
udp_splice_ns[V6][port] = s < 0 ? -1 : s;
}
if (IN_INTERVAL(0, FD_REF_MAX, s)) if (IN_INTERVAL(0, FD_REF_MAX, s))
return 0; return 0;
} }
if ((!addr || inany_v4(addr)) && c->ifi4) { if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
if (!ns) { if (!ns) {
r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
addr ? addr : &inany_any4, ifname, addr, ifname, port, uref.u32);
port, uref.u32);
udp_splice_init[V4][port] = r4 < 0 ? -1 : r4; udp_splice_init[V4][port] = r4 < 0 ? -1 : r4;
} else { } else {
r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE, r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
&inany_loopback4, ifname, &in4addr_loopback,
port, uref.u32); ifname, port, uref.u32);
udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4; udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4;
} }
} }
if ((!addr || !inany_v4(addr)) && c->ifi6) { if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
if (!ns) { if (!ns) {
r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
addr ? addr : &inany_any6, ifname, addr, ifname, port, uref.u32);
port, uref.u32);
udp_splice_init[V6][port] = r6 < 0 ? -1 : r6; udp_splice_init[V6][port] = r6 < 0 ? -1 : r6;
} else { } else {
r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE, r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
&inany_loopback6, ifname, &in6addr_loopback,
port, uref.u32); ifname, port, uref.u32);
udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6; udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6;
} }
} }
@ -917,7 +881,7 @@ static void udp_port_rebind(struct ctx *c, bool outbound)
if ((c->ifi4 && socks[V4][port] == -1) || if ((c->ifi4 && socks[V4][port] == -1) ||
(c->ifi6 && socks[V6][port] == -1)) (c->ifi6 && socks[V6][port] == -1))
udp_sock_init(c, outbound, NULL, NULL, port); udp_sock_init(c, outbound, AF_UNSPEC, NULL, NULL, port);
} }
} }

4
udp.h
View file

@ -16,8 +16,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
int udp_tap_handler(const struct ctx *c, uint8_t pif, int udp_tap_handler(const struct ctx *c, uint8_t pif,
sa_family_t af, const void *saddr, const void *daddr, sa_family_t af, const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now); const struct pool *p, int idx, const struct timespec *now);
int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr, int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
const char *ifname, in_port_t port); const void *addr, const char *ifname, in_port_t port);
int udp_init(struct ctx *c); int udp_init(struct ctx *c);
void udp_timer(struct ctx *c, const struct timespec *now); void udp_timer(struct ctx *c, const struct timespec *now);
void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);

View file

@ -34,16 +34,13 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
return &flow->udp; return &flow->udp;
} }
/* /**
* udp_flow_close() - Close and clean up UDP flow * udp_flow_close() - Close and clean up UDP flow
* @c: Execution context * @c: Execution context
* @uflow: UDP flow * @uflow: UDP flow
*/ */
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
{ {
if (uflow->closed)
return; /* Nothing to do */
if (uflow->s[INISIDE] >= 0) { if (uflow->s[INISIDE] >= 0) {
/* The listening socket needs to stay in epoll */ /* The listening socket needs to stay in epoll */
close(uflow->s[INISIDE]); close(uflow->s[INISIDE]);
@ -56,11 +53,12 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
close(uflow->s[TGTSIDE]); close(uflow->s[TGTSIDE]);
uflow->s[TGTSIDE] = -1; uflow->s[TGTSIDE] = -1;
} }
uflow->last_errno = 0;
flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
if (!pif_is_socket(uflow->f.pif[TGTSIDE])) if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE)); flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
uflow->closed = true;
} }
/** /**
@ -261,17 +259,6 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
return udp_flow_new(c, flow, -1, now); return udp_flow_new(c, flow, -1, now);
} }
/**
* udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
* @uflow: Flow to handle
*
* Return: true if the connection is ready to free, false otherwise
*/
bool udp_flow_defer(const struct udp_flow *uflow)
{
return uflow->closed;
}
/** /**
* udp_flow_timer() - Handler for timed events related to a given flow * udp_flow_timer() - Handler for timed events related to a given flow
* @c: Execution context * @c: Execution context

View file

@ -10,7 +10,6 @@
/** /**
* struct udp - Descriptor for a flow of UDP packets * struct udp - Descriptor for a flow of UDP packets
* @f: Generic flow information * @f: Generic flow information
* @closed: Flow is already closed
* @ts: Activity timestamp * @ts: Activity timestamp
* @s: Socket fd (or -1) for each side of the flow * @s: Socket fd (or -1) for each side of the flow
*/ */
@ -18,9 +17,10 @@ struct udp_flow {
/* Must be first element */ /* Must be first element */
struct flow_common f; struct flow_common f;
bool closed :1;
time_t ts; time_t ts;
int s[SIDES]; int s[SIDES];
int last_errno;
}; };
struct udp_flow *udp_at_sidx(flow_sidx_t sidx); struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
@ -33,7 +33,6 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
in_port_t srcport, in_port_t dstport, in_port_t srcport, in_port_t dstport,
const struct timespec *now); const struct timespec *now);
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow); void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
bool udp_flow_defer(const struct udp_flow *uflow);
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
const struct timespec *now); const struct timespec *now);

210
util.c
View file

@ -28,7 +28,6 @@
#include <linux/errqueue.h> #include <linux/errqueue.h>
#include <getopt.h> #include <getopt.h>
#include "linux_dep.h"
#include "util.h" #include "util.h"
#include "iov.h" #include "iov.h"
#include "passt.h" #include "passt.h"
@ -53,7 +52,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
{ {
sa_family_t af = ((const struct sockaddr *)sa)->sa_family; sa_family_t af = ((const struct sockaddr *)sa)->sa_family;
union epoll_ref ref = { .type = type, .data = data }; union epoll_ref ref = { .type = type, .data = data };
bool freebind = false;
struct epoll_event ev; struct epoll_event ev;
int fd, y = 1, ret; int fd, y = 1, ret;
uint8_t proto; uint8_t proto;
@ -63,11 +61,8 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
case EPOLL_TYPE_TCP_LISTEN: case EPOLL_TYPE_TCP_LISTEN:
proto = IPPROTO_TCP; proto = IPPROTO_TCP;
socktype = SOCK_STREAM | SOCK_NONBLOCK; socktype = SOCK_STREAM | SOCK_NONBLOCK;
freebind = c->freebind;
break; break;
case EPOLL_TYPE_UDP_LISTEN: case EPOLL_TYPE_UDP_LISTEN:
freebind = c->freebind;
/* fallthrough */
case EPOLL_TYPE_UDP_REPLY: case EPOLL_TYPE_UDP_REPLY:
proto = IPPROTO_UDP; proto = IPPROTO_UDP;
socktype = SOCK_DGRAM | SOCK_NONBLOCK; socktype = SOCK_DGRAM | SOCK_NONBLOCK;
@ -132,18 +127,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
} }
} }
if (freebind) {
int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
int opt = af == AF_INET ? IP_FREEBIND : IPV6_FREEBIND;
if (setsockopt(fd, level, opt, &y, sizeof(y))) {
err_perror("Failed to set %s on socket %i",
af == AF_INET ? "IP_FREEBIND"
: "IPV6_FREEBIND",
fd);
}
}
if (bind(fd, sa, sl) < 0) { if (bind(fd, sa, sl) < 0) {
/* We'll fail to bind to low ports if we don't have enough /* We'll fail to bind to low ports if we don't have enough
* capabilities, and we'll fail to bind on already bound ports, * capabilities, and we'll fail to bind on already bound ports,
@ -174,6 +157,58 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return fd; return fd;
} }
/**
* sock_l4() - Create and bind socket for given L4, add to epoll list
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @type: epoll type
* @bind_addr: Address for binding, NULL for any
* @ifname: Interface for binding, NULL for any
* @port: Port, host order
* @data: epoll reference portion for protocol handlers
*
* Return: newly created socket, negative error code on failure
*/
int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
const void *bind_addr, const char *ifname, uint16_t port,
uint32_t data)
{
switch (af) {
case AF_INET: {
struct sockaddr_in addr4 = {
.sin_family = AF_INET,
.sin_port = htons(port),
{ 0 }, { 0 },
};
if (bind_addr)
addr4.sin_addr = *(struct in_addr *)bind_addr;
return sock_l4_sa(c, type, &addr4, sizeof(addr4), ifname,
false, data);
}
case AF_UNSPEC:
if (!DUAL_STACK_SOCKETS || bind_addr)
return -EINVAL;
/* fallthrough */
case AF_INET6: {
struct sockaddr_in6 addr6 = {
.sin6_family = AF_INET6,
.sin6_port = htons(port),
0, IN6ADDR_ANY_INIT, 0,
};
if (bind_addr) {
addr6.sin6_addr = *(struct in6_addr *)bind_addr;
if (IN6_IS_ADDR_LINKLOCAL(bind_addr))
addr6.sin6_scope_id = c->ifi6;
}
return sock_l4_sa(c, type, &addr6, sizeof(addr6), ifname,
af == AF_INET6, data);
}
default:
return -EINVAL;
}
}
/** /**
* sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
@ -184,8 +219,7 @@ void sock_probe_mem(struct ctx *c)
int v = INT_MAX / 2, s; int v = INT_MAX / 2, s;
socklen_t sl; socklen_t sl;
s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
if (s < 0) {
c->low_wmem = c->low_rmem = 1; c->low_wmem = c->low_rmem = 1;
return; return;
} }
@ -215,7 +249,7 @@ void sock_probe_mem(struct ctx *c)
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b) int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b)
{ {
if (a->tv_nsec < b->tv_nsec) { if (a->tv_nsec < b->tv_nsec) {
return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 + return (b->tv_nsec - a->tv_nsec) / 1000 +
(a->tv_sec - b->tv_sec - 1) * 1000000; (a->tv_sec - b->tv_sec - 1) * 1000000;
} }
@ -409,20 +443,25 @@ void pidfile_write(int fd, pid_t pid)
} }
/** /**
* output_file_open() - Open file for output, if needed * pidfile_open() - Open PID file if needed
* @path: Path for output file * @path: Path for PID file, empty string if no PID file is requested
* @flags: Flags for open() other than O_CREAT, O_TRUNC, O_CLOEXEC
* *
* Return: file descriptor on success, -1 on failure with errno set by open() * Return: descriptor for PID file, -1 if path is NULL, won't return on failure
*/ */
int output_file_open(const char *path, int flags) int pidfile_open(const char *path)
{ {
/* We use O_CLOEXEC here, but clang-tidy as of LLVM 16 to 19 looks for int fd;
* it in the 'mode' argument if we have one
*/ if (!*path)
return open(path, O_CREAT | O_TRUNC | O_CLOEXEC | flags, return -1;
/* NOLINTNEXTLINE(android-cloexec-open) */
S_IRUSR | S_IWUSR); if ((fd = open(path, O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC,
S_IRUSR | S_IWUSR)) < 0) {
perror("PID file open");
exit(EXIT_FAILURE);
}
return fd;
} }
/** /**
@ -446,11 +485,16 @@ int __daemon(int pidfile_fd, int devnull_fd)
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
if (setsid() < 0 || errno = 0;
dup2(devnull_fd, STDIN_FILENO) < 0 ||
dup2(devnull_fd, STDOUT_FILENO) < 0 || setsid();
dup2(devnull_fd, STDERR_FILENO) < 0 ||
close(devnull_fd)) dup2(devnull_fd, STDIN_FILENO);
dup2(devnull_fd, STDOUT_FILENO);
dup2(devnull_fd, STDERR_FILENO);
close(devnull_fd);
if (errno)
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
return 0; return 0;
@ -538,36 +582,6 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
#endif #endif
} }
/* write_all_buf() - write all of a buffer to an fd
* @fd: File descriptor
* @buf: Pointer to base of buffer
* @len: Length of buffer
*
* Return: 0 on success, -1 on error (with errno set)
*
* #syscalls write
*/
int write_all_buf(int fd, const void *buf, size_t len)
{
const char *p = buf;
size_t left = len;
while (left) {
ssize_t rc;
do
rc = write(fd, p, left);
while ((rc < 0) && errno == EINTR);
if (rc < 0)
return -1;
p += rc;
left -= rc;
}
return 0;
}
/* write_remainder() - write the tail of an IO vector to an fd /* write_remainder() - write the tail of an IO vector to an fd
* @fd: File descriptor * @fd: File descriptor
* @iov: IO vector * @iov: IO vector
@ -576,30 +590,28 @@ int write_all_buf(int fd, const void *buf, size_t len)
* *
* Return: 0 on success, -1 on error (with errno set) * Return: 0 on success, -1 on error (with errno set)
* *
* #syscalls writev * #syscalls write writev
*/ */
int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip) int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip)
{ {
size_t i = 0, offset; size_t offset, i;
while ((i += iov_skip_bytes(iov + i, iovcnt - i, skip, &offset)) < iovcnt) { while ((i = iov_skip_bytes(iov, iovcnt, skip, &offset)) < iovcnt) {
ssize_t rc; ssize_t rc;
if (offset) { if (offset) {
/* Write the remainder of the partially written buffer */ rc = write(fd, (char *)iov[i].iov_base + offset,
if (write_all_buf(fd, (char *)iov[i].iov_base + offset, iov[i].iov_len - offset);
iov[i].iov_len - offset) < 0) } else {
return -1; rc = writev(fd, &iov[i], iovcnt - i);
i++;
} }
/* Write as much of the remaining whole buffers as we can */
rc = writev(fd, &iov[i], iovcnt - i);
if (rc < 0) if (rc < 0)
return -1; return -1;
skip = rc; skip += rc;
} }
return 0; return 0;
} }
@ -738,48 +750,6 @@ void close_open_files(int argc, char **argv)
rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE); rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE);
} }
if (rc) { if (rc)
if (errno == ENOSYS || errno == EINVAL) {
/* This probably means close_range() or the
* CLOSE_RANGE_UNSHARE flag is not supported by the
* kernel. Not much we can do here except carry on and
* hope for the best.
*/
warn(
"Can't use close_range() to ensure no files leaked by parent");
} else {
die_perror("Failed to close files leaked by parent"); die_perror("Failed to close files leaked by parent");
} }
}
}
/**
* snprintf_check() - snprintf() wrapper, checking for truncation and errors
* @str: Output buffer
* @size: Maximum size to write to @str
* @format: Message
*
* Return: false on success, true on truncation or error, sets errno on failure
*/
bool snprintf_check(char *str, size_t size, const char *format, ...)
{
va_list ap;
int rc;
va_start(ap, format);
rc = vsnprintf(str, size, format, ap);
va_end(ap);
if (rc < 0) {
errno = EIO;
return true;
}
if ((size_t)rc >= size) {
errno = ENOBUFS;
return true;
}
return false;
}

54
util.h
View file

@ -11,12 +11,12 @@
#include <stdbool.h> #include <stdbool.h>
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h>
#include <string.h> #include <string.h>
#include <signal.h> #include <signal.h>
#include <arpa/inet.h> #include <arpa/inet.h>
#include <unistd.h> #include <unistd.h>
#include <sys/syscall.h> #include <sys/syscall.h>
#include <linux/close_range.h>
#include "log.h" #include "log.h"
@ -67,15 +67,6 @@
#define STRINGIFY(x) #x #define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x) #define STR(x) STRINGIFY(x)
#ifdef CPPCHECK_6936
/* Some cppcheck versions get confused by aborts inside a loop, causing
* it to give false positive uninitialised variable warnings later in
* the function, because it doesn't realise the non-initialising path
* already exited. See https://trac.cppcheck.net/ticket/13227
*/
#define ASSERT(expr) \
((expr) ? (void)0 : abort())
#else
#define ASSERT(expr) \ #define ASSERT(expr) \
do { \ do { \
if (!(expr)) { \ if (!(expr)) { \
@ -87,7 +78,6 @@
abort(); \ abort(); \
} \ } \
} while (0) } while (0)
#endif
#ifdef P_tmpdir #ifdef P_tmpdir
#define TMPDIR P_tmpdir #define TMPDIR P_tmpdir
@ -101,9 +91,6 @@
#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0]))) #define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0])))
#define foreach(item, array) \
for ((item) = (array); (item) - (array) < ARRAY_SIZE(array); (item)++)
#define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b)) #define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b))
#define FD_PROTO(x, proto) \ #define FD_PROTO(x, proto) \
(IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x))) (IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x)))
@ -144,7 +131,7 @@ static inline uint32_t ntohl_unaligned(const void *p)
return ntohl(val); return ntohl(val);
} }
#define NS_FN_STACK_SIZE (1024 * 1024) /* 1MiB */ #define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8)
int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
void *arg); void *arg);
#define NS_CALL(fn, arg) \ #define NS_CALL(fn, arg) \
@ -157,9 +144,9 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
(void *)(arg)); \ (void *)(arg)); \
} while (0) } while (0)
#define RCVBUF_BIG (2ULL * 1024 * 1024) #define RCVBUF_BIG (2UL * 1024 * 1024)
#define SNDBUF_BIG (4ULL * 1024 * 1024) #define SNDBUF_BIG (4UL * 1024 * 1024)
#define SNDBUF_SMALL (128ULL * 1024) #define SNDBUF_SMALL (128UL * 1024)
#include <net/if.h> #include <net/if.h>
#include <limits.h> #include <limits.h>
@ -170,9 +157,33 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
struct ctx; struct ctx;
/* cppcheck-suppress funcArgNamesDifferent */
__attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); }
#ifdef CLOSE_RANGE_UNSHARE /* Linux kernel >= 5.9 */
/* glibc < 2.34 and musl as of 1.2.5 need these */
#ifndef SYS_close_range
#define SYS_close_range 436
#endif
__attribute__ ((weak))
/* cppcheck-suppress funcArgNamesDifferent */
int close_range(unsigned int first, unsigned int last, int flags) {
return syscall(SYS_close_range, first, last, flags);
}
#else
/* No reasonable fallback option */
/* cppcheck-suppress funcArgNamesDifferent */
int close_range(unsigned int first, unsigned int last, int flags) {
return 0;
}
#endif
int sock_l4_sa(const struct ctx *c, enum epoll_type type, int sock_l4_sa(const struct ctx *c, enum epoll_type type,
const void *sa, socklen_t sl, const void *sa, socklen_t sl,
const char *ifname, bool v6only, uint32_t data); const char *ifname, bool v6only, uint32_t data);
int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
const void *bind_addr, const char *ifname, uint16_t port,
uint32_t data);
void sock_probe_mem(struct ctx *c); void sock_probe_mem(struct ctx *c);
long timespec_diff_ms(const struct timespec *a, const struct timespec *b); long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b); int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);
@ -184,15 +195,13 @@ char *line_read(char *buf, size_t len, int fd);
void ns_enter(const struct ctx *c); void ns_enter(const struct ctx *c);
bool ns_is_init(void); bool ns_is_init(void);
int open_in_ns(const struct ctx *c, const char *path, int flags); int open_in_ns(const struct ctx *c, const char *path, int flags);
int output_file_open(const char *path, int flags); int pidfile_open(const char *path);
void pidfile_write(int fd, pid_t pid); void pidfile_write(int fd, pid_t pid);
int __daemon(int pidfile_fd, int devnull_fd); int __daemon(int pidfile_fd, int devnull_fd);
int fls(unsigned long x); int fls(unsigned long x);
int write_file(const char *path, const char *buf); int write_file(const char *path, const char *buf);
int write_all_buf(int fd, const void *buf, size_t len);
int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip); int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
void close_open_files(int argc, char **argv); void close_open_files(int argc, char **argv);
bool snprintf_check(char *str, size_t size, const char *format, ...);
/** /**
* af_name() - Return name of an address family * af_name() - Return name of an address family
@ -260,9 +269,6 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
return mod_sub(x, i, m) < mod_sub(j, i, m); return mod_sub(x, i, m) < mod_sub(j, i, m);
} }
/* FPRINTF() intentionally silences cert-err33-c clang-tidy warnings */
#define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__)
/* /*
* Workarounds for https://github.com/llvm/llvm-project/issues/58992 * Workarounds for https://github.com/llvm/llvm-project/issues/58992
* *