Compare commits

..

4 commits

Author SHA1 Message Date
Stefano Brivio
0c6c20dee5 udp, udp_flow: Add instrumentation, handle EPOLLERR without queued errors
Link: https://github.com/containers/podman/issues/23686#issuecomment-2324945010
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2024-09-04 18:36:54 +02:00
David Gibson
d098e0527a additional debug 2024-09-04 18:36:54 +02:00
David Gibson
026fb71d1d tcp: Attempt to mitigate EPOLLRDHUP storms with half-closed connections
Link: https://github.com/containers/podman/issues/23686
2024-09-04 18:36:54 +02:00
Stefano Brivio
232e12529e log: Don't prefix log file messages with time and severity if they're continuations
In fecb1b65b1 ("log: Don't prefix message with timestamp on --debug
if it's a continuation"), I fixed this for --debug on standard error,
but not for log files: if messages are continuations, they shouldn't
be prefixed by timestamp and severity.

Otherwise, we'll print stuff like this:

  0.0028: ERROR:   Receive error on guest connection, reset0.0028:  ERROR:   : Bad file descriptor

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2024-09-04 16:22:27 +02:00
64 changed files with 1186 additions and 1762 deletions

View file

@ -1,126 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
#
# clang-format configuration file. Intended for clang-format >= 11.
#
# For more information, see:
#
# Documentation/dev-tools/clang-format.rst
# https://clang.llvm.org/docs/ClangFormat.html
# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
#
---
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: None
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: false
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: true
AfterNamespace: true
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Custom
BreakBeforeInheritanceComma: false
BreakBeforeTernaryOperators: false
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeComma
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 8
ContinuationIndentWidth: 8
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: false
# Taken from:
# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
# | LC_ALL=C sort -u
ForEachMacros:
- 'for_each_nst'
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '.*'
Priority: 1
IncludeIsMainRegex: '(Test)?$'
IndentCaseLabels: false
IndentGotoLabels: false
IndentPPDirectives: None
IndentWidth: 8
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 8
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
# Taken from git's rules
PenaltyBreakAssignment: 10
PenaltyBreakBeforeFirstCallParameter: 30
PenaltyBreakComment: 10
PenaltyBreakFirstLessLess: 0
PenaltyBreakString: 10
PenaltyExcessCharacter: 100
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Right
ReflowComments: false
SortIncludes: false
SortUsingDeclarations: false
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatementsExceptForEachMacros
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp03
TabWidth: 8
UseTab: Always
...

View file

@ -1,93 +0,0 @@
---
Checks:
- "clang-diagnostic-*,clang-analyzer-*,*,-modernize-*"
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
- "-clang-analyzer-valist.Uninitialized"
# Dubious value, would kill readability
- "-cppcoreguidelines-init-variables"
# Dubious value over the compiler's built-in warning. Would
# increase verbosity.
- "-bugprone-assignment-in-if-condition"
# Debatable whether these improve readability, right now it would look
# like a mess
- "-google-readability-braces-around-statements"
- "-hicpp-braces-around-statements"
- "-readability-braces-around-statements"
# TODO: in most cases they are justified, but probably not everywhere
#
- "-readability-magic-numbers"
- "-cppcoreguidelines-avoid-magic-numbers"
# TODO: this is Linux-only for the moment, nice to fix eventually
- "-llvmlibc-restrict-system-libc-headers"
# Those are needed for syscalls, epoll_wait flags, etc.
- "-hicpp-signed-bitwise"
# Probably not doable to impement this without plain memcpy(), memset()
- "-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
# TODO: not really important, but nice to fix eventually
- "-llvm-include-order"
# Dubious value, would kill readability
- "-readability-isolate-declaration"
# TODO: nice to fix eventually
- "-bugprone-narrowing-conversions"
- "-cppcoreguidelines-narrowing-conversions"
# TODO: check, fix, and more in general constify wherever possible
- "-cppcoreguidelines-avoid-non-const-global-variables"
# TODO: check paths where it might make sense to improve performance
- "-altera-unroll-loops"
- "-altera-id-dependent-backward-branch"
# Not much can be done about them other than being careful
- "-bugprone-easily-swappable-parameters"
# TODO: split reported functions
- "-readability-function-cognitive-complexity"
# "Poor" alignment needed for structs reflecting message formats/headers
- "-altera-struct-pack-align"
# TODO: check again if multithreading is implemented
- "-concurrency-mt-unsafe"
# Complains about any identifier <3 characters, reasonable for
# globals, pointlessly verbose for locals and parameters.
- "-readability-identifier-length"
# Wants to include headers which *directly* provide the things
# we use. That sounds nice, but means it will often want a OS
# specific header instead of a mostly standard one, such as
# <linux/limits.h> instead of <limits.h>.
- "-misc-include-cleaner"
# Want to replace all #defines of integers with enums. Kind of
# makes sense when those defines form an enum-like set, but
# weird for cases like standalone constants, and causes other
# awkwardness for a bunch of cases we use
- "-cppcoreguidelines-macro-to-enum"
# It's been a couple of centuries since multiplication has been granted
# precedence over addition in modern mathematical notation. Adding
# parentheses to reinforce that certainly won't improve readability.
- "-readability-math-missing-parentheses"
WarningsAsErrors: "*"
HeaderFileExtensions:
- h
ImplementationFileExtensions:
- c
HeaderFilterRegex: ""
FormatStyle: none
CheckOptions:
bugprone-suspicious-string-compare.WarnOnImplicitComparison: "false"
SystemHeaders: false

View file

@ -1,3 +0,0 @@
CompileFlags:
# Don't try to interpret our headers as C++'
Add: [-xc, -Wall]

161
Makefile
View file

@ -15,11 +15,24 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio
# the IPv6 socket API? (Linux does)
DUAL_STACK_SOCKETS := 1
RLIMIT_STACK_VAL := $(shell /bin/sh -c 'ulimit -s')
ifeq ($(RLIMIT_STACK_VAL),unlimited)
RLIMIT_STACK_VAL := 1024
endif
TARGET ?= $(shell $(CC) -dumpmachine)
# Get 'uname -m'-like architecture description for target
TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')
AUDIT_ARCH := $(shell echo $(TARGET_ARCH) | tr [a-z] [A-Z] | sed 's/^ARM.*/ARM/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/I[456]86/I386/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/')
# On some systems enabling optimization also enables source fortification,
# automagically. Do not override it.
FORTIFY_FLAG :=
@ -31,6 +44,10 @@ FLAGS := -Wall -Wextra -Wno-format-zero-length
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
FLAGS += -DNETNS_RUN_DIR=\"/run/netns\"
FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)
FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
FLAGS += -DARCH=\"$(TARGET_ARCH)\"
FLAGS += -DVERSION=\"$(VERSION)\"
FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
@ -50,6 +67,21 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
udp.h udp_flow.h util.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_SND_WND
endif
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 };
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_BYTES_ACKED
endif
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_min_rtt = 0 };
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_MIN_RTT
endif
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_GETRANDOM
@ -59,6 +91,11 @@ ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - -o - >/dev/null 2>&1; ec
FLAGS += -fstack-protector-strong
endif
C := \#define _GNU_SOURCE\n\#include <fcntl.h>\nint x = FALLOC_FL_COLLAPSE_RANGE;
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
EXTRA_SYSCALLS += fallocate
endif
prefix ?= /usr/local
exec_prefix ?= $(prefix)
bindir ?= $(exec_prefix)/bin
@ -95,7 +132,7 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
ln -sf $< $@
qrap: $(QRAP_SRCS) passt.h
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS)
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS)
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
rt_sigreturn getpid gettid kill clock_gettime mmap \
@ -159,11 +196,116 @@ docs: README.md
done < README.md; \
) > README.plain.md
clang-tidy: $(PASST_SRCS) $(HEADERS)
clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
-DCLANG_TIDY_58992
# Checkers currently disabled for clang-tidy:
# - llvmlibc-restrict-system-libc-headers
# TODO: this is Linux-only for the moment, nice to fix eventually
#
# - google-readability-braces-around-statements
# - hicpp-braces-around-statements
# - readability-braces-around-statements
# Debatable whether that improves readability, right now it would look
# like a mess
#
# - readability-magic-numbers
# - cppcoreguidelines-avoid-magic-numbers
# TODO: in most cases they are justified, but probably not everywhere
#
# - clang-analyzer-valist.Uninitialized
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
#
# - clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling
# Probably not doable to impement this without plain memcpy(), memset()
#
# - cppcoreguidelines-init-variables
# Dubious value, would kill readability
#
# - hicpp-signed-bitwise
# Those are needed for syscalls, epoll_wait flags, etc.
#
# - llvm-include-order
# TODO: not really important, but nice to fix eventually
#
# - readability-isolate-declaration
# Dubious value, would kill readability
#
# - bugprone-narrowing-conversions
# - cppcoreguidelines-narrowing-conversions
# TODO: nice to fix eventually
#
# - cppcoreguidelines-avoid-non-const-global-variables
# TODO: check, fix, and more in general constify wherever possible
#
# - altera-unroll-loops
# - altera-id-dependent-backward-branch
# TODO: check paths where it might make sense to improve performance
#
# - bugprone-easily-swappable-parameters
# Not much can be done about them other than being careful
#
# - readability-function-cognitive-complexity
# TODO: split reported functions
#
# - altera-struct-pack-align
# "Poor" alignment needed for structs reflecting message formats/headers
#
# - concurrency-mt-unsafe
# TODO: check again if multithreading is implemented
#
# - readability-identifier-length
# Complains about any identifier <3 characters, reasonable for
# globals, pointlessly verbose for locals and parameters.
#
# - bugprone-assignment-in-if-condition
# Dubious value over the compiler's built-in warning. Would
# increase verbosity.
#
# - misc-include-cleaner
# Wants to include headers which *directly* provide the things
# we use. That sounds nice, but means it will often want a OS
# specific header instead of a mostly standard one, such as
# <linux/limits.h> instead of <limits.h>.
#
# - cppcoreguidelines-macro-to-enum
# Want to replace all #defines of integers with enums. Kind of
# makes sense when those defines form an enum-like set, but
# weird for cases like standalone constants, and causes other
# awkwardness for a bunch of cases we use
cppcheck: $(PASST_SRCS) $(HEADERS)
clang-tidy: $(SRCS) $(HEADERS)
clang-tidy -checks=*,-modernize-*,\
-clang-analyzer-valist.Uninitialized,\
-cppcoreguidelines-init-variables,\
-bugprone-assignment-in-if-condition,\
-google-readability-braces-around-statements,\
-hicpp-braces-around-statements,\
-readability-braces-around-statements,\
-readability-magic-numbers,\
-llvmlibc-restrict-system-libc-headers,\
-hicpp-signed-bitwise,\
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\
-llvm-include-order,\
-cppcoreguidelines-avoid-magic-numbers,\
-readability-isolate-declaration,\
-bugprone-narrowing-conversions,\
-cppcoreguidelines-narrowing-conversions,\
-cppcoreguidelines-avoid-non-const-global-variables,\
-altera-unroll-loops,-altera-id-dependent-backward-branch,\
-bugprone-easily-swappable-parameters,\
-readability-function-cognitive-complexity,\
-altera-struct-pack-align,\
-concurrency-mt-unsafe,\
-readability-identifier-length,\
-misc-include-cleaner,\
-cppcoreguidelines-macro-to-enum \
-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
--warnings-as-errors=* $(SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)
VER := $(shell $(CC) -dumpversion)
SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include
endif
cppcheck: $(SRCS) $(HEADERS)
if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \
CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \
else \
@ -172,8 +314,11 @@ cppcheck: $(PASST_SRCS) $(HEADERS)
cppcheck --std=c11 --error-exitcode=1 --enable=all --force \
--inconclusive --library=posix --quiet \
$${CPPCHECK_EXHAUSTIVE} \
$(SYSTEM_INCLUDES:%=-I%) \
$(SYSTEM_INCLUDES:%=--config-exclude=%) \
$(SYSTEM_INCLUDES:%=--suppress=*:%/*) \
$(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*) \
--inline-suppr \
--suppress=missingIncludeSystem \
--suppress=unusedStructMember \
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936 \
$(PASST_SRCS) $(HEADERS)
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
$(SRCS) $(HEADERS)

8
arch.c
View file

@ -19,7 +19,6 @@
#include <unistd.h>
#include "log.h"
#include "util.h"
/**
* arch_avx2_exec() - Switch to AVX2 build if supported
@ -41,11 +40,8 @@ void arch_avx2_exec(char **argv)
if (__builtin_cpu_supports("avx2")) {
char new_path[PATH_MAX + sizeof(".avx2")];
if (snprintf_check(new_path, PATH_MAX + sizeof(".avx2"),
"%s.avx2", exe))
die_perror("Can't build AVX2 executable path");
execv(new_path, argv);
snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe);
execve(new_path, argv, environ);
warn_perror("Can't run AVX2 build, using non-AVX2 version");
}
}

8
arp.c
View file

@ -59,12 +59,14 @@ int arp(const struct ctx *c, const struct pool *p)
ah->ar_op != htons(ARPOP_REQUEST))
return 1;
/* Discard announcements, but not 0.0.0.0 "probes" */
if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) &&
/* Discard announcements (but not 0.0.0.0 "probes"): we might have the
* same IP address, hide that.
*/
if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) &&
!memcmp(am->sip, am->tip, sizeof(am->sip)))
return 1;
/* Don't resolve the guest's assigned address, either. */
/* Don't resolve our own address, either. */
if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
return 1;

View file

@ -59,7 +59,6 @@
#include "util.h"
#include "ip.h"
#include "checksum.h"
#include "iov.h"
/* Checksums are optional for UDP over IPv4, so we usually just set
* them to 0. Change this to 1 to calculate real UDP over IPv4
@ -166,24 +165,22 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
* @udp4hr: UDP header, initialised apart from checksum
* @saddr: IPv4 source address
* @daddr: IPv4 destination address
* @iov: Pointer to the array of IO vectors
* @iov_cnt: Length of the array
* @offset: UDP payload offset in the iovec array
* @payload: UDP packet payload
* @dlen: Length of @payload (not including UDP header)
*/
void csum_udp4(struct udphdr *udp4hr,
struct in_addr saddr, struct in_addr daddr,
const struct iovec *iov, int iov_cnt, size_t offset)
const void *payload, size_t dlen)
{
/* UDP checksums are optional, so don't bother */
udp4hr->check = 0;
if (UDP4_REAL_CHECKSUMS) {
uint16_t l4len = iov_size(iov, iov_cnt) - offset +
sizeof(struct udphdr);
uint16_t l4len = dlen + sizeof(struct udphdr);
uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
saddr, daddr);
psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
udp4hr->check = csum_iov(iov, iov_cnt, offset, psum);
udp4hr->check = csum(payload, dlen, psum);
}
}
@ -229,24 +226,19 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
/**
* csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet
* @udp6hr: UDP header, initialised apart from checksum
* @saddr: Source address
* @daddr: Destination address
* @iov: Pointer to the array of IO vectors
* @iov_cnt: Length of the array
* @offset: UDP payload offset in the iovec array
* @payload: UDP packet payload
* @dlen: Length of @payload (not including UDP header)
*/
void csum_udp6(struct udphdr *udp6hr,
const struct in6_addr *saddr, const struct in6_addr *daddr,
const struct iovec *iov, int iov_cnt, size_t offset)
const void *payload, size_t dlen)
{
uint16_t l4len = iov_size(iov, iov_cnt) - offset +
sizeof(struct udphdr);
uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
saddr, daddr);
uint32_t psum = proto_ipv6_header_psum(dlen + sizeof(struct udphdr),
IPPROTO_UDP, saddr, daddr);
udp6hr->check = 0;
psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
udp6hr->check = csum_iov(iov, iov_cnt, offset, psum);
udp6hr->check = csum(payload, dlen, psum);
}
/**
@ -505,26 +497,16 @@ uint16_t csum(const void *buf, size_t len, uint32_t init)
*
* @iov Pointer to the array of IO vectors
* @n Length of the array
* @offset: Offset of the data to checksum within the full data length
* @init Initial 32-bit checksum, 0 for no pre-computed checksum
*
* Return: 16-bit folded, complemented checksum
*/
uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
uint32_t init)
/* cppcheck-suppress unusedFunction */
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init)
{
unsigned int i;
size_t first;
i = iov_skip_bytes(iov, n, offset, &first);
if (i >= n)
return (uint16_t)~csum_fold(init);
init = csum_unfolded((char *)iov[i].iov_base + first,
iov[i].iov_len - first, init);
i++;
for (; i < n; i++)
for (i = 0; i < n; i++)
init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);
return (uint16_t)~csum_fold(init);

View file

@ -19,20 +19,19 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
struct in_addr saddr, struct in_addr daddr);
void csum_udp4(struct udphdr *udp4hr,
struct in_addr saddr, struct in_addr daddr,
const struct iovec *iov, int iov_cnt, size_t offset);
const void *payload, size_t dlen);
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen);
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
const struct in6_addr *saddr,
const struct in6_addr *daddr);
void csum_udp6(struct udphdr *udp6hr,
const struct in6_addr *saddr, const struct in6_addr *daddr,
const struct iovec *iov, int iov_cnt, size_t offset);
const void *payload, size_t dlen);
void csum_icmp6(struct icmp6hdr *icmp6hr,
const struct in6_addr *saddr, const struct in6_addr *daddr,
const void *payload, size_t dlen);
uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
uint16_t csum(const void *buf, size_t len, uint32_t init);
uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
uint32_t init);
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init);
#endif /* CHECKSUM_H */

117
conf.c
View file

@ -46,8 +46,6 @@
#include "isolation.h"
#include "log.h"
#define NETNS_RUN_DIR "/run/netns"
/**
* next_chunk - Return the next piece of a string delimited by a character
* @s: String to search
@ -118,10 +116,11 @@ static int parse_port_range(const char *s, char **endptr,
static void conf_ports(const struct ctx *c, char optname, const char *optarg,
struct fwd_ports *fwd)
{
union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
char addr_buf[sizeof(struct in6_addr)] = { 0 }, *addr = addr_buf;
char buf[BUFSIZ], *spec, *ifname = NULL, *p;
bool exclude_only = true, bound_one = false;
uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
sa_family_t af = AF_UNSPEC;
unsigned i;
int ret;
@ -167,13 +166,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
bitmap_set(fwd->map, i);
if (optname == 't') {
ret = tcp_sock_init(c, NULL, NULL, i);
ret = tcp_sock_init(c, AF_UNSPEC, NULL, NULL,
i);
if (ret == -ENFILE || ret == -EMFILE)
goto enfile;
if (!ret)
bound_one = true;
} else if (optname == 'u') {
ret = udp_sock_init(c, 0, NULL, NULL, i);
ret = udp_sock_init(c, 0, AF_UNSPEC, NULL, NULL,
i);
if (ret == -ENFILE || ret == -EMFILE)
goto enfile;
if (!ret)
@ -225,7 +226,11 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
p++;
}
if (!inany_pton(p, addr))
if (inet_pton(AF_INET, p, addr))
af = AF_INET;
else if (inet_pton(AF_INET6, p, addr))
af = AF_INET6;
else
goto bad;
}
} else {
@ -271,13 +276,13 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
bitmap_set(fwd->map, i);
if (optname == 't') {
ret = tcp_sock_init(c, addr, ifname, i);
ret = tcp_sock_init(c, af, addr, ifname, i);
if (ret == -ENFILE || ret == -EMFILE)
goto enfile;
if (!ret)
bound_one = true;
} else if (optname == 'u') {
ret = udp_sock_init(c, 0, addr, ifname, i);
ret = udp_sock_init(c, 0, af, addr, ifname, i);
if (ret == -ENFILE || ret == -EMFILE)
goto enfile;
if (!ret)
@ -333,9 +338,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
ret = 0;
if (optname == 't')
ret = tcp_sock_init(c, addr, ifname, i);
ret = tcp_sock_init(c, af, addr, ifname, i);
else if (optname == 'u')
ret = udp_sock_init(c, 0, addr, ifname, i);
ret = udp_sock_init(c, 0, af, addr, ifname, i);
if (ret)
goto bind_fail;
}
@ -576,15 +581,10 @@ static void conf_pasta_ns(int *netns_only, char *userns, char *netns,
if (pidval < 0 || pidval > INT_MAX)
die("Invalid PID %s", argv[optind]);
if (snprintf_check(netns, PATH_MAX,
"/proc/%ld/ns/net", pidval))
die_perror("Can't build netns path");
if (!*userns) {
if (snprintf_check(userns, PATH_MAX,
"/proc/%ld/ns/user", pidval))
die_perror("Can't build userns path");
}
snprintf(netns, PATH_MAX, "/proc/%ld/ns/net", pidval);
if (!*userns)
snprintf(userns, PATH_MAX, "/proc/%ld/ns/user",
pidval);
}
}
@ -735,19 +735,19 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
static void usage(const char *name, FILE *f, int status)
{
if (strstr(name, "pasta")) {
FPRINTF(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name);
FPRINTF(f, " %s [OPTION]... PID\n", name);
FPRINTF(f, " %s [OPTION]... --netns [PATH|NAME]\n", name);
FPRINTF(f,
fprintf(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name);
fprintf(f, " %s [OPTION]... PID\n", name);
fprintf(f, " %s [OPTION]... --netns [PATH|NAME]\n", name);
fprintf(f,
"\n"
"Without PID or --netns, run the given command or a\n"
"default shell in a new network and user namespace, and\n"
"connect it via pasta.\n");
} else {
FPRINTF(f, "Usage: %s [OPTION]...\n", name);
fprintf(f, "Usage: %s [OPTION]...\n", name);
}
FPRINTF(f,
fprintf(f,
"\n"
" -d, --debug Be verbose\n"
" --trace Be extra verbose, implies --debug\n"
@ -764,17 +764,17 @@ static void usage(const char *name, FILE *f, int status)
" --version Show version and exit\n");
if (strstr(name, "pasta")) {
FPRINTF(f,
fprintf(f,
" -I, --ns-ifname NAME namespace interface name\n"
" default: same interface name as external one\n");
} else {
FPRINTF(f,
fprintf(f,
" -s, --socket PATH UNIX domain socket path\n"
" default: probe free path starting from "
UNIX_SOCK_PATH "\n", 1);
}
FPRINTF(f,
fprintf(f,
" -F, --fd FD Use FD as pre-opened connected socket\n"
" -p, --pcap FILE Log tap-facing traffic to pcap file\n"
" -P, --pid FILE Write own PID to the given file\n"
@ -805,28 +805,28 @@ static void usage(const char *name, FILE *f, int status)
" can be specified multiple times\n"
" a single, empty option disables DNS information\n");
if (strstr(name, "pasta"))
FPRINTF(f, " default: don't use any addresses\n");
fprintf(f, " default: don't use any addresses\n");
else
FPRINTF(f, " default: use addresses from /etc/resolv.conf\n");
FPRINTF(f,
fprintf(f, " default: use addresses from /etc/resolv.conf\n");
fprintf(f,
" -S, --search LIST Space-separated list, search domains\n"
" a single, empty option disables the DNS search list\n");
if (strstr(name, "pasta"))
FPRINTF(f, " default: don't use any search list\n");
fprintf(f, " default: don't use any search list\n");
else
FPRINTF(f, " default: use search list from /etc/resolv.conf\n");
fprintf(f, " default: use search list from /etc/resolv.conf\n");
if (strstr(name, "pasta"))
FPRINTF(f, " --dhcp-dns \tPass DNS list via DHCP/DHCPv6/NDP\n");
fprintf(f, " --dhcp-dns \tPass DNS list via DHCP/DHCPv6/NDP\n");
else
FPRINTF(f, " --no-dhcp-dns No DNS list in DHCP/DHCPv6/NDP\n");
fprintf(f, " --no-dhcp-dns No DNS list in DHCP/DHCPv6/NDP\n");
if (strstr(name, "pasta"))
FPRINTF(f, " --dhcp-search Pass list via DHCP/DHCPv6/NDP\n");
fprintf(f, " --dhcp-search Pass list via DHCP/DHCPv6/NDP\n");
else
FPRINTF(f, " --no-dhcp-search No list in DHCP/DHCPv6/NDP\n");
fprintf(f, " --no-dhcp-search No list in DHCP/DHCPv6/NDP\n");
FPRINTF(f,
fprintf(f,
" --map-host-loopback ADDR Translate ADDR to refer to host\n"
" can be specified zero to two times (for IPv4 and IPv6)\n"
" default: gateway address\n"
@ -836,9 +836,6 @@ static void usage(const char *name, FILE *f, int status)
" --dns-forward ADDR Forward DNS queries sent to ADDR\n"
" can be specified zero to two times (for IPv4 and IPv6)\n"
" default: don't forward DNS queries\n"
" --dns-host ADDR Host nameserver to direct queries to\n"
" can be specified zero to two times (for IPv4 and IPv6)\n"
" default: first nameserver from host's /etc/resolv.conf\n"
" --no-tcp Disable TCP protocol handler\n"
" --no-udp Disable UDP protocol handler\n"
" --no-icmp Disable ICMP/ICMPv6 protocol handler\n"
@ -846,7 +843,6 @@ static void usage(const char *name, FILE *f, int status)
" --no-ndp Disable NDP responses\n"
" --no-dhcpv6 Disable DHCPv6 server\n"
" --no-ra Disable router advertisements\n"
" --freebind Bind to any address for forwarding\n"
" --no-map-gw Don't map gateway address to host\n"
" -4, --ipv4-only Enable IPv4 operation only\n"
" -6, --ipv6-only Enable IPv6 operation only\n");
@ -854,7 +850,7 @@ static void usage(const char *name, FILE *f, int status)
if (strstr(name, "pasta"))
goto pasta_opts;
FPRINTF(f,
fprintf(f,
" -1, --one-off Quit after handling one single client\n"
" -t, --tcp-ports SPEC TCP port forwarding to guest\n"
" can be specified multiple times\n"
@ -885,7 +881,7 @@ static void usage(const char *name, FILE *f, int status)
pasta_opts:
FPRINTF(f,
fprintf(f,
" -t, --tcp-ports SPEC TCP port forwarding to namespace\n"
" can be specified multiple times\n"
" SPEC can be:\n"
@ -919,9 +915,6 @@ pasta_opts:
" -U, --udp-ns SPEC UDP port forwarding to init namespace\n"
" SPEC is as described above\n"
" default: auto\n"
" --host-lo-to-ns-lo DEPRECATED:\n"
" Translate host-loopback forwards to\n"
" namespace loopback\n"
" --userns NSPATH Target user namespace to join\n"
" --netns PATH|NAME Target network namespace to join\n"
" --netns-only Don't join existing user namespace\n"
@ -1196,11 +1189,7 @@ static void conf_open_files(struct ctx *c)
if (c->mode != MODE_PASTA && c->fd_tap == -1)
c->fd_tap_listen = tap_sock_unix_open(c->sock_path);
if (*c->pidfile) {
c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY);
if (c->pidfile_fd < 0)
die_perror("Couldn't open PID file %s", c->pidfile);
}
c->pidfile_fd = pidfile_open(c->pidfile);
}
/**
@ -1273,7 +1262,6 @@ void conf(struct ctx *c, int argc, char **argv)
{"no-dhcpv6", no_argument, &c->no_dhcpv6, 1 },
{"no-ndp", no_argument, &c->no_ndp, 1 },
{"no-ra", no_argument, &c->no_ra, 1 },
{"freebind", no_argument, &c->freebind, 1 },
{"no-map-gw", no_argument, &no_map_gw, 1 },
{"ipv4-only", no_argument, NULL, '4' },
{"ipv6-only", no_argument, NULL, '6' },
@ -1303,8 +1291,6 @@ void conf(struct ctx *c, int argc, char **argv)
{"netns-only", no_argument, NULL, 20 },
{"map-host-loopback", required_argument, NULL, 21 },
{"map-guest-addr", required_argument, NULL, 22 },
{"host-lo-to-ns-lo", no_argument, NULL, 23 },
{"dns-host", required_argument, NULL, 24 },
{ 0 },
};
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@ -1427,9 +1413,9 @@ void conf(struct ctx *c, int argc, char **argv)
break;
case 14:
FPRINTF(stdout,
fprintf(stdout,
c->mode == MODE_PASTA ? "pasta " : "passt ");
FPRINTF(stdout, VERSION_BLOB);
fprintf(stdout, VERSION_BLOB);
exit(EXIT_SUCCESS);
case 15:
ret = snprintf(c->ip4.ifname_out,
@ -1482,23 +1468,6 @@ void conf(struct ctx *c, int argc, char **argv)
conf_nat(optarg, &c->ip4.map_guest_addr,
&c->ip6.map_guest_addr, NULL);
break;
case 23:
if (c->mode != MODE_PASTA)
die("--host-lo-to-ns-lo is for pasta mode only");
c->host_lo_to_ns_lo = 1;
break;
case 24:
if (inet_pton(AF_INET6, optarg, &c->ip6.dns_host) &&
!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
break;
if (inet_pton(AF_INET, optarg, &c->ip4.dns_host) &&
!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host) &&
!IN4_IS_ADDR_BROADCAST(&c->ip4.dns_host))
break;
die("Invalid host nameserver address: %s", optarg);
break;
case 'd':
c->debug = 1;
c->quiet = 0;

View file

@ -34,8 +34,6 @@
owner @{PROC}/@{pid}/uid_map r, # conf_ugid()
@{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral()
network netlink raw, # nl_sock_init_do(), netlink.c
network inet stream, # tcp.c

View file

@ -50,7 +50,6 @@ require {
type passwd_file_t;
class netlink_route_socket { bind create nlmsg_read };
type sysctl_net_t;
class capability { sys_tty_config setuid setgid };
class cap_userns { setpcap sys_admin sys_ptrace };
@ -105,8 +104,6 @@ allow passt_t net_conf_t:lnk_file read;
allow passt_t tmp_t:sock_file { create unlink write };
allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt };
kernel_search_network_sysctl(passt_t)
allow passt_t sysctl_net_t:dir search;
allow passt_t sysctl_net_t:file { open read };
corenet_tcp_bind_all_nodes(passt_t)
corenet_udp_bind_all_nodes(passt_t)

View file

@ -196,7 +196,7 @@ allow pasta_t ifconfig_var_run_t:dir { read search watch };
allow pasta_t self:tun_socket create;
allow pasta_t tun_tap_device_t:chr_file { ioctl open read write };
allow pasta_t sysctl_net_t:dir search;
allow pasta_t sysctl_net_t:file { open read write };
allow pasta_t sysctl_net_t:file { open write };
allow pasta_t kernel_t:system module_request;
allow pasta_t nsfs_t:file read;

View file

@ -296,42 +296,47 @@ static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset,
static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
struct in6_addr *la)
{
int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type;
const struct opt_ia_addr *opt_addr;
char buf[INET6_ADDRSTRLEN];
struct in6_addr req_addr;
const struct opt_hdr *h;
struct opt_hdr *ia;
size_t offset;
int ia_type;
foreach(ia_type, ia_types) {
ia_type = OPT_IA_NA;
ia_ta:
offset = 0;
while ((ia = dhcpv6_opt(p, &offset, *ia_type))) {
while ((ia = dhcpv6_opt(p, &offset, ia_type))) {
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
return NULL;
offset += sizeof(struct opt_ia_na);
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
const struct opt_ia_addr *opt_addr;
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
return NULL;
opt_addr = (const struct opt_ia_addr *)h;
req_addr = opt_addr->addr;
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr))
goto err;
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) {
info("DHCPv6: requested address %s not on link",
inet_ntop(AF_INET6, &req_addr,
buf, sizeof(buf)));
return ia;
}
offset += sizeof(struct opt_ia_addr);
}
}
if (ia_type == OPT_IA_NA) {
ia_type = OPT_IA_TA;
goto ia_ta;
}
return NULL;
err:
info("DHCPv6: requested address %s not on link",
inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf)));
return ia;
}
/**
@ -423,11 +428,11 @@ search:
int dhcpv6(struct ctx *c, const struct pool *p,
const struct in6_addr *saddr, const struct in6_addr *daddr)
{
const struct opt_hdr *client_id, *server_id, *ia;
struct opt_hdr *ia, *bad_ia, *client_id;
const struct opt_hdr *server_id;
const struct in6_addr *src;
const struct msg_hdr *mh;
const struct udphdr *uh;
struct opt_hdr *bad_ia;
size_t mlen, n;
uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);

53
flow.c
View file

@ -283,23 +283,28 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
"Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
}
/** flow_log_details_() - Log the details of a flow
* @f: flow to log
* @pri: Log priority
* @state: State to log details according to
*
* Logs the details of the flow: endpoints, interfaces, type etc.
/**
* flow_set_state() - Change flow's state
* @f: Flow changing state
* @state: New state
*/
void flow_log_details_(const struct flow_common *f, int pri,
enum flow_state state)
static void flow_set_state(struct flow_common *f, enum flow_state state)
{
char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
const struct flowside *ini = &f->side[INISIDE];
const struct flowside *tgt = &f->side[TGTSIDE];
uint8_t oldstate = f->state;
if (state >= FLOW_STATE_TGT)
flow_log_(f, pri,
ASSERT(state < FLOW_NUM_STATES);
ASSERT(oldstate < FLOW_NUM_STATES);
f->state = state;
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
FLOW_STATE(f));
if (MAX(state, oldstate) >= FLOW_STATE_TGT)
flow_log_(f, LOG_DEBUG,
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
@ -311,8 +316,8 @@ void flow_log_details_(const struct flow_common *f, int pri,
tgt->oport,
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
tgt->eport);
else if (state >= FLOW_STATE_INI)
flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
else if (MAX(state, oldstate) >= FLOW_STATE_INI)
flow_log_(f, LOG_DEBUG, "%s [%s]:%hu -> [%s]:%hu => ?",
pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
ini->eport,
@ -320,25 +325,6 @@ void flow_log_details_(const struct flow_common *f, int pri,
ini->oport);
}
/**
* flow_set_state() - Change flow's state
* @f: Flow changing state
* @state: New state
*/
static void flow_set_state(struct flow_common *f, enum flow_state state)
{
uint8_t oldstate = f->state;
ASSERT(state < FLOW_NUM_STATES);
ASSERT(oldstate < FLOW_NUM_STATES);
f->state = state;
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
FLOW_STATE(f));
flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
}
/**
* flow_initiate_() - Move flow to INI, setting pif[INISIDE]
* @flow: Flow to change state
@ -711,7 +697,7 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
!(FLOW_PROTO(&flow->f) == proto &&
flow->f.pif[sidx.sidei] == pif &&
flowside_eq(&flow->f.side[sidx.sidei], side)))
b = mod_sub(b, 1, FLOW_HASH_SIZE);
b = (b + 1) % FLOW_HASH_SIZE;
return flow_hashtab[b];
}
@ -846,8 +832,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
closed = icmp_ping_timer(c, &flow->ping, now);
break;
case FLOW_UDP:
closed = udp_flow_defer(&flow->udp);
if (!closed && timer)
if (timer)
closed = udp_flow_timer(c, &flow->udp, now);
break;
default:

7
flow.h
View file

@ -264,11 +264,4 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
flow_dbg((f), __VA_ARGS__); \
} while (0)
void flow_log_details_(const struct flow_common *f, int pri,
enum flow_state state);
#define flow_log_details(f_, pri) \
flow_log_details_(&((f_)->f), (pri), (f_)->f.state)
#define flow_dbg_details(f_) flow_log_details((f_), LOG_DEBUG)
#define flow_err_details(f_) flow_log_details((f_), LOG_ERR)
#endif /* FLOW_H */

View file

@ -110,7 +110,7 @@ static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx)
const union flow *flow = flow_at_sidx(sidx);
if (!flow)
return NULL;
return PIF_NONE;
return &flow->f.side[sidx.sidei];
}

35
fwd.c
View file

@ -75,8 +75,8 @@ void fwd_probe_ephemeral(void)
if (*end || errno)
goto parse_err;
if (min < 0 || min >= (long)NUM_PORTS ||
max < 0 || max >= (long)NUM_PORTS)
if (min < 0 || min >= NUM_PORTS ||
max < 0 || max >= NUM_PORTS)
goto parse_err;
fwd_ephemeral_min = min;
@ -447,35 +447,20 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
(proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
/* spliceable */
/* The traffic will go over the guest's 'lo' interface, but by
* default use its external address, so we don't inadvertently
* expose services that listen only on the guest's loopback
* address. That can be overridden by --host-lo-to-ns-lo which
* will instead forward to the loopback address in the guest.
*
* In either case, let the kernel pick the source address to
* match.
/* Preserve the specific loopback adddress used, but let the
* kernel pick a source port on the target side
*/
if (inany_v4(&ini->eaddr)) {
if (c->host_lo_to_ns_lo)
tgt->eaddr = inany_loopback4;
else
tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
tgt->oaddr = inany_any4;
} else {
if (c->host_lo_to_ns_lo)
tgt->eaddr = inany_loopback6;
else
tgt->eaddr.a6 = c->ip6.addr_seen;
tgt->oaddr = inany_any6;
}
/* Let the kernel pick source port */
tgt->oaddr = ini->eaddr;
tgt->oport = 0;
if (proto == IPPROTO_UDP)
/* But for UDP preserve the source port */
tgt->oport = ini->eport;
if (inany_v4(&ini->eaddr))
tgt->eaddr = inany_loopback4;
else
tgt->eaddr = inany_loopback6;
return PIF_SPLICE;
}

20
inany.c
View file

@ -36,23 +36,3 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
return inet_ntop(AF_INET6, &src->a6, dst, size);
}
/** inany_pton - Parse an IPv[46] address from text format
* @src: IPv[46] address
* @dst: output buffer, filled with parsed address
*
* Return: On success, 1, if no parseable address is found, 0
*/
int inany_pton(const char *src, union inany_addr *dst)
{
if (inet_pton(AF_INET, src, &dst->v4mapped.a4)) {
memset(&dst->v4mapped.zero, 0, sizeof(dst->v4mapped.zero));
memset(&dst->v4mapped.one, 0xff, sizeof(dst->v4mapped.one));
return 1;
}
if (inet_pton(AF_INET6, src, &dst->a6))
return 1;
return 0;
}

View file

@ -270,6 +270,5 @@ static inline void inany_siphash_feed(struct siphash_state *state,
#define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size);
int inany_pton(const char *src, union inany_addr *dst);
#endif /* INANY_H */

View file

@ -1,144 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later
* Copyright Red Hat
*
* Declarations for Linux specific dependencies
*/
#ifndef LINUX_DEP_H
#define LINUX_DEP_H
/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt()
*
* Largely derived from include/linux/tcp.h in the Linux kernel
*
* Some fields returned by TCP_INFO have been there for ages and are shared with
* BSD. struct tcp_info from netinet/tcp.h has only those fields. There are
* also a many Linux specific extensions to the structure, which are only found
* in the linux/tcp.h version of struct tcp_info.
*
* We want to use some of those extension fields, when available. We can test
* for availability in the runtime kernel using the length returned from
* getsockopt(). However, we won't necessarily be compiled against the same
* kernel headers as we'll run with, so compiling directly against linux/tcp.h
* means wrapping every field access in an #ifdef whose #else does the same
* thing as when the field is missing at runtime. This rapidly gets messy.
*
* Instead we define here struct tcp_info_linux which includes all the Linux
* extensions that we want to use. This is taken from v6.11 of the kernel.
*/
struct tcp_info_linux {
uint8_t tcpi_state;
uint8_t tcpi_ca_state;
uint8_t tcpi_retransmits;
uint8_t tcpi_probes;
uint8_t tcpi_backoff;
uint8_t tcpi_options;
uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
uint32_t tcpi_rto;
uint32_t tcpi_ato;
uint32_t tcpi_snd_mss;
uint32_t tcpi_rcv_mss;
uint32_t tcpi_unacked;
uint32_t tcpi_sacked;
uint32_t tcpi_lost;
uint32_t tcpi_retrans;
uint32_t tcpi_fackets;
/* Times. */
uint32_t tcpi_last_data_sent;
uint32_t tcpi_last_ack_sent;
uint32_t tcpi_last_data_recv;
uint32_t tcpi_last_ack_recv;
/* Metrics. */
uint32_t tcpi_pmtu;
uint32_t tcpi_rcv_ssthresh;
uint32_t tcpi_rtt;
uint32_t tcpi_rttvar;
uint32_t tcpi_snd_ssthresh;
uint32_t tcpi_snd_cwnd;
uint32_t tcpi_advmss;
uint32_t tcpi_reordering;
uint32_t tcpi_rcv_rtt;
uint32_t tcpi_rcv_space;
uint32_t tcpi_total_retrans;
/* Linux extensions */
uint64_t tcpi_pacing_rate;
uint64_t tcpi_max_pacing_rate;
uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
uint32_t tcpi_notsent_bytes;
uint32_t tcpi_min_rtt;
uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
uint64_t tcpi_delivery_rate;
uint64_t tcpi_busy_time; /* Time (usec) busy sending data */
uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */
uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
uint32_t tcpi_delivered;
uint32_t tcpi_delivered_ce;
uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */
uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */
uint32_t tcpi_reord_seen; /* reordering events seen */
uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */
uint32_t tcpi_snd_wnd; /* peer's advertised receive window after
* scaling (bytes)
*/
uint32_t tcpi_rcv_wnd; /* local advertised receive window after
* scaling (bytes)
*/
uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */
uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including
* SYN/SYN-ACK and recurring timeouts.
*/
uint16_t tcpi_total_rto_recoveries; /* Total number of RTO
* recoveries, including any
* unfinished recovery.
*/
uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries
* in milliseconds, including any
* unfinished recovery.
*/
};
#include <linux/falloc.h>
#ifndef FALLOC_FL_COLLAPSE_RANGE
#define FALLOC_FL_COLLAPSE_RANGE 0x08
#endif
#include <linux/close_range.h>
/* glibc < 2.34 and musl as of 1.2.5 need these */
#ifndef SYS_close_range
#define SYS_close_range 436
#endif
#ifndef CLOSE_RANGE_UNSHARE /* Linux kernel < 5.9 */
#define CLOSE_RANGE_UNSHARE (1U << 1)
#endif
__attribute__ ((weak))
/* cppcheck-suppress funcArgNamesDifferent */
int close_range(unsigned int first, unsigned int last, int flags) {
return syscall(SYS_close_range, first, last, flags);
}
#endif /* LINUX_DEP_H */

19
log.c
View file

@ -26,7 +26,6 @@
#include <stdarg.h>
#include <sys/socket.h>
#include "linux_dep.h"
#include "log.h"
#include "util.h"
#include "passt.h"
@ -93,6 +92,7 @@ const char *logfile_prefix[] = {
" ", /* LOG_DEBUG */
};
#ifdef FALLOC_FL_COLLAPSE_RANGE
/**
* logfile_rotate_fallocate() - Write header, set log_written after fallocate()
* @fd: Log file descriptor
@ -126,6 +126,7 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)
log_written -= log_cut_size;
}
#endif /* FALLOC_FL_COLLAPSE_RANGE */
/**
* logfile_rotate_move() - Fallback: move recent entries toward start, then cut
@ -197,17 +198,21 @@ out:
*
* Return: 0 on success, negative error code on failure
*
* #syscalls fcntl fallocate
* #syscalls fcntl
*
* fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there
*/
static int logfile_rotate(int fd, const struct timespec *now)
{
if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */))
return -errno;
#ifdef FALLOC_FL_COLLAPSE_RANGE
/* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */
if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size))
logfile_rotate_fallocate(fd, now);
else
#endif
logfile_rotate_move(fd, now);
if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND))
@ -269,7 +274,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
char timestr[LOGTIME_STRLEN];
logtime_fmt(timestr, sizeof(timestr), now);
FPRINTF(stderr, "%s: ", timestr);
fprintf(stderr, "%s: ", timestr);
}
if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) {
@ -288,7 +293,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
(log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) {
(void)vfprintf(stderr, format, ap);
if (newline && format[strlen(format)] != '\n')
FPRINTF(stderr, "\n");
fprintf(stderr, "\n");
}
}
@ -394,7 +399,7 @@ void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
n += snprintf(buf + n, BUFSIZ - n, "\n");
if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
fprintf(stderr, "Failed to send %i bytes to syslog\n", n);
}
/**
@ -411,7 +416,8 @@ void logfile_init(const char *name, const char *path, size_t size)
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0)
die_perror("Failed to read own /proc/self/exe link");
log_file = output_file_open(path, O_APPEND | O_RDWR);
log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC,
S_IRUSR | S_IWUSR);
if (log_file == -1)
die_perror("Couldn't open log file %s", path);
@ -427,3 +433,4 @@ void logfile_init(const char *name, const char *path, size_t size)
/* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */
log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE);
}

4
ndp.c
View file

@ -234,8 +234,8 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
return 1;
if (ih->icmp6_type == NS) {
const struct ndp_ns *ns =
packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);
struct ndp_ns *ns = packet_get(p, 0, 0, sizeof(struct ndp_ns),
NULL);
if (!ns)
return -1;

View file

@ -353,7 +353,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
*/
bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
{
int nh_len = RTA_PAYLOAD(rta);
size_t nh_len = RTA_PAYLOAD(rta);
struct rtnexthop *rtnh;
bool found = false;
int hops = -1;
@ -582,7 +582,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
*(unsigned int *)RTA_DATA(rta) = ifi_dst;
} else if (rta->rta_type == RTA_MULTIPATH) {
int nh_len = RTA_PAYLOAD(rta);
size_t nh_len = RTA_PAYLOAD(rta);
struct rtnexthop *rtnh;
for (rtnh = (struct rtnexthop *)RTA_DATA(rta);

96
passt.1
View file

@ -95,7 +95,7 @@ detached PID namespace after starting, because the PID itself cannot change.
Default is to fork into background.
.TP
.BR \-e ", " \-\-stderr " " (DEPRECATED)
.BR \-e ", " \-\-stderr
This option has no effect, and is maintained for compatibility purposes only.
Note that this configuration option is \fBdeprecated\fR and will be removed in a
@ -249,19 +249,10 @@ the host.
.TP
.BR \-\-dns-forward " " \fIaddr
Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the
nameserver (with corresponding IP version) specified by the
\fB\-\-dns-host\fR option. Maps only UDP and TCP traffic to port 53 or
port 853. Replies are translated back with a reverse mapping. This
option can be specified zero to two times (once for IPv4, once for
IPv6).
.TP
.BR \-\-dns-host " " \fIaddr
Configure the host nameserver which guest or namespace queries to the
\fB\-\-dns-forward\fR address will be redirected to. This option can
be specified zero to two times (once for IPv4, once for IPv6).
By default, the first nameserver from the host's
\fI/etc/resolv.conf\fR.
first configured DNS resolver (with corresponding IP version). Maps
only UDP and TCP traffic to port 53 or port 853. Replies are
translated back with a reverse mapping. This option can be specified
zero to two times (once for IPv4, once for IPv6).
.TP
.BR \-S ", " \-\-search " " \fIlist
@ -336,16 +327,6 @@ namespace will be silently dropped.
Disable Router Advertisements. Router Solicitations coming from guest or target
namespace will be ignored.
.TP
.BR \-\-freebind
Allow any binding address to be specified for \fB-t\fR and \fB-u\fR
options. Usually binding addresses must be addresses currently
configured on the host. With \fB\-\-freebind\fR, the
\fBIP_FREEBIND\fR or \fBIPV6_FREEBIND\fR socket option is enabled
allowing any address to be used. This is typically used to bind
addresses which might be configured on the host in future, at which
point the forwarding will immediately start operating.
.TP
.BR \-\-map-host-loopback " " \fIaddr
Translate \fIaddr\fR to refer to the host. Packets from the guest to
@ -605,13 +586,6 @@ Configure UDP port forwarding from target namespace to init namespace.
Default is \fBauto\fR.
.TP
.BR \-\-host-lo-to-ns-lo " " (DEPRECATED)
If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from
the host's loopback address will appear on the loopback address in the
guest as well. Without this option such forwarded packets will appear
to come from the guest's public address.
.TP
.BR \-\-userns " " \fIspec
Target user namespace to join, as a path. If PID is given, without this option,
@ -889,41 +863,38 @@ root@localhost's password:
.SH NOTES
.SS Handling of traffic with loopback destination and source addresses
.SS Handling of traffic with local destination and source addresses
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a loopback
address (127.0.0.0/8 or ::1), depending on the configuration. Loopback
destination or source addresses need to be changed before packets are
delivered to the guest or target namespace: most operating systems
would drop packets received with loopback addresses on non-loopback
interfaces, and it would also be impossible for guest or target
namespace to route answers back.
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a local address,
depending on the configuration. Local destination or source addresses need to be
changed before packets are delivered to the guest or target namespace: most
operating systems would drop packets received from non-loopback interfaces with
local addresses, and it would also be impossible for guest or target namespace
to route answers back.
For convenience, the source address on these packets is translated to
the address specified by the \fB\-\-map-host-loopback\fR option (with
some exceptions in pasta mode, see next section below). If not
specified this defaults, somewhat arbitrarily, to the address of
default IPv4 or IPv6 gateway (if any) -- this is known to be an
existing, valid address on the same subnet. If \fB\-\-no-map-gw\fR or
\fB\-\-map-host-loopback none\fR are specified this translation is
disabled and packets with loopback addresses are simply dropped.
For convenience, and somewhat arbitrarily, the source address on these packets
is translated to the address of the default IPv4 or IPv6 gateway (if any) --
this is known to be an existing, valid address on the same subnet.
Loopback destination addresses are translated to the observed external
address of the guest or target namespace. For IPv6, the observed
link-local address is used if the translated source address is
link-local, otherwise the observed global address is used. For both
IPv4 and IPv6, if no addresses have been seen yet, the configured
addresses will be used instead.
Loopback destination addresses are instead translated to the observed external
address of the guest or target namespace. For IPv6 packets, if usage of a
link-local address by guest or namespace has ever been observed, and the
original destination address is also a link-local address, the observed
link-local address is used. Otherwise, the observed global address is used. For
both IPv4 and IPv6, if no addresses have been seen yet, the configured addresses
will be used instead.
For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1,
with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
the last observed source address from guest or namespace is 192.0.2.2, this will
be translated to a connection from 192.0.2.1 to 192.0.2.2.
Similarly, for traffic coming from guest or namespace, packets with
destination address corresponding to the \fB\-\-map-host-loopback\fR
address will have their destination address translated to a loopback
address.
Similarly, for traffic coming from guest or namespace, packets with destination
address corresponding to the default gateway will have their destination address
translated to a loopback address, if and only if a packet, in the opposite
direction, with a loopback destination or source address, port-wise matching for
UDP, or connection-wise for TCP, has been recently forwarded to guest or
namespace. This behaviour can be disabled with \-\-no\-map\-gw.
.SS Handling of local traffic in pasta
@ -939,15 +910,8 @@ and the new socket using the \fBsplice\fR(2) system call, and for UDP, a pair
of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet
transfers.
Because it's not possible to bind sockets to foreign addresses, this
bypass only applies to local connections and traffic. It also means
that the address translation differs slightly from passt mode.
Connections from loopback to loopback on the host will appear to come
from the target namespace's public address within the guest, unless
\fB\-\-host-lo-to-ns-lo\fR is specified, in which case they will
appear to come from loopback in the namespace as well. The latter
behaviour used to be the default, but is usually undesirable, since it
can unintentionally expose namespace local services to the host.
This bypass only applies to local connections and traffic, because it's not
possible to bind sockets to foreign addresses.
.SS Binding to low numbered ports (well-known or system ports, up to 1023)

12
passt.c
View file

@ -207,8 +207,7 @@ int main(int argc, char **argv)
struct timespec now;
struct sigaction sa;
if (clock_gettime(CLOCK_MONOTONIC, &log_start))
die_perror("Failed to get CLOCK_MONOTONIC time");
clock_gettime(CLOCK_MONOTONIC, &log_start);
arch_avx2_exec(argv);
@ -266,8 +265,7 @@ int main(int argc, char **argv)
secret_init(&c);
if (clock_gettime(CLOCK_MONOTONIC, &now))
die_perror("Failed to get CLOCK_MONOTONIC time");
clock_gettime(CLOCK_MONOTONIC, &now);
flow_init();
@ -309,15 +307,13 @@ int main(int argc, char **argv)
timer_init(&c, &now);
loop:
/* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
/* NOLINTEND(bugprone-branch-clone) */
if (nfds == -1 && errno != EINTR)
die_perror("epoll_wait() failed in main loop");
if (clock_gettime(CLOCK_MONOTONIC, &now))
err_perror("Failed to get CLOCK_MONOTONIC time");
clock_gettime(CLOCK_MONOTONIC, &now);
for (i = 0; i < nfds; i++) {
union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);

View file

@ -225,8 +225,6 @@ struct ip6_ctx {
* @no_dhcpv6: Disable DHCPv6 server
* @no_ndp: Disable NDP handler altogether
* @no_ra: Disable router advertisements
* @host_lo_to_ns_lo: Map host loopback addresses to ns loopback addresses
* @freebind: Allow binding of non-local addresses for forwarding
* @low_wmem: Low probed net.core.wmem_max
* @low_rmem: Low probed net.core.rmem_max
*/
@ -286,8 +284,6 @@ struct ctx {
int no_dhcpv6;
int no_ndp;
int no_ra;
int host_lo_to_ns_lo;
int freebind;
int low_wmem;
int low_rmem;

14
pasta.c
View file

@ -102,9 +102,7 @@ static int pasta_wait_for_ns(void *arg)
int flags = O_RDONLY | O_CLOEXEC;
char ns[PATH_MAX];
if (snprintf_check(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid))
die_perror("Can't build netns path");
snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
do {
while ((c->pasta_netns_fd = open(ns, flags)) < 0) {
if (errno != ENOENT)
@ -241,11 +239,8 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
c->quiet = 1;
/* Configure user and group mappings */
if (snprintf_check(uidmap, BUFSIZ, "0 %u 1", uid))
die_perror("Can't build uidmap");
if (snprintf_check(gidmap, BUFSIZ, "0 %u 1", gid))
die_perror("Can't build gidmap");
snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
snprintf(gidmap, BUFSIZ, "0 %u 1", gid);
if (write_file("/proc/self/uid_map", uidmap) ||
write_file("/proc/self/setgroups", "deny") ||
@ -432,12 +427,12 @@ static int pasta_netns_quit_timer(void)
*/
void pasta_netns_quit_init(const struct ctx *c)
{
union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY };
struct epoll_event ev = { .events = EPOLLIN };
int flags = O_NONBLOCK | O_CLOEXEC;
struct statfs s = { 0 };
bool try_inotify = true;
int fd = -1, dir_fd;
union epoll_ref ref;
if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base)
return;
@ -468,7 +463,6 @@ void pasta_netns_quit_init(const struct ctx *c)
ref.type = EPOLL_TYPE_NSQUIT_TIMER;
} else {
close(dir_fd);
ref.type = EPOLL_TYPE_NSQUIT_INOTIFY;
}
if (fd > FD_REF_MAX)

32
pcap.c
View file

@ -86,8 +86,9 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
.caplen = l2len,
.len = l2len
};
struct iovec hiov = { &h, sizeof(h) };
if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 ||
if (write_remainder(pcap_fd, &hiov, 1, 0) < 0 ||
write_remainder(pcap_fd, iov, iovcnt, offset) < 0)
debug_perror("Cannot log packet, length %zu", l2len);
}
@ -100,14 +101,12 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
void pcap(const char *pkt, size_t l2len)
{
struct iovec iov = { (char *)pkt, l2len };
struct timespec now = { 0 };
struct timespec now;
if (pcap_fd == -1)
return;
if (clock_gettime(CLOCK_REALTIME, &now))
err_perror("Failed to get CLOCK_REALTIME time");
clock_gettime(CLOCK_REALTIME, &now);
pcap_frame(&iov, 1, 0, &now);
}
@ -121,14 +120,13 @@ void pcap(const char *pkt, size_t l2len)
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
size_t offset)
{
struct timespec now = { 0 };
struct timespec now;
unsigned int i;
if (pcap_fd == -1)
return;
if (clock_gettime(CLOCK_REALTIME, &now))
err_perror("Failed to get CLOCK_REALTIME time");
clock_gettime(CLOCK_REALTIME, &now);
for (i = 0; i < n; i++)
pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
@ -141,20 +139,17 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
* @iov: Pointer to the array of struct iovec describing the I/O vector
* containing packet data to write, including L2 header
* @iovcnt: Number of buffers (@iov entries)
* @offset: Offset of the L2 frame within the full data length
*/
/* cppcheck-suppress unusedFunction */
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
void pcap_iov(const struct iovec *iov, size_t iovcnt)
{
struct timespec now = { 0 };
struct timespec now;
if (pcap_fd == -1)
return;
if (clock_gettime(CLOCK_REALTIME, &now))
err_perror("Failed to get CLOCK_REALTIME time");
pcap_frame(iov, iovcnt, offset, &now);
clock_gettime(CLOCK_REALTIME, &now);
pcap_frame(iov, iovcnt, 0, &now);
}
/**
@ -163,15 +158,18 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
*/
void pcap_init(struct ctx *c)
{
int flags = O_WRONLY | O_CREAT | O_TRUNC;
if (pcap_fd != -1)
return;
if (!*c->pcap)
return;
pcap_fd = output_file_open(c->pcap, O_WRONLY);
flags |= c->foreground ? O_CLOEXEC : 0;
pcap_fd = open(c->pcap, flags, S_IRUSR | S_IWUSR);
if (pcap_fd == -1) {
err_perror("Couldn't open pcap file %s", c->pcap);
perror("open");
return;
}

2
pcap.h
View file

@ -9,7 +9,7 @@
void pcap(const char *pkt, size_t l2len);
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
size_t offset);
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset);
void pcap_iov(const struct iovec *iov, size_t iovcnt);
void pcap_init(struct ctx *c);
#endif /* PCAP_H */

42
pif.c
View file

@ -59,45 +59,3 @@ void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
*sl = sizeof(sa->sa6);
}
}
/** pif_sock_l4() - Open a socket bound to an address on a specified interface
* @c: Execution context
* @type: Socket epoll type
* @pif: Interface for this socket
* @addr: Address to bind to, or NULL for dual-stack any
* @ifname: Interface for binding, NULL for any
* @port: Port number to bind to (host byte order)
* @data: epoll reference portion for protocol handlers
*
* NOTE: For namespace pifs, this must be called having already entered the
* relevant namespace.
*
* Return: newly created socket, negative error code on failure
*/
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
const union inany_addr *addr, const char *ifname,
in_port_t port, uint32_t data)
{
union sockaddr_inany sa = {
.sa6.sin6_family = AF_INET6,
.sa6.sin6_addr = in6addr_any,
.sa6.sin6_port = htons(port),
};
socklen_t sl;
ASSERT(pif_is_socket(pif));
if (pif == PIF_SPLICE) {
/* Sanity checks */
ASSERT(!ifname);
ASSERT(addr && inany_is_loopback(addr));
}
if (!addr)
return sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
ifname, false, data);
pif_sockaddr(c, &sa, &sl, pif, addr, port);
return sock_l4_sa(c, type, &sa, sl,
ifname, sa.sa_family == AF_INET6, data);
}

3
pif.h
View file

@ -59,8 +59,5 @@ static inline bool pif_is_socket(uint8_t pif)
void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
uint8_t pif, const union inany_addr *addr, in_port_t port);
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
const union inany_addr *addr, const char *ifname,
in_port_t port, uint32_t data);
#endif /* PIF_H */

View file

@ -20,15 +20,6 @@ OUT="$(mktemp)"
[ -z "${ARCH}" ] && ARCH="$(uname -m)"
[ -z "${CC}" ] && CC="cc"
AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z] \
| sed 's/^ARM.*/ARM/' \
| sed 's/I[456]86/I386/' \
| sed 's/PPC64/PPC/' \
| sed 's/PPCLE/PPC64LE/' \
| sed 's/MIPS64EL/MIPSEL64/' \
| sed 's/HPPA/PARISC/' \
| sed 's/SH4/SH/')"
HEADER="/* This file was automatically generated by $(basename ${0}) */
#ifndef AUDIT_ARCH_PPC64LE
@ -41,7 +32,7 @@ struct sock_filter filter_@PROFILE@[] = {
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(offsetof(struct seccomp_data, arch))),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, @AUDIT_ARCH@, 0, @KILL@),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@),
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(offsetof(struct seccomp_data, nr))),
@ -242,8 +233,7 @@ gen_profile() {
sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}"
done
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" \
"AUDIT_ARCH:${AUDIT_ARCH}"
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))"
}
printf '%s\n' "${HEADER}" > "${OUT}"

142
tap.c
View file

@ -172,15 +172,11 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
char *data = (char *)(uh + 1);
const struct iovec iov = {
.iov_base = (void *)in,
.iov_len = dlen
};
uh->source = htons(sport);
uh->dest = htons(dport);
uh->len = htons(l4len);
csum_udp4(uh, src, dst, &iov, 1, 0);
csum_udp4(uh, src, dst, in, dlen);
memcpy(data, in, dlen);
tap_send_single(c, buf, dlen + (data - buf));
@ -251,7 +247,7 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h,
void tap_udp6_send(const struct ctx *c,
const struct in6_addr *src, in_port_t sport,
const struct in6_addr *dst, in_port_t dport,
uint32_t flow, void *in, size_t dlen)
uint32_t flow, const void *in, size_t dlen)
{
size_t l4len = dlen + sizeof(struct udphdr);
char buf[USHRT_MAX];
@ -259,15 +255,11 @@ void tap_udp6_send(const struct ctx *c,
struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
l4len, IPPROTO_UDP, flow);
char *data = (char *)(uh + 1);
const struct iovec iov = {
.iov_base = in,
.iov_len = dlen
};
uh->source = htons(sport);
uh->dest = htons(dport);
uh->len = htons(l4len);
csum_udp6(uh, src, dst, &iov, 1, 0);
csum_udp6(uh, src, dst, in, dlen);
memcpy(data, in, dlen);
tap_send_single(c, buf, dlen + (data - buf));
@ -990,17 +982,24 @@ static void tap_sock_reset(struct ctx *c)
}
/**
* tap_passt_input() - Handler for new data on the socket to qemu
* tap_handler_passt() - Packet handler for AF_UNIX file descriptor
* @c: Execution context
* @events: epoll events
* @now: Current timestamp
*/
static void tap_passt_input(struct ctx *c, const struct timespec *now)
void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now)
{
static const char *partial_frame;
static ssize_t partial_len = 0;
ssize_t n;
char *p;
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
tap_sock_reset(c);
return;
}
tap_flush_pools();
if (partial_len) {
@ -1011,13 +1010,10 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
memmove(pkt_buf, partial_frame, partial_len);
}
do {
n = recv(c->fd_tap, pkt_buf + partial_len,
TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
} while ((n < 0) && errno == EINTR);
n = recv(c->fd_tap, pkt_buf + partial_len, TAP_BUF_BYTES - partial_len,
MSG_DONTWAIT);
if (n < 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK) {
if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
err_perror("Receive error on guest connection, reset");
tap_sock_reset(c);
}
@ -1055,63 +1051,6 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
tap_handler(c, now);
}
/**
* tap_handler_passt() - Event handler for AF_UNIX file descriptor
* @c: Execution context
* @events: epoll events
* @now: Current timestamp
*/
void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now)
{
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
tap_sock_reset(c);
return;
}
if (events & EPOLLIN)
tap_passt_input(c, now);
}
/**
* tap_pasta_input() - Handler for new data on the socket to hypervisor
* @c: Execution context
* @now: Current timestamp
*/
static void tap_pasta_input(struct ctx *c, const struct timespec *now)
{
ssize_t n, len;
tap_flush_pools();
for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
if (len == 0) {
die("EOF on tap device, exiting");
} else if (len < 0) {
if (errno == EINTR) {
len = 0;
continue;
}
if (errno == EAGAIN && errno == EWOULDBLOCK)
break; /* all done for now */
die("Error on tap device, exiting");
}
/* Ignore frames of bad length */
if (len < (ssize_t)sizeof(struct ethhdr) ||
len > (ssize_t)ETH_MAX_MTU)
continue;
tap_add_packet(c, len, pkt_buf + n);
}
tap_handler(c, now);
}
/**
* tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor
* @c: Execution context
@ -1121,11 +1060,46 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
void tap_handler_pasta(struct ctx *c, uint32_t events,
const struct timespec *now)
{
ssize_t n, len;
int ret;
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
die("Disconnect event on /dev/net/tun device, exiting");
if (events & EPOLLIN)
tap_pasta_input(c, now);
redo:
n = 0;
tap_flush_pools();
restart:
while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
if (len < (ssize_t)sizeof(struct ethhdr) ||
len > (ssize_t)ETH_MAX_MTU) {
n += len;
continue;
}
tap_add_packet(c, len, pkt_buf + n);
if ((n += len) == TAP_BUF_BYTES)
break;
}
if (len < 0 && errno == EINTR)
goto restart;
ret = errno;
tap_handler(c, now);
if (len > 0 || ret == EAGAIN)
return;
if (n == TAP_BUF_BYTES)
goto redo;
die("Error on tap device, exiting");
}
/**
@ -1136,7 +1110,7 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
*/
int tap_sock_unix_open(char *sock_path)
{
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
int fd = socket(AF_UNIX, SOCK_STREAM, 0);
struct sockaddr_un addr = {
.sun_family = AF_UNIX,
};
@ -1151,12 +1125,10 @@ int tap_sock_unix_open(char *sock_path)
if (*sock_path)
memcpy(path, sock_path, UNIX_PATH_MAX);
else if (snprintf_check(path, UNIX_PATH_MAX - 1,
UNIX_SOCK_PATH, i))
die_perror("Can't build UNIX domain socket path");
else
snprintf(path, UNIX_PATH_MAX - 1, UNIX_SOCK_PATH, i);
ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
0);
ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
if (ex < 0)
die_perror("Failed to check for UNIX domain conflicts");
@ -1289,7 +1261,7 @@ static int tap_ns_tun(void *arg)
if (fd < 0)
die_perror("Failed to open() /dev/net/tun");
rc = ioctl(fd, (int)TUNSETIFF, &ifr);
rc = ioctl(fd, TUNSETIFF, &ifr);
if (rc < 0)
die_perror("TUNSETIFF ioctl on /dev/net/tun failed");

2
tap.h
View file

@ -53,7 +53,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
void tap_udp6_send(const struct ctx *c,
const struct in6_addr *src, in_port_t sport,
const struct in6_addr *dst, in_port_t dport,
uint32_t flow, void *in, size_t dlen);
uint32_t flow, const void *in, size_t dlen);
void tap_icmp6_send(const struct ctx *c,
const struct in6_addr *src, const struct in6_addr *dst,
const void *in, size_t l4len);

387
tcp.c
View file

@ -274,7 +274,6 @@
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
@ -287,6 +286,8 @@
#include <time.h>
#include <arpa/inet.h>
#include <linux/tcp.h> /* For struct tcp_info */
#include "checksum.h"
#include "util.h"
#include "iov.h"
@ -299,7 +300,6 @@
#include "log.h"
#include "inany.h"
#include "flow.h"
#include "linux_dep.h"
#include "flow_table.h"
#include "tcp_internal.h"
@ -308,6 +308,11 @@
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
#ifdef HAS_SND_WND
# define KERNEL_REPORTS_SND_WND(c) ((c)->tcp.kernel_snd_wnd)
#else
# define KERNEL_REPORTS_SND_WND(c) (0 && (c))
#endif
#define ACK_INTERVAL 10 /* ms */
#define SYN_TIMEOUT 10 /* s */
@ -318,6 +323,11 @@
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
* <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
*/
#define SOL_TCP IPPROTO_TCP
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define CONN_IS_CLOSING(conn) \
@ -361,20 +371,6 @@ char tcp_buf_discard [MAX_WINDOW];
/* Does the kernel support TCP_PEEK_OFF? */
bool peek_offset_cap;
/* Size of data returned by TCP_INFO getsockopt() */
socklen_t tcp_info_size;
#define tcp_info_cap(f_) \
((offsetof(struct tcp_info_linux, tcpi_##f_) + \
sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
#define snd_wnd_cap tcp_info_cap(snd_wnd)
/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
#define bytes_acked_cap tcp_info_cap(bytes_acked)
/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
#define min_rtt_cap tcp_info_cap(min_rtt)
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
@ -428,23 +424,27 @@ int tcp_set_peek_offset(int s, int offset)
*/
static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
{
uint32_t rdhup;
if (!events)
return 0;
rdhup = (events & SOCK_FIN_RCVD) ? 0 : EPOLLRDHUP;
if (events & ESTABLISHED) {
if (events & TAP_FIN_SENT)
return EPOLLET;
if (conn_flags & STALLED)
return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;
return EPOLLIN | EPOLLOUT | rdhup | EPOLLET;
return EPOLLIN | EPOLLRDHUP;
return EPOLLIN | rdhup;
}
if (events == TAP_SYN_RCVD)
return EPOLLOUT | EPOLLET | EPOLLRDHUP;
return EPOLLOUT | EPOLLET | rdhup;
return EPOLLET | EPOLLRDHUP;
return rdhup;
}
/**
@ -549,8 +549,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
(unsigned long long)it.it_value.tv_sec,
(unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
if (timerfd_settime(conn->timer, 0, &it, NULL))
flow_err(conn, "failed to set timer: %s", strerror(errno));
timerfd_settime(conn->timer, 0, &it, NULL);
}
/**
@ -680,12 +679,13 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
* @tinfo: Pointer to struct tcp_info for socket
*/
static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
const struct tcp_info_linux *tinfo)
const struct tcp_info *tinfo)
{
#ifdef HAS_MIN_RTT
const struct flowside *tapside = TAPFLOW(conn);
int i, hole = -1;
if (!min_rtt_cap ||
if (!tinfo->tcpi_min_rtt ||
(int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD)
return;
@ -706,6 +706,10 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
if (hole == LOW_RTT_TABLE_SIZE)
hole = 0;
inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
#else
(void)conn;
(void)tinfo;
#endif /* HAS_MIN_RTT */
}
/**
@ -752,106 +756,34 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
}
/**
* tcp_update_check_tcp4() - Calculate TCP checksum for IPv4
* tcp_update_check_tcp4() - Update TCP checksum from stored one
* @iph: IPv4 header
* @iov: Pointer to the array of IO vectors
* @iov_cnt: Length of the array
* @l4offset: IPv4 payload offset in the iovec array
* @th: TCP header followed by TCP payload
*/
static void tcp_update_check_tcp4(const struct iphdr *iph,
const struct iovec *iov, int iov_cnt,
size_t l4offset)
static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th)
{
uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
struct in_addr saddr = { .s_addr = iph->saddr };
struct in_addr daddr = { .s_addr = iph->daddr };
size_t check_ofs;
uint16_t *check;
int check_idx;
uint32_t sum;
char *ptr;
uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
check_idx = iov_skip_bytes(iov, iov_cnt,
l4offset + offsetof(struct tcphdr, check),
&check_ofs);
if (check_idx >= iov_cnt) {
err("TCP4 buffer is too small, iov size %zd, check offset %zd",
iov_size(iov, iov_cnt),
l4offset + offsetof(struct tcphdr, check));
return;
}
if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
err("TCP4 checksum field memory is not contiguous "
"check_ofs %zd check_idx %d iov_len %zd",
check_ofs, check_idx, iov[check_idx].iov_len);
return;
}
ptr = (char *)iov[check_idx].iov_base + check_ofs;
if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
err("TCP4 checksum field is not correctly aligned in memory");
return;
}
check = (uint16_t *)ptr;
*check = 0;
*check = csum_iov(iov, iov_cnt, l4offset, sum);
th->check = 0;
th->check = csum(th, l4len, sum);
}
/**
* tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
* @ip6h: IPv6 header
* @iov: Pointer to the array of IO vectors
* @iov_cnt: Length of the array
* @l4offset: IPv6 payload offset in the iovec array
* @th: TCP header followed by TCP payload
*/
static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
const struct iovec *iov, int iov_cnt,
size_t l4offset)
static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
{
uint16_t l4len = ntohs(ip6h->payload_len);
size_t check_ofs;
uint16_t *check;
int check_idx;
uint32_t sum;
char *ptr;
uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
&ip6h->saddr, &ip6h->daddr);
sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr,
&ip6h->daddr);
check_idx = iov_skip_bytes(iov, iov_cnt,
l4offset + offsetof(struct tcphdr, check),
&check_ofs);
if (check_idx >= iov_cnt) {
err("TCP6 buffer is too small, iov size %zd, check offset %zd",
iov_size(iov, iov_cnt),
l4offset + offsetof(struct tcphdr, check));
return;
}
if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
err("TCP6 checksum field memory is not contiguous "
"check_ofs %zd check_idx %d iov_len %zd",
check_ofs, check_idx, iov[check_idx].iov_len);
return;
}
ptr = (char *)iov[check_idx].iov_base + check_ofs;
if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
err("TCP6 checksum field is not correctly aligned in memory");
return;
}
check = (uint16_t *)ptr;
*check = 0;
*check = csum_iov(iov, iov_cnt, l4offset, sum);
th->check = 0;
th->check = csum(th, l4len, sum);
}
/**
@ -937,6 +869,7 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn)
/* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
void tcp_defer_handler(struct ctx *c)
{
tcp_flags_flush(c);
tcp_payload_flush(c);
}
@ -970,24 +903,23 @@ static void tcp_fill_header(struct tcphdr *th,
* @conn: Connection pointer
* @taph: tap backend specific header
* @iph: Pointer to IPv4 header
* @bp: Pointer to TCP header followed by TCP payload
* @th: Pointer to TCP header
* @dlen: TCP payload length
* @check: Checksum, if already known
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
*
* Return: The IPv4 payload length, host order
*/
static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
struct tap_hdr *taph,
struct iphdr *iph, struct tcp_payload_t *bp,
struct iphdr *iph, struct tcphdr *th,
size_t dlen, const uint16_t *check,
uint32_t seq, bool no_tcp_csum)
uint32_t seq)
{
const struct flowside *tapside = TAPFLOW(conn);
const struct in_addr *src4 = inany_v4(&tapside->oaddr);
const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
size_t l4len = dlen + sizeof(bp->th);
size_t l4len = dlen + sizeof(*th);
size_t l3len = l4len + sizeof(*iph);
ASSERT(src4 && dst4);
@ -999,18 +931,9 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
iph->check = check ? *check :
csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4);
tcp_fill_header(&bp->th, conn, seq);
tcp_fill_header(th, conn, seq);
if (no_tcp_csum) {
bp->th.check = 0;
} else {
const struct iovec iov = {
.iov_base = bp,
.iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr),
};
tcp_update_check_tcp4(iph, &iov, 1, 0);
}
tcp_update_check_tcp4(iph, th);
tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
@ -1022,21 +945,20 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
* @conn: Connection pointer
* @taph: tap backend specific header
* @ip6h: Pointer to IPv6 header
* @bp: Pointer to TCP header followed by TCP payload
* @th: Pointer to TCP header
* @dlen: TCP payload length
* @check: Checksum, if already known
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
*
* Return: The IPv6 payload length, host order
*/
static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
struct tap_hdr *taph,
struct ipv6hdr *ip6h, struct tcp_payload_t *bp,
size_t dlen, uint32_t seq, bool no_tcp_csum)
struct ipv6hdr *ip6h, struct tcphdr *th,
size_t dlen, uint32_t seq)
{
const struct flowside *tapside = TAPFLOW(conn);
size_t l4len = dlen + sizeof(bp->th);
size_t l4len = dlen + sizeof(*th);
ip6h->payload_len = htons(l4len);
ip6h->saddr = tapside->oaddr.a6;
@ -1050,18 +972,9 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
tcp_fill_header(&bp->th, conn, seq);
tcp_fill_header(th, conn, seq);
if (no_tcp_csum) {
bp->th.check = 0;
} else {
const struct iovec iov = {
.iov_base = bp,
.iov_len = ntohs(ip6h->payload_len)
};
tcp_update_check_tcp6(ip6h, &iov, 1, 0);
}
tcp_update_check_tcp6(ip6h, th);
tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
@ -1075,14 +988,12 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
* @dlen: TCP payload length
* @check: Checksum, if already known
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
*
* Return: IP payload length, host order
*/
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
struct iovec *iov, size_t dlen,
const uint16_t *check, uint32_t seq,
bool no_tcp_csum)
const uint16_t *check, uint32_t seq)
{
const struct flowside *tapside = TAPFLOW(conn);
const struct in_addr *a4 = inany_v4(&tapside->oaddr);
@ -1091,13 +1002,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_IP].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, dlen,
check, seq, no_tcp_csum);
check, seq);
}
return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_IP].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, dlen,
seq, no_tcp_csum);
seq);
}
/**
@ -1110,24 +1021,25 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
* Return: 1 if sequence or window were updated, 0 otherwise
*/
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
bool force_seq, struct tcp_info_linux *tinfo)
int force_seq, struct tcp_info *tinfo)
{
uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
/* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */
socklen_t sl = sizeof(*tinfo);
struct tcp_info_linux tinfo_new;
struct tcp_info tinfo_new;
uint32_t new_wnd_to_tap = prev_wnd_to_tap;
int s = conn->sock;
if (!bytes_acked_cap) {
#ifndef HAS_BYTES_ACKED
(void)force_seq;
conn->seq_ack_to_tap = conn->seq_from_tap;
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
conn->seq_ack_to_tap = prev_ack_to_tap;
} else {
if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL ||
tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) ||
(conn->flags & LOCAL) || force_seq) {
#else
if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
|| CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
conn->seq_ack_to_tap = conn->seq_from_tap;
} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
if (!tinfo) {
@ -1142,9 +1054,9 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
conn->seq_ack_to_tap = prev_ack_to_tap;
}
}
#endif /* !HAS_BYTES_ACKED */
if (!snd_wnd_cap) {
if (!KERNEL_REPORTS_SND_WND(c)) {
tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
@ -1162,6 +1074,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
}
}
#ifdef HAS_SND_WND
if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
new_wnd_to_tap = tinfo->tcpi_snd_wnd;
} else {
@ -1169,6 +1082,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
SNDBUF_GET(conn));
}
#endif
new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
if (!(conn->events & ESTABLISHED))
@ -1226,11 +1140,11 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
* 0 if there is no flag to send
* 1 otherwise
*/
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, char *data,
size_t *optlen)
{
struct tcp_info_linux tinfo = { 0 };
struct tcp_info tinfo = { 0 };
socklen_t sl = sizeof(tinfo);
int s = conn->sock;
@ -1243,16 +1157,27 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
return -ECONNRESET;
}
#ifdef HAS_SND_WND
if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
c->tcp.kernel_snd_wnd = 1;
#endif
if (!(conn->flags & LOCAL))
tcp_rtt_dst_check(conn, &tinfo);
if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags)
if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
return 0;
*optlen = 0;
if (flags & SYN) {
int mss;
/* Options: MSS, NOP and window scale (8 bytes) */
*optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
*data++ = OPT_MSS;
*data++ = OPT_MSS_LEN;
if (c->mtu == -1) {
mss = tinfo.tcpi_snd_mss;
} else {
@ -1268,11 +1193,16 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
else if (mss > PAGE_SIZE)
mss = ROUND_DOWN(mss, PAGE_SIZE);
}
*(uint16_t *)data = htons(MIN(USHRT_MAX, mss));
data += OPT_MSS_LEN - 2;
conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
*opts = TCP_SYN_OPTS(mss, conn->ws_to_tap);
*optlen = sizeof(*opts);
*data++ = OPT_NOP;
*data++ = OPT_WS;
*data++ = OPT_WS_LEN;
*data++ = conn->ws_to_tap;
} else if (!(flags & RST)) {
flags |= ACK;
}
@ -1309,8 +1239,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
*
* Return: negative error code on connection reset, 0 otherwise
*/
static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
int flags)
int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
return tcp_buf_send_flag(c, conn, flags);
}
@ -1320,7 +1249,7 @@ static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
* @c: Execution context
* @conn: Connection pointer
*/
void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
{
if (conn->events == CLOSED)
return;
@ -1410,7 +1339,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
{
int s;
s = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP);
s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
if (s > FD_REF_MAX) {
close(s);
@ -1538,7 +1467,7 @@ static void tcp_bind_outbound(const struct ctx *c,
* @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp
*/
static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
const void *saddr, const void *daddr,
const struct tcphdr *th, const char *opts,
size_t optlen, const struct timespec *now)
@ -1703,7 +1632,7 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
*
* #syscalls recvmsg
*/
static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
{
return tcp_buf_data_from_sock(c, conn);
}
@ -1719,7 +1648,7 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
*
* Return: count of consumed packets
*/
static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
const struct pool *p, int idx)
{
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
@ -1917,8 +1846,7 @@ out:
* @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length
*/
static void tcp_conn_from_sock_finish(const struct ctx *c,
struct tcp_tap_conn *conn,
static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
const struct tcphdr *th,
const char *opts, size_t optlen)
{
@ -1941,12 +1869,11 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
return;
}
tcp_send_flag(c, conn, ACK);
/* The client might have sent data already, which we didn't
* dequeue waiting for SYN,ACK from tap -- check now.
*/
tcp_data_from_sock(c, conn);
tcp_send_flag(c, conn, ACK);
}
/**
@ -1962,7 +1889,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
*
* Return: count of consumed packets
*/
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now)
{
@ -2100,7 +2027,7 @@ reset:
* @c: Execution context
* @conn: Connection pointer
*/
static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
{
socklen_t sl;
int so;
@ -2126,8 +2053,8 @@ static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
* @sa: Peer socket address (from accept())
* @now: Current timestamp
*/
static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
int s, const struct timespec *now)
static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
const struct timespec *now)
{
struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
uint64_t hash;
@ -2158,7 +2085,7 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
* @ref: epoll reference of listening socket
* @now: Current timestamp
*/
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
const struct timespec *now)
{
const struct flowside *ini;
@ -2223,7 +2150,7 @@ cancel:
*
* #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
*/
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
{
struct itimerspec check_armed = { { 0 }, { 0 } };
struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp;
@ -2235,9 +2162,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
* timer is currently armed, this event came from a previous setting,
* and we just set the timer to a new point in the future: discard it.
*/
if (timerfd_gettime(conn->timer, &check_armed))
flow_err(conn, "failed to read timer: %s", strerror(errno));
timerfd_gettime(conn->timer, &check_armed);
if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
return;
@ -2275,10 +2200,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
* case. This avoids having to preemptively reset the timer on
* ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
*/
if (timerfd_settime(conn->timer, 0, &new, &old))
flow_err(conn, "failed to set timer: %s",
strerror(errno));
timerfd_settime(conn->timer, 0, &new, &old);
if (old.it_value.tv_sec == ACT_TIMEOUT) {
flow_dbg(conn, "activity timeout");
tcp_rst(c, conn);
@ -2292,14 +2214,19 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
* @ref: epoll reference
* @events: epoll events bitmap
*/
void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events)
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
{
struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside);
ASSERT(!c->no_tcp);
ASSERT(pif_at_sidx(ref.flowside) != PIF_TAP);
if (events & EPOLLRDHUP) {
flow_err(conn, "EPOLLRDHUP: events=0x%x conn->events=0x%x "
"conn->flags=0x%x\n", events, conn->events,
conn->flags);
}
if (conn->events == CLOSED)
return;
@ -2324,7 +2251,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
tcp_data_from_sock(c, conn);
if (events & EPOLLOUT)
tcp_update_seqack_wnd(c, conn, false, NULL);
tcp_update_seqack_wnd(c, conn, 0, NULL);
return;
}
@ -2347,16 +2274,17 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
}
/**
* tcp_sock_init_one() - Initialise listening socket for address and port
* tcp_sock_init_af() - Initialise listening socket for a given af and port
* @c: Execution context
* @addr: Pointer to address for binding, NULL for dual stack any
* @ifname: Name of interface to bind to, NULL if not configured
* @af: Address family to listen on
* @port: Port, host order
* @addr: Pointer to address for binding, NULL if not configured
* @ifname: Name of interface to bind to, NULL if not configured
*
* Return: fd for the new listening socket, negative error code on failure
*/
static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
const char *ifname, in_port_t port)
static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
const void *addr, const char *ifname)
{
union tcp_listen_epoll_ref tref = {
.port = port,
@ -2364,13 +2292,12 @@ static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
};
int s;
s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr,
ifname, port, tref.u32);
s = sock_l4(c, af, EPOLL_TYPE_TCP_LISTEN, addr, ifname, port, tref.u32);
if (c->tcp.fwd_in.mode == FWD_AUTO) {
if (!addr || inany_v4(addr))
if (af == AF_INET || af == AF_UNSPEC)
tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s;
if (!addr || !inany_v4(addr))
if (af == AF_INET6 || af == AF_UNSPEC)
tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s;
}
@ -2384,32 +2311,31 @@ static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
/**
* tcp_sock_init() - Create listening sockets for a given host ("inbound") port
* @c: Execution context
* @af: Address family to select a specific IP version, or AF_UNSPEC
* @addr: Pointer to address for binding, NULL if not configured
* @ifname: Name of interface to bind to, NULL if not configured
* @port: Port, host order
*
* Return: 0 on (partial) success, negative error code on (complete) failure
*/
int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
const char *ifname, in_port_t port)
{
int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
ASSERT(!c->no_tcp);
if (!addr && c->ifi4 && c->ifi6)
if (af == AF_UNSPEC && c->ifi4 && c->ifi6)
/* Attempt to get a dual stack socket */
if (tcp_sock_init_one(c, NULL, ifname, port) >= 0)
if (tcp_sock_init_af(c, AF_UNSPEC, port, addr, ifname) >= 0)
return 0;
/* Otherwise create a socket per IP version */
if ((!addr || inany_v4(addr)) && c->ifi4)
r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4,
ifname, port);
if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4)
r4 = tcp_sock_init_af(c, AF_INET, port, addr, ifname);
if ((!addr || !inany_v4(addr)) && c->ifi6)
r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6,
ifname, port);
if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6)
r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname);
if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
return 0;
@ -2432,7 +2358,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
ASSERT(c->mode == MODE_PASTA);
s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4,
s = sock_l4(c, AF_INET, EPOLL_TYPE_TCP_LISTEN, &in4addr_loopback,
NULL, port, tref.u32);
if (s >= 0)
tcp_sock_set_bufsize(c, s);
@ -2458,7 +2384,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
ASSERT(c->mode == MODE_PASTA);
s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6,
s = sock_l4(c, AF_INET6, EPOLL_TYPE_TCP_LISTEN, &in6addr_loopback,
NULL, port, tref.u32);
if (s >= 0)
tcp_sock_set_bufsize(c, s);
@ -2561,7 +2487,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
*
* Return: true if supported, false otherwise
*/
static bool tcp_probe_peek_offset_cap(sa_family_t af)
bool tcp_probe_peek_offset_cap(sa_family_t af)
{
bool ret = false;
int s, optv = 0;
@ -2578,34 +2504,6 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
return ret;
}
/**
* tcp_probe_tcp_info() - Check what data TCP_INFO reports
*
* Return: Number of bytes returned by TCP_INFO getsockopt()
*/
static socklen_t tcp_probe_tcp_info(void)
{
struct tcp_info_linux tinfo;
socklen_t sl = sizeof(tinfo);
int s;
s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
if (s < 0) {
warn_perror("Temporary TCP socket creation failed");
return false;
}
if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
warn_perror("Failed to get TCP_INFO on temporary socket");
close(s);
return false;
}
close(s);
return sl;
}
/**
* tcp_init() - Get initial sequence, hash secret, initialise per-socket data
* @c: Execution context
@ -2616,7 +2514,11 @@ int tcp_init(struct ctx *c)
{
ASSERT(!c->no_tcp);
tcp_sock_iov_init(c);
if (c->ifi4)
tcp_sock4_iov_init(c);
if (c->ifi6)
tcp_sock6_iov_init(c);
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
@ -2635,15 +2537,6 @@ int tcp_init(struct ctx *c)
(!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
tcp_info_size = tcp_probe_tcp_info();
#define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \
STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
dbg_tcpi(snd_wnd);
dbg_tcpi(bytes_acked);
dbg_tcpi(min_rtt);
#undef dbg_tcpi
return 0;
}
@ -2685,7 +2578,7 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
if (outbound)
tcp_ns_sock_init(c, port);
else
tcp_sock_init(c, NULL, NULL, port);
tcp_sock_init(c, AF_UNSPEC, NULL, NULL, port);
}
}
}

15
tcp.h
View file

@ -10,15 +10,14 @@
struct ctx;
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref);
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
const struct timespec *now);
void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events);
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now);
int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
const char *ifname, in_port_t port);
int tcp_init(struct ctx *c);
void tcp_timer(struct ctx *c, const struct timespec *now);
@ -59,12 +58,16 @@ union tcp_listen_epoll_ref {
* @fwd_in: Port forwarding configuration for inbound packets
* @fwd_out: Port forwarding configuration for outbound packets
* @timer_run: Timestamp of most recent timer run
* @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035)
* @pipe_size: Size of pipes for spliced connections
*/
struct tcp_ctx {
struct fwd_ports fwd_in;
struct fwd_ports fwd_out;
struct timespec timer_run;
#ifdef HAS_SND_WND
int kernel_snd_wnd;
#endif
size_t pipe_size;
};

370
tcp_buf.c
View file

@ -20,7 +20,7 @@
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <linux/tcp.h>
#include "util.h"
#include "ip.h"
@ -38,32 +38,88 @@
(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
/* Static buffers */
/**
* struct tcp_payload_t - TCP header and data to send segments with payload
* @th: TCP header
* @data: TCP data
*/
struct tcp_payload_t {
struct tcphdr th;
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
/* Ethernet header for IPv4 and IPv6 frames */
/**
* struct tcp_flags_t - TCP header and data to send zero-length
* segments (flags)
* @th: TCP header
* @opts TCP options
*/
struct tcp_flags_t {
struct tcphdr th;
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
/* Ethernet header for IPv4 frames */
static struct ethhdr tcp4_eth_src;
static struct ethhdr tcp6_eth_src;
static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM];
static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
/* IPv4 headers */
static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
/* TCP segments with payload for IPv4 frames */
static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM];
/* IP headers for IPv4 and IPv6 */
struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
/* TCP segments with payload for IPv4 and IPv6 frames */
static struct tcp_payload_t tcp_payload[TCP_FRAMES_MEM];
static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516");
static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516");
static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
/* References tracking the owner connection of frames in the tap outqueue */
static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp_payload_used;
static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp4_payload_used;
static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
/* IPv4 headers for TCP segment without payload */
static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM];
/* TCP segments without payload for IPv4 frames */
static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM];
static unsigned int tcp4_flags_used;
/* Ethernet header for IPv6 frames */
static struct ethhdr tcp6_eth_src;
static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
/* IPv6 headers */
static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
/* TCP headers and data for IPv6 frames */
static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM];
static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
/* References tracking the owner connection of frames in the tap outqueue */
static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp6_payload_used;
static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
/* IPv6 headers for TCP segment without payload */
static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM];
/* TCP segment without payload for IPv6 frames */
static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM];
static unsigned int tcp6_flags_used;
/* recvmsg()/sendmsg() data for tap */
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
/**
* tcp_update_l2_buf() - Update Ethernet header buffers with addresses
* @eth_d: Ethernet destination address, NULL if unchanged
@ -76,30 +132,105 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
}
/**
* tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
* tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
* @c: Execution context
*/
void tcp_sock_iov_init(const struct ctx *c)
void tcp_sock4_iov_init(const struct ctx *c)
{
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
struct iovec *iov;
int i;
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
tcp6_payload_ip[i] = ip6;
for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
tcp4_payload_ip[i] = iph;
tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
tcp4_payload[i].th.ack = 1;
}
for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
tcp4_flags_ip[i] = iph;
tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
tcp4_flags[i].th.ack = 1;
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
struct iovec *iov = tcp_l2_iov[i];
iov = tcp4_l2_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]);
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov = tcp4_l2_flags_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
}
}
/**
* tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
* @c: Execution context
*/
void tcp_sock6_iov_init(const struct ctx *c)
{
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
struct iovec *iov;
int i;
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
tcp6_payload_ip[i] = ip6;
tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
tcp6_payload[i].th.ack = 1;
}
for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
tcp6_flags_ip[i] = ip6;
tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
tcp6_flags[i].th .ack = 1;
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov = tcp6_l2_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov = tcp6_l2_flags_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
}
}
/**
* tcp_flags_flush() - Send out buffers for segments with no data (flags)
* @c: Execution context
*/
void tcp_flags_flush(const struct ctx *c)
{
tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
tcp6_flags_used);
tcp6_flags_used = 0;
tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
tcp4_flags_used);
tcp4_flags_used = 0;
}
/**
@ -109,7 +240,7 @@ void tcp_sock_iov_init(const struct ctx *c)
* @frames: Two-dimensional array containing queued frames with sub-iovs
* @num_frames: Number of entries in the two arrays to be compared
*/
static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
{
int i;
@ -131,20 +262,28 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
}
/**
* tcp_payload_flush() - Send out buffers for segments with data or flags
* tcp_payload_flush() - Send out buffers for segments with data
* @c: Execution context
*/
void tcp_payload_flush(const struct ctx *c)
void tcp_payload_flush(struct ctx *c)
{
size_t m;
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
tcp_payload_used);
if (m != tcp_payload_used) {
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
tcp_payload_used - m);
m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
tcp6_payload_used);
if (m != tcp6_payload_used) {
tcp_revert_seq(c, &tcp6_frame_conns[m], &tcp6_l2_iov[m],
tcp6_payload_used - m);
}
tcp_payload_used = 0;
tcp6_payload_used = 0;
m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
tcp4_payload_used);
if (m != tcp4_payload_used) {
tcp_revert_seq(c, &tcp4_frame_conns[m], &tcp4_l2_iov[m],
tcp4_payload_used - m);
}
tcp4_payload_used = 0;
}
/**
@ -155,48 +294,58 @@ void tcp_payload_flush(const struct ctx *c)
*
* Return: negative error code on connection reset, 0 otherwise
*/
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
struct tcp_payload_t *payload;
struct tcp_flags_t *payload;
struct iovec *iov;
size_t optlen;
size_t l4len;
uint32_t seq;
int ret;
iov = tcp_l2_iov[tcp_payload_used];
if (CONN_V4(conn)) {
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else {
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
if (CONN_V4(conn))
iov = tcp4_l2_flags_iov[tcp4_flags_used++];
else
iov = tcp6_l2_flags_iov[tcp6_flags_used++];
payload = iov[TCP_IOV_PAYLOAD].iov_base;
seq = conn->seq_to_tap;
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
(struct tcp_syn_opts *)&payload->data, &optlen);
if (ret <= 0)
payload->opts, &optlen);
if (ret <= 0) {
if (CONN_V4(conn))
tcp4_flags_used--;
else
tcp6_flags_used--;
return ret;
tcp_payload_used++;
l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (flags & DUP_ACK) {
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_TAP].iov_len);
dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP];
memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, l4len);
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
}
if (tcp_payload_used > TCP_FRAMES_MEM - 2)
tcp_payload_flush(c);
l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (flags & DUP_ACK) {
struct iovec *dup_iov;
int i;
if (CONN_V4(conn))
dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
else
dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
for (i = 0; i < TCP_NUM_IOVS; i++)
memcpy(dup_iov[i].iov_base, iov[i].iov_base,
iov[i].iov_len);
dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
}
if (CONN_V4(conn)) {
if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
tcp_flags_flush(c);
} else {
if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
tcp_flags_flush(c);
}
return 0;
}
@ -209,39 +358,39 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
* @seq: Sequence number to be sent
*/
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
ssize_t dlen, int no_csum, uint32_t seq)
{
struct tcp_payload_t *payload;
const uint16_t *check = NULL;
struct iovec *iov;
size_t l4len;
conn->seq_to_tap = seq + dlen;
tcp_frame_conns[tcp_payload_used] = conn;
iov = tcp_l2_iov[tcp_payload_used];
if (CONN_V4(conn)) {
if (no_csum) {
struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
if (CONN_V4(conn)) {
struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
const uint16_t *check = NULL;
if (no_csum) {
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
check = &iph->check;
}
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else if (CONN_V6(conn)) {
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
payload = iov[TCP_IOV_PAYLOAD].iov_base;
payload->th.th_off = sizeof(struct tcphdr) / 4;
payload->th.th_x2 = 0;
payload->th.th_flags = 0;
payload->th.ack = 1;
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
tcp4_frame_conns[tcp4_payload_used] = conn;
iov = tcp4_l2_iov[tcp4_payload_used++];
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
} else if (CONN_V6(conn)) {
tcp6_frame_conns[tcp6_payload_used] = conn;
iov = tcp6_l2_iov[tcp6_payload_used++];
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
}
}
/**
@ -253,11 +402,12 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
*
* #syscalls recvmsg
*/
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
{
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
int len, dlen, i, s = conn->sock;
int sendlen, len, dlen, v4 = CONN_V4(conn);
int s = conn->sock, i, ret = 0;
struct msghdr mh_sock = { 0 };
uint16_t mss = MSS_GET(conn);
uint32_t already_sent, seq;
@ -304,15 +454,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
mh_sock.msg_iovlen = fill_bufs;
}
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
(!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
tcp_payload_flush(c);
/* Silence Coverity CWE-125 false positive */
tcp_payload_used = 0;
tcp4_payload_used = tcp6_payload_used = 0;
}
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
if (v4)
iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
else
iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
@ -323,19 +477,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
len = recvmsg(s, &mh_sock, MSG_PEEK);
while (len < 0 && errno == EINTR);
if (len < 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK) {
tcp_rst(c, conn);
return -errno;
}
return 0;
}
if (len < 0)
goto err;
if (!len) {
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
if (ret) {
if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
tcp_rst(c, conn);
return ret;
}
@ -346,27 +493,28 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
return 0;
}
sendlen = len;
if (!peek_offset_cap)
len -= already_sent;
sendlen -= already_sent;
if (len <= 0) {
if (sendlen <= 0) {
conn_flag(c, conn, STALLED);
return 0;
}
conn_flag(c, conn, ~STALLED);
send_bufs = DIV_ROUND_UP(len, mss);
last_len = len - (send_bufs - 1) * mss;
send_bufs = DIV_ROUND_UP(sendlen, mss);
last_len = sendlen - (send_bufs - 1) * mss;
/* Likely, some new data was acked too. */
tcp_update_seqack_wnd(c, conn, false, NULL);
tcp_update_seqack_wnd(c, conn, 0, NULL);
/* Finally, queue to tap */
dlen = mss;
seq = conn->seq_to_tap;
for (i = 0; i < send_bufs; i++) {
int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
if (i == send_bufs - 1)
dlen = last_len;
@ -378,4 +526,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
err:
if (errno != EAGAIN && errno != EWOULDBLOCK) {
ret = -errno;
tcp_rst(c, conn);
}
return ret;
}

View file

@ -6,9 +6,11 @@
#ifndef TCP_BUF_H
#define TCP_BUF_H
void tcp_sock_iov_init(const struct ctx *c);
void tcp_payload_flush(const struct ctx *c);
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
void tcp_sock4_iov_init(const struct ctx *c);
void tcp_sock6_iov_init(const struct ctx *c);
void tcp_flags_flush(const struct ctx *c);
void tcp_payload_flush(struct ctx *c);
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
#endif /*TCP_BUF_H */

View file

@ -33,7 +33,9 @@
#define OPT_EOL 0
#define OPT_NOP 1
#define OPT_MSS 2
#define OPT_MSS_LEN 4
#define OPT_WS 3
#define OPT_WS_LEN 3
#define OPT_SACKP 4
#define OPT_SACK 5
#define OPT_TS 8
@ -61,79 +63,6 @@ enum tcp_iov_parts {
TCP_NUM_IOVS
};
/**
* struct tcp_payload_t - TCP header and data to send segments with payload
* @th: TCP header
* @data: TCP data
*/
struct tcp_payload_t {
struct tcphdr th;
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
/** struct tcp_opt_nop - TCP NOP option
* @kind: Option kind (OPT_NOP = 1)
*/
struct tcp_opt_nop {
uint8_t kind;
} __attribute__ ((packed));
#define TCP_OPT_NOP ((struct tcp_opt_nop){ .kind = OPT_NOP, })
/** struct tcp_opt_mss - TCP MSS option
* @kind: Option kind (OPT_MSS == 2)
* @len: Option length (4)
* @mss: Maximum Segment Size
*/
struct tcp_opt_mss {
uint8_t kind;
uint8_t len;
uint16_t mss;
} __attribute__ ((packed));
#define TCP_OPT_MSS(mss_) \
((struct tcp_opt_mss) { \
.kind = OPT_MSS, \
.len = sizeof(struct tcp_opt_mss), \
.mss = htons(mss_), \
})
/** struct tcp_opt_ws - TCP Window Scaling option
* @kind: Option kind (OPT_WS == 3)
* @len: Option length (3)
* @shift: Window scaling shift
*/
struct tcp_opt_ws {
uint8_t kind;
uint8_t len;
uint8_t shift;
} __attribute__ ((packed));
#define TCP_OPT_WS(shift_) \
((struct tcp_opt_ws) { \
.kind = OPT_WS, \
.len = sizeof(struct tcp_opt_ws), \
.shift = (shift_), \
})
/** struct tcp_syn_opts - TCP options we apply to SYN packets
* @mss: Maximum Segment Size (MSS) option
* @nop: NOP opt (for alignment)
* @ws: Window Scaling (WS) option
*/
struct tcp_syn_opts {
struct tcp_opt_mss mss;
struct tcp_opt_nop nop;
struct tcp_opt_ws ws;
} __attribute__ ((packed));
#define TCP_SYN_OPTS(mss_, ws_) \
((struct tcp_syn_opts){ \
.mss = TCP_OPT_MSS(mss_), \
.nop = TCP_OPT_NOP, \
.ws = TCP_OPT_WS(ws_), \
})
extern char tcp_buf_discard [MAX_WINDOW];
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
@ -153,23 +82,19 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
conn_event_do(c, conn, event); \
} while (0)
void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
#define tcp_rst(c, conn) \
do { \
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
tcp_rst_do(c, conn); \
} while (0)
struct tcp_info_linux;
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
struct iovec *iov, size_t dlen,
const uint16_t *check, uint32_t seq,
bool no_tcp_csum);
const uint16_t *check, uint32_t seq);
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
bool force_seq, struct tcp_info_linux *tinfo);
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
size_t *optlen);
int force_seq, struct tcp_info *tinfo);
int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
struct tcphdr *th, char *data, size_t *optlen);
#endif /* TCP_INTERNAL_H */

View file

@ -320,7 +320,7 @@ static int tcp_splice_connect_finish(const struct ctx *c,
}
if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ,
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
c->tcp.pipe_size)) {
flow_trace(conn,
"cannot set %d->%d pipe size to %zu",
sidei, !sidei, c->tcp.pipe_size);
@ -503,7 +503,7 @@ swap:
lowat_act_flag = RCVLOWAT_ACT(fromsidei);
while (1) {
ssize_t readlen, written, pending;
ssize_t readlen, to_write = 0, written;
int more = 0;
retry:
@ -518,11 +518,14 @@ retry:
if (errno != EAGAIN)
goto close;
to_write = c->tcp.pipe_size;
} else if (!readlen) {
eof = 1;
to_write = c->tcp.pipe_size;
} else {
never_read = 0;
to_write += readlen;
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
more = SPLICE_F_MORE;
@ -532,10 +535,10 @@ retry:
eintr:
written = splice(conn->pipe[fromsidei][0], NULL,
conn->s[!fromsidei], NULL, c->tcp.pipe_size,
conn->s[!fromsidei], NULL, to_write,
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
flow_trace(conn, "%zi from write-side call (passed %zi)",
written, c->tcp.pipe_size);
written, to_write);
/* Most common case: skip updating counters. */
if (readlen > 0 && readlen == written) {
@ -581,9 +584,10 @@ eintr:
if (never_read && written == (long)(c->tcp.pipe_size))
goto retry;
pending = conn->read[fromsidei] - conn->written[fromsidei];
if (!never_read && written > 0 && written < pending)
if (!never_read && written < to_write) {
to_write -= written;
goto retry;
}
if (eof)
break;
@ -672,7 +676,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
continue;
if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
c->tcp.pipe_size)) {
trace("TCP (spliced): cannot set pool pipe size to %zu",
c->tcp.pipe_size);
}

View file

@ -8,6 +8,7 @@
WGET = wget -c
DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
debian-9-nocloud-amd64-daily-20200210-166.qcow2 \
debian-10-nocloud-amd64.qcow2 \
debian-10-generic-arm64.qcow2 \
debian-10-generic-ppc64el-20220911-1135.qcow2 \
@ -41,7 +42,8 @@ OPENSUSE_IMGS = openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 \
openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \
openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \
openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz \
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \
trusty-server-cloudimg-i386-disk1.img \
@ -133,6 +135,9 @@ realclean: clean
debian-8.11.0-openstack-%.qcow2:
$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2
debian-9-nocloud-%-daily-20200210-166.qcow2:
$(WGET) -O $@ https://cloud.debian.org/images/cloud/stretch/daily/20200210-166/debian-9-nocloud-$*-daily-20200210-166.qcow2
debian-10-nocloud-%.qcow2:
$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2
@ -198,6 +203,9 @@ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz:
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz:
$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2:
$(WGET) -O $@ https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
# Ubuntu downloads
trusty-server-cloudimg-%-disk1.img:
$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img

View file

@ -58,7 +58,7 @@ setup_passt() {
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
' -machine accel=kvm' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
@ -159,7 +159,7 @@ setup_passt_in_ns() {
' -machine accel=kvm' \
' -M accel=kvm:tcg' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
@ -230,7 +230,7 @@ setup_two_guests() {
context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
' -M accel=kvm:tcg' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
@ -243,7 +243,7 @@ setup_two_guests() {
context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
' -M accel=kvm:tcg' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel '"${KERNEL}" \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
' -nodefaults' \
' -append "console=ttyS0 mitigations=off apparmor=0" ' \

View file

@ -31,8 +31,8 @@ PR_DELAY_INIT=100 # ms
# $@: Message to print
info() {
tmux select-pane -t ${PANE_INFO}
printf "${@}\n" >> $STATEBASE/log_pipe
printf "${@}\n" >> "${LOGFILE}"
echo "${@}" >> $STATEBASE/log_pipe
echo "${@}" >> "${LOGFILE}"
}
# info_n() - Highlight, print message to pane and to log file without newline
@ -47,13 +47,13 @@ info_n() {
# $@: Message to print
info_nolog() {
tmux select-pane -t ${PANE_INFO}
printf "${@}\n" >> $STATEBASE/log_pipe
echo "${@}" >> $STATEBASE/log_pipe
}
# info_nolog() - Print message to log file
# $@: Message to print
log() {
printf "${@}\n" >> "${LOGFILE}"
echo "${@}" >> "${LOGFILE}"
}
# info_nolog_n() - Send message to pane without highlighting it, without newline
@ -664,7 +664,7 @@ pause_continue() {
# run_term() - Start tmux session, running entry point, with recording if needed
run_term() {
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eTRACE=$TRACE -eKERNEL=$KERNEL"
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG"
if [ ${CI} -eq 1 ]; then
printf '\e[8;50;240t'

View file

@ -33,15 +33,10 @@
#define die(...) \
do { \
fprintf(stderr, "nstool: " __VA_ARGS__); \
fprintf(stderr, __VA_ARGS__); \
exit(1); \
} while (0)
#define err(...) \
do { \
fprintf(stderr, "nstool: " __VA_ARGS__); \
} while (0)
struct ns_type {
int flag;
const char *name;
@ -161,9 +156,6 @@ static int connect_ctl(const char *sockpath, bool wait,
static void cmd_hold(int argc, char *argv[])
{
struct sigaction sa = {
.sa_handler = SIG_IGN,
};
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX);
struct sockaddr_un addr;
const char *sockpath = argv[1];
@ -193,10 +185,6 @@ static void cmd_hold(int argc, char *argv[])
if (!getcwd(info.cwd, sizeof(info.cwd)))
die("getcwd(): %s\n", strerror(errno));
rc = sigaction(SIGPIPE, &sa, NULL);
if (rc)
die("sigaction(SIGPIPE): %s\n", strerror(errno));
do {
int afd = accept(fd, NULL, NULL);
char buf;
@ -205,21 +193,17 @@ static void cmd_hold(int argc, char *argv[])
die("accept(): %s\n", strerror(errno));
rc = write(afd, &info, sizeof(info));
if (rc < 0) {
err("holder write() to control socket: %s\n",
strerror(errno));
}
if (rc < 0)
die("write(): %s\n", strerror(errno));
if ((size_t)rc < sizeof(info))
err("holder short write() on control socket\n");
die("short write() on control socket\n");
rc = read(afd, &buf, sizeof(buf));
if (rc < 0) {
err("holder read() on control socket: %s\n",
strerror(errno));
}
if (rc < 0)
die("read(): %s\n", strerror(errno));
close(afd);
} while (rc <= 0);
} while (rc == 0);
unlink(sockpath);
}
@ -362,7 +346,7 @@ static int openns(const char *fmt, ...)
}
static pid_t sig_pid;
static void sig_propagate(int signum)
static void sig_handler(int signum)
{
int err;
@ -374,7 +358,7 @@ static void sig_propagate(int signum)
static void wait_for_child(pid_t pid)
{
struct sigaction sa = {
.sa_handler = sig_propagate,
.sa_handler = sig_handler,
.sa_flags = SA_RESETHAND,
};
int status, err;

View file

@ -49,8 +49,6 @@ check [ "__SEARCH__" = "__HOST_SEARCH__" ]
test DHCPv6: address
guest /sbin/dhclient -6 __IFNAME__
# Wait for DAD to complete
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
check [ "__ADDR6__" = "__HOST_ADDR6__" ]

View file

@ -16,15 +16,13 @@ htools ip jq sipcalc grep cut
test Interface name
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
guest ip link set dev __IFNAME__ up
# Wait for DAD to complete
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
guest ip link set dev __IFNAME__ up && sleep 2
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
check [ -n "__IFNAME__" ]
test SLAAC: prefix
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
gout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
gout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]

View file

@ -52,8 +52,6 @@ check [ "__SEARCH__" = "__HOST_SEARCH__" ]
test DHCPv6: address
guest /sbin/dhclient -6 __IFNAME__
# Wait for DAD to complete
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
check [ "__ADDR6__" = "__HOST_ADDR6__" ]

View file

@ -32,7 +32,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10001
guestw
guest cmp test_big.bin /root/big.bin
test TCP/IPv4: host to ns (spliced): big transfer
test TCP/IPv4: host to ns: big transfer
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
sleep 1
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
@ -90,7 +90,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10001
guestw
guest cmp test_small.bin /root/small.bin
test TCP/IPv4: host to ns (spliced): small transfer
test TCP/IPv4: host to ns: small transfer
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
sleep 1
host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
@ -146,7 +146,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10001
guestw
guest cmp test_big.bin /root/big.bin
test TCP/IPv6: host to ns (spliced): big transfer
test TCP/IPv6: host to ns: big transfer
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
sleep 1
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
@ -204,7 +204,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10001
guestw
guest cmp test_small.bin /root/small.bin
test TCP/IPv6: host to ns (spliced): small transfer
test TCP/IPv6: host to ns: small transfer
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
sleep 1
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002

View file

@ -30,7 +30,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10001,shut-null
guestw
guest cmp test.bin /root/medium.bin
test UDP/IPv4: host to ns (recvmmsg/sendmmsg)
test UDP/IPv4: host to ns
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
sleep 1
host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
@ -88,7 +88,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10001,shut-null
guestw
guest cmp test.bin /root/medium.bin
test UDP/IPv6: host to ns (recvmmsg/sendmmsg)
test UDP/IPv6: host to ns
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
sleep 1
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null

View file

@ -35,8 +35,6 @@ check [ __MTU__ = 65520 ]
test DHCPv6: address
ns /sbin/dhclient -6 --no-pid __IFNAME__
# Wait for DAD to complete
ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'

View file

@ -18,12 +18,11 @@ test Interface name
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
check [ -n "__IFNAME__" ]
ns ip link set dev __IFNAME__ up
# Wait for DAD to complete
ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
sleep 2
test SLAAC: prefix
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
nsout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
nsout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]

View file

@ -19,8 +19,8 @@ set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin
set TEMP_SMALL __STATEDIR__/test_small.bin
set TEMP_NS_SMALL __STATEDIR__/test_ns_small.bin
test TCP/IPv4: host to ns (spliced): big transfer
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
test TCP/IPv4: host to ns: big transfer
nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_BIG__,create,trunc
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
nsw
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
@ -38,8 +38,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003
hostw
check cmp __BASEPATH__/big.bin __TEMP_BIG__
test TCP/IPv4: host to ns (spliced): small transfer
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
test TCP/IPv4: host to ns: small transfer
nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_SMALL__,create,trunc
host socat OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
nsw
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
@ -57,8 +57,8 @@ ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003
hostw
check cmp __BASEPATH__/small.bin __TEMP_SMALL__
test TCP/IPv6: host to ns (spliced): big transfer
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
test TCP/IPv6: host to ns: big transfer
nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_BIG__,create,trunc
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
nsw
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
@ -77,8 +77,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003
hostw
check cmp __BASEPATH__/big.bin __TEMP_BIG__
test TCP/IPv6: host to ns (spliced): small transfer
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
test TCP/IPv6: host to ns: small transfer
nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_SMALL__,create,trunc
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
nsw
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__

View file

@ -17,8 +17,8 @@ htools dd socat ip jq
set TEMP __STATEDIR__/test.bin
set TEMP_NS __STATEDIR__/test_ns.bin
test UDP/IPv4: host to ns (recvmmsg/sendmmsg)
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
test UDP/IPv4: host to ns
nsb socat -u UDP4-LISTEN:10002,bind=127.0.0.1,null-eof OPEN:__TEMP_NS__,create,trunc
host socat OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
nsw
check cmp __BASEPATH__/medium.bin __TEMP_NS__
@ -37,8 +37,8 @@ ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null
hostw
check cmp __BASEPATH__/medium.bin __TEMP__
test UDP/IPv6: host to ns (recvmmsg/sendmmsg)
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
test UDP/IPv6: host to ns
nsb socat -u UDP6-LISTEN:10002,bind=[::1],null-eof OPEN:__TEMP_NS__,create,trunc
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
nsw
check cmp __BASEPATH__/medium.bin __TEMP_NS__

View file

@ -116,8 +116,6 @@ iperf3k ns
# Reducing MTU below 1280 deconfigures IPv6, get our address back
guest dhclient -6 -x
guest dhclient -6 __IFNAME__
# Wait for DAD to complete
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
tl TCP RR latency over IPv4: guest to host
lat -

View file

@ -211,7 +211,7 @@ tr TCP throughput over IPv6: host to ns
iperf3s ns 10002
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
bw -
bw -
bw -

View file

@ -196,7 +196,7 @@ tr UDP throughput over IPv6: host to ns
iperf3s ns 10002
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472
bw __BW__ 0.3 0.5
iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972

View file

@ -38,9 +38,6 @@ TRACE=${TRACE:-0}
# If set, tell passt and pasta to take packet captures
PCAP=${PCAP:-0}
# Custom kernel to boot guests with, if given
KERNEL=${KERNEL:-"/boot/vmlinuz-$(uname -r)"}
COMMIT="$(git log --oneline --no-decorate -1)"
. lib/util

View file

@ -36,13 +36,9 @@ check [ "__ADDR2__" = "__HOST_ADDR__" ]
test DHCPv6: addresses
# Link is up now, wait for DAD to complete
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
sleep 2
guest1 /sbin/dhclient -6 __IFNAME1__
guest2 /sbin/dhclient -6 __IFNAME2__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
g2out ADDR2_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME2__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
@ -52,33 +48,33 @@ check [ "__ADDR2_6__" = "__HOST_ADDR6__" ]
test TCP/IPv4: guest 1 > guest 2
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc
sleep 1
guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004
guest2w
sleep 1
g2out MSG2 cat msg
check [ "__MSG2__" = "Hello_from_guest_1" ]
test TCP/IPv6: guest 2 > guest 1
g2out GW2_6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc
sleep 1
guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001
guest1w
sleep 1
g1out MSG1 cat msg
check [ "__MSG1__" = "Hello_from_guest_2" ]
test UDP/IPv4: guest 1 > guest 2
guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc
sleep 1
guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004
guest2w
sleep 1
g2out MSG2 cat msg
check [ "__MSG2__" = "Hello_from_guest_1" ]
test UDP/IPv6: guest 2 > guest 1
guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc
sleep 1
guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001
guest1w
sleep 1
g1out MSG1 cat msg
check [ "__MSG1__" = "Hello_from_guest_2" ]

252
udp.c
View file

@ -169,11 +169,11 @@ udp_meta[UDP_MAX_FRAMES];
* @UDP_NUM_IOVS the number of entries in the iovec array
*/
enum udp_iov_idx {
UDP_IOV_TAP,
UDP_IOV_ETH,
UDP_IOV_IP,
UDP_IOV_PAYLOAD,
UDP_NUM_IOVS,
UDP_IOV_TAP = 0,
UDP_IOV_ETH = 1,
UDP_IOV_IP = 2,
UDP_IOV_PAYLOAD = 3,
UDP_NUM_IOVS
};
/* IOVs and msghdr arrays for receiving datagrams from sockets */
@ -298,13 +298,11 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
* @bp: Pointer to udp_payload_t to update
* @toside: Flowside for destination side
* @dlen: Length of UDP payload
* @no_udp_csum: Do not set UDP checksum
*
* Return: size of IPv4 payload (UDP header + data)
*/
static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
const struct flowside *toside, size_t dlen,
bool no_udp_csum)
const struct flowside *toside, size_t dlen)
{
const struct in_addr *src = inany_v4(&toside->oaddr);
const struct in_addr *dst = inany_v4(&toside->eaddr);
@ -321,33 +319,22 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
bp->uh.source = htons(toside->oport);
bp->uh.dest = htons(toside->eport);
bp->uh.len = htons(l4len);
if (no_udp_csum) {
bp->uh.check = 0;
} else {
const struct iovec iov = {
.iov_base = bp->data,
.iov_len = dlen
};
csum_udp4(&bp->uh, *src, *dst, &iov, 1, 0);
}
csum_udp4(&bp->uh, *src, *dst, bp->data, dlen);
return l4len;
}
/**
* udp_update_hdr6() - Update headers for one IPv6 datagram
* @ip6h: Pre-filled IPv6 header (except for payload_len and
* addresses)
* @ip6h: Pre-filled IPv6 header (except for payload_len and addresses)
* @bp: Pointer to udp_payload_t to update
* @toside: Flowside for destination side
* @dlen: Length of UDP payload
* @no_udp_csum: Do not set UDP checksum
*
* Return: size of IPv6 payload (UDP header + data)
*/
static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
const struct flowside *toside, size_t dlen,
bool no_udp_csum)
const struct flowside *toside, size_t dlen)
{
uint16_t l4len = dlen + sizeof(bp->uh);
@ -361,20 +348,7 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
bp->uh.source = htons(toside->oport);
bp->uh.dest = htons(toside->eport);
bp->uh.len = ip6h->payload_len;
if (no_udp_csum) {
/* 0 is an invalid checksum for UDP IPv6 and dropped by
* the kernel stack, even if the checksum is disabled by virtio
* flags. We need to put any non-zero value here.
*/
bp->uh.check = 0xffff;
} else {
const struct iovec iov = {
.iov_base = bp->data,
.iov_len = dlen
};
csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
&iov, 1, 0);
}
csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, bp->data, dlen);
return l4len;
}
@ -384,11 +358,9 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
* @mmh: Receiving mmsghdr array
* @idx: Index of the datagram to prepare
* @toside: Flowside for destination side
* @no_udp_csum: Do not set UDP checksum
*/
static void udp_tap_prepare(const struct mmsghdr *mmh,
unsigned idx, const struct flowside *toside,
bool no_udp_csum)
static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
const struct flowside *toside)
{
struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
struct udp_payload_t *bp = &udp_payload[idx];
@ -396,15 +368,13 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
size_t l4len;
if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
mmh[idx].msg_len, no_udp_csum);
l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len);
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
sizeof(udp6_eth_hdr));
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
} else {
l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
mmh[idx].msg_len, no_udp_csum);
l4len = udp_update_hdr4(&bm->ip4h, bp, toside, mmh[idx].msg_len);
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
sizeof(udp4_eth_hdr));
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
@ -417,8 +387,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
* udp_sock_recverr() - Receive and clear an error from a socket
* @s: Socket to receive from
*
* Return: 1 if error received and processed, 0 if no more errors in queue, < 0
* if there was an error reading the queue
* Return: ee_errno, 0 on empty queue
*
* #syscalls recvmsg
*/
@ -439,16 +408,15 @@ static int udp_sock_recverr(int s)
rc = recvmsg(s, &mh, MSG_ERRQUEUE);
if (rc < 0) {
if (errno == EAGAIN || errno == EWOULDBLOCK)
return 0;
if (errno != EAGAIN && errno != EWOULDBLOCK)
err_perror("Failed to read error queue");
err_perror("UDP: Failed to read error queue");
return -1;
return 0;
}
if (!(mh.msg_flags & MSG_ERRQUEUE)) {
err("Missing MSG_ERRQUEUE flag reading error queue");
return -1;
return 0;
}
hdr = CMSG_FIRSTHDR(&mh);
@ -457,7 +425,7 @@ static int udp_sock_recverr(int s)
(hdr->cmsg_level == IPPROTO_IPV6 &&
hdr->cmsg_type == IPV6_RECVERR))) {
err("Unexpected cmsg reading error queue");
return -1;
return 0;
}
ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
@ -466,54 +434,7 @@ static int udp_sock_recverr(int s)
debug("%s error on UDP socket %i: %s",
str_ee_origin(ee), s, strerror(ee->ee_errno));
return 1;
}
/**
* udp_sock_errs() - Process errors on a socket
* @c: Execution context
* @s: Socket to receive from
* @events: epoll events bitmap
*
* Return: Number of errors handled, or < 0 if we have an unrecoverable error
*/
static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
{
unsigned n_err = 0;
socklen_t errlen;
int rc, err;
ASSERT(!c->no_udp);
if (!(events & EPOLLERR))
return 0; /* Nothing to do */
/* Empty the error queue */
while ((rc = udp_sock_recverr(s)) > 0)
n_err += rc;
if (rc < 0)
return -1; /* error reading error, unrecoverable */
errlen = sizeof(err);
if (getsockopt(s, SOL_SOCKET, SO_ERROR, &err, &errlen) < 0 ||
errlen != sizeof(err)) {
err_perror("Error reading SO_ERROR");
return -1; /* error reading error, unrecoverable */
}
if (err) {
debug("Unqueued error on UDP socket %i: %s", s, strerror(err));
n_err++;
}
if (!n_err) {
/* EPOLLERR, but no errors to clear !? */
err("EPOLLERR event without reported errors on socket %i", s);
return -1; /* no way to clear, unrecoverable */
}
return n_err;
return ee->ee_errno;
}
/**
@ -521,14 +442,15 @@ static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
* @c: Execution context
* @s: Socket to receive from
* @events: epoll events bitmap
* @mmh mmsghdr array to receive into
* @mmh: mmsghdr array to receive into
* @recv_err: Set to last error in queue. If none: -1 on EPOLLERR, 0 otherwise
*
* Return: Number of datagrams received
* Return: count of datagrams received
*
* #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
*/
static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
struct mmsghdr *mmh)
struct mmsghdr *mmh, int *recv_err)
{
/* For not entirely clear reasons (data locality?) pasta gets better
* throughput if we receive tap datagrams one at a atime. For small
@ -541,6 +463,17 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
ASSERT(!c->no_udp);
/* Clear any errors first */
if (events & EPOLLERR) {
bool found = false;
int ret;
while ((ret = udp_sock_recverr(s)))
found = true;
*recv_err = found ? ret : -1;
}
if (!(events & EPOLLIN))
return 0;
@ -566,16 +499,10 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now)
{
const socklen_t sasize = sizeof(udp_meta[0].s_in);
int recv_err = 0;
int n, i;
if (udp_sock_errs(c, ref.fd, events) < 0) {
err("UDP: Unrecoverable error on listening socket:"
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
/* FIXME: what now? close/re-open socket? */
return;
}
if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv, &recv_err)) <= 0)
return;
/* We divide datagrams into batches based on how we need to send them,
@ -595,8 +522,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
udp_splice_prepare(udp_mh_recv, i);
} else if (batchpif == PIF_TAP) {
udp_tap_prepare(udp_mh_recv, i,
flowside_at_sidx(batchsidx),
false);
flowside_at_sidx(batchsidx));
}
if (++i >= n)
@ -644,21 +570,51 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
const struct flowside *toside = flowside_at_sidx(tosidx);
struct udp_flow *uflow = udp_at_sidx(ref.flowside);
int from_s = uflow->s[ref.flowside.sidei];
uint8_t topif = pif_at_sidx(tosidx);
int n, i, from_s;
int recv_err = 0;
int n, i;
ASSERT(!c->no_udp && uflow);
from_s = uflow->s[ref.flowside.sidei];
n = udp_sock_recv(c, from_s, events, udp_mh_recv, &recv_err);
if (recv_err == -1) {
struct flow_common *f = &uflow->f;
char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
const struct flowside *ini = &f->side[INISIDE];
const struct flowside *tgt = &f->side[TGTSIDE];
flow_err(uflow, "EPOLLERR without error queue, closing flow");
err("Last recorded errno was: %i (%s)", uflow->last_errno,
strerror(uflow->last_errno));
flow_log_(f, LOG_ERR,
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
ini->eport,
inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
ini->oport,
pif_name(f->pif[TGTSIDE]),
inany_ntop(&tgt->oaddr, fstr1, sizeof(fstr1)),
tgt->oport,
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
tgt->eport);
if (udp_sock_errs(c, from_s, events) < 0) {
flow_err(uflow, "Unrecoverable error on reply socket");
flow_err_details(uflow);
udp_flow_close(c, uflow);
return;
}
if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
if (recv_err) {
struct udp_flow *uflow = udp_at_sidx(udp_meta[0].tosidx);
uflow->last_errno = recv_err;
flow_err(uflow, "Recorded errno %i (%s)", recv_err,
strerror(recv_err));
}
if (n <= 0)
return;
flow_trace(uflow, "Received %d datagrams on reply socket", n);
@ -668,7 +624,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
if (pif_is_socket(topif))
udp_splice_prepare(udp_mh_recv, i);
else if (topif == PIF_TAP)
udp_tap_prepare(udp_mh_recv, i, toside, false);
udp_tap_prepare(udp_mh_recv, i, toside);
/* Restore sockaddr length clobbered by recvmsg() */
udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
}
@ -795,61 +751,69 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
* udp_sock_init() - Initialise listening sockets for a given port
* @c: Execution context
* @ns: In pasta mode, if set, bind with loopback address in namespace
* @af: Address family to select a specific IP version, or AF_UNSPEC
* @addr: Pointer to address for binding, NULL if not configured
* @ifname: Name of interface to bind to, NULL if not configured
* @port: Port, host order
*
* Return: 0 on (partial) success, negative error code on (complete) failure
*/
int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
const char *ifname, in_port_t port)
int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
const void *addr, const char *ifname, in_port_t port)
{
union udp_listen_epoll_ref uref = {
.pif = ns ? PIF_SPLICE : PIF_HOST,
.port = port,
};
union udp_listen_epoll_ref uref = { .port = port };
int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
ASSERT(!c->no_udp);
if (!addr && c->ifi4 && c->ifi6 && !ns) {
if (ns)
uref.pif = PIF_SPLICE;
else
uref.pif = PIF_HOST;
if (af == AF_UNSPEC && c->ifi4 && c->ifi6) {
int s;
/* Attempt to get a dual stack socket */
s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
NULL, ifname, port, uref.u32);
if (!ns) {
s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
addr, ifname, port, uref.u32);
udp_splice_init[V4][port] = s < 0 ? -1 : s;
udp_splice_init[V6][port] = s < 0 ? -1 : s;
} else {
s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
&in4addr_loopback, ifname, port, uref.u32);
udp_splice_ns[V4][port] = s < 0 ? -1 : s;
udp_splice_ns[V6][port] = s < 0 ? -1 : s;
}
if (IN_INTERVAL(0, FD_REF_MAX, s))
return 0;
}
if ((!addr || inany_v4(addr)) && c->ifi4) {
if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
if (!ns) {
r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
addr ? addr : &inany_any4, ifname,
port, uref.u32);
r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
addr, ifname, port, uref.u32);
udp_splice_init[V4][port] = r4 < 0 ? -1 : r4;
} else {
r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
&inany_loopback4, ifname,
port, uref.u32);
r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
&in4addr_loopback,
ifname, port, uref.u32);
udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4;
}
}
if ((!addr || !inany_v4(addr)) && c->ifi6) {
if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
if (!ns) {
r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
addr ? addr : &inany_any6, ifname,
port, uref.u32);
r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
addr, ifname, port, uref.u32);
udp_splice_init[V6][port] = r6 < 0 ? -1 : r6;
} else {
r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
&inany_loopback6, ifname,
port, uref.u32);
r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
&in6addr_loopback,
ifname, port, uref.u32);
udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6;
}
}
@ -917,7 +881,7 @@ static void udp_port_rebind(struct ctx *c, bool outbound)
if ((c->ifi4 && socks[V4][port] == -1) ||
(c->ifi6 && socks[V6][port] == -1))
udp_sock_init(c, outbound, NULL, NULL, port);
udp_sock_init(c, outbound, AF_UNSPEC, NULL, NULL, port);
}
}

4
udp.h
View file

@ -16,8 +16,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
int udp_tap_handler(const struct ctx *c, uint8_t pif,
sa_family_t af, const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now);
int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
const char *ifname, in_port_t port);
int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
const void *addr, const char *ifname, in_port_t port);
int udp_init(struct ctx *c);
void udp_timer(struct ctx *c, const struct timespec *now);
void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);

View file

@ -34,16 +34,13 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
return &flow->udp;
}
/*
/**
* udp_flow_close() - Close and clean up UDP flow
* @c: Execution context
* @uflow: UDP flow
*/
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
{
if (uflow->closed)
return; /* Nothing to do */
if (uflow->s[INISIDE] >= 0) {
/* The listening socket needs to stay in epoll */
close(uflow->s[INISIDE]);
@ -56,11 +53,12 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
close(uflow->s[TGTSIDE]);
uflow->s[TGTSIDE] = -1;
}
uflow->last_errno = 0;
flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
uflow->closed = true;
}
/**
@ -261,17 +259,6 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
return udp_flow_new(c, flow, -1, now);
}
/**
* udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
* @uflow: Flow to handle
*
* Return: true if the connection is ready to free, false otherwise
*/
bool udp_flow_defer(const struct udp_flow *uflow)
{
return uflow->closed;
}
/**
* udp_flow_timer() - Handler for timed events related to a given flow
* @c: Execution context

View file

@ -10,7 +10,6 @@
/**
* struct udp - Descriptor for a flow of UDP packets
* @f: Generic flow information
* @closed: Flow is already closed
* @ts: Activity timestamp
* @s: Socket fd (or -1) for each side of the flow
*/
@ -18,9 +17,10 @@ struct udp_flow {
/* Must be first element */
struct flow_common f;
bool closed :1;
time_t ts;
int s[SIDES];
int last_errno;
};
struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
@ -33,7 +33,6 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
in_port_t srcport, in_port_t dstport,
const struct timespec *now);
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
bool udp_flow_defer(const struct udp_flow *uflow);
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
const struct timespec *now);

210
util.c
View file

@ -28,7 +28,6 @@
#include <linux/errqueue.h>
#include <getopt.h>
#include "linux_dep.h"
#include "util.h"
#include "iov.h"
#include "passt.h"
@ -53,7 +52,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
{
sa_family_t af = ((const struct sockaddr *)sa)->sa_family;
union epoll_ref ref = { .type = type, .data = data };
bool freebind = false;
struct epoll_event ev;
int fd, y = 1, ret;
uint8_t proto;
@ -63,11 +61,8 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
case EPOLL_TYPE_TCP_LISTEN:
proto = IPPROTO_TCP;
socktype = SOCK_STREAM | SOCK_NONBLOCK;
freebind = c->freebind;
break;
case EPOLL_TYPE_UDP_LISTEN:
freebind = c->freebind;
/* fallthrough */
case EPOLL_TYPE_UDP_REPLY:
proto = IPPROTO_UDP;
socktype = SOCK_DGRAM | SOCK_NONBLOCK;
@ -132,18 +127,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
}
}
if (freebind) {
int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
int opt = af == AF_INET ? IP_FREEBIND : IPV6_FREEBIND;
if (setsockopt(fd, level, opt, &y, sizeof(y))) {
err_perror("Failed to set %s on socket %i",
af == AF_INET ? "IP_FREEBIND"
: "IPV6_FREEBIND",
fd);
}
}
if (bind(fd, sa, sl) < 0) {
/* We'll fail to bind to low ports if we don't have enough
* capabilities, and we'll fail to bind on already bound ports,
@ -174,6 +157,58 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return fd;
}
/**
* sock_l4() - Create and bind socket for given L4, add to epoll list
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @type: epoll type
* @bind_addr: Address for binding, NULL for any
* @ifname: Interface for binding, NULL for any
* @port: Port, host order
* @data: epoll reference portion for protocol handlers
*
* Return: newly created socket, negative error code on failure
*/
int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
const void *bind_addr, const char *ifname, uint16_t port,
uint32_t data)
{
switch (af) {
case AF_INET: {
struct sockaddr_in addr4 = {
.sin_family = AF_INET,
.sin_port = htons(port),
{ 0 }, { 0 },
};
if (bind_addr)
addr4.sin_addr = *(struct in_addr *)bind_addr;
return sock_l4_sa(c, type, &addr4, sizeof(addr4), ifname,
false, data);
}
case AF_UNSPEC:
if (!DUAL_STACK_SOCKETS || bind_addr)
return -EINVAL;
/* fallthrough */
case AF_INET6: {
struct sockaddr_in6 addr6 = {
.sin6_family = AF_INET6,
.sin6_port = htons(port),
0, IN6ADDR_ANY_INIT, 0,
};
if (bind_addr) {
addr6.sin6_addr = *(struct in6_addr *)bind_addr;
if (IN6_IS_ADDR_LINKLOCAL(bind_addr))
addr6.sin6_scope_id = c->ifi6;
}
return sock_l4_sa(c, type, &addr6, sizeof(addr6), ifname,
af == AF_INET6, data);
}
default:
return -EINVAL;
}
}
/**
* sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
@ -184,8 +219,7 @@ void sock_probe_mem(struct ctx *c)
int v = INT_MAX / 2, s;
socklen_t sl;
s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
if (s < 0) {
if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
c->low_wmem = c->low_rmem = 1;
return;
}
@ -215,7 +249,7 @@ void sock_probe_mem(struct ctx *c)
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b)
{
if (a->tv_nsec < b->tv_nsec) {
return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 +
return (b->tv_nsec - a->tv_nsec) / 1000 +
(a->tv_sec - b->tv_sec - 1) * 1000000;
}
@ -409,20 +443,25 @@ void pidfile_write(int fd, pid_t pid)
}
/**
* output_file_open() - Open file for output, if needed
* @path: Path for output file
* @flags: Flags for open() other than O_CREAT, O_TRUNC, O_CLOEXEC
* pidfile_open() - Open PID file if needed
* @path: Path for PID file, empty string if no PID file is requested
*
* Return: file descriptor on success, -1 on failure with errno set by open()
* Return: descriptor for PID file, -1 if path is NULL, won't return on failure
*/
int output_file_open(const char *path, int flags)
int pidfile_open(const char *path)
{
/* We use O_CLOEXEC here, but clang-tidy as of LLVM 16 to 19 looks for
* it in the 'mode' argument if we have one
*/
return open(path, O_CREAT | O_TRUNC | O_CLOEXEC | flags,
/* NOLINTNEXTLINE(android-cloexec-open) */
S_IRUSR | S_IWUSR);
int fd;
if (!*path)
return -1;
if ((fd = open(path, O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC,
S_IRUSR | S_IWUSR)) < 0) {
perror("PID file open");
exit(EXIT_FAILURE);
}
return fd;
}
/**
@ -446,11 +485,16 @@ int __daemon(int pidfile_fd, int devnull_fd)
exit(EXIT_SUCCESS);
}
if (setsid() < 0 ||
dup2(devnull_fd, STDIN_FILENO) < 0 ||
dup2(devnull_fd, STDOUT_FILENO) < 0 ||
dup2(devnull_fd, STDERR_FILENO) < 0 ||
close(devnull_fd))
errno = 0;
setsid();
dup2(devnull_fd, STDIN_FILENO);
dup2(devnull_fd, STDOUT_FILENO);
dup2(devnull_fd, STDERR_FILENO);
close(devnull_fd);
if (errno)
exit(EXIT_FAILURE);
return 0;
@ -538,36 +582,6 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
#endif
}
/* write_all_buf() - write all of a buffer to an fd
* @fd: File descriptor
* @buf: Pointer to base of buffer
* @len: Length of buffer
*
* Return: 0 on success, -1 on error (with errno set)
*
* #syscalls write
*/
int write_all_buf(int fd, const void *buf, size_t len)
{
const char *p = buf;
size_t left = len;
while (left) {
ssize_t rc;
do
rc = write(fd, p, left);
while ((rc < 0) && errno == EINTR);
if (rc < 0)
return -1;
p += rc;
left -= rc;
}
return 0;
}
/* write_remainder() - write the tail of an IO vector to an fd
* @fd: File descriptor
* @iov: IO vector
@ -576,30 +590,28 @@ int write_all_buf(int fd, const void *buf, size_t len)
*
* Return: 0 on success, -1 on error (with errno set)
*
* #syscalls writev
* #syscalls write writev
*/
int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip)
{
size_t i = 0, offset;
size_t offset, i;
while ((i += iov_skip_bytes(iov + i, iovcnt - i, skip, &offset)) < iovcnt) {
while ((i = iov_skip_bytes(iov, iovcnt, skip, &offset)) < iovcnt) {
ssize_t rc;
if (offset) {
/* Write the remainder of the partially written buffer */
if (write_all_buf(fd, (char *)iov[i].iov_base + offset,
iov[i].iov_len - offset) < 0)
return -1;
i++;
rc = write(fd, (char *)iov[i].iov_base + offset,
iov[i].iov_len - offset);
} else {
rc = writev(fd, &iov[i], iovcnt - i);
}
/* Write as much of the remaining whole buffers as we can */
rc = writev(fd, &iov[i], iovcnt - i);
if (rc < 0)
return -1;
skip = rc;
skip += rc;
}
return 0;
}
@ -738,48 +750,6 @@ void close_open_files(int argc, char **argv)
rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE);
}
if (rc) {
if (errno == ENOSYS || errno == EINVAL) {
/* This probably means close_range() or the
* CLOSE_RANGE_UNSHARE flag is not supported by the
* kernel. Not much we can do here except carry on and
* hope for the best.
*/
warn(
"Can't use close_range() to ensure no files leaked by parent");
} else {
if (rc)
die_perror("Failed to close files leaked by parent");
}
}
}
/**
* snprintf_check() - snprintf() wrapper, checking for truncation and errors
* @str: Output buffer
* @size: Maximum size to write to @str
* @format: Message
*
* Return: false on success, true on truncation or error, sets errno on failure
*/
bool snprintf_check(char *str, size_t size, const char *format, ...)
{
va_list ap;
int rc;
va_start(ap, format);
rc = vsnprintf(str, size, format, ap);
va_end(ap);
if (rc < 0) {
errno = EIO;
return true;
}
if ((size_t)rc >= size) {
errno = ENOBUFS;
return true;
}
return false;
}

54
util.h
View file

@ -11,12 +11,12 @@
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <signal.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/close_range.h>
#include "log.h"
@ -67,15 +67,6 @@
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
#ifdef CPPCHECK_6936
/* Some cppcheck versions get confused by aborts inside a loop, causing
* it to give false positive uninitialised variable warnings later in
* the function, because it doesn't realise the non-initialising path
* already exited. See https://trac.cppcheck.net/ticket/13227
*/
#define ASSERT(expr) \
((expr) ? (void)0 : abort())
#else
#define ASSERT(expr) \
do { \
if (!(expr)) { \
@ -87,7 +78,6 @@
abort(); \
} \
} while (0)
#endif
#ifdef P_tmpdir
#define TMPDIR P_tmpdir
@ -101,9 +91,6 @@
#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0])))
#define foreach(item, array) \
for ((item) = (array); (item) - (array) < ARRAY_SIZE(array); (item)++)
#define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b))
#define FD_PROTO(x, proto) \
(IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x)))
@ -144,7 +131,7 @@ static inline uint32_t ntohl_unaligned(const void *p)
return ntohl(val);
}
#define NS_FN_STACK_SIZE (1024 * 1024) /* 1MiB */
#define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8)
int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
void *arg);
#define NS_CALL(fn, arg) \
@ -157,9 +144,9 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
(void *)(arg)); \
} while (0)
#define RCVBUF_BIG (2ULL * 1024 * 1024)
#define SNDBUF_BIG (4ULL * 1024 * 1024)
#define SNDBUF_SMALL (128ULL * 1024)
#define RCVBUF_BIG (2UL * 1024 * 1024)
#define SNDBUF_BIG (4UL * 1024 * 1024)
#define SNDBUF_SMALL (128UL * 1024)
#include <net/if.h>
#include <limits.h>
@ -170,9 +157,33 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
struct ctx;
/* cppcheck-suppress funcArgNamesDifferent */
__attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); }
#ifdef CLOSE_RANGE_UNSHARE /* Linux kernel >= 5.9 */
/* glibc < 2.34 and musl as of 1.2.5 need these */
#ifndef SYS_close_range
#define SYS_close_range 436
#endif
__attribute__ ((weak))
/* cppcheck-suppress funcArgNamesDifferent */
int close_range(unsigned int first, unsigned int last, int flags) {
return syscall(SYS_close_range, first, last, flags);
}
#else
/* No reasonable fallback option */
/* cppcheck-suppress funcArgNamesDifferent */
int close_range(unsigned int first, unsigned int last, int flags) {
return 0;
}
#endif
int sock_l4_sa(const struct ctx *c, enum epoll_type type,
const void *sa, socklen_t sl,
const char *ifname, bool v6only, uint32_t data);
int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
const void *bind_addr, const char *ifname, uint16_t port,
uint32_t data);
void sock_probe_mem(struct ctx *c);
long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);
@ -184,15 +195,13 @@ char *line_read(char *buf, size_t len, int fd);
void ns_enter(const struct ctx *c);
bool ns_is_init(void);
int open_in_ns(const struct ctx *c, const char *path, int flags);
int output_file_open(const char *path, int flags);
int pidfile_open(const char *path);
void pidfile_write(int fd, pid_t pid);
int __daemon(int pidfile_fd, int devnull_fd);
int fls(unsigned long x);
int write_file(const char *path, const char *buf);
int write_all_buf(int fd, const void *buf, size_t len);
int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
void close_open_files(int argc, char **argv);
bool snprintf_check(char *str, size_t size, const char *format, ...);
/**
* af_name() - Return name of an address family
@ -260,9 +269,6 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
return mod_sub(x, i, m) < mod_sub(j, i, m);
}
/* FPRINTF() intentionally silences cert-err33-c clang-tidy warnings */
#define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__)
/*
* Workarounds for https://github.com/llvm/llvm-project/issues/58992
*