Compare commits
396 commits
vhost-user
...
master
Author | SHA1 | Date | |
---|---|---|---|
|
0588163b1f | ||
|
14dd70e2b3 | ||
|
d64f257243 | ||
|
b84cd05098 | ||
|
58fa5508bd | ||
|
71869e2912 | ||
|
d4f09c9b96 | ||
|
87940f9aa7 | ||
|
1feb90fe62 | ||
|
5f5e814cfc | ||
|
78da088f7b | ||
|
9a0e544f05 | ||
|
910f4f9103 | ||
|
1699083f29 | ||
|
b456ee1b53 | ||
|
867db07fcf | ||
|
6f913b3af0 | ||
|
d8e05a3fe0 | ||
|
0d7b8201ed | ||
|
c5f4e4d146 | ||
|
1e76a19895 | ||
|
1d7cff3779 | ||
|
c560e2f65b | ||
|
13fc6d511e | ||
|
7917159005 | ||
|
93bce404c1 | ||
|
c938d8a93e | ||
|
f6b546c6e4 | ||
|
30b4f88167 | ||
|
b78e72da0b | ||
|
8346216c9a | ||
|
8f1b6a0ca6 | ||
|
5e93bcd8bf | ||
|
9afce0b45c | ||
|
d165d36a0c | ||
|
ee7d0b62a7 | ||
|
b1a607fba1 | ||
|
099ace64ce | ||
|
59fe34ee36 | ||
|
134b4d58b4 | ||
|
744247856d | ||
|
98efe7c2fd | ||
|
988a4d75f8 | ||
|
ba38e67cf4 | ||
|
2053c36dec | ||
|
5563d5f668 | ||
|
f43f7d5e89 | ||
|
e7fcd0c348 | ||
|
81143813a6 | ||
|
13f0291ede | ||
|
9e4615b40b | ||
|
149f457b23 | ||
|
9e5df350d6 | ||
|
b4dace8f46 | ||
|
58e6d68599 | ||
|
1fa421192c | ||
|
ef8a5161d0 | ||
|
53176ca91d | ||
|
75b9c0feb0 | ||
|
f9d677bff6 | ||
|
2d7f734c45 | ||
|
7612cb80fe | ||
|
b40880c157 | ||
|
ff63ac922a | ||
|
9d66df9a9a | ||
|
151dbe0d3d | ||
|
3d484aa370 | ||
|
e6548c6437 | ||
|
fd8334b25d | ||
|
72e7d3024b | ||
|
def8acdcd8 | ||
|
b55013b1a7 | ||
|
cbde4192ee | ||
|
b8d4fac6a2 | ||
|
204e77cd11 | ||
|
8f8c4d27eb | ||
|
4fe5f4e813 | ||
|
d836d9e345 | ||
|
bfc294b90d | ||
|
bb41901c71 | ||
|
265b2099c7 | ||
|
4aff6f9392 | ||
|
7d8804beb8 | ||
|
5ff5d55291 | ||
|
1f414ed8f0 | ||
|
6b38f07239 | ||
|
116bc8266d | ||
|
a33ecafbd9 | ||
|
d2a1dc744b | ||
|
11e29054fe | ||
|
49fc4e0414 | ||
|
63513e54f3 | ||
|
748ef4cd6e | ||
|
afedc2412e | ||
|
aff5a49b0e | ||
|
bd99f02a64 | ||
|
bd092ca421 | ||
|
88bfa3801e | ||
|
1166401c2f | ||
|
7ad9f9bd2b | ||
|
0ea60e5a77 | ||
|
38363964fc | ||
|
eedc81b6ef | ||
|
4a41dc58d6 | ||
|
1daf6f4615 | ||
|
712ca32353 | ||
|
e0be6bc2f4 | ||
|
c78b194001 | ||
|
620e19a1b4 | ||
|
418feb37ec | ||
|
1d6142f362 | ||
|
f00ebda369 | ||
|
05453ea590 | ||
|
1a66806c18 | ||
|
6e9ecf5741 | ||
|
7291b70ba7 | ||
|
396307541e | ||
|
d6817b3930 | ||
|
34be8eeb38 | ||
|
aded2b671c | ||
|
2aea1da143 | ||
|
57b7bd2a48 | ||
|
8436c0d61b | ||
|
e813a4df7d | ||
|
dbaaebbe00 | ||
|
935bd81936 | ||
|
90e83d50a9 | ||
|
356de97e43 | ||
|
4d8dd1fbe7 | ||
|
975cfa5f32 | ||
|
8d4baa4446 | ||
|
a42fb9c000 | ||
|
c9f0ec3227 | ||
|
57532f1ded | ||
|
0b25cac94e | ||
|
a6066f4e27 | ||
|
7c083ee41c | ||
|
1d10760c9f | ||
|
6852bd07cc | ||
|
c679894668 | ||
|
ceea52ca93 | ||
|
905ecd2b0b | ||
|
066e69986b | ||
|
e6feb5a892 | ||
|
32c386834d | ||
|
f4e9f26480 | ||
|
d6f0220731 | ||
|
74e508cf79 | ||
|
0c74068f56 | ||
|
8231ce54c3 | ||
|
b91d3373ac | ||
|
946206437a | ||
|
61c0b0d0f1 | ||
|
baba284912 | ||
|
c16141eda5 | ||
|
f6d5a52392 | ||
|
86bdd968ea | ||
|
fecb1b65b1 | ||
|
baccfb95ce | ||
|
09603cab28 | ||
|
755f9fd911 | ||
|
5ca61c2f34 | ||
|
a628cb93a7 | ||
|
2c7558dc43 | ||
|
b91bae1ded | ||
|
95569e4aa4 | ||
|
fbb0c9523e | ||
|
ee36266a55 | ||
|
3a082c4ecb | ||
|
031df332e9 | ||
|
e877f905e5 | ||
|
623ceb1f2b | ||
|
a5bbefa6fb | ||
|
f30ed68c52 | ||
|
0149d11cc5 | ||
|
8fae3b73cb | ||
|
f87b11c7be | ||
|
57a21d2df1 | ||
|
37e3b24d90 | ||
|
4684f60344 | ||
|
9e3f2355c4 | ||
|
a06db27c49 | ||
|
77c092ee5e | ||
|
e5c37ba0f4 | ||
|
327d9d482f | ||
|
2ce1d37831 | ||
|
e9a542321f | ||
|
becf81ab88 | ||
|
0ada84e3f8 | ||
|
4a333c88d7 | ||
|
6ff702f325 | ||
|
f72d35a78d | ||
|
606e0c7b95 | ||
|
f16f8f5bf6 | ||
|
1cd773081f | ||
|
13295583f8 | ||
|
d19b396f11 | ||
|
a09aeb4bd6 | ||
|
9cb6b50815 | ||
|
882599e180 | ||
|
d29fa0856e | ||
|
d89b3aa097 | ||
|
898f797174 | ||
|
b7ad19347f | ||
|
8126f7a660 | ||
|
e0647ad80c | ||
|
a45a7e9798 | ||
|
8abd06e9fa | ||
|
c000f2aba6 | ||
|
060f24e310 | ||
|
4cd753e65c | ||
|
781164e25b | ||
|
2faf6fcd8b | ||
|
2f40a01944 | ||
|
6d76278c21 | ||
|
5cffb1bf64 | ||
|
508adde342 | ||
|
acca4235c4 | ||
|
163a339214 | ||
|
f19a8f71f9 | ||
|
528a6517f8 | ||
|
e2ea10e246 | ||
|
52d45f1737 | ||
|
f9fe212b1f | ||
|
4e2d36e83f | ||
|
8012f5ff55 | ||
|
ba74b1fea1 | ||
|
01e5611ec3 | ||
|
66a02c9f7c | ||
|
5235c47c79 | ||
|
71d7985188 | ||
|
9b125e7776 | ||
|
2fa91ee391 | ||
|
6bd8283bf9 | ||
|
4e1f850f61 | ||
|
272d1d033c | ||
|
f79c42317f | ||
|
a740e16fd1 | ||
|
e63d281871 | ||
|
8bd57bf25b | ||
|
ec2691a12e | ||
|
299c407501 | ||
|
be0214cca6 | ||
|
69e5393c37 | ||
|
c6c61a9e1a | ||
|
55aff45bc1 | ||
|
9f9b15f949 | ||
|
fbd78b6f3e | ||
|
f62c33d85f | ||
|
8f8eb73482 | ||
|
74c1c5efcf | ||
|
b625ed5fee | ||
|
403a7c14a0 | ||
|
21ee1eb2de | ||
|
1ee2ecade3 | ||
|
054697598f | ||
|
c66f0341d9 | ||
|
e7323e515a | ||
|
dba7f0f5ce | ||
|
92a22fef93 | ||
|
c1140df889 | ||
|
afd9cdc9bb | ||
|
8c2f24a560 | ||
|
bca0fefa32 | ||
|
b74801645c | ||
|
65923ba798 | ||
|
62de6140d9 | ||
|
1544a43863 | ||
|
f301bb18b5 | ||
|
450a6131be | ||
|
54a9d3801b | ||
|
020ff7a40e | ||
|
7e87bd98ac | ||
|
ff57f8ddc6 | ||
|
63db7dcdbf | ||
|
523fbc5af7 | ||
|
4070bac7a4 | ||
|
7290335b14 | ||
|
0c335d751a | ||
|
377b666dc9 | ||
|
e7ac995217 | ||
|
9ecf7fedc5 | ||
|
fba2b544b6 | ||
|
ec26fa013a | ||
|
d949667436 | ||
|
8a83b530fe | ||
|
ec416fdcc4 | ||
|
3f63743a65 | ||
|
f9e8ee0777 | ||
|
c919bbbdd3 | ||
|
bda80ef53f | ||
|
0e36fe1a43 | ||
|
7094b91d10 | ||
|
c80fa6a6bb | ||
|
d2afb4b625 | ||
|
b3aeb004ea | ||
|
7cb2088835 | ||
|
e651197b5c | ||
|
e84a01e94c | ||
|
765eb0bf16 | ||
|
0608ec42f2 | ||
|
c9b2413465 | ||
|
ba23b05545 | ||
|
57d8aa8ffe | ||
|
cbca08cd38 | ||
|
fcfb592adc | ||
|
d02bb6ca05 | ||
|
45b8632dcc | ||
|
3f917b326b | ||
|
cc801fb38f | ||
|
8a2accb847 | ||
|
43571852e6 | ||
|
0060acd11b | ||
|
a63199832a | ||
|
7a832a8a0e | ||
|
1a20370b36 | ||
|
3ff3a8a467 | ||
|
6cdc9fd51b | ||
|
80f7ff2996 | ||
|
29bd08ff0f | ||
|
26c71db332 | ||
|
623c2fd621 | ||
|
72884484b0 | ||
|
7e6a606c32 | ||
|
1ba76c9e8c | ||
|
d4598e1d18 | ||
|
6170688616 | ||
|
2d16946bac | ||
|
6c4d26a364 | ||
|
3f9bd867b5 | ||
|
fcd9308856 | ||
|
eea5d3ef2d | ||
|
3559899586 | ||
|
40f8b2976a | ||
|
68d1b0a152 | ||
|
5566386f5f | ||
|
9e22c53aa9 | ||
|
1095a7b0c9 | ||
|
5d37dab012 | ||
|
34fb381b5a | ||
|
c27ca91564 | ||
|
76e32022c4 | ||
|
d03c4e2020 | ||
|
bfc83b54c4 | ||
|
ff2ff2fbca | ||
|
2681366966 | ||
|
0804fdbc28 | ||
|
95601237ef | ||
|
27f1c762b1 | ||
|
eca8baa028 | ||
|
ee338a256e | ||
|
5d5208b67d | ||
|
954589b64b | ||
|
489b28e216 | ||
|
f9fe3ae5dd | ||
|
e8b78217bb | ||
|
ef2cb13b49 | ||
|
97e8b33f87 | ||
|
67a6258918 | ||
|
f4e38b5cd2 | ||
|
88c2f08eba | ||
|
100919ce74 | ||
|
dc7b7f28b7 | ||
|
bbea2752f6 | ||
|
4988e2b406 | ||
|
5894a245b9 | ||
|
16c2d8da0d | ||
|
99355e25b9 | ||
|
71dd405460 | ||
|
639fdf06ed | ||
|
d35bcbee90 | ||
|
615d370ca2 | ||
|
d989eae308 | ||
|
f919dc7a4b | ||
|
f00b153414 | ||
|
d3eb0d7b59 | ||
|
4db947d17c | ||
|
1ebe787fe4 | ||
|
2d0e0084b6 | ||
|
f67238aa86 | ||
|
3fe9878db7 | ||
|
0761f29a14 | ||
|
4d05ba2c58 | ||
|
43881636c2 | ||
|
b299942bbd | ||
|
413c15988e | ||
|
ae69838db0 | ||
|
b0419d150a | ||
|
8a842e03cd | ||
|
76571ae869 | ||
|
205b140dec | ||
|
860d2764dd | ||
|
4779dfe12f | ||
|
02cbdb0b86 | ||
|
3af5e9fdba | ||
|
383a6f67e5 |
110 changed files with 8454 additions and 4930 deletions
126
.clang-format
Normal file
126
.clang-format
Normal file
|
@ -0,0 +1,126 @@
|
||||||
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
#
|
||||||
|
# clang-format configuration file. Intended for clang-format >= 11.
|
||||||
|
#
|
||||||
|
# For more information, see:
|
||||||
|
#
|
||||||
|
# Documentation/dev-tools/clang-format.rst
|
||||||
|
# https://clang.llvm.org/docs/ClangFormat.html
|
||||||
|
# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
|
||||||
|
#
|
||||||
|
---
|
||||||
|
AccessModifierOffset: -4
|
||||||
|
AlignAfterOpenBracket: Align
|
||||||
|
AlignConsecutiveAssignments: false
|
||||||
|
AlignConsecutiveDeclarations: false
|
||||||
|
AlignEscapedNewlines: Left
|
||||||
|
AlignOperands: true
|
||||||
|
AlignTrailingComments: false
|
||||||
|
AllowAllParametersOfDeclarationOnNextLine: false
|
||||||
|
AllowShortBlocksOnASingleLine: false
|
||||||
|
AllowShortCaseLabelsOnASingleLine: false
|
||||||
|
AllowShortFunctionsOnASingleLine: None
|
||||||
|
AllowShortIfStatementsOnASingleLine: false
|
||||||
|
AllowShortLoopsOnASingleLine: false
|
||||||
|
AlwaysBreakAfterDefinitionReturnType: None
|
||||||
|
AlwaysBreakAfterReturnType: None
|
||||||
|
AlwaysBreakBeforeMultilineStrings: false
|
||||||
|
AlwaysBreakTemplateDeclarations: false
|
||||||
|
BinPackArguments: true
|
||||||
|
BinPackParameters: true
|
||||||
|
BraceWrapping:
|
||||||
|
AfterClass: false
|
||||||
|
AfterControlStatement: false
|
||||||
|
AfterEnum: false
|
||||||
|
AfterFunction: true
|
||||||
|
AfterNamespace: true
|
||||||
|
AfterObjCDeclaration: false
|
||||||
|
AfterStruct: false
|
||||||
|
AfterUnion: false
|
||||||
|
AfterExternBlock: false
|
||||||
|
BeforeCatch: false
|
||||||
|
BeforeElse: false
|
||||||
|
IndentBraces: false
|
||||||
|
SplitEmptyFunction: true
|
||||||
|
SplitEmptyRecord: true
|
||||||
|
SplitEmptyNamespace: true
|
||||||
|
BreakBeforeBinaryOperators: None
|
||||||
|
BreakBeforeBraces: Custom
|
||||||
|
BreakBeforeInheritanceComma: false
|
||||||
|
BreakBeforeTernaryOperators: false
|
||||||
|
BreakConstructorInitializersBeforeComma: false
|
||||||
|
BreakConstructorInitializers: BeforeComma
|
||||||
|
BreakAfterJavaFieldAnnotations: false
|
||||||
|
BreakStringLiterals: false
|
||||||
|
ColumnLimit: 80
|
||||||
|
CommentPragmas: '^ IWYU pragma:'
|
||||||
|
CompactNamespaces: false
|
||||||
|
ConstructorInitializerAllOnOneLineOrOnePerLine: false
|
||||||
|
ConstructorInitializerIndentWidth: 8
|
||||||
|
ContinuationIndentWidth: 8
|
||||||
|
Cpp11BracedListStyle: false
|
||||||
|
DerivePointerAlignment: false
|
||||||
|
DisableFormat: false
|
||||||
|
ExperimentalAutoDetectBinPacking: false
|
||||||
|
FixNamespaceComments: false
|
||||||
|
|
||||||
|
# Taken from:
|
||||||
|
# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
|
||||||
|
# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
|
||||||
|
# | LC_ALL=C sort -u
|
||||||
|
ForEachMacros:
|
||||||
|
- 'for_each_nst'
|
||||||
|
|
||||||
|
IncludeBlocks: Preserve
|
||||||
|
IncludeCategories:
|
||||||
|
- Regex: '.*'
|
||||||
|
Priority: 1
|
||||||
|
IncludeIsMainRegex: '(Test)?$'
|
||||||
|
IndentCaseLabels: false
|
||||||
|
IndentGotoLabels: false
|
||||||
|
IndentPPDirectives: None
|
||||||
|
IndentWidth: 8
|
||||||
|
IndentWrappedFunctionNames: false
|
||||||
|
JavaScriptQuotes: Leave
|
||||||
|
JavaScriptWrapImports: true
|
||||||
|
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||||
|
MacroBlockBegin: ''
|
||||||
|
MacroBlockEnd: ''
|
||||||
|
MaxEmptyLinesToKeep: 1
|
||||||
|
NamespaceIndentation: None
|
||||||
|
ObjCBinPackProtocolList: Auto
|
||||||
|
ObjCBlockIndentWidth: 8
|
||||||
|
ObjCSpaceAfterProperty: true
|
||||||
|
ObjCSpaceBeforeProtocolList: true
|
||||||
|
|
||||||
|
# Taken from git's rules
|
||||||
|
PenaltyBreakAssignment: 10
|
||||||
|
PenaltyBreakBeforeFirstCallParameter: 30
|
||||||
|
PenaltyBreakComment: 10
|
||||||
|
PenaltyBreakFirstLessLess: 0
|
||||||
|
PenaltyBreakString: 10
|
||||||
|
PenaltyExcessCharacter: 100
|
||||||
|
PenaltyReturnTypeOnItsOwnLine: 60
|
||||||
|
|
||||||
|
PointerAlignment: Right
|
||||||
|
ReflowComments: false
|
||||||
|
SortIncludes: false
|
||||||
|
SortUsingDeclarations: false
|
||||||
|
SpaceAfterCStyleCast: false
|
||||||
|
SpaceAfterTemplateKeyword: true
|
||||||
|
SpaceBeforeAssignmentOperators: true
|
||||||
|
SpaceBeforeCtorInitializerColon: true
|
||||||
|
SpaceBeforeInheritanceColon: true
|
||||||
|
SpaceBeforeParens: ControlStatementsExceptForEachMacros
|
||||||
|
SpaceBeforeRangeBasedForLoopColon: true
|
||||||
|
SpaceInEmptyParentheses: false
|
||||||
|
SpacesBeforeTrailingComments: 1
|
||||||
|
SpacesInAngles: false
|
||||||
|
SpacesInContainerLiterals: false
|
||||||
|
SpacesInCStyleCastParentheses: false
|
||||||
|
SpacesInParentheses: false
|
||||||
|
SpacesInSquareBrackets: false
|
||||||
|
Standard: Cpp03
|
||||||
|
TabWidth: 8
|
||||||
|
UseTab: Always
|
||||||
|
...
|
93
.clang-tidy
Normal file
93
.clang-tidy
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
---
|
||||||
|
Checks:
|
||||||
|
- "clang-diagnostic-*,clang-analyzer-*,*,-modernize-*"
|
||||||
|
|
||||||
|
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
|
||||||
|
- "-clang-analyzer-valist.Uninitialized"
|
||||||
|
|
||||||
|
# Dubious value, would kill readability
|
||||||
|
- "-cppcoreguidelines-init-variables"
|
||||||
|
|
||||||
|
# Dubious value over the compiler's built-in warning. Would
|
||||||
|
# increase verbosity.
|
||||||
|
- "-bugprone-assignment-in-if-condition"
|
||||||
|
|
||||||
|
# Debatable whether these improve readability, right now it would look
|
||||||
|
# like a mess
|
||||||
|
- "-google-readability-braces-around-statements"
|
||||||
|
- "-hicpp-braces-around-statements"
|
||||||
|
- "-readability-braces-around-statements"
|
||||||
|
|
||||||
|
# TODO: in most cases they are justified, but probably not everywhere
|
||||||
|
#
|
||||||
|
- "-readability-magic-numbers"
|
||||||
|
- "-cppcoreguidelines-avoid-magic-numbers"
|
||||||
|
|
||||||
|
# TODO: this is Linux-only for the moment, nice to fix eventually
|
||||||
|
- "-llvmlibc-restrict-system-libc-headers"
|
||||||
|
|
||||||
|
# Those are needed for syscalls, epoll_wait flags, etc.
|
||||||
|
- "-hicpp-signed-bitwise"
|
||||||
|
|
||||||
|
# Probably not doable to impement this without plain memcpy(), memset()
|
||||||
|
- "-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
|
||||||
|
|
||||||
|
# TODO: not really important, but nice to fix eventually
|
||||||
|
- "-llvm-include-order"
|
||||||
|
|
||||||
|
# Dubious value, would kill readability
|
||||||
|
- "-readability-isolate-declaration"
|
||||||
|
|
||||||
|
# TODO: nice to fix eventually
|
||||||
|
- "-bugprone-narrowing-conversions"
|
||||||
|
- "-cppcoreguidelines-narrowing-conversions"
|
||||||
|
|
||||||
|
# TODO: check, fix, and more in general constify wherever possible
|
||||||
|
- "-cppcoreguidelines-avoid-non-const-global-variables"
|
||||||
|
|
||||||
|
# TODO: check paths where it might make sense to improve performance
|
||||||
|
- "-altera-unroll-loops"
|
||||||
|
- "-altera-id-dependent-backward-branch"
|
||||||
|
|
||||||
|
# Not much can be done about them other than being careful
|
||||||
|
- "-bugprone-easily-swappable-parameters"
|
||||||
|
|
||||||
|
# TODO: split reported functions
|
||||||
|
- "-readability-function-cognitive-complexity"
|
||||||
|
|
||||||
|
# "Poor" alignment needed for structs reflecting message formats/headers
|
||||||
|
- "-altera-struct-pack-align"
|
||||||
|
|
||||||
|
# TODO: check again if multithreading is implemented
|
||||||
|
- "-concurrency-mt-unsafe"
|
||||||
|
|
||||||
|
# Complains about any identifier <3 characters, reasonable for
|
||||||
|
# globals, pointlessly verbose for locals and parameters.
|
||||||
|
- "-readability-identifier-length"
|
||||||
|
|
||||||
|
# Wants to include headers which *directly* provide the things
|
||||||
|
# we use. That sounds nice, but means it will often want a OS
|
||||||
|
# specific header instead of a mostly standard one, such as
|
||||||
|
# <linux/limits.h> instead of <limits.h>.
|
||||||
|
- "-misc-include-cleaner"
|
||||||
|
|
||||||
|
# Want to replace all #defines of integers with enums. Kind of
|
||||||
|
# makes sense when those defines form an enum-like set, but
|
||||||
|
# weird for cases like standalone constants, and causes other
|
||||||
|
# awkwardness for a bunch of cases we use
|
||||||
|
- "-cppcoreguidelines-macro-to-enum"
|
||||||
|
|
||||||
|
# It's been a couple of centuries since multiplication has been granted
|
||||||
|
# precedence over addition in modern mathematical notation. Adding
|
||||||
|
# parentheses to reinforce that certainly won't improve readability.
|
||||||
|
- "-readability-math-missing-parentheses"
|
||||||
|
WarningsAsErrors: "*"
|
||||||
|
HeaderFileExtensions:
|
||||||
|
- h
|
||||||
|
ImplementationFileExtensions:
|
||||||
|
- c
|
||||||
|
HeaderFilterRegex: ""
|
||||||
|
FormatStyle: none
|
||||||
|
CheckOptions:
|
||||||
|
bugprone-suspicious-string-compare.WarnOnImplicitComparison: "false"
|
||||||
|
SystemHeaders: false
|
3
.clangd
Normal file
3
.clangd
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
CompileFlags:
|
||||||
|
# Don't try to interpret our headers as C++'
|
||||||
|
Add: [-xc, -Wall]
|
176
Makefile
176
Makefile
|
@ -15,65 +15,41 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio
|
||||||
# the IPv6 socket API? (Linux does)
|
# the IPv6 socket API? (Linux does)
|
||||||
DUAL_STACK_SOCKETS := 1
|
DUAL_STACK_SOCKETS := 1
|
||||||
|
|
||||||
RLIMIT_STACK_VAL := $(shell /bin/sh -c 'ulimit -s')
|
|
||||||
ifeq ($(RLIMIT_STACK_VAL),unlimited)
|
|
||||||
RLIMIT_STACK_VAL := 1024
|
|
||||||
endif
|
|
||||||
|
|
||||||
TARGET ?= $(shell $(CC) -dumpmachine)
|
TARGET ?= $(shell $(CC) -dumpmachine)
|
||||||
# Get 'uname -m'-like architecture description for target
|
# Get 'uname -m'-like architecture description for target
|
||||||
TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
|
TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z])
|
||||||
TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')
|
TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/')
|
||||||
|
|
||||||
AUDIT_ARCH := $(shell echo $(TARGET_ARCH) | tr [a-z] [A-Z] | sed 's/^ARM.*/ARM/')
|
# On some systems enabling optimization also enables source fortification,
|
||||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/I[456]86/I386/')
|
# automagically. Do not override it.
|
||||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/')
|
FORTIFY_FLAG :=
|
||||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/')
|
ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /dev/null; echo $$?),1)
|
||||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/')
|
FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
|
||||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/')
|
endif
|
||||||
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/')
|
|
||||||
|
|
||||||
FLAGS := -Wall -Wextra -Wno-format-zero-length
|
FLAGS := -Wall -Wextra -Wno-format-zero-length
|
||||||
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
|
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
|
||||||
FLAGS += -D_FORTIFY_SOURCE=2 -O2 -pie -fPIE
|
FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE
|
||||||
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
|
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
|
||||||
FLAGS += -DNETNS_RUN_DIR=\"/run/netns\"
|
|
||||||
FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)
|
|
||||||
FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL)
|
|
||||||
FLAGS += -DARCH=\"$(TARGET_ARCH)\"
|
|
||||||
FLAGS += -DVERSION=\"$(VERSION)\"
|
FLAGS += -DVERSION=\"$(VERSION)\"
|
||||||
FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
|
FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
|
||||||
|
|
||||||
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
|
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
|
||||||
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
|
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
|
||||||
ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
|
ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
|
||||||
tcp_splice.c udp.c util.c
|
tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c
|
||||||
QRAP_SRCS = qrap.c
|
QRAP_SRCS = qrap.c
|
||||||
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
|
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
|
||||||
|
|
||||||
MANPAGES = passt.1 pasta.1 qrap.1
|
MANPAGES = passt.1 pasta.1 qrap.1
|
||||||
|
|
||||||
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
|
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
|
||||||
flow_table.h icmp.h inany.h iov.h ip.h isolation.h lineread.h log.h \
|
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
|
||||||
ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h siphash.h tap.h \
|
lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
|
||||||
tcp.h tcp_conn.h tcp_splice.h udp.h util.h
|
siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
|
||||||
|
udp.h udp_flow.h util.h
|
||||||
HEADERS = $(PASST_HEADERS) seccomp.h
|
HEADERS = $(PASST_HEADERS) seccomp.h
|
||||||
|
|
||||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
|
|
||||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
|
||||||
FLAGS += -DHAS_SND_WND
|
|
||||||
endif
|
|
||||||
|
|
||||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 };
|
|
||||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
|
||||||
FLAGS += -DHAS_BYTES_ACKED
|
|
||||||
endif
|
|
||||||
|
|
||||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_min_rtt = 0 };
|
|
||||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
|
||||||
FLAGS += -DHAS_MIN_RTT
|
|
||||||
endif
|
|
||||||
|
|
||||||
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
|
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
|
||||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
||||||
FLAGS += -DHAS_GETRANDOM
|
FLAGS += -DHAS_GETRANDOM
|
||||||
|
@ -83,11 +59,6 @@ ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - -o - >/dev/null 2>&1; ec
|
||||||
FLAGS += -fstack-protector-strong
|
FLAGS += -fstack-protector-strong
|
||||||
endif
|
endif
|
||||||
|
|
||||||
C := \#define _GNU_SOURCE\n\#include <fcntl.h>\nint x = FALLOC_FL_COLLAPSE_RANGE;
|
|
||||||
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
|
|
||||||
EXTRA_SYSCALLS += fallocate
|
|
||||||
endif
|
|
||||||
|
|
||||||
prefix ?= /usr/local
|
prefix ?= /usr/local
|
||||||
exec_prefix ?= $(prefix)
|
exec_prefix ?= $(prefix)
|
||||||
bindir ?= $(exec_prefix)/bin
|
bindir ?= $(exec_prefix)/bin
|
||||||
|
@ -124,11 +95,11 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
|
||||||
ln -sf $< $@
|
ln -sf $< $@
|
||||||
|
|
||||||
qrap: $(QRAP_SRCS) passt.h
|
qrap: $(QRAP_SRCS) passt.h
|
||||||
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS)
|
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS)
|
||||||
|
|
||||||
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
|
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
|
||||||
getpid gettid kill clock_gettime mmap \
|
rt_sigreturn getpid gettid kill clock_gettime mmap \
|
||||||
munmap open unlink gettimeofday futex
|
mmap2 munmap open unlink gettimeofday futex
|
||||||
valgrind: FLAGS += -g -DVALGRIND
|
valgrind: FLAGS += -g -DVALGRIND
|
||||||
valgrind: all
|
valgrind: all
|
||||||
|
|
||||||
|
@ -188,111 +159,11 @@ docs: README.md
|
||||||
done < README.md; \
|
done < README.md; \
|
||||||
) > README.plain.md
|
) > README.plain.md
|
||||||
|
|
||||||
# Checkers currently disabled for clang-tidy:
|
clang-tidy: $(PASST_SRCS) $(HEADERS)
|
||||||
# - llvmlibc-restrict-system-libc-headers
|
clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
|
||||||
# TODO: this is Linux-only for the moment, nice to fix eventually
|
-DCLANG_TIDY_58992
|
||||||
#
|
|
||||||
# - bugprone-macro-parentheses
|
|
||||||
# - google-readability-braces-around-statements
|
|
||||||
# - hicpp-braces-around-statements
|
|
||||||
# - readability-braces-around-statements
|
|
||||||
# Debatable whether that improves readability, right now it would look
|
|
||||||
# like a mess
|
|
||||||
#
|
|
||||||
# - readability-magic-numbers
|
|
||||||
# - cppcoreguidelines-avoid-magic-numbers
|
|
||||||
# TODO: in most cases they are justified, but probably not everywhere
|
|
||||||
#
|
|
||||||
# - clang-analyzer-valist.Uninitialized
|
|
||||||
# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed
|
|
||||||
#
|
|
||||||
# - clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling
|
|
||||||
# Probably not doable to impement this without plain memcpy(), memset()
|
|
||||||
#
|
|
||||||
# - cppcoreguidelines-init-variables
|
|
||||||
# Dubious value, would kill readability
|
|
||||||
#
|
|
||||||
# - hicpp-signed-bitwise
|
|
||||||
# Those are needed for syscalls, epoll_wait flags, etc.
|
|
||||||
#
|
|
||||||
# - llvm-include-order
|
|
||||||
# TODO: not really important, but nice to fix eventually
|
|
||||||
#
|
|
||||||
# - readability-isolate-declaration
|
|
||||||
# Dubious value, would kill readability
|
|
||||||
#
|
|
||||||
# - bugprone-narrowing-conversions
|
|
||||||
# - cppcoreguidelines-narrowing-conversions
|
|
||||||
# TODO: nice to fix eventually
|
|
||||||
#
|
|
||||||
# - cppcoreguidelines-avoid-non-const-global-variables
|
|
||||||
# TODO: check, fix, and more in general constify wherever possible
|
|
||||||
#
|
|
||||||
# - altera-unroll-loops
|
|
||||||
# - altera-id-dependent-backward-branch
|
|
||||||
# TODO: check paths where it might make sense to improve performance
|
|
||||||
#
|
|
||||||
# - bugprone-easily-swappable-parameters
|
|
||||||
# Not much can be done about them other than being careful
|
|
||||||
#
|
|
||||||
# - readability-function-cognitive-complexity
|
|
||||||
# TODO: split reported functions
|
|
||||||
#
|
|
||||||
# - altera-struct-pack-align
|
|
||||||
# "Poor" alignment needed for structs reflecting message formats/headers
|
|
||||||
#
|
|
||||||
# - concurrency-mt-unsafe
|
|
||||||
# TODO: check again if multithreading is implemented
|
|
||||||
#
|
|
||||||
# - readability-identifier-length
|
|
||||||
# Complains about any identifier <3 characters, reasonable for
|
|
||||||
# globals, pointlessly verbose for locals and parameters.
|
|
||||||
#
|
|
||||||
# - bugprone-assignment-in-if-condition
|
|
||||||
# Dubious value over the compiler's built-in warning. Would
|
|
||||||
# increase verbosity.
|
|
||||||
#
|
|
||||||
# - misc-include-cleaner
|
|
||||||
# Wants to include headers which *directly* provide the things
|
|
||||||
# we use. That sounds nice, but means it will often want a OS
|
|
||||||
# specific header instead of a mostly standard one, such as
|
|
||||||
# <linux/limits.h> instead of <limits.h>.
|
|
||||||
|
|
||||||
clang-tidy: $(SRCS) $(HEADERS)
|
cppcheck: $(PASST_SRCS) $(HEADERS)
|
||||||
clang-tidy -checks=*,-modernize-*,\
|
|
||||||
-clang-analyzer-valist.Uninitialized,\
|
|
||||||
-cppcoreguidelines-init-variables,\
|
|
||||||
-bugprone-assignment-in-if-condition,\
|
|
||||||
-bugprone-macro-parentheses,\
|
|
||||||
-google-readability-braces-around-statements,\
|
|
||||||
-hicpp-braces-around-statements,\
|
|
||||||
-readability-braces-around-statements,\
|
|
||||||
-readability-magic-numbers,\
|
|
||||||
-llvmlibc-restrict-system-libc-headers,\
|
|
||||||
-hicpp-signed-bitwise,\
|
|
||||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\
|
|
||||||
-llvm-include-order,\
|
|
||||||
-cppcoreguidelines-avoid-magic-numbers,\
|
|
||||||
-readability-isolate-declaration,\
|
|
||||||
-bugprone-narrowing-conversions,\
|
|
||||||
-cppcoreguidelines-narrowing-conversions,\
|
|
||||||
-cppcoreguidelines-avoid-non-const-global-variables,\
|
|
||||||
-altera-unroll-loops,-altera-id-dependent-backward-branch,\
|
|
||||||
-bugprone-easily-swappable-parameters,\
|
|
||||||
-readability-function-cognitive-complexity,\
|
|
||||||
-altera-struct-pack-align,\
|
|
||||||
-concurrency-mt-unsafe,\
|
|
||||||
-readability-identifier-length,\
|
|
||||||
-misc-include-cleaner \
|
|
||||||
-config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \
|
|
||||||
--warnings-as-errors=* $(SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992
|
|
||||||
|
|
||||||
SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET))
|
|
||||||
ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1)
|
|
||||||
VER := $(shell $(CC) -dumpversion)
|
|
||||||
SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include
|
|
||||||
endif
|
|
||||||
cppcheck: $(SRCS) $(HEADERS)
|
|
||||||
if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \
|
if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \
|
||||||
CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \
|
CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \
|
||||||
else \
|
else \
|
||||||
|
@ -301,11 +172,8 @@ cppcheck: $(SRCS) $(HEADERS)
|
||||||
cppcheck --std=c11 --error-exitcode=1 --enable=all --force \
|
cppcheck --std=c11 --error-exitcode=1 --enable=all --force \
|
||||||
--inconclusive --library=posix --quiet \
|
--inconclusive --library=posix --quiet \
|
||||||
$${CPPCHECK_EXHAUSTIVE} \
|
$${CPPCHECK_EXHAUSTIVE} \
|
||||||
$(SYSTEM_INCLUDES:%=-I%) \
|
|
||||||
$(SYSTEM_INCLUDES:%=--config-exclude=%) \
|
|
||||||
$(SYSTEM_INCLUDES:%=--suppress=*:%/*) \
|
|
||||||
$(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*) \
|
|
||||||
--inline-suppr \
|
--inline-suppr \
|
||||||
|
--suppress=missingIncludeSystem \
|
||||||
--suppress=unusedStructMember \
|
--suppress=unusedStructMember \
|
||||||
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
|
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936 \
|
||||||
.
|
$(PASST_SRCS) $(HEADERS)
|
||||||
|
|
10
README.md
10
README.md
|
@ -338,20 +338,24 @@ speeding up local connections, and usually requiring NAT. _pasta_:
|
||||||
[_slirp4netns_ replacement](/passt/tree/slirp4netns.sh)
|
[_slirp4netns_ replacement](/passt/tree/slirp4netns.sh)
|
||||||
* ✅ out-of-tree patch for
|
* ✅ out-of-tree patch for
|
||||||
[Kata Containers](/passt/tree/contrib/kata-containers) available
|
[Kata Containers](/passt/tree/contrib/kata-containers) available
|
||||||
* ⌚ drop-in replacement for VPNKit (rootless Docker)
|
* ✅ rootless Docker
|
||||||
|
[network back-end](https://docs.docker.com/engine/security/rootless/#networking-errors)
|
||||||
|
via moby/rootlesskit
|
||||||
|
|
||||||
### Availability
|
### Availability
|
||||||
* official packages for:
|
* official packages for:
|
||||||
|
* ✅ [Alpine Linux](https://pkgs.alpinelinux.org/packages?name=passt)
|
||||||
* ✅ [Arch Linux](https://archlinux.org/packages/extra/x86_64/passt/) ([aarch64](https://archlinuxarm.org/packages/aarch64/passt), [i486](https://www.archlinux32.org/packages/?q=passt))
|
* ✅ [Arch Linux](https://archlinux.org/packages/extra/x86_64/passt/) ([aarch64](https://archlinuxarm.org/packages/aarch64/passt), [i486](https://www.archlinux32.org/packages/?q=passt))
|
||||||
* ✅ [CentOS Stream](https://gitlab.com/redhat/centos-stream/rpms/passt)
|
* ✅ [CentOS Stream](https://gitlab.com/redhat/centos-stream/rpms/passt)
|
||||||
* ✅ [Debian](https://tracker.debian.org/pkg/passt)
|
* ✅ [Debian](https://tracker.debian.org/pkg/passt)
|
||||||
* ✅ [Fedora](https://src.fedoraproject.org/rpms/passt)
|
* ✅ [Fedora](https://src.fedoraproject.org/rpms/passt)
|
||||||
* ✅ [Gentoo](https://packages.gentoo.org/packages/net-misc/passt)
|
* ✅ [Gentoo](https://packages.gentoo.org/packages/net-misc/passt)
|
||||||
|
* ✅ [GNU Guix](https://packages.guix.gnu.org/packages/passt/)
|
||||||
|
* ✅ [OpenSUSE](https://build.opensuse.org/package/requests/Virtualization:containers/passt)
|
||||||
* ✅ [Ubuntu](https://launchpad.net/ubuntu/+source/passt)
|
* ✅ [Ubuntu](https://launchpad.net/ubuntu/+source/passt)
|
||||||
* ✅ [Void Linux](https://voidlinux.org/packages/?q=passt)
|
* ✅ [Void Linux](https://voidlinux.org/packages/?q=passt)
|
||||||
* unofficial packages for:
|
* unofficial packages for:
|
||||||
* ✅ [EPEL, Mageia](https://copr.fedorainfracloud.org/coprs/sbrivio/passt/)
|
* ✅ [EPEL, Mageia](https://copr.fedorainfracloud.org/coprs/sbrivio/passt/)
|
||||||
* 🛠 [openSUSE](https://build.opensuse.org/package/show/Virtualization:containers/passt)
|
|
||||||
* ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64
|
* ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64
|
||||||
static builds for other RPM-based distributions
|
static builds for other RPM-based distributions
|
||||||
* ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64
|
* ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64
|
||||||
|
@ -396,7 +400,7 @@ services:
|
||||||
and nameserver using SLAAC
|
and nameserver using SLAAC
|
||||||
* [DHCPv6 server](/passt/tree/dhcpv6.c): a simple
|
* [DHCPv6 server](/passt/tree/dhcpv6.c): a simple
|
||||||
implementation handing out one single IPv6 address to the guest or namespace,
|
implementation handing out one single IPv6 address to the guest or namespace,
|
||||||
namely, the the same address as the first one configured for the upstream host
|
namely, the same address as the first one configured for the upstream host
|
||||||
interface, and passing the nameservers configured on the host
|
interface, and passing the nameservers configured on the host
|
||||||
|
|
||||||
## Addresses
|
## Addresses
|
||||||
|
|
18
arch.c
18
arch.c
|
@ -18,6 +18,9 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "log.h"
|
||||||
|
#include "util.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* arch_avx2_exec() - Switch to AVX2 build if supported
|
* arch_avx2_exec() - Switch to AVX2 build if supported
|
||||||
* @argv: Arguments from command line
|
* @argv: Arguments from command line
|
||||||
|
@ -28,10 +31,8 @@ void arch_avx2_exec(char **argv)
|
||||||
char exe[PATH_MAX] = { 0 };
|
char exe[PATH_MAX] = { 0 };
|
||||||
const char *p;
|
const char *p;
|
||||||
|
|
||||||
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) {
|
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0)
|
||||||
perror("readlink /proc/self/exe");
|
die_perror("Failed to read own /proc/self/exe link");
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
p = strstr(exe, ".avx2");
|
p = strstr(exe, ".avx2");
|
||||||
if (p && strlen(p) == strlen(".avx2"))
|
if (p && strlen(p) == strlen(".avx2"))
|
||||||
|
@ -40,9 +41,12 @@ void arch_avx2_exec(char **argv)
|
||||||
if (__builtin_cpu_supports("avx2")) {
|
if (__builtin_cpu_supports("avx2")) {
|
||||||
char new_path[PATH_MAX + sizeof(".avx2")];
|
char new_path[PATH_MAX + sizeof(".avx2")];
|
||||||
|
|
||||||
snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe);
|
if (snprintf_check(new_path, PATH_MAX + sizeof(".avx2"),
|
||||||
execve(new_path, argv, environ);
|
"%s.avx2", exe))
|
||||||
perror("Can't run AVX2 build, using non-AVX2 version");
|
die_perror("Can't build AVX2 executable path");
|
||||||
|
|
||||||
|
execv(new_path, argv);
|
||||||
|
warn_perror("Can't run AVX2 build, using non-AVX2 version");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
20
arp.c
20
arp.c
|
@ -43,8 +43,7 @@ int arp(const struct ctx *c, const struct pool *p)
|
||||||
struct ethhdr *eh;
|
struct ethhdr *eh;
|
||||||
struct arphdr *ah;
|
struct arphdr *ah;
|
||||||
struct arpmsg *am;
|
struct arpmsg *am;
|
||||||
size_t len;
|
size_t l2len;
|
||||||
int ret;
|
|
||||||
|
|
||||||
eh = packet_get(p, 0, 0, sizeof(*eh), NULL);
|
eh = packet_get(p, 0, 0, sizeof(*eh), NULL);
|
||||||
ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL);
|
ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL);
|
||||||
|
@ -60,31 +59,28 @@ int arp(const struct ctx *c, const struct pool *p)
|
||||||
ah->ar_op != htons(ARPOP_REQUEST))
|
ah->ar_op != htons(ARPOP_REQUEST))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
/* Discard announcements (but not 0.0.0.0 "probes"): we might have the
|
/* Discard announcements, but not 0.0.0.0 "probes" */
|
||||||
* same IP address, hide that.
|
if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) &&
|
||||||
*/
|
|
||||||
if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) &&
|
|
||||||
!memcmp(am->sip, am->tip, sizeof(am->sip)))
|
!memcmp(am->sip, am->tip, sizeof(am->sip)))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
/* Don't resolve our own address, either. */
|
/* Don't resolve the guest's assigned address, either. */
|
||||||
if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
|
if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
ah->ar_op = htons(ARPOP_REPLY);
|
ah->ar_op = htons(ARPOP_REPLY);
|
||||||
memcpy(am->tha, am->sha, sizeof(am->tha));
|
memcpy(am->tha, am->sha, sizeof(am->tha));
|
||||||
memcpy(am->sha, c->mac, sizeof(am->sha));
|
memcpy(am->sha, c->our_tap_mac, sizeof(am->sha));
|
||||||
|
|
||||||
memcpy(swap, am->tip, sizeof(am->tip));
|
memcpy(swap, am->tip, sizeof(am->tip));
|
||||||
memcpy(am->tip, am->sip, sizeof(am->tip));
|
memcpy(am->tip, am->sip, sizeof(am->tip));
|
||||||
memcpy(am->sip, swap, sizeof(am->sip));
|
memcpy(am->sip, swap, sizeof(am->sip));
|
||||||
|
|
||||||
len = sizeof(*eh) + sizeof(*ah) + sizeof(*am);
|
l2len = sizeof(*eh) + sizeof(*ah) + sizeof(*am);
|
||||||
memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest));
|
memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest));
|
||||||
memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
|
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
|
||||||
|
|
||||||
if ((ret = tap_send(c, eh, len)) < 0)
|
tap_send_single(c, eh, l2len);
|
||||||
warn("ARP: send: %s", strerror(ret));
|
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
96
checksum.c
96
checksum.c
|
@ -59,6 +59,7 @@
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
#include "ip.h"
|
#include "ip.h"
|
||||||
#include "checksum.h"
|
#include "checksum.h"
|
||||||
|
#include "iov.h"
|
||||||
|
|
||||||
/* Checksums are optional for UDP over IPv4, so we usually just set
|
/* Checksums are optional for UDP over IPv4, so we usually just set
|
||||||
* them to 0. Change this to 1 to calculate real UDP over IPv4
|
* them to 0. Change this to 1 to calculate real UDP over IPv4
|
||||||
|
@ -116,19 +117,19 @@ uint16_t csum_fold(uint32_t sum)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* csum_ip4_header() - Calculate IPv4 header checksum
|
* csum_ip4_header() - Calculate IPv4 header checksum
|
||||||
* @tot_len: IPv4 payload length (data + IP header, network order)
|
* @l3len: IPv4 packet length (host order)
|
||||||
* @protocol: Protocol number (network order)
|
* @protocol: Protocol number
|
||||||
* @saddr: IPv4 source address (network order)
|
* @saddr: IPv4 source address
|
||||||
* @daddr: IPv4 destination address (network order)
|
* @daddr: IPv4 destination address
|
||||||
*
|
*
|
||||||
* Return: 16-bit folded sum of the IPv4 header
|
* Return: 16-bit folded sum of the IPv4 header
|
||||||
*/
|
*/
|
||||||
uint16_t csum_ip4_header(uint16_t tot_len, uint8_t protocol,
|
uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
|
||||||
struct in_addr saddr, struct in_addr daddr)
|
struct in_addr saddr, struct in_addr daddr)
|
||||||
{
|
{
|
||||||
uint32_t sum = L2_BUF_IP4_PSUM(protocol);
|
uint32_t sum = L2_BUF_IP4_PSUM(protocol);
|
||||||
|
|
||||||
sum += tot_len;
|
sum += htons(l3len);
|
||||||
sum += (saddr.s_addr >> 16) & 0xffff;
|
sum += (saddr.s_addr >> 16) & 0xffff;
|
||||||
sum += saddr.s_addr & 0xffff;
|
sum += saddr.s_addr & 0xffff;
|
||||||
sum += (daddr.s_addr >> 16) & 0xffff;
|
sum += (daddr.s_addr >> 16) & 0xffff;
|
||||||
|
@ -140,13 +141,13 @@ uint16_t csum_ip4_header(uint16_t tot_len, uint8_t protocol,
|
||||||
/**
|
/**
|
||||||
* proto_ipv4_header_psum() - Calculates the partial checksum of an
|
* proto_ipv4_header_psum() - Calculates the partial checksum of an
|
||||||
* IPv4 header for UDP or TCP
|
* IPv4 header for UDP or TCP
|
||||||
* @tot_len: IPv4 Payload length (host order)
|
* @l4len: IPv4 Payload length (host order)
|
||||||
* @proto: Protocol number (host order)
|
* @proto: Protocol number
|
||||||
* @saddr: Source address (network order)
|
* @saddr: Source address
|
||||||
* @daddr: Destination address (network order)
|
* @daddr: Destination address
|
||||||
* Returns: Partial checksum of the IPv4 header
|
* Returns: Partial checksum of the IPv4 header
|
||||||
*/
|
*/
|
||||||
uint32_t proto_ipv4_header_psum(uint16_t tot_len, uint8_t protocol,
|
uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
||||||
struct in_addr saddr, struct in_addr daddr)
|
struct in_addr saddr, struct in_addr daddr)
|
||||||
{
|
{
|
||||||
uint32_t psum = htons(protocol);
|
uint32_t psum = htons(protocol);
|
||||||
|
@ -155,7 +156,7 @@ uint32_t proto_ipv4_header_psum(uint16_t tot_len, uint8_t protocol,
|
||||||
psum += saddr.s_addr & 0xffff;
|
psum += saddr.s_addr & 0xffff;
|
||||||
psum += (daddr.s_addr >> 16) & 0xffff;
|
psum += (daddr.s_addr >> 16) & 0xffff;
|
||||||
psum += daddr.s_addr & 0xffff;
|
psum += daddr.s_addr & 0xffff;
|
||||||
psum += htons(tot_len);
|
psum += htons(l4len);
|
||||||
|
|
||||||
return psum;
|
return psum;
|
||||||
}
|
}
|
||||||
|
@ -165,22 +166,24 @@ uint32_t proto_ipv4_header_psum(uint16_t tot_len, uint8_t protocol,
|
||||||
* @udp4hr: UDP header, initialised apart from checksum
|
* @udp4hr: UDP header, initialised apart from checksum
|
||||||
* @saddr: IPv4 source address
|
* @saddr: IPv4 source address
|
||||||
* @daddr: IPv4 destination address
|
* @daddr: IPv4 destination address
|
||||||
* @payload: ICMPv4 packet payload
|
* @iov: Pointer to the array of IO vectors
|
||||||
* @len: Length of @payload (not including UDP)
|
* @iov_cnt: Length of the array
|
||||||
|
* @offset: UDP payload offset in the iovec array
|
||||||
*/
|
*/
|
||||||
void csum_udp4(struct udphdr *udp4hr,
|
void csum_udp4(struct udphdr *udp4hr,
|
||||||
struct in_addr saddr, struct in_addr daddr,
|
struct in_addr saddr, struct in_addr daddr,
|
||||||
const void *payload, size_t len)
|
const struct iovec *iov, int iov_cnt, size_t offset)
|
||||||
{
|
{
|
||||||
/* UDP checksums are optional, so don't bother */
|
/* UDP checksums are optional, so don't bother */
|
||||||
udp4hr->check = 0;
|
udp4hr->check = 0;
|
||||||
|
|
||||||
if (UDP4_REAL_CHECKSUMS) {
|
if (UDP4_REAL_CHECKSUMS) {
|
||||||
uint16_t tot_len = len + sizeof(struct udphdr);
|
uint16_t l4len = iov_size(iov, iov_cnt) - offset +
|
||||||
uint32_t psum = proto_ipv4_header_psum(tot_len, IPPROTO_UDP,
|
sizeof(struct udphdr);
|
||||||
|
uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
|
||||||
saddr, daddr);
|
saddr, daddr);
|
||||||
psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
|
psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum);
|
||||||
udp4hr->check = csum(payload, len, psum);
|
udp4hr->check = csum_iov(iov, iov_cnt, offset, psum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -188,9 +191,9 @@ void csum_udp4(struct udphdr *udp4hr,
|
||||||
* csum_icmp4() - Calculate and set checksum for an ICMP packet
|
* csum_icmp4() - Calculate and set checksum for an ICMP packet
|
||||||
* @icmp4hr: ICMP header, initialised apart from checksum
|
* @icmp4hr: ICMP header, initialised apart from checksum
|
||||||
* @payload: ICMP packet payload
|
* @payload: ICMP packet payload
|
||||||
* @len: Length of @payload (not including ICMP header)
|
* @dlen: Length of @payload (not including ICMP header)
|
||||||
*/
|
*/
|
||||||
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t len)
|
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen)
|
||||||
{
|
{
|
||||||
uint32_t psum;
|
uint32_t psum;
|
||||||
|
|
||||||
|
@ -199,16 +202,16 @@ void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t len)
|
||||||
/* Partial checksum for ICMP header alone */
|
/* Partial checksum for ICMP header alone */
|
||||||
psum = sum_16b(icmp4hr, sizeof(*icmp4hr));
|
psum = sum_16b(icmp4hr, sizeof(*icmp4hr));
|
||||||
|
|
||||||
icmp4hr->checksum = csum(payload, len, psum);
|
icmp4hr->checksum = csum(payload, dlen, psum);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* proto_ipv6_header_psum() - Calculates the partial checksum of an
|
* proto_ipv6_header_psum() - Calculates the partial checksum of an
|
||||||
* IPv6 header for UDP or TCP
|
* IPv6 header for UDP or TCP
|
||||||
* @payload_len: IPv6 payload length (host order)
|
* @payload_len: IPv6 payload length (host order)
|
||||||
* @proto: Protocol number (host order)
|
* @proto: Protocol number
|
||||||
* @saddr: Source address (network order)
|
* @saddr: Source address
|
||||||
* @daddr: Destination address (network order)
|
* @daddr: Destination address
|
||||||
* Returns: Partial checksum of the IPv6 header
|
* Returns: Partial checksum of the IPv6 header
|
||||||
*/
|
*/
|
||||||
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
||||||
|
@ -226,19 +229,24 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
||||||
/**
|
/**
|
||||||
* csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet
|
* csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet
|
||||||
* @udp6hr: UDP header, initialised apart from checksum
|
* @udp6hr: UDP header, initialised apart from checksum
|
||||||
* @payload: UDP packet payload
|
* @saddr: Source address
|
||||||
* @len: Length of @payload (not including UDP header)
|
* @daddr: Destination address
|
||||||
|
* @iov: Pointer to the array of IO vectors
|
||||||
|
* @iov_cnt: Length of the array
|
||||||
|
* @offset: UDP payload offset in the iovec array
|
||||||
*/
|
*/
|
||||||
void csum_udp6(struct udphdr *udp6hr,
|
void csum_udp6(struct udphdr *udp6hr,
|
||||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||||
const void *payload, size_t len)
|
const struct iovec *iov, int iov_cnt, size_t offset)
|
||||||
{
|
{
|
||||||
uint32_t psum = proto_ipv6_header_psum(len + sizeof(struct udphdr),
|
uint16_t l4len = iov_size(iov, iov_cnt) - offset +
|
||||||
IPPROTO_UDP, saddr, daddr);
|
sizeof(struct udphdr);
|
||||||
|
uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
|
||||||
|
saddr, daddr);
|
||||||
udp6hr->check = 0;
|
udp6hr->check = 0;
|
||||||
|
|
||||||
psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
|
psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum);
|
||||||
udp6hr->check = csum(payload, len, psum);
|
udp6hr->check = csum_iov(iov, iov_cnt, offset, psum);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -247,21 +255,19 @@ void csum_udp6(struct udphdr *udp6hr,
|
||||||
* @saddr: IPv6 source address
|
* @saddr: IPv6 source address
|
||||||
* @daddr: IPv6 destination address
|
* @daddr: IPv6 destination address
|
||||||
* @payload: ICMP packet payload
|
* @payload: ICMP packet payload
|
||||||
* @len: Length of @payload (not including ICMPv6 header)
|
* @dlen: Length of @payload (not including ICMPv6 header)
|
||||||
*/
|
*/
|
||||||
void csum_icmp6(struct icmp6hdr *icmp6hr,
|
void csum_icmp6(struct icmp6hdr *icmp6hr,
|
||||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||||
const void *payload, size_t len)
|
const void *payload, size_t dlen)
|
||||||
{
|
{
|
||||||
/* Partial checksum for the pseudo-IPv6 header */
|
uint32_t psum = proto_ipv6_header_psum(dlen + sizeof(*icmp6hr),
|
||||||
uint32_t psum = sum_16b(saddr, sizeof(*saddr)) +
|
IPPROTO_ICMPV6, saddr, daddr);
|
||||||
sum_16b(daddr, sizeof(*daddr)) +
|
|
||||||
htons(len + sizeof(*icmp6hr)) + htons(IPPROTO_ICMPV6);
|
|
||||||
|
|
||||||
icmp6hr->icmp6_cksum = 0;
|
icmp6hr->icmp6_cksum = 0;
|
||||||
/* Add in partial checksum for the ICMPv6 header alone */
|
/* Add in partial checksum for the ICMPv6 header alone */
|
||||||
psum += sum_16b(icmp6hr, sizeof(*icmp6hr));
|
psum += sum_16b(icmp6hr, sizeof(*icmp6hr));
|
||||||
icmp6hr->icmp6_cksum = csum(payload, len, psum);
|
icmp6hr->icmp6_cksum = csum(payload, dlen, psum);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
|
@ -499,16 +505,26 @@ uint16_t csum(const void *buf, size_t len, uint32_t init)
|
||||||
*
|
*
|
||||||
* @iov Pointer to the array of IO vectors
|
* @iov Pointer to the array of IO vectors
|
||||||
* @n Length of the array
|
* @n Length of the array
|
||||||
|
* @offset: Offset of the data to checksum within the full data length
|
||||||
* @init Initial 32-bit checksum, 0 for no pre-computed checksum
|
* @init Initial 32-bit checksum, 0 for no pre-computed checksum
|
||||||
*
|
*
|
||||||
* Return: 16-bit folded, complemented checksum
|
* Return: 16-bit folded, complemented checksum
|
||||||
*/
|
*/
|
||||||
/* cppcheck-suppress unusedFunction */
|
uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
|
||||||
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init)
|
uint32_t init)
|
||||||
{
|
{
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
|
size_t first;
|
||||||
|
|
||||||
for (i = 0; i < n; i++)
|
i = iov_skip_bytes(iov, n, offset, &first);
|
||||||
|
if (i >= n)
|
||||||
|
return (uint16_t)~csum_fold(init);
|
||||||
|
|
||||||
|
init = csum_unfolded((char *)iov[i].iov_base + first,
|
||||||
|
iov[i].iov_len - first, init);
|
||||||
|
i++;
|
||||||
|
|
||||||
|
for (; i < n; i++)
|
||||||
init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);
|
init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init);
|
||||||
|
|
||||||
return (uint16_t)~csum_fold(init);
|
return (uint16_t)~csum_fold(init);
|
||||||
|
|
15
checksum.h
15
checksum.h
|
@ -13,25 +13,26 @@ struct icmp6hdr;
|
||||||
uint32_t sum_16b(const void *buf, size_t len);
|
uint32_t sum_16b(const void *buf, size_t len);
|
||||||
uint16_t csum_fold(uint32_t sum);
|
uint16_t csum_fold(uint32_t sum);
|
||||||
uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init);
|
uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init);
|
||||||
uint16_t csum_ip4_header(uint16_t tot_len, uint8_t protocol,
|
uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
|
||||||
struct in_addr saddr, struct in_addr daddr);
|
struct in_addr saddr, struct in_addr daddr);
|
||||||
uint32_t proto_ipv4_header_psum(uint16_t tot_len, uint8_t protocol,
|
uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
||||||
struct in_addr saddr, struct in_addr daddr);
|
struct in_addr saddr, struct in_addr daddr);
|
||||||
void csum_udp4(struct udphdr *udp4hr,
|
void csum_udp4(struct udphdr *udp4hr,
|
||||||
struct in_addr saddr, struct in_addr daddr,
|
struct in_addr saddr, struct in_addr daddr,
|
||||||
const void *payload, size_t len);
|
const struct iovec *iov, int iov_cnt, size_t offset);
|
||||||
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t len);
|
void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen);
|
||||||
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
|
||||||
const struct in6_addr *saddr,
|
const struct in6_addr *saddr,
|
||||||
const struct in6_addr *daddr);
|
const struct in6_addr *daddr);
|
||||||
void csum_udp6(struct udphdr *udp6hr,
|
void csum_udp6(struct udphdr *udp6hr,
|
||||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||||
const void *payload, size_t len);
|
const struct iovec *iov, int iov_cnt, size_t offset);
|
||||||
void csum_icmp6(struct icmp6hdr *icmp6hr,
|
void csum_icmp6(struct icmp6hdr *icmp6hr,
|
||||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||||
const void *payload, size_t len);
|
const void *payload, size_t dlen);
|
||||||
uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
|
uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
|
||||||
uint16_t csum(const void *buf, size_t len, uint32_t init);
|
uint16_t csum(const void *buf, size_t len, uint32_t init);
|
||||||
uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init);
|
uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset,
|
||||||
|
uint32_t init);
|
||||||
|
|
||||||
#endif /* CHECKSUM_H */
|
#endif /* CHECKSUM_H */
|
||||||
|
|
|
@ -26,13 +26,16 @@
|
||||||
capability sys_ptrace,
|
capability sys_ptrace,
|
||||||
|
|
||||||
/ r, # isolate_prefork(), isolation.c
|
/ r, # isolate_prefork(), isolation.c
|
||||||
mount options=(rw, runbindable) /,
|
mount options=(rw, runbindable) -> /,
|
||||||
|
mount "" -> "/",
|
||||||
mount "" -> "/tmp/",
|
mount "" -> "/tmp/",
|
||||||
pivot_root "/tmp/" -> "/tmp/",
|
pivot_root "/tmp/" -> "/tmp/",
|
||||||
umount "/",
|
umount "/",
|
||||||
|
|
||||||
owner @{PROC}/@{pid}/uid_map r, # conf_ugid()
|
owner @{PROC}/@{pid}/uid_map r, # conf_ugid()
|
||||||
|
|
||||||
|
@{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral()
|
||||||
|
|
||||||
network netlink raw, # nl_sock_init_do(), netlink.c
|
network netlink raw, # nl_sock_init_do(), netlink.c
|
||||||
|
|
||||||
network inet stream, # tcp.c
|
network inet stream, # tcp.c
|
||||||
|
|
|
@ -27,8 +27,9 @@
|
||||||
@{PROC}/@{pid}/net/udp r,
|
@{PROC}/@{pid}/net/udp r,
|
||||||
@{PROC}/@{pid}/net/udp6 r,
|
@{PROC}/@{pid}/net/udp6 r,
|
||||||
|
|
||||||
@{run}/user/@{uid}/netns/* r, # pasta_open_ns(), pasta.c
|
@{run}/user/@{uid}/** rw, # pasta_open_ns()
|
||||||
|
|
||||||
|
@{PROC}/[0-9]*/ns/ r, # pasta_netns_quit_init(),
|
||||||
@{PROC}/[0-9]*/ns/net r, # pasta_wait_for_ns(),
|
@{PROC}/[0-9]*/ns/net r, # pasta_wait_for_ns(),
|
||||||
@{PROC}/[0-9]*/ns/user r, # conf_pasta_ns()
|
@{PROC}/[0-9]*/ns/user r, # conf_pasta_ns()
|
||||||
|
|
||||||
|
@ -42,3 +43,5 @@
|
||||||
/{usr/,}bin/** Ux,
|
/{usr/,}bin/** Ux,
|
||||||
|
|
||||||
/usr/bin/pasta.avx2 ix, # arch_avx2_exec(), arch.c
|
/usr/bin/pasta.avx2 ix, # arch_avx2_exec(), arch.c
|
||||||
|
|
||||||
|
ptrace r, # pasta_open_ns()
|
||||||
|
|
|
@ -19,9 +19,12 @@ profile passt /usr/bin/passt{,.avx2} {
|
||||||
include <abstractions/passt>
|
include <abstractions/passt>
|
||||||
|
|
||||||
# Alternatively: include <abstractions/user-tmp>
|
# Alternatively: include <abstractions/user-tmp>
|
||||||
owner /tmp/** w, # tap_sock_unix_init(), pcap(),
|
owner /tmp/** w, # tap_sock_unix_open(),
|
||||||
# write_pidfile(),
|
# tap_sock_unix_init(), pcap(),
|
||||||
|
# pidfile_open(),
|
||||||
|
# pidfile_write(),
|
||||||
# logfile_init()
|
# logfile_init()
|
||||||
|
|
||||||
owner @{HOME}/** w, # pcap(), write_pidfile()
|
owner @{HOME}/** w, # pcap(), pidfile_open(),
|
||||||
|
# pidfile_write()
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,9 +19,13 @@ profile pasta /usr/bin/pasta{,.avx2} flags=(attach_disconnected) {
|
||||||
include <abstractions/pasta>
|
include <abstractions/pasta>
|
||||||
|
|
||||||
# Alternatively: include <abstractions/user-tmp>
|
# Alternatively: include <abstractions/user-tmp>
|
||||||
owner /tmp/** w, # tap_sock_unix_init(), pcap(),
|
/tmp/** rw, # tap_sock_unix_open(),
|
||||||
# write_pidfile(),
|
# tap_sock_unix_init(), pcap(),
|
||||||
# logfile_init()
|
# pidfile_open(),
|
||||||
|
# pidfile_write(),
|
||||||
|
# logfile_init(),
|
||||||
|
# pasta_open_ns()
|
||||||
|
|
||||||
owner @{HOME}/** w, # pcap(), write_pidfile()
|
owner @{HOME}/** w, # pcap(), pidfile_open(),
|
||||||
|
# pidfile_write()
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@ Name: passt
|
||||||
Version: {{{ git_version }}}
|
Version: {{{ git_version }}}
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: User-mode networking daemons for virtual machines and namespaces
|
Summary: User-mode networking daemons for virtual machines and namespaces
|
||||||
License: GPLv2+ and BSD
|
License: GPL-2.0-or-later AND BSD-3-Clause
|
||||||
Group: System Environment/Daemons
|
Group: System Environment/Daemons
|
||||||
URL: https://passt.top/
|
URL: https://passt.top/
|
||||||
Source: https://passt.top/passt/snapshot/passt-%{git_hash}.tar.xz
|
Source: https://passt.top/passt/snapshot/passt-%{git_hash}.tar.xz
|
||||||
|
|
|
@ -29,7 +29,11 @@ function passt_git_changelog_entry {
|
||||||
[ -z "${__from}" ] && __from="$(git rev-list --max-parents=0 HEAD)"
|
[ -z "${__from}" ] && __from="$(git rev-list --max-parents=0 HEAD)"
|
||||||
|
|
||||||
__date="$(git log --pretty="format:%cI" "${__to}" -1)"
|
__date="$(git log --pretty="format:%cI" "${__to}" -1)"
|
||||||
__author="$(git log -1 --pretty="format:%an <%ae>" ${__to} -- contrib/fedora)"
|
__author="Stefano Brivio <sbrivio@redhat.com>"
|
||||||
|
# Use:
|
||||||
|
# __author="$(git log -1 --pretty="format:%an <%ae>" ${__to} -- contrib/fedora)"
|
||||||
|
# if you want the author of changelog entries to match the latest
|
||||||
|
# author for contrib/fedora
|
||||||
|
|
||||||
printf "* %s %s - %s\n" "$(date "+%a %b %e %Y" -d "${__date}")" "${__author}" "$(git_version "${__to}")-1"
|
printf "* %s %s - %s\n" "$(date "+%a %b %e %Y" -d "${__date}")" "${__author}" "$(git_version "${__to}")-1"
|
||||||
|
|
||||||
|
|
|
@ -50,6 +50,7 @@ require {
|
||||||
type passwd_file_t;
|
type passwd_file_t;
|
||||||
|
|
||||||
class netlink_route_socket { bind create nlmsg_read };
|
class netlink_route_socket { bind create nlmsg_read };
|
||||||
|
type sysctl_net_t;
|
||||||
|
|
||||||
class capability { sys_tty_config setuid setgid };
|
class capability { sys_tty_config setuid setgid };
|
||||||
class cap_userns { setpcap sys_admin sys_ptrace };
|
class cap_userns { setpcap sys_admin sys_ptrace };
|
||||||
|
@ -104,6 +105,8 @@ allow passt_t net_conf_t:lnk_file read;
|
||||||
allow passt_t tmp_t:sock_file { create unlink write };
|
allow passt_t tmp_t:sock_file { create unlink write };
|
||||||
allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt };
|
allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt };
|
||||||
kernel_search_network_sysctl(passt_t)
|
kernel_search_network_sysctl(passt_t)
|
||||||
|
allow passt_t sysctl_net_t:dir search;
|
||||||
|
allow passt_t sysctl_net_t:file { open read };
|
||||||
|
|
||||||
corenet_tcp_bind_all_nodes(passt_t)
|
corenet_tcp_bind_all_nodes(passt_t)
|
||||||
corenet_udp_bind_all_nodes(passt_t)
|
corenet_udp_bind_all_nodes(passt_t)
|
||||||
|
|
|
@ -196,7 +196,7 @@ allow pasta_t ifconfig_var_run_t:dir { read search watch };
|
||||||
allow pasta_t self:tun_socket create;
|
allow pasta_t self:tun_socket create;
|
||||||
allow pasta_t tun_tap_device_t:chr_file { ioctl open read write };
|
allow pasta_t tun_tap_device_t:chr_file { ioctl open read write };
|
||||||
allow pasta_t sysctl_net_t:dir search;
|
allow pasta_t sysctl_net_t:dir search;
|
||||||
allow pasta_t sysctl_net_t:file { open write };
|
allow pasta_t sysctl_net_t:file { open read write };
|
||||||
allow pasta_t kernel_t:system module_request;
|
allow pasta_t kernel_t:system module_request;
|
||||||
|
|
||||||
allow pasta_t nsfs_t:file read;
|
allow pasta_t nsfs_t:file read;
|
||||||
|
@ -211,3 +211,4 @@ allow pasta_t ifconfig_t:process { noatsecure rlimitinh siginh };
|
||||||
allow pasta_t netutils_t:process { noatsecure rlimitinh siginh };
|
allow pasta_t netutils_t:process { noatsecure rlimitinh siginh };
|
||||||
allow pasta_t ping_t:process { noatsecure rlimitinh siginh };
|
allow pasta_t ping_t:process { noatsecure rlimitinh siginh };
|
||||||
allow pasta_t user_tty_device_t:chr_file { append read write };
|
allow pasta_t user_tty_device_t:chr_file { append read write };
|
||||||
|
allow pasta_t user_devpts_t:chr_file { append read write };
|
||||||
|
|
25
dhcp.c
25
dhcp.c
|
@ -275,7 +275,8 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len)
|
||||||
*/
|
*/
|
||||||
int dhcp(const struct ctx *c, const struct pool *p)
|
int dhcp(const struct ctx *c, const struct pool *p)
|
||||||
{
|
{
|
||||||
size_t mlen, len, offset = 0, opt_len, opt_off = 0;
|
size_t mlen, dlen, offset = 0, opt_len, opt_off = 0;
|
||||||
|
char macstr[ETH_ADDRSTRLEN];
|
||||||
const struct ethhdr *eh;
|
const struct ethhdr *eh;
|
||||||
const struct iphdr *iph;
|
const struct iphdr *iph;
|
||||||
const struct udphdr *uh;
|
const struct udphdr *uh;
|
||||||
|
@ -340,26 +341,26 @@ int dhcp(const struct ctx *c, const struct pool *p)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
info(" from %02x:%02x:%02x:%02x:%02x:%02x",
|
info(" from %s", eth_ntop(m->chaddr, macstr, sizeof(macstr)));
|
||||||
m->chaddr[0], m->chaddr[1], m->chaddr[2],
|
|
||||||
m->chaddr[3], m->chaddr[4], m->chaddr[5]);
|
|
||||||
|
|
||||||
m->yiaddr = c->ip4.addr;
|
m->yiaddr = c->ip4.addr;
|
||||||
mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len));
|
mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len));
|
||||||
memcpy(opts[1].s, &mask, sizeof(mask));
|
memcpy(opts[1].s, &mask, sizeof(mask));
|
||||||
memcpy(opts[3].s, &c->ip4.gw, sizeof(c->ip4.gw));
|
memcpy(opts[3].s, &c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
|
||||||
memcpy(opts[54].s, &c->ip4.gw, sizeof(c->ip4.gw));
|
memcpy(opts[54].s, &c->ip4.our_tap_addr, sizeof(c->ip4.our_tap_addr));
|
||||||
|
|
||||||
/* If the gateway is not on the assigned subnet, send an option 121
|
/* If the gateway is not on the assigned subnet, send an option 121
|
||||||
* (Classless Static Routing) adding a dummy route to it.
|
* (Classless Static Routing) adding a dummy route to it.
|
||||||
*/
|
*/
|
||||||
if ((c->ip4.addr.s_addr & mask.s_addr)
|
if ((c->ip4.addr.s_addr & mask.s_addr)
|
||||||
!= (c->ip4.gw.s_addr & mask.s_addr)) {
|
!= (c->ip4.guest_gw.s_addr & mask.s_addr)) {
|
||||||
/* a.b.c.d/32:0.0.0.0, 0:a.b.c.d */
|
/* a.b.c.d/32:0.0.0.0, 0:a.b.c.d */
|
||||||
opts[121].slen = 14;
|
opts[121].slen = 14;
|
||||||
opts[121].s[0] = 32;
|
opts[121].s[0] = 32;
|
||||||
memcpy(opts[121].s + 1, &c->ip4.gw, sizeof(c->ip4.gw));
|
memcpy(opts[121].s + 1,
|
||||||
memcpy(opts[121].s + 10, &c->ip4.gw, sizeof(c->ip4.gw));
|
&c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
|
||||||
|
memcpy(opts[121].s + 10,
|
||||||
|
&c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c->mtu != -1) {
|
if (c->mtu != -1) {
|
||||||
|
@ -377,8 +378,8 @@ int dhcp(const struct ctx *c, const struct pool *p)
|
||||||
if (!c->no_dhcp_dns_search)
|
if (!c->no_dhcp_dns_search)
|
||||||
opt_set_dns_search(c, sizeof(m->o));
|
opt_set_dns_search(c, sizeof(m->o));
|
||||||
|
|
||||||
len = offsetof(struct msg, o) + fill(m);
|
dlen = offsetof(struct msg, o) + fill(m);
|
||||||
tap_udp4_send(c, c->ip4.gw, 67, c->ip4.addr, 68, m, len);
|
tap_udp4_send(c, c->ip4.our_tap_addr, 67, c->ip4.addr, 68, m, dlen);
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
70
dhcpv6.c
70
dhcpv6.c
|
@ -296,45 +296,42 @@ static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset,
|
||||||
static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
|
static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
|
||||||
struct in6_addr *la)
|
struct in6_addr *la)
|
||||||
{
|
{
|
||||||
|
int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type;
|
||||||
|
const struct opt_ia_addr *opt_addr;
|
||||||
char buf[INET6_ADDRSTRLEN];
|
char buf[INET6_ADDRSTRLEN];
|
||||||
struct in6_addr req_addr;
|
struct in6_addr req_addr;
|
||||||
struct opt_hdr *ia, *h;
|
const struct opt_hdr *h;
|
||||||
|
struct opt_hdr *ia;
|
||||||
size_t offset;
|
size_t offset;
|
||||||
int ia_type;
|
|
||||||
|
|
||||||
ia_type = OPT_IA_NA;
|
foreach(ia_type, ia_types) {
|
||||||
ia_ta:
|
offset = 0;
|
||||||
offset = 0;
|
while ((ia = dhcpv6_opt(p, &offset, *ia_type))) {
|
||||||
while ((ia = dhcpv6_opt(p, &offset, ia_type))) {
|
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
|
||||||
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
offset += sizeof(struct opt_ia_na);
|
|
||||||
|
|
||||||
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
|
|
||||||
struct opt_ia_addr *opt_addr = (struct opt_ia_addr *)h;
|
|
||||||
|
|
||||||
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
|
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
memcpy(&req_addr, &opt_addr->addr, sizeof(req_addr));
|
offset += sizeof(struct opt_ia_na);
|
||||||
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) {
|
|
||||||
info("DHCPv6: requested address %s not on link",
|
|
||||||
inet_ntop(AF_INET6, &req_addr,
|
|
||||||
buf, sizeof(buf)));
|
|
||||||
return ia;
|
|
||||||
}
|
|
||||||
|
|
||||||
offset += sizeof(struct opt_ia_addr);
|
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
|
||||||
|
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
opt_addr = (const struct opt_ia_addr *)h;
|
||||||
|
req_addr = opt_addr->addr;
|
||||||
|
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr))
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
offset += sizeof(struct opt_ia_addr);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ia_type == OPT_IA_NA) {
|
|
||||||
ia_type = OPT_IA_TA;
|
|
||||||
goto ia_ta;
|
|
||||||
}
|
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
err:
|
||||||
|
info("DHCPv6: requested address %s not on link",
|
||||||
|
inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf)));
|
||||||
|
return ia;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -363,7 +360,7 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset)
|
||||||
srv->hdr.l = 0;
|
srv->hdr.l = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(&srv->addr[i], &c->ip6.dns[i], sizeof(srv->addr[i]));
|
srv->addr[i] = c->ip6.dns[i];
|
||||||
srv->hdr.l += sizeof(srv->addr[i]);
|
srv->hdr.l += sizeof(srv->addr[i]);
|
||||||
offset += sizeof(srv->addr[i]);
|
offset += sizeof(srv->addr[i]);
|
||||||
}
|
}
|
||||||
|
@ -426,11 +423,11 @@ search:
|
||||||
int dhcpv6(struct ctx *c, const struct pool *p,
|
int dhcpv6(struct ctx *c, const struct pool *p,
|
||||||
const struct in6_addr *saddr, const struct in6_addr *daddr)
|
const struct in6_addr *saddr, const struct in6_addr *daddr)
|
||||||
{
|
{
|
||||||
struct opt_hdr *ia, *bad_ia, *client_id;
|
const struct opt_hdr *client_id, *server_id, *ia;
|
||||||
const struct opt_hdr *server_id;
|
|
||||||
const struct in6_addr *src;
|
const struct in6_addr *src;
|
||||||
const struct msg_hdr *mh;
|
const struct msg_hdr *mh;
|
||||||
const struct udphdr *uh;
|
const struct udphdr *uh;
|
||||||
|
struct opt_hdr *bad_ia;
|
||||||
size_t mlen, n;
|
size_t mlen, n;
|
||||||
|
|
||||||
uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);
|
uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);
|
||||||
|
@ -451,10 +448,7 @@ int dhcpv6(struct ctx *c, const struct pool *p,
|
||||||
|
|
||||||
c->ip6.addr_ll_seen = *saddr;
|
c->ip6.addr_ll_seen = *saddr;
|
||||||
|
|
||||||
if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw))
|
src = &c->ip6.our_tap_ll;
|
||||||
src = &c->ip6.gw;
|
|
||||||
else
|
|
||||||
src = &c->ip6.addr_ll;
|
|
||||||
|
|
||||||
mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL);
|
mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL);
|
||||||
if (!mh)
|
if (!mh)
|
||||||
|
@ -574,8 +568,10 @@ void dhcpv6_init(const struct ctx *c)
|
||||||
resp.server_id.duid_time = duid_time;
|
resp.server_id.duid_time = duid_time;
|
||||||
resp_not_on_link.server_id.duid_time = duid_time;
|
resp_not_on_link.server_id.duid_time = duid_time;
|
||||||
|
|
||||||
memcpy(resp.server_id.duid_lladdr, c->mac, sizeof(c->mac));
|
memcpy(resp.server_id.duid_lladdr,
|
||||||
memcpy(resp_not_on_link.server_id.duid_lladdr, c->mac, sizeof(c->mac));
|
c->our_tap_mac, sizeof(c->our_tap_mac));
|
||||||
|
memcpy(resp_not_on_link.server_id.duid_lladdr,
|
||||||
|
c->our_tap_mac, sizeof(c->our_tap_mac));
|
||||||
|
|
||||||
resp.ia_addr.addr = c->ip6.addr;
|
resp.ia_addr.addr = c->ip6.addr;
|
||||||
}
|
}
|
||||||
|
|
3
doc/platform-requirements/.gitignore
vendored
Normal file
3
doc/platform-requirements/.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
/reuseaddr-priority
|
||||||
|
/recv-zero
|
||||||
|
/udp-close-dup
|
45
doc/platform-requirements/Makefile
Normal file
45
doc/platform-requirements/Makefile
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
#
|
||||||
|
# Copyright Red Hat
|
||||||
|
# Author: David Gibson <david@gibson.dropbear.id.au>
|
||||||
|
|
||||||
|
TARGETS = reuseaddr-priority recv-zero udp-close-dup
|
||||||
|
SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c
|
||||||
|
CFLAGS = -Wall
|
||||||
|
|
||||||
|
all: cppcheck clang-tidy $(TARGETS:%=check-%)
|
||||||
|
|
||||||
|
$(TARGETS): %: %.c common.c common.h
|
||||||
|
|
||||||
|
check-%: %
|
||||||
|
./$<
|
||||||
|
|
||||||
|
cppcheck:
|
||||||
|
cppcheck --std=c11 --error-exitcode=1 --enable=all --force \
|
||||||
|
--check-level=exhaustive --inline-suppr \
|
||||||
|
--inconclusive --library=posix --quiet \
|
||||||
|
--suppress=missingIncludeSystem \
|
||||||
|
$(SRCS)
|
||||||
|
|
||||||
|
clang-tidy:
|
||||||
|
clang-tidy --checks=*,\
|
||||||
|
-altera-id-dependent-backward-branch,\
|
||||||
|
-altera-unroll-loops,\
|
||||||
|
-bugprone-easily-swappable-parameters,\
|
||||||
|
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\
|
||||||
|
-concurrency-mt-unsafe,\
|
||||||
|
-cppcoreguidelines-avoid-non-const-global-variables,\
|
||||||
|
-cppcoreguidelines-init-variables,\
|
||||||
|
-cppcoreguidelines-macro-to-enum,\
|
||||||
|
-google-readability-braces-around-statements,\
|
||||||
|
-hicpp-braces-around-statements,\
|
||||||
|
-llvmlibc-restrict-system-libc-headers,\
|
||||||
|
-misc-include-cleaner,\
|
||||||
|
-modernize-macro-to-enum,\
|
||||||
|
-readability-braces-around-statements,\
|
||||||
|
-readability-identifier-length,\
|
||||||
|
-readability-isolate-declaration \
|
||||||
|
$(SRCS)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f $(TARGETS) *.o *~
|
18
doc/platform-requirements/README
Normal file
18
doc/platform-requirements/README
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
Platform Requirements
|
||||||
|
=====================
|
||||||
|
|
||||||
|
TODO: document the various Linux specific features we currently require
|
||||||
|
|
||||||
|
|
||||||
|
Test Programs
|
||||||
|
-------------
|
||||||
|
|
||||||
|
In some places we rely on quite specific behaviour of sockets.
|
||||||
|
Although Linux, at least, seems to behave as required, It's not always
|
||||||
|
clear from the available documentation if this is required by POSIX or
|
||||||
|
some other specification.
|
||||||
|
|
||||||
|
To specifically document those expectations this directory has some
|
||||||
|
test programs which explicitly check for the behaviour we need.
|
||||||
|
When/if we attempt a port to a new platform, running these to check
|
||||||
|
behaviour would be a good place to start.
|
66
doc/platform-requirements/common.c
Normal file
66
doc/platform-requirements/common.c
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
/* common.c
|
||||||
|
*
|
||||||
|
* Common helper functions for testing SO_REUSEADDR behaviour
|
||||||
|
*
|
||||||
|
* Copyright Red Hat
|
||||||
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <errno.h>
|
||||||
|
#include <netinet/in.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/socket.h>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
int sock_reuseaddr(void)
|
||||||
|
{
|
||||||
|
int y = 1;
|
||||||
|
int s;
|
||||||
|
|
||||||
|
|
||||||
|
s = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
|
||||||
|
if (s < 0)
|
||||||
|
die("socket(): %s\n", strerror(errno));
|
||||||
|
|
||||||
|
if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) , 0)
|
||||||
|
die("SO_REUSEADDR: %s\n", strerror(errno));
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Send a token via the given connected socket */
|
||||||
|
void send_token(int s, long token)
|
||||||
|
{
|
||||||
|
ssize_t rc;
|
||||||
|
|
||||||
|
rc = send(s, &token, sizeof(token), 0);
|
||||||
|
if (rc < 0)
|
||||||
|
die("send(): %s\n", strerror(errno));
|
||||||
|
if (rc < sizeof(token))
|
||||||
|
die("short send()\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Attempt to receive a token via the given socket.
|
||||||
|
*
|
||||||
|
* Returns true if we received the token, false if we got an EAGAIN, dies in any
|
||||||
|
* other case */
|
||||||
|
bool recv_token(int s, long token)
|
||||||
|
{
|
||||||
|
ssize_t rc;
|
||||||
|
long buf;
|
||||||
|
|
||||||
|
rc = recv(s, &buf, sizeof(buf), MSG_DONTWAIT);
|
||||||
|
if (rc < 0) {
|
||||||
|
if (errno == EWOULDBLOCK)
|
||||||
|
return false;
|
||||||
|
die("recv(): %s\n", strerror(errno));
|
||||||
|
}
|
||||||
|
if (rc < sizeof(buf))
|
||||||
|
die("short recv()\n");
|
||||||
|
if (buf != token)
|
||||||
|
die("data mismatch\n");
|
||||||
|
return true;
|
||||||
|
}
|
47
doc/platform-requirements/common.h
Normal file
47
doc/platform-requirements/common.h
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
/* common.h
|
||||||
|
*
|
||||||
|
* Useful shared functions
|
||||||
|
*
|
||||||
|
* Copyright Red Hat
|
||||||
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||||
|
*/
|
||||||
|
#ifndef REUSEADDR_COMMON_H
|
||||||
|
#define REUSEADDR_COMMON_H
|
||||||
|
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
static inline void die(const char *fmt, ...)
|
||||||
|
{
|
||||||
|
va_list ap;
|
||||||
|
|
||||||
|
va_start(ap, fmt);
|
||||||
|
(void)vfprintf(stderr, fmt, ap);
|
||||||
|
va_end(ap);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if __BYTE_ORDER == __BIG_ENDIAN
|
||||||
|
#define htons_constant(x) (x)
|
||||||
|
#define htonl_constant(x) (x)
|
||||||
|
#else
|
||||||
|
#define htons_constant(x) (__bswap_constant_16(x))
|
||||||
|
#define htonl_constant(x) (__bswap_constant_32(x))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define SOCKADDR_INIT(addr, port) \
|
||||||
|
{ \
|
||||||
|
.sin_family = AF_INET, \
|
||||||
|
.sin_addr = { .s_addr = htonl_constant(addr) }, \
|
||||||
|
.sin_port = htons_constant(port), \
|
||||||
|
}
|
||||||
|
|
||||||
|
int sock_reuseaddr(void);
|
||||||
|
void send_token(int s, long token);
|
||||||
|
bool recv_token(int s, long token);
|
||||||
|
|
||||||
|
#endif /* REUSEADDR_COMMON_H */
|
118
doc/platform-requirements/recv-zero.c
Normal file
118
doc/platform-requirements/recv-zero.c
Normal file
|
@ -0,0 +1,118 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
/* recv-zero.c
|
||||||
|
*
|
||||||
|
* Verify that we're able to discard datagrams by recv()ing into a zero-length
|
||||||
|
* buffer.
|
||||||
|
*
|
||||||
|
* Copyright Red Hat
|
||||||
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <arpa/inet.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <net/if.h>
|
||||||
|
#include <netinet/in.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define DSTPORT 13257U
|
||||||
|
|
||||||
|
enum discard_method {
|
||||||
|
DISCARD_NULL_BUF,
|
||||||
|
DISCARD_ZERO_IOV,
|
||||||
|
DISCARD_NULL_IOV,
|
||||||
|
NUM_METHODS,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* 127.0.0.1:DSTPORT */
|
||||||
|
static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT);
|
||||||
|
|
||||||
|
static void test_discard(enum discard_method method)
|
||||||
|
{
|
||||||
|
struct iovec zero_iov = { .iov_base = NULL, .iov_len = 0, };
|
||||||
|
struct msghdr mh_zero = {
|
||||||
|
.msg_iov = &zero_iov,
|
||||||
|
.msg_iovlen = 1,
|
||||||
|
};
|
||||||
|
struct msghdr mh_null = {
|
||||||
|
.msg_iov = NULL,
|
||||||
|
.msg_iovlen = 0,
|
||||||
|
};
|
||||||
|
long token1, token2;
|
||||||
|
int recv_s, send_s;
|
||||||
|
ssize_t rc;
|
||||||
|
|
||||||
|
token1 = random();
|
||||||
|
token2 = random();
|
||||||
|
|
||||||
|
recv_s = sock_reuseaddr();
|
||||||
|
if (bind(recv_s, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0)
|
||||||
|
die("bind(): %s\n", strerror(errno));
|
||||||
|
|
||||||
|
send_s = sock_reuseaddr();
|
||||||
|
if (connect(send_s, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0)
|
||||||
|
die("connect(): %s\n", strerror(errno));
|
||||||
|
|
||||||
|
send_token(send_s, token1);
|
||||||
|
send_token(send_s, token2);
|
||||||
|
|
||||||
|
switch (method) {
|
||||||
|
case DISCARD_NULL_BUF:
|
||||||
|
/* cppcheck-suppress nullPointer */
|
||||||
|
rc = recv(recv_s, NULL, 0, MSG_DONTWAIT);
|
||||||
|
if (rc < 0)
|
||||||
|
die("discarding recv(): %s\n", strerror(errno));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case DISCARD_ZERO_IOV:
|
||||||
|
rc = recvmsg(recv_s, &mh_zero, MSG_DONTWAIT);
|
||||||
|
if (rc < 0)
|
||||||
|
die("recvmsg() with zero-length buffer: %s\n",
|
||||||
|
strerror(errno));
|
||||||
|
if (!((unsigned)mh_zero.msg_flags & MSG_TRUNC))
|
||||||
|
die("Missing MSG_TRUNC flag\n");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case DISCARD_NULL_IOV:
|
||||||
|
rc = recvmsg(recv_s, &mh_null, MSG_DONTWAIT);
|
||||||
|
if (rc < 0)
|
||||||
|
die("recvmsg() with zero-length iov: %s\n",
|
||||||
|
strerror(errno));
|
||||||
|
if (!((unsigned)mh_null.msg_flags & MSG_TRUNC))
|
||||||
|
die("Missing MSG_TRUNC flag\n");
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
die("Bad method\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
recv_token(recv_s, token2);
|
||||||
|
|
||||||
|
/* cppcheck-suppress nullPointer */
|
||||||
|
rc = recv(recv_s, NULL, 0, MSG_DONTWAIT);
|
||||||
|
if (rc < 0 && errno != EAGAIN)
|
||||||
|
die("redundant discarding recv(): %s\n", strerror(errno));
|
||||||
|
if (rc >= 0)
|
||||||
|
die("Unexpected receive: rc=%zd\n", rc);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
enum discard_method method;
|
||||||
|
|
||||||
|
(void)argc;
|
||||||
|
(void)argv;
|
||||||
|
|
||||||
|
for (method = 0; method < NUM_METHODS; method++)
|
||||||
|
test_discard(method);
|
||||||
|
|
||||||
|
printf("Discarding datagrams with 0-length receives seems to work\n");
|
||||||
|
|
||||||
|
exit(0);
|
||||||
|
}
|
240
doc/platform-requirements/reuseaddr-priority.c
Normal file
240
doc/platform-requirements/reuseaddr-priority.c
Normal file
|
@ -0,0 +1,240 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
/* reuseaddr-priority.c
|
||||||
|
*
|
||||||
|
* Verify which SO_REUSEADDR UDP sockets get priority to receive
|
||||||
|
* =============================================================
|
||||||
|
*
|
||||||
|
* SO_REUSEADDR allows multiple sockets to bind to overlapping addresses, so
|
||||||
|
* there can be multiple sockets eligible to receive the same packet. The exact
|
||||||
|
* semantics of which socket will receive in this circumstance isn't very well
|
||||||
|
* documented.
|
||||||
|
*
|
||||||
|
* This program verifies that things behave the way we expect. Specifically we
|
||||||
|
* expect:
|
||||||
|
*
|
||||||
|
* - If both a connected and an unconnected socket could receive a datagram, the
|
||||||
|
* connected one will receive it in preference to the unconnected one.
|
||||||
|
*
|
||||||
|
* - If an unconnected socket bound to a specific address and an unconnected
|
||||||
|
* socket bound to the "any" address (0.0.0.0 or ::) could receive a datagram,
|
||||||
|
* then the one with a specific address will receive it in preference to the
|
||||||
|
* other.
|
||||||
|
*
|
||||||
|
* These should be true regardless of the order the sockets are created in, or
|
||||||
|
* the order they're polled in.
|
||||||
|
*
|
||||||
|
* Copyright Red Hat
|
||||||
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <arpa/inet.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <net/if.h>
|
||||||
|
#include <netinet/in.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define SRCPORT 13246U
|
||||||
|
#define DSTPORT 13247U
|
||||||
|
|
||||||
|
/* Different cases for receiving socket configuration */
|
||||||
|
enum sock_type {
|
||||||
|
/* Socket is bound to 0.0.0.0:DSTPORT and not connected */
|
||||||
|
SOCK_BOUND_ANY = 0,
|
||||||
|
|
||||||
|
/* Socket is bound to 127.0.0.1:DSTPORT and not connected */
|
||||||
|
SOCK_BOUND_LO = 1,
|
||||||
|
|
||||||
|
/* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */
|
||||||
|
SOCK_CONNECTED = 2,
|
||||||
|
|
||||||
|
NUM_SOCK_TYPES,
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef enum sock_type order_t[NUM_SOCK_TYPES];
|
||||||
|
|
||||||
|
static order_t orders[] = {
|
||||||
|
{0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {1, 2, 0}, {2, 0, 1}, {2, 1, 0},
|
||||||
|
};
|
||||||
|
|
||||||
|
/* 127.0.0.2 */
|
||||||
|
#define INADDR_LOOPBACK2 ((in_addr_t)(0x7f000002))
|
||||||
|
|
||||||
|
/* 0.0.0.0:DSTPORT */
|
||||||
|
static const struct sockaddr_in any_dst = SOCKADDR_INIT(INADDR_ANY, DSTPORT);
|
||||||
|
/* 127.0.0.1:DSTPORT */
|
||||||
|
static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT);
|
||||||
|
|
||||||
|
/* 127.0.0.2:DSTPORT */
|
||||||
|
static const struct sockaddr_in lo2_dst = SOCKADDR_INIT(INADDR_LOOPBACK2, DSTPORT);
|
||||||
|
|
||||||
|
/* 127.0.0.1:SRCPORT */
|
||||||
|
static const struct sockaddr_in lo_src = SOCKADDR_INIT(INADDR_LOOPBACK, SRCPORT);
|
||||||
|
|
||||||
|
/* Random token to send in datagram */
|
||||||
|
static long token;
|
||||||
|
|
||||||
|
/* Get a socket of the specified type for receiving */
|
||||||
|
static int sock_recv(enum sock_type type)
|
||||||
|
{
|
||||||
|
const struct sockaddr *connect_sa = NULL;
|
||||||
|
const struct sockaddr *bind_sa = NULL;
|
||||||
|
int s;
|
||||||
|
|
||||||
|
s = sock_reuseaddr();
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case SOCK_CONNECTED:
|
||||||
|
connect_sa = (struct sockaddr *)&lo_src;
|
||||||
|
/* fallthrough */
|
||||||
|
case SOCK_BOUND_ANY:
|
||||||
|
bind_sa = (struct sockaddr *)&any_dst;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SOCK_BOUND_LO:
|
||||||
|
bind_sa = (struct sockaddr *)&lo_dst;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
die("bug");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bind_sa)
|
||||||
|
if (bind(s, bind_sa, sizeof(struct sockaddr_in)) < 0)
|
||||||
|
die("bind(): %s\n", strerror(errno));
|
||||||
|
if (connect_sa)
|
||||||
|
if (connect(s, connect_sa, sizeof(struct sockaddr_in)) < 0)
|
||||||
|
die("connect(): %s\n", strerror(errno));
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get a socket suitable for sending to the given type of receiving socket */
|
||||||
|
static int sock_send(enum sock_type type)
|
||||||
|
{
|
||||||
|
const struct sockaddr *connect_sa = NULL;
|
||||||
|
const struct sockaddr *bind_sa = NULL;
|
||||||
|
int s;
|
||||||
|
|
||||||
|
s = sock_reuseaddr();
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case SOCK_BOUND_ANY:
|
||||||
|
connect_sa = (struct sockaddr *)&lo2_dst;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SOCK_CONNECTED:
|
||||||
|
bind_sa = (struct sockaddr *)&lo_src;
|
||||||
|
/* fallthrough */
|
||||||
|
case SOCK_BOUND_LO:
|
||||||
|
connect_sa = (struct sockaddr *)&lo_dst;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
die("bug");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bind_sa)
|
||||||
|
if (bind(s, bind_sa, sizeof(struct sockaddr_in)) < 0)
|
||||||
|
die("bind(): %s\n", strerror(errno));
|
||||||
|
if (connect_sa)
|
||||||
|
if (connect(s, connect_sa, sizeof(struct sockaddr_in)) < 0)
|
||||||
|
die("connect(): %s\n", strerror(errno));
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check for expected behaviour with one specific ordering for various operations:
|
||||||
|
*
|
||||||
|
* @recv_create_order: Order to create receiving sockets in
|
||||||
|
* @send_create_order: Order to create sending sockets in
|
||||||
|
* @test_order: Order to test the behaviour of different types
|
||||||
|
* @recv_order: Order to check the receiving sockets
|
||||||
|
*/
|
||||||
|
static void check_one_order(const order_t recv_create_order,
|
||||||
|
const order_t send_create_order,
|
||||||
|
const order_t test_order,
|
||||||
|
const order_t recv_order)
|
||||||
|
{
|
||||||
|
int rs[NUM_SOCK_TYPES];
|
||||||
|
int ss[NUM_SOCK_TYPES];
|
||||||
|
int nfds = 0;
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
for (i = 0; i < NUM_SOCK_TYPES; i++) {
|
||||||
|
enum sock_type t = recv_create_order[i];
|
||||||
|
int s;
|
||||||
|
|
||||||
|
s = sock_recv(t);
|
||||||
|
if (s >= nfds)
|
||||||
|
nfds = s + 1;
|
||||||
|
|
||||||
|
rs[t] = s;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < NUM_SOCK_TYPES; i++) {
|
||||||
|
enum sock_type t = send_create_order[i];
|
||||||
|
|
||||||
|
ss[t] = sock_send(t);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < NUM_SOCK_TYPES; i++) {
|
||||||
|
enum sock_type ti = test_order[i];
|
||||||
|
int recv_via = -1;
|
||||||
|
|
||||||
|
send_token(ss[ti], token);
|
||||||
|
|
||||||
|
for (j = 0; j < NUM_SOCK_TYPES; j++) {
|
||||||
|
enum sock_type tj = recv_order[j];
|
||||||
|
|
||||||
|
if (recv_token(rs[tj], token)) {
|
||||||
|
if (recv_via != -1)
|
||||||
|
die("Received token more than once\n");
|
||||||
|
recv_via = tj;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (recv_via == -1)
|
||||||
|
die("Didn't receive token at all\n");
|
||||||
|
if (recv_via != ti)
|
||||||
|
die("Received token via unexpected socket\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < NUM_SOCK_TYPES; i++) {
|
||||||
|
close(rs[i]);
|
||||||
|
close(ss[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void check_all_orders(void)
|
||||||
|
{
|
||||||
|
int norders = sizeof(orders) / sizeof(orders[0]);
|
||||||
|
int i, j, k, l;
|
||||||
|
|
||||||
|
for (i = 0; i < norders; i++)
|
||||||
|
for (j = 0; j < norders; j++)
|
||||||
|
for (k = 0; k < norders; k++)
|
||||||
|
for (l = 0; l < norders; l++)
|
||||||
|
check_one_order(orders[i], orders[j],
|
||||||
|
orders[k], orders[l]);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
(void)argc;
|
||||||
|
(void)argv;
|
||||||
|
|
||||||
|
token = random();
|
||||||
|
|
||||||
|
check_all_orders();
|
||||||
|
|
||||||
|
printf("SO_REUSEADDR receive priorities seem to work as expected\n");
|
||||||
|
|
||||||
|
exit(0);
|
||||||
|
}
|
105
doc/platform-requirements/udp-close-dup.c
Normal file
105
doc/platform-requirements/udp-close-dup.c
Normal file
|
@ -0,0 +1,105 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
/* udp-close-dup.c
|
||||||
|
*
|
||||||
|
* Verify that closing one dup() of a UDP socket won't stop other dups from
|
||||||
|
* receiving packets.
|
||||||
|
*
|
||||||
|
* Copyright Red Hat
|
||||||
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <arpa/inet.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <net/if.h>
|
||||||
|
#include <netinet/in.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define DSTPORT 13257U
|
||||||
|
|
||||||
|
/* 127.0.0.1:DSTPORT */
|
||||||
|
static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT);
|
||||||
|
|
||||||
|
enum dup_method {
|
||||||
|
DUP_DUP,
|
||||||
|
DUP_FCNTL,
|
||||||
|
NUM_METHODS,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void test_close_dup(enum dup_method method)
|
||||||
|
{
|
||||||
|
long token;
|
||||||
|
int s1, s2, send_s;
|
||||||
|
ssize_t rc;
|
||||||
|
|
||||||
|
s1 = sock_reuseaddr();
|
||||||
|
if (bind(s1, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0)
|
||||||
|
die("bind(): %s\n", strerror(errno));
|
||||||
|
|
||||||
|
send_s = sock_reuseaddr();
|
||||||
|
if (connect(send_s, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0)
|
||||||
|
die("connect(): %s\n", strerror(errno));
|
||||||
|
|
||||||
|
/* Receive before duplicating */
|
||||||
|
token = random();
|
||||||
|
send_token(send_s, token);
|
||||||
|
recv_token(s1, token);
|
||||||
|
|
||||||
|
switch (method) {
|
||||||
|
case DUP_DUP:
|
||||||
|
/* NOLINTNEXTLINE(android-cloexec-dup) */
|
||||||
|
s2 = dup(s1);
|
||||||
|
if (s2 < 0)
|
||||||
|
die("dup(): %s\n", strerror(errno));
|
||||||
|
break;
|
||||||
|
case DUP_FCNTL:
|
||||||
|
s2 = fcntl(s1, F_DUPFD_CLOEXEC, 0);
|
||||||
|
if (s2 < 0)
|
||||||
|
die("F_DUPFD_CLOEXEC: %s\n", strerror(errno));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
die("Bad method\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Receive via original handle */
|
||||||
|
token = random();
|
||||||
|
send_token(send_s, token);
|
||||||
|
recv_token(s1, token);
|
||||||
|
|
||||||
|
/* Receive via duplicated handle */
|
||||||
|
token = random();
|
||||||
|
send_token(send_s, token);
|
||||||
|
recv_token(s2, token);
|
||||||
|
|
||||||
|
/* Close duplicate */
|
||||||
|
rc = close(s2);
|
||||||
|
if (rc < 0)
|
||||||
|
die("close() dup: %s\n", strerror(errno));
|
||||||
|
|
||||||
|
/* Receive after closing duplicate */
|
||||||
|
token = random();
|
||||||
|
send_token(send_s, token);
|
||||||
|
recv_token(s1, token);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
enum dup_method method;
|
||||||
|
|
||||||
|
(void)argc;
|
||||||
|
(void)argv;
|
||||||
|
|
||||||
|
for (method = 0; method < NUM_METHODS; method++)
|
||||||
|
test_close_dup(method);
|
||||||
|
|
||||||
|
printf("Closing dup()ed UDP sockets seems to work as expected\n");
|
||||||
|
|
||||||
|
exit(0);
|
||||||
|
}
|
43
epoll_type.h
Normal file
43
epoll_type.h
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
* Copyright Red Hat
|
||||||
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef EPOLL_TYPE_H
|
||||||
|
#define EPOLL_TYPE_H
|
||||||
|
|
||||||
|
/**
|
||||||
|
* enum epoll_type - Different types of fds we poll over
|
||||||
|
*/
|
||||||
|
enum epoll_type {
|
||||||
|
/* Special value to indicate an invalid type */
|
||||||
|
EPOLL_TYPE_NONE = 0,
|
||||||
|
/* Connected TCP sockets */
|
||||||
|
EPOLL_TYPE_TCP,
|
||||||
|
/* Connected TCP sockets (spliced) */
|
||||||
|
EPOLL_TYPE_TCP_SPLICE,
|
||||||
|
/* Listening TCP sockets */
|
||||||
|
EPOLL_TYPE_TCP_LISTEN,
|
||||||
|
/* timerfds used for TCP timers */
|
||||||
|
EPOLL_TYPE_TCP_TIMER,
|
||||||
|
/* UDP "listening" sockets */
|
||||||
|
EPOLL_TYPE_UDP_LISTEN,
|
||||||
|
/* UDP socket for replies on a specific flow */
|
||||||
|
EPOLL_TYPE_UDP_REPLY,
|
||||||
|
/* ICMP/ICMPv6 ping sockets */
|
||||||
|
EPOLL_TYPE_PING,
|
||||||
|
/* inotify fd watching for end of netns (pasta) */
|
||||||
|
EPOLL_TYPE_NSQUIT_INOTIFY,
|
||||||
|
/* timer fd watching for end of netns, fallback for inotify (pasta) */
|
||||||
|
EPOLL_TYPE_NSQUIT_TIMER,
|
||||||
|
/* tuntap character device */
|
||||||
|
EPOLL_TYPE_TAP_PASTA,
|
||||||
|
/* socket connected to qemu */
|
||||||
|
EPOLL_TYPE_TAP_PASST,
|
||||||
|
/* socket listening for qemu socket connections */
|
||||||
|
EPOLL_TYPE_TAP_LISTEN,
|
||||||
|
|
||||||
|
EPOLL_NUM_TYPES,
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* EPOLL_TYPE_H */
|
697
flow.c
697
flow.c
|
@ -5,9 +5,11 @@
|
||||||
* Tracking for logical "flows" of packets.
|
* Tracking for logical "flows" of packets.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <errno.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#include <sched.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
|
@ -18,10 +20,24 @@
|
||||||
#include "flow.h"
|
#include "flow.h"
|
||||||
#include "flow_table.h"
|
#include "flow_table.h"
|
||||||
|
|
||||||
|
const char *flow_state_str[] = {
|
||||||
|
[FLOW_STATE_FREE] = "FREE",
|
||||||
|
[FLOW_STATE_NEW] = "NEW",
|
||||||
|
[FLOW_STATE_INI] = "INI",
|
||||||
|
[FLOW_STATE_TGT] = "TGT",
|
||||||
|
[FLOW_STATE_TYPED] = "TYPED",
|
||||||
|
[FLOW_STATE_ACTIVE] = "ACTIVE",
|
||||||
|
};
|
||||||
|
static_assert(ARRAY_SIZE(flow_state_str) == FLOW_NUM_STATES,
|
||||||
|
"flow_state_str[] doesn't match enum flow_state");
|
||||||
|
|
||||||
const char *flow_type_str[] = {
|
const char *flow_type_str[] = {
|
||||||
[FLOW_TYPE_NONE] = "<none>",
|
[FLOW_TYPE_NONE] = "<none>",
|
||||||
[FLOW_TCP] = "TCP connection",
|
[FLOW_TCP] = "TCP connection",
|
||||||
[FLOW_TCP_SPLICE] = "TCP connection (spliced)",
|
[FLOW_TCP_SPLICE] = "TCP connection (spliced)",
|
||||||
|
[FLOW_PING4] = "ICMP ping sequence",
|
||||||
|
[FLOW_PING6] = "ICMPv6 ping sequence",
|
||||||
|
[FLOW_UDP] = "UDP flow",
|
||||||
};
|
};
|
||||||
static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
|
static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
|
||||||
"flow_type_str[] doesn't match enum flow_type");
|
"flow_type_str[] doesn't match enum flow_type");
|
||||||
|
@ -29,52 +45,15 @@ static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
|
||||||
const uint8_t flow_proto[] = {
|
const uint8_t flow_proto[] = {
|
||||||
[FLOW_TCP] = IPPROTO_TCP,
|
[FLOW_TCP] = IPPROTO_TCP,
|
||||||
[FLOW_TCP_SPLICE] = IPPROTO_TCP,
|
[FLOW_TCP_SPLICE] = IPPROTO_TCP,
|
||||||
|
[FLOW_PING4] = IPPROTO_ICMP,
|
||||||
|
[FLOW_PING6] = IPPROTO_ICMPV6,
|
||||||
|
[FLOW_UDP] = IPPROTO_UDP,
|
||||||
};
|
};
|
||||||
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
|
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
|
||||||
"flow_proto[] doesn't match enum flow_type");
|
"flow_proto[] doesn't match enum flow_type");
|
||||||
|
|
||||||
/* Global Flow Table */
|
/* Global Flow Table */
|
||||||
|
|
||||||
/**
|
|
||||||
* DOC: Theory of Operation - flow entry life cycle
|
|
||||||
*
|
|
||||||
* An individual flow table entry moves through these logical states, usually in
|
|
||||||
* this order.
|
|
||||||
*
|
|
||||||
* FREE - Part of the general pool of free flow table entries
|
|
||||||
* Operations:
|
|
||||||
* - flow_alloc() finds an entry and moves it to ALLOC state
|
|
||||||
*
|
|
||||||
* ALLOC - A tentatively allocated entry
|
|
||||||
* Operations:
|
|
||||||
* - flow_alloc_cancel() returns the entry to FREE state
|
|
||||||
* - FLOW_START() set the entry's type and moves to START state
|
|
||||||
* Caveats:
|
|
||||||
* - It's not safe to write fields in the flow entry
|
|
||||||
* - It's not safe to allocate further entries with flow_alloc()
|
|
||||||
* - It's not safe to return to the main epoll loop (use FLOW_START()
|
|
||||||
* to move to START state before doing so)
|
|
||||||
* - It's not safe to use flow_*() logging functions
|
|
||||||
*
|
|
||||||
* START - An entry being prepared by flow type specific code
|
|
||||||
* Operations:
|
|
||||||
* - Flow type specific fields may be accessed
|
|
||||||
* - flow_*() logging functions
|
|
||||||
* - flow_alloc_cancel() returns the entry to FREE state
|
|
||||||
* Caveats:
|
|
||||||
* - Returning to the main epoll loop or allocating another entry
|
|
||||||
* with flow_alloc() implicitly moves the entry to ACTIVE state.
|
|
||||||
*
|
|
||||||
* ACTIVE - An active flow entry managed by flow type specific code
|
|
||||||
* Operations:
|
|
||||||
* - Flow type specific fields may be accessed
|
|
||||||
* - flow_*() logging functions
|
|
||||||
* - Flow may be expired by returning 'true' from flow type specific
|
|
||||||
* deferred or timer handler. This will return it to FREE state.
|
|
||||||
* Caveats:
|
|
||||||
* - It's not safe to call flow_alloc_cancel()
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* DOC: Theory of Operation - allocating and freeing flow entries
|
* DOC: Theory of Operation - allocating and freeing flow entries
|
||||||
*
|
*
|
||||||
|
@ -128,10 +107,156 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
|
||||||
|
|
||||||
unsigned flow_first_free;
|
unsigned flow_first_free;
|
||||||
union flow flowtab[FLOW_MAX];
|
union flow flowtab[FLOW_MAX];
|
||||||
|
static const union flow *flow_new_entry; /* = NULL */
|
||||||
|
|
||||||
|
/* Hash table to index it */
|
||||||
|
#define FLOW_HASH_LOAD 70 /* % */
|
||||||
|
#define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD))
|
||||||
|
|
||||||
|
/* Table for lookup from flowside information */
|
||||||
|
static flow_sidx_t flow_hashtab[FLOW_HASH_SIZE];
|
||||||
|
|
||||||
|
static_assert(ARRAY_SIZE(flow_hashtab) >= 2 * FLOW_MAX,
|
||||||
|
"Safe linear probing requires hash table with more entries than the number of sides in the flow table");
|
||||||
|
|
||||||
/* Last time the flow timers ran */
|
/* Last time the flow timers ran */
|
||||||
static struct timespec flow_timer_run;
|
static struct timespec flow_timer_run;
|
||||||
|
|
||||||
|
/** flowside_from_af() - Initialise flowside from addresses
|
||||||
|
* @side: flowside to initialise
|
||||||
|
* @af: Address family (AF_INET or AF_INET6)
|
||||||
|
* @eaddr: Endpoint address (pointer to in_addr or in6_addr)
|
||||||
|
* @eport: Endpoint port
|
||||||
|
* @oaddr: Our address (pointer to in_addr or in6_addr)
|
||||||
|
* @oport: Our port
|
||||||
|
*/
|
||||||
|
static void flowside_from_af(struct flowside *side, sa_family_t af,
|
||||||
|
const void *eaddr, in_port_t eport,
|
||||||
|
const void *oaddr, in_port_t oport)
|
||||||
|
{
|
||||||
|
if (oaddr)
|
||||||
|
inany_from_af(&side->oaddr, af, oaddr);
|
||||||
|
else
|
||||||
|
side->oaddr = inany_any6;
|
||||||
|
side->oport = oport;
|
||||||
|
|
||||||
|
if (eaddr)
|
||||||
|
inany_from_af(&side->eaddr, af, eaddr);
|
||||||
|
else
|
||||||
|
side->eaddr = inany_any6;
|
||||||
|
side->eport = eport;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct flowside_sock_args - Parameters for flowside_sock_splice()
|
||||||
|
* @c: Execution context
|
||||||
|
* @fd: Filled in with new socket fd
|
||||||
|
* @err: Filled in with errno if something failed
|
||||||
|
* @type: Socket epoll type
|
||||||
|
* @sa: Socket address
|
||||||
|
* @sl: Length of @sa
|
||||||
|
* @data: epoll reference data
|
||||||
|
*/
|
||||||
|
struct flowside_sock_args {
|
||||||
|
const struct ctx *c;
|
||||||
|
int fd;
|
||||||
|
int err;
|
||||||
|
enum epoll_type type;
|
||||||
|
const struct sockaddr *sa;
|
||||||
|
socklen_t sl;
|
||||||
|
const char *path;
|
||||||
|
uint32_t data;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** flowside_sock_splice() - Create and bind socket for PIF_SPLICE based on flowside
|
||||||
|
* @arg: Argument as a struct flowside_sock_args
|
||||||
|
*
|
||||||
|
* Return: 0
|
||||||
|
*/
|
||||||
|
static int flowside_sock_splice(void *arg)
|
||||||
|
{
|
||||||
|
struct flowside_sock_args *a = arg;
|
||||||
|
|
||||||
|
ns_enter(a->c);
|
||||||
|
|
||||||
|
a->fd = sock_l4_sa(a->c, a->type, a->sa, a->sl, NULL,
|
||||||
|
a->sa->sa_family == AF_INET6, a->data);
|
||||||
|
a->err = errno;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** flowside_sock_l4() - Create and bind socket based on flowside
|
||||||
|
* @c: Execution context
|
||||||
|
* @type: Socket epoll type
|
||||||
|
* @pif: Interface for this socket
|
||||||
|
* @tgt: Target flowside
|
||||||
|
* @data: epoll reference portion for protocol handlers
|
||||||
|
*
|
||||||
|
* Return: socket fd of protocol @proto bound to our address and port from @tgt
|
||||||
|
* (if specified).
|
||||||
|
*/
|
||||||
|
int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||||
|
const struct flowside *tgt, uint32_t data)
|
||||||
|
{
|
||||||
|
const char *ifname = NULL;
|
||||||
|
union sockaddr_inany sa;
|
||||||
|
socklen_t sl;
|
||||||
|
|
||||||
|
ASSERT(pif_is_socket(pif));
|
||||||
|
|
||||||
|
pif_sockaddr(c, &sa, &sl, pif, &tgt->oaddr, tgt->oport);
|
||||||
|
|
||||||
|
switch (pif) {
|
||||||
|
case PIF_HOST:
|
||||||
|
if (inany_is_loopback(&tgt->oaddr))
|
||||||
|
ifname = NULL;
|
||||||
|
else if (sa.sa_family == AF_INET)
|
||||||
|
ifname = c->ip4.ifname_out;
|
||||||
|
else if (sa.sa_family == AF_INET6)
|
||||||
|
ifname = c->ip6.ifname_out;
|
||||||
|
|
||||||
|
return sock_l4_sa(c, type, &sa, sl, ifname,
|
||||||
|
sa.sa_family == AF_INET6, data);
|
||||||
|
|
||||||
|
case PIF_SPLICE: {
|
||||||
|
struct flowside_sock_args args = {
|
||||||
|
.c = c, .type = type,
|
||||||
|
.sa = &sa.sa, .sl = sl, .data = data,
|
||||||
|
};
|
||||||
|
NS_CALL(flowside_sock_splice, &args);
|
||||||
|
errno = args.err;
|
||||||
|
return args.fd;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
/* If we add new socket pifs, they'll need to be implemented
|
||||||
|
* here
|
||||||
|
*/
|
||||||
|
ASSERT(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** flowside_connect() - Connect a socket based on flowside
|
||||||
|
* @c: Execution context
|
||||||
|
* @s: Socket to connect
|
||||||
|
* @pif: Target pif
|
||||||
|
* @tgt: Target flowside
|
||||||
|
*
|
||||||
|
* Connect @s to the endpoint address and port from @tgt.
|
||||||
|
*
|
||||||
|
* Return: 0 on success, negative on error
|
||||||
|
*/
|
||||||
|
int flowside_connect(const struct ctx *c, int s,
|
||||||
|
uint8_t pif, const struct flowside *tgt)
|
||||||
|
{
|
||||||
|
union sockaddr_inany sa;
|
||||||
|
socklen_t sl;
|
||||||
|
|
||||||
|
pif_sockaddr(c, &sa, &sl, pif, &tgt->eaddr, tgt->eport);
|
||||||
|
return connect(s, &sa.sa, sl);
|
||||||
|
}
|
||||||
|
|
||||||
/** flow_log_ - Log flow-related message
|
/** flow_log_ - Log flow-related message
|
||||||
* @f: flow the message is related to
|
* @f: flow the message is related to
|
||||||
* @pri: Log priority
|
* @pri: Log priority
|
||||||
|
@ -140,6 +265,7 @@ static struct timespec flow_timer_run;
|
||||||
*/
|
*/
|
||||||
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
||||||
{
|
{
|
||||||
|
const char *type_or_state;
|
||||||
char msg[BUFSIZ];
|
char msg[BUFSIZ];
|
||||||
va_list args;
|
va_list args;
|
||||||
|
|
||||||
|
@ -147,40 +273,221 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
||||||
(void)vsnprintf(msg, sizeof(msg), fmt, args);
|
(void)vsnprintf(msg, sizeof(msg), fmt, args);
|
||||||
va_end(args);
|
va_end(args);
|
||||||
|
|
||||||
logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg);
|
/* Show type if it's set, otherwise the state */
|
||||||
|
if (f->state < FLOW_STATE_TYPED)
|
||||||
|
type_or_state = FLOW_STATE(f);
|
||||||
|
else
|
||||||
|
type_or_state = FLOW_TYPE(f);
|
||||||
|
|
||||||
|
logmsg(true, false, pri,
|
||||||
|
"Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** flow_log_details_() - Log the details of a flow
|
||||||
|
* @f: flow to log
|
||||||
|
* @pri: Log priority
|
||||||
|
* @state: State to log details according to
|
||||||
|
*
|
||||||
|
* Logs the details of the flow: endpoints, interfaces, type etc.
|
||||||
|
*/
|
||||||
|
void flow_log_details_(const struct flow_common *f, int pri,
|
||||||
|
enum flow_state state)
|
||||||
|
{
|
||||||
|
char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
|
||||||
|
char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
|
||||||
|
const struct flowside *ini = &f->side[INISIDE];
|
||||||
|
const struct flowside *tgt = &f->side[TGTSIDE];
|
||||||
|
|
||||||
|
if (state >= FLOW_STATE_TGT)
|
||||||
|
flow_log_(f, pri,
|
||||||
|
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
|
||||||
|
pif_name(f->pif[INISIDE]),
|
||||||
|
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
||||||
|
ini->eport,
|
||||||
|
inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
|
||||||
|
ini->oport,
|
||||||
|
pif_name(f->pif[TGTSIDE]),
|
||||||
|
inany_ntop(&tgt->oaddr, fstr1, sizeof(fstr1)),
|
||||||
|
tgt->oport,
|
||||||
|
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
|
||||||
|
tgt->eport);
|
||||||
|
else if (state >= FLOW_STATE_INI)
|
||||||
|
flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
|
||||||
|
pif_name(f->pif[INISIDE]),
|
||||||
|
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
||||||
|
ini->eport,
|
||||||
|
inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
|
||||||
|
ini->oport);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* flow_start() - Set flow type for new flow and log
|
* flow_set_state() - Change flow's state
|
||||||
* @flow: Flow to set type for
|
* @f: Flow changing state
|
||||||
* @type: Type for new flow
|
* @state: New state
|
||||||
* @iniside: Which side initiated the new flow
|
|
||||||
*
|
|
||||||
* Return: @flow
|
|
||||||
*
|
|
||||||
* Should be called before setting any flow type specific fields in the flow
|
|
||||||
* table entry.
|
|
||||||
*/
|
*/
|
||||||
union flow *flow_start(union flow *flow, enum flow_type type,
|
static void flow_set_state(struct flow_common *f, enum flow_state state)
|
||||||
unsigned iniside)
|
|
||||||
{
|
{
|
||||||
(void)iniside;
|
uint8_t oldstate = f->state;
|
||||||
flow->f.type = type;
|
|
||||||
flow_dbg(flow, "START %s", flow_type_str[flow->f.type]);
|
ASSERT(state < FLOW_NUM_STATES);
|
||||||
|
ASSERT(oldstate < FLOW_NUM_STATES);
|
||||||
|
|
||||||
|
f->state = state;
|
||||||
|
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
|
||||||
|
FLOW_STATE(f));
|
||||||
|
|
||||||
|
flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_initiate_() - Move flow to INI, setting pif[INISIDE]
|
||||||
|
* @flow: Flow to change state
|
||||||
|
* @pif: pif of the initiating side
|
||||||
|
*/
|
||||||
|
static void flow_initiate_(union flow *flow, uint8_t pif)
|
||||||
|
{
|
||||||
|
struct flow_common *f = &flow->f;
|
||||||
|
|
||||||
|
ASSERT(pif != PIF_NONE);
|
||||||
|
ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW);
|
||||||
|
ASSERT(f->type == FLOW_TYPE_NONE);
|
||||||
|
ASSERT(f->pif[INISIDE] == PIF_NONE && f->pif[TGTSIDE] == PIF_NONE);
|
||||||
|
|
||||||
|
f->pif[INISIDE] = pif;
|
||||||
|
flow_set_state(f, FLOW_STATE_INI);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_initiate_af() - Move flow to INI, setting INISIDE details
|
||||||
|
* @flow: Flow to change state
|
||||||
|
* @pif: pif of the initiating side
|
||||||
|
* @af: Address family of @saddr and @daddr
|
||||||
|
* @saddr: Source address (pointer to in_addr or in6_addr)
|
||||||
|
* @sport: Endpoint port
|
||||||
|
* @daddr: Destination address (pointer to in_addr or in6_addr)
|
||||||
|
* @dport: Destination port
|
||||||
|
*
|
||||||
|
* Return: pointer to the initiating flowside information
|
||||||
|
*/
|
||||||
|
const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
|
||||||
|
sa_family_t af,
|
||||||
|
const void *saddr, in_port_t sport,
|
||||||
|
const void *daddr, in_port_t dport)
|
||||||
|
{
|
||||||
|
struct flowside *ini = &flow->f.side[INISIDE];
|
||||||
|
|
||||||
|
flowside_from_af(ini, af, saddr, sport, daddr, dport);
|
||||||
|
flow_initiate_(flow, pif);
|
||||||
|
return ini;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_initiate_sa() - Move flow to INI, setting INISIDE details
|
||||||
|
* @flow: Flow to change state
|
||||||
|
* @pif: pif of the initiating side
|
||||||
|
* @ssa: Source socket address
|
||||||
|
* @dport: Destination port
|
||||||
|
*
|
||||||
|
* Return: pointer to the initiating flowside information
|
||||||
|
*/
|
||||||
|
const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
||||||
|
const union sockaddr_inany *ssa,
|
||||||
|
in_port_t dport)
|
||||||
|
{
|
||||||
|
struct flowside *ini = &flow->f.side[INISIDE];
|
||||||
|
|
||||||
|
inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
|
||||||
|
if (inany_v4(&ini->eaddr))
|
||||||
|
ini->oaddr = inany_any4;
|
||||||
|
else
|
||||||
|
ini->oaddr = inany_any6;
|
||||||
|
ini->oport = dport;
|
||||||
|
flow_initiate_(flow, pif);
|
||||||
|
return ini;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_target() - Determine where flow should forward to, and move to TGT
|
||||||
|
* @c: Execution context
|
||||||
|
* @flow: Flow to forward
|
||||||
|
* @proto: Protocol
|
||||||
|
*
|
||||||
|
* Return: pointer to the target flowside information
|
||||||
|
*/
|
||||||
|
const struct flowside *flow_target(const struct ctx *c, union flow *flow,
|
||||||
|
uint8_t proto)
|
||||||
|
{
|
||||||
|
char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
|
||||||
|
struct flow_common *f = &flow->f;
|
||||||
|
const struct flowside *ini = &f->side[INISIDE];
|
||||||
|
struct flowside *tgt = &f->side[TGTSIDE];
|
||||||
|
uint8_t tgtpif = PIF_NONE;
|
||||||
|
|
||||||
|
ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_INI);
|
||||||
|
ASSERT(f->type == FLOW_TYPE_NONE);
|
||||||
|
ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] == PIF_NONE);
|
||||||
|
ASSERT(flow->f.state == FLOW_STATE_INI);
|
||||||
|
|
||||||
|
switch (f->pif[INISIDE]) {
|
||||||
|
case PIF_TAP:
|
||||||
|
tgtpif = fwd_nat_from_tap(c, proto, ini, tgt);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case PIF_SPLICE:
|
||||||
|
tgtpif = fwd_nat_from_splice(c, proto, ini, tgt);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case PIF_HOST:
|
||||||
|
tgtpif = fwd_nat_from_host(c, proto, ini, tgt);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
flow_err(flow, "No rules to forward %s [%s]:%hu -> [%s]:%hu",
|
||||||
|
pif_name(f->pif[INISIDE]),
|
||||||
|
inany_ntop(&ini->eaddr, estr, sizeof(estr)),
|
||||||
|
ini->eport,
|
||||||
|
inany_ntop(&ini->oaddr, fstr, sizeof(fstr)),
|
||||||
|
ini->oport);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tgtpif == PIF_NONE)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
f->pif[TGTSIDE] = tgtpif;
|
||||||
|
flow_set_state(f, FLOW_STATE_TGT);
|
||||||
|
return tgt;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_set_type() - Set type and move to TYPED
|
||||||
|
* @flow: Flow to change state
|
||||||
|
* @pif: pif of the initiating side
|
||||||
|
*/
|
||||||
|
union flow *flow_set_type(union flow *flow, enum flow_type type)
|
||||||
|
{
|
||||||
|
struct flow_common *f = &flow->f;
|
||||||
|
|
||||||
|
ASSERT(type != FLOW_TYPE_NONE);
|
||||||
|
ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_TGT);
|
||||||
|
ASSERT(f->type == FLOW_TYPE_NONE);
|
||||||
|
ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE);
|
||||||
|
|
||||||
|
f->type = type;
|
||||||
|
flow_set_state(f, FLOW_STATE_TYPED);
|
||||||
return flow;
|
return flow;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* flow_end() - Clear flow type for finished flow and log
|
* flow_activate() - Move flow to ACTIVE
|
||||||
* @flow: Flow to clear
|
* @f: Flow to change state
|
||||||
*/
|
*/
|
||||||
static void flow_end(union flow *flow)
|
void flow_activate(struct flow_common *f)
|
||||||
{
|
{
|
||||||
if (flow->f.type == FLOW_TYPE_NONE)
|
ASSERT(&flow_new_entry->f == f && f->state == FLOW_STATE_TYPED);
|
||||||
return; /* Nothing to do */
|
ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE);
|
||||||
|
|
||||||
flow_dbg(flow, "END %s", flow_type_str[flow->f.type]);
|
flow_set_state(f, FLOW_STATE_ACTIVE);
|
||||||
flow->f.type = FLOW_TYPE_NONE;
|
flow_new_entry = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -192,9 +499,12 @@ union flow *flow_alloc(void)
|
||||||
{
|
{
|
||||||
union flow *flow = &flowtab[flow_first_free];
|
union flow *flow = &flowtab[flow_first_free];
|
||||||
|
|
||||||
|
ASSERT(!flow_new_entry);
|
||||||
|
|
||||||
if (flow_first_free >= FLOW_MAX)
|
if (flow_first_free >= FLOW_MAX)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
ASSERT(flow->f.state == FLOW_STATE_FREE);
|
||||||
ASSERT(flow->f.type == FLOW_TYPE_NONE);
|
ASSERT(flow->f.type == FLOW_TYPE_NONE);
|
||||||
ASSERT(flow->free.n >= 1);
|
ASSERT(flow->free.n >= 1);
|
||||||
ASSERT(flow_first_free + flow->free.n <= FLOW_MAX);
|
ASSERT(flow_first_free + flow->free.n <= FLOW_MAX);
|
||||||
|
@ -217,7 +527,10 @@ union flow *flow_alloc(void)
|
||||||
flow_first_free = flow->free.next;
|
flow_first_free = flow->free.next;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
flow_new_entry = flow;
|
||||||
memset(flow, 0, sizeof(*flow));
|
memset(flow, 0, sizeof(*flow));
|
||||||
|
flow_set_state(&flow->f, FLOW_STATE_NEW);
|
||||||
|
|
||||||
return flow;
|
return flow;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -229,15 +542,228 @@ union flow *flow_alloc(void)
|
||||||
*/
|
*/
|
||||||
void flow_alloc_cancel(union flow *flow)
|
void flow_alloc_cancel(union flow *flow)
|
||||||
{
|
{
|
||||||
|
ASSERT(flow_new_entry == flow);
|
||||||
|
ASSERT(flow->f.state == FLOW_STATE_NEW ||
|
||||||
|
flow->f.state == FLOW_STATE_INI ||
|
||||||
|
flow->f.state == FLOW_STATE_TGT ||
|
||||||
|
flow->f.state == FLOW_STATE_TYPED);
|
||||||
ASSERT(flow_first_free > FLOW_IDX(flow));
|
ASSERT(flow_first_free > FLOW_IDX(flow));
|
||||||
|
|
||||||
flow_end(flow);
|
flow_set_state(&flow->f, FLOW_STATE_FREE);
|
||||||
|
memset(flow, 0, sizeof(*flow));
|
||||||
|
|
||||||
/* Put it back in a length 1 free cluster, don't attempt to fully
|
/* Put it back in a length 1 free cluster, don't attempt to fully
|
||||||
* reverse flow_alloc()s steps. This will get folded together the next
|
* reverse flow_alloc()s steps. This will get folded together the next
|
||||||
* time flow_defer_handler runs anyway() */
|
* time flow_defer_handler runs anyway() */
|
||||||
flow->free.n = 1;
|
flow->free.n = 1;
|
||||||
flow->free.next = flow_first_free;
|
flow->free.next = flow_first_free;
|
||||||
flow_first_free = FLOW_IDX(flow);
|
flow_first_free = FLOW_IDX(flow);
|
||||||
|
flow_new_entry = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_hash() - Calculate hash value for one side of a flow
|
||||||
|
* @c: Execution context
|
||||||
|
* @proto: Protocol of this flow (IP L4 protocol number)
|
||||||
|
* @pif: pif of the side to hash
|
||||||
|
* @side: Flowside (must not have unspecified parts)
|
||||||
|
*
|
||||||
|
* Return: hash value
|
||||||
|
*/
|
||||||
|
static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||||
|
const struct flowside *side)
|
||||||
|
{
|
||||||
|
struct siphash_state state = SIPHASH_INIT(c->hash_secret);
|
||||||
|
|
||||||
|
inany_siphash_feed(&state, &side->oaddr);
|
||||||
|
inany_siphash_feed(&state, &side->eaddr);
|
||||||
|
|
||||||
|
return siphash_final(&state, 38, (uint64_t)proto << 40 |
|
||||||
|
(uint64_t)pif << 32 |
|
||||||
|
(uint64_t)side->oport << 16 |
|
||||||
|
(uint64_t)side->eport);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_sidx_hash() - Calculate hash value for given side of a given flow
|
||||||
|
* @c: Execution context
|
||||||
|
* @sidx: Flow & side index to get hash for
|
||||||
|
*
|
||||||
|
* Return: hash value, of the flow & side represented by @sidx
|
||||||
|
*/
|
||||||
|
static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
|
||||||
|
{
|
||||||
|
const struct flow_common *f = &flow_at_sidx(sidx)->f;
|
||||||
|
const struct flowside *side = &f->side[sidx.sidei];
|
||||||
|
uint8_t pif = f->pif[sidx.sidei];
|
||||||
|
|
||||||
|
/* For the hash table to work, entries must have complete endpoint
|
||||||
|
* information, and at least a forwarding port.
|
||||||
|
*/
|
||||||
|
ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
|
||||||
|
side->eport != 0 && side->oport != 0);
|
||||||
|
|
||||||
|
return flow_hash(c, FLOW_PROTO(f), pif, side);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_hash_probe_() - Find hash bucket for a flow, given hash
|
||||||
|
* @hash: Raw hash value for flow & side
|
||||||
|
* @sidx: Flow and side to find bucket for
|
||||||
|
*
|
||||||
|
* Return: If @sidx is in the hash table, its current bucket, otherwise a
|
||||||
|
* suitable free bucket for it.
|
||||||
|
*/
|
||||||
|
static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
|
||||||
|
{
|
||||||
|
unsigned b = hash % FLOW_HASH_SIZE;
|
||||||
|
|
||||||
|
/* Linear probing */
|
||||||
|
while (flow_sidx_valid(flow_hashtab[b]) &&
|
||||||
|
!flow_sidx_eq(flow_hashtab[b], sidx))
|
||||||
|
b = mod_sub(b, 1, FLOW_HASH_SIZE);
|
||||||
|
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_hash_probe() - Find hash bucket for a flow
|
||||||
|
* @c: Execution context
|
||||||
|
* @sidx: Flow and side to find bucket for
|
||||||
|
*
|
||||||
|
* Return: If @sidx is in the hash table, its current bucket, otherwise a
|
||||||
|
* suitable free bucket for it.
|
||||||
|
*/
|
||||||
|
static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx)
|
||||||
|
{
|
||||||
|
return flow_hash_probe_(flow_sidx_hash(c, sidx), sidx);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_hash_insert() - Insert side of a flow into into hash table
|
||||||
|
* @c: Execution context
|
||||||
|
* @sidx: Flow & side index
|
||||||
|
*
|
||||||
|
* Return: raw (un-modded) hash value of side of flow
|
||||||
|
*/
|
||||||
|
uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx)
|
||||||
|
{
|
||||||
|
uint64_t hash = flow_sidx_hash(c, sidx);
|
||||||
|
unsigned b = flow_hash_probe_(hash, sidx);
|
||||||
|
|
||||||
|
flow_hashtab[b] = sidx;
|
||||||
|
flow_dbg(flow_at_sidx(sidx), "Side %u hash table insert: bucket: %u",
|
||||||
|
sidx.sidei, b);
|
||||||
|
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_hash_remove() - Drop side of a flow from the hash table
|
||||||
|
* @c: Execution context
|
||||||
|
* @sidx: Side of flow to remove
|
||||||
|
*/
|
||||||
|
void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx)
|
||||||
|
{
|
||||||
|
unsigned b = flow_hash_probe(c, sidx), s;
|
||||||
|
|
||||||
|
if (!flow_sidx_valid(flow_hashtab[b]))
|
||||||
|
return; /* Redundant remove */
|
||||||
|
|
||||||
|
flow_dbg(flow_at_sidx(sidx), "Side %u hash table remove: bucket: %u",
|
||||||
|
sidx.sidei, b);
|
||||||
|
|
||||||
|
/* Scan the remainder of the cluster */
|
||||||
|
for (s = mod_sub(b, 1, FLOW_HASH_SIZE);
|
||||||
|
flow_sidx_valid(flow_hashtab[s]);
|
||||||
|
s = mod_sub(s, 1, FLOW_HASH_SIZE)) {
|
||||||
|
unsigned h = flow_sidx_hash(c, flow_hashtab[s]) % FLOW_HASH_SIZE;
|
||||||
|
|
||||||
|
if (!mod_between(h, s, b, FLOW_HASH_SIZE)) {
|
||||||
|
/* flow_hashtab[s] can live in flow_hashtab[b]'s slot */
|
||||||
|
debug("hash table remove: shuffle %u -> %u", s, b);
|
||||||
|
flow_hashtab[b] = flow_hashtab[s];
|
||||||
|
b = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
flow_hashtab[b] = FLOW_SIDX_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flowside_lookup() - Look for a matching flowside in the flow table
|
||||||
|
* @c: Execution context
|
||||||
|
* @proto: Protocol of the flow (IP L4 protocol number)
|
||||||
|
* @pif: pif to look for in the table
|
||||||
|
* @side: Flowside to look for in the table
|
||||||
|
*
|
||||||
|
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
|
||||||
|
*/
|
||||||
|
static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
|
||||||
|
uint8_t pif, const struct flowside *side)
|
||||||
|
{
|
||||||
|
flow_sidx_t sidx;
|
||||||
|
union flow *flow;
|
||||||
|
unsigned b;
|
||||||
|
|
||||||
|
b = flow_hash(c, proto, pif, side) % FLOW_HASH_SIZE;
|
||||||
|
while ((sidx = flow_hashtab[b], flow = flow_at_sidx(sidx)) &&
|
||||||
|
!(FLOW_PROTO(&flow->f) == proto &&
|
||||||
|
flow->f.pif[sidx.sidei] == pif &&
|
||||||
|
flowside_eq(&flow->f.side[sidx.sidei], side)))
|
||||||
|
b = mod_sub(b, 1, FLOW_HASH_SIZE);
|
||||||
|
|
||||||
|
return flow_hashtab[b];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_lookup_af() - Look up a flow given addressing information
|
||||||
|
* @c: Execution context
|
||||||
|
* @proto: Protocol of the flow (IP L4 protocol number)
|
||||||
|
* @pif: Interface of the flow
|
||||||
|
* @af: Address family, AF_INET or AF_INET6
|
||||||
|
* @eaddr: Guest side endpoint address (guest local address)
|
||||||
|
* @oaddr: Our guest side address (guest remote address)
|
||||||
|
* @eport: Guest side endpoint port (guest local port)
|
||||||
|
* @oport: Our guest side port (guest remote port)
|
||||||
|
*
|
||||||
|
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
|
||||||
|
*/
|
||||||
|
flow_sidx_t flow_lookup_af(const struct ctx *c,
|
||||||
|
uint8_t proto, uint8_t pif, sa_family_t af,
|
||||||
|
const void *eaddr, const void *oaddr,
|
||||||
|
in_port_t eport, in_port_t oport)
|
||||||
|
{
|
||||||
|
struct flowside side;
|
||||||
|
|
||||||
|
flowside_from_af(&side, af, eaddr, eport, oaddr, oport);
|
||||||
|
return flowside_lookup(c, proto, pif, &side);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_lookup_sa() - Look up a flow given an endpoint socket address
|
||||||
|
* @c: Execution context
|
||||||
|
* @proto: Protocol of the flow (IP L4 protocol number)
|
||||||
|
* @pif: Interface of the flow
|
||||||
|
* @esa: Socket address of the endpoint
|
||||||
|
* @oport: Our port number
|
||||||
|
*
|
||||||
|
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
|
||||||
|
*/
|
||||||
|
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||||
|
const void *esa, in_port_t oport)
|
||||||
|
{
|
||||||
|
struct flowside side = {
|
||||||
|
.oport = oport,
|
||||||
|
};
|
||||||
|
|
||||||
|
inany_from_sockaddr(&side.eaddr, &side.eport, esa);
|
||||||
|
if (inany_v4(&side.eaddr))
|
||||||
|
side.oaddr = inany_any4;
|
||||||
|
else
|
||||||
|
side.oaddr = inany_any6;
|
||||||
|
|
||||||
|
return flowside_lookup(c, proto, pif, &side);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -257,11 +783,14 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
||||||
flow_timer_run = *now;
|
flow_timer_run = *now;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
|
||||||
|
|
||||||
for (idx = 0; idx < FLOW_MAX; idx++) {
|
for (idx = 0; idx < FLOW_MAX; idx++) {
|
||||||
union flow *flow = &flowtab[idx];
|
union flow *flow = &flowtab[idx];
|
||||||
bool closed = false;
|
bool closed = false;
|
||||||
|
|
||||||
if (flow->f.type == FLOW_TYPE_NONE) {
|
switch (flow->f.state) {
|
||||||
|
case FLOW_STATE_FREE: {
|
||||||
unsigned skip = flow->free.n;
|
unsigned skip = flow->free.n;
|
||||||
|
|
||||||
/* First entry of a free cluster must have n >= 1 */
|
/* First entry of a free cluster must have n >= 1 */
|
||||||
|
@ -283,17 +812,43 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case FLOW_STATE_NEW:
|
||||||
|
case FLOW_STATE_INI:
|
||||||
|
case FLOW_STATE_TGT:
|
||||||
|
case FLOW_STATE_TYPED:
|
||||||
|
/* Incomplete flow at end of cycle */
|
||||||
|
ASSERT(false);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case FLOW_STATE_ACTIVE:
|
||||||
|
/* Nothing to do */
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
switch (flow->f.type) {
|
switch (flow->f.type) {
|
||||||
case FLOW_TYPE_NONE:
|
case FLOW_TYPE_NONE:
|
||||||
ASSERT(false);
|
ASSERT(false);
|
||||||
break;
|
break;
|
||||||
case FLOW_TCP:
|
case FLOW_TCP:
|
||||||
closed = tcp_flow_defer(flow);
|
closed = tcp_flow_defer(&flow->tcp);
|
||||||
break;
|
break;
|
||||||
case FLOW_TCP_SPLICE:
|
case FLOW_TCP_SPLICE:
|
||||||
closed = tcp_splice_flow_defer(flow);
|
closed = tcp_splice_flow_defer(&flow->tcp_splice);
|
||||||
if (!closed && timer)
|
if (!closed && timer)
|
||||||
tcp_splice_timer(c, flow);
|
tcp_splice_timer(c, &flow->tcp_splice);
|
||||||
|
break;
|
||||||
|
case FLOW_PING4:
|
||||||
|
case FLOW_PING6:
|
||||||
|
if (timer)
|
||||||
|
closed = icmp_ping_timer(c, &flow->ping, now);
|
||||||
|
break;
|
||||||
|
case FLOW_UDP:
|
||||||
|
closed = udp_flow_defer(&flow->udp);
|
||||||
|
if (!closed && timer)
|
||||||
|
closed = udp_flow_timer(c, &flow->udp, now);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
/* Assume other flow types don't need any handling */
|
/* Assume other flow types don't need any handling */
|
||||||
|
@ -301,7 +856,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (closed) {
|
if (closed) {
|
||||||
flow_end(flow);
|
flow_set_state(&flow->f, FLOW_STATE_FREE);
|
||||||
|
memset(flow, 0, sizeof(*flow));
|
||||||
|
|
||||||
if (free_head) {
|
if (free_head) {
|
||||||
/* Add slot to current free cluster */
|
/* Add slot to current free cluster */
|
||||||
|
@ -328,7 +884,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
||||||
*/
|
*/
|
||||||
void flow_init(void)
|
void flow_init(void)
|
||||||
{
|
{
|
||||||
|
unsigned b;
|
||||||
|
|
||||||
/* Initial state is a single free cluster containing the whole table */
|
/* Initial state is a single free cluster containing the whole table */
|
||||||
flowtab[0].free.n = FLOW_MAX;
|
flowtab[0].free.n = FLOW_MAX;
|
||||||
flowtab[0].free.next = FLOW_MAX;
|
flowtab[0].free.next = FLOW_MAX;
|
||||||
|
|
||||||
|
for (b = 0; b < FLOW_HASH_SIZE; b++)
|
||||||
|
flow_hashtab[b] = FLOW_SIDX_NONE;
|
||||||
}
|
}
|
||||||
|
|
199
flow.h
199
flow.h
|
@ -9,6 +9,98 @@
|
||||||
|
|
||||||
#define FLOW_TIMER_INTERVAL 1000 /* ms */
|
#define FLOW_TIMER_INTERVAL 1000 /* ms */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* enum flow_state - States of a flow table entry
|
||||||
|
*
|
||||||
|
* An individual flow table entry moves through these states, usually in this
|
||||||
|
* order.
|
||||||
|
* General rules:
|
||||||
|
* - Code outside flow.c should never write common fields of union flow.
|
||||||
|
* - The state field may always be read.
|
||||||
|
*
|
||||||
|
* FREE - Part of the general pool of free flow table entries
|
||||||
|
* Operations:
|
||||||
|
* - flow_alloc() finds an entry and moves it to NEW
|
||||||
|
*
|
||||||
|
* NEW - Freshly allocated, uninitialised entry
|
||||||
|
* Operations:
|
||||||
|
* - flow_alloc_cancel() returns the entry to FREE
|
||||||
|
* - flow_initiate() sets the entry's INISIDE details and moves to
|
||||||
|
* INI
|
||||||
|
* - FLOW_SET_TYPE() sets the entry's type and moves to TYPED
|
||||||
|
* Caveats:
|
||||||
|
* - No fields other than state may be accessed
|
||||||
|
* - At most one entry may be NEW, INI, TGT or TYPED at a time, so
|
||||||
|
* it's unsafe to use flow_alloc() again until this entry moves to
|
||||||
|
* ACTIVE or FREE
|
||||||
|
* - You may not return to the main epoll loop while any flow is NEW
|
||||||
|
*
|
||||||
|
* INI - An entry with INISIDE common information completed
|
||||||
|
* Operations:
|
||||||
|
* - Common fields related to INISIDE may be read
|
||||||
|
* - flow_alloc_cancel() returns the entry to FREE
|
||||||
|
* - flow_target() sets the entry's TGTSIDE details and moves to TGT
|
||||||
|
* Caveats:
|
||||||
|
* - Other common fields may not be read
|
||||||
|
* - Type specific fields may not be read or written
|
||||||
|
* - At most one entry may be NEW, INI, TGT or TYPED at a time, so
|
||||||
|
* it's unsafe to use flow_alloc() again until this entry moves to
|
||||||
|
* ACTIVE or FREE
|
||||||
|
* - You may not return to the main epoll loop while any flow is INI
|
||||||
|
*
|
||||||
|
* TGT - An entry with only INISIDE and TGTSIDE common information completed
|
||||||
|
* Operations:
|
||||||
|
* - Common fields related to INISIDE & TGTSIDE may be read
|
||||||
|
* - flow_alloc_cancel() returns the entry to FREE
|
||||||
|
* - FLOW_SET_TYPE() sets the entry's type and moves to TYPED
|
||||||
|
* Caveats:
|
||||||
|
* - Other common fields may not be read
|
||||||
|
* - Type specific fields may not be read or written
|
||||||
|
* - At most one entry may be NEW, INI, TGT or TYPED at a time, so
|
||||||
|
* it's unsafe to use flow_alloc() again until this entry moves to
|
||||||
|
* ACTIVE or FREE
|
||||||
|
* - You may not return to the main epoll loop while any flow is TGT
|
||||||
|
*
|
||||||
|
* TYPED - Generic info initialised, type specific initialisation underway
|
||||||
|
* Operations:
|
||||||
|
* - All common fields may be read
|
||||||
|
* - Type specific fields may be read and written
|
||||||
|
* - flow_alloc_cancel() returns the entry to FREE
|
||||||
|
* - FLOW_ACTIVATE() moves the entry to ACTIVE
|
||||||
|
* Caveats:
|
||||||
|
* - At most one entry may be NEW, INI, TGT or TYPED at a time, so
|
||||||
|
* it's unsafe to use flow_alloc() again until this entry moves to
|
||||||
|
* ACTIVE or FREE
|
||||||
|
* - You may not return to the main epoll loop while any flow is
|
||||||
|
* TYPED
|
||||||
|
*
|
||||||
|
* ACTIVE - An active, fully-initialised flow entry
|
||||||
|
* Operations:
|
||||||
|
* - All common fields may be read
|
||||||
|
* - Type specific fields may be read and written
|
||||||
|
* - Flow returns to FREE when it expires, signalled by returning
|
||||||
|
* 'true' from flow type specific deferred or timer handler
|
||||||
|
* Caveats:
|
||||||
|
* - flow_alloc_cancel() may not be called on it
|
||||||
|
*/
|
||||||
|
enum flow_state {
|
||||||
|
FLOW_STATE_FREE,
|
||||||
|
FLOW_STATE_NEW,
|
||||||
|
FLOW_STATE_INI,
|
||||||
|
FLOW_STATE_TGT,
|
||||||
|
FLOW_STATE_TYPED,
|
||||||
|
FLOW_STATE_ACTIVE,
|
||||||
|
|
||||||
|
FLOW_NUM_STATES,
|
||||||
|
};
|
||||||
|
#define FLOW_STATE_BITS 8
|
||||||
|
static_assert(FLOW_NUM_STATES <= (1 << FLOW_STATE_BITS),
|
||||||
|
"Too many flow states for FLOW_STATE_BITS");
|
||||||
|
|
||||||
|
extern const char *flow_state_str[];
|
||||||
|
#define FLOW_STATE(f) \
|
||||||
|
((f)->state < FLOW_NUM_STATES ? flow_state_str[(f)->state] : "?")
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* enum flow_type - Different types of packet flows we track
|
* enum flow_type - Different types of packet flows we track
|
||||||
*/
|
*/
|
||||||
|
@ -19,9 +111,18 @@ enum flow_type {
|
||||||
FLOW_TCP,
|
FLOW_TCP,
|
||||||
/* A TCP connection between a host socket and ns socket */
|
/* A TCP connection between a host socket and ns socket */
|
||||||
FLOW_TCP_SPLICE,
|
FLOW_TCP_SPLICE,
|
||||||
|
/* ICMP echo requests from guest to host and matching replies back */
|
||||||
|
FLOW_PING4,
|
||||||
|
/* ICMPv6 echo requests from guest to host and matching replies back */
|
||||||
|
FLOW_PING6,
|
||||||
|
/* UDP pseudo-connection */
|
||||||
|
FLOW_UDP,
|
||||||
|
|
||||||
FLOW_NUM_TYPES,
|
FLOW_NUM_TYPES,
|
||||||
};
|
};
|
||||||
|
#define FLOW_TYPE_BITS 8
|
||||||
|
static_assert(FLOW_NUM_TYPES <= (1 << FLOW_TYPE_BITS),
|
||||||
|
"Too many flow types for FLOW_TYPE_BITS");
|
||||||
|
|
||||||
extern const char *flow_type_str[];
|
extern const char *flow_type_str[];
|
||||||
#define FLOW_TYPE(f) \
|
#define FLOW_TYPE(f) \
|
||||||
|
@ -31,12 +132,66 @@ extern const uint8_t flow_proto[];
|
||||||
#define FLOW_PROTO(f) \
|
#define FLOW_PROTO(f) \
|
||||||
((f)->type < FLOW_NUM_TYPES ? flow_proto[(f)->type] : 0)
|
((f)->type < FLOW_NUM_TYPES ? flow_proto[(f)->type] : 0)
|
||||||
|
|
||||||
|
#define SIDES 2
|
||||||
|
|
||||||
|
#define INISIDE 0 /* Initiating side index */
|
||||||
|
#define TGTSIDE 1 /* Target side index */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct flowside - Address information for one side of a flow
|
||||||
|
* @eaddr: Endpoint address (remote address from passt's PoV)
|
||||||
|
* @oaddr: Our address (local address from passt's PoV)
|
||||||
|
* @eport: Endpoint port
|
||||||
|
* @oport: Our port
|
||||||
|
*/
|
||||||
|
struct flowside {
|
||||||
|
union inany_addr oaddr;
|
||||||
|
union inany_addr eaddr;
|
||||||
|
in_port_t oport;
|
||||||
|
in_port_t eport;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flowside_eq() - Check if two flowsides are equal
|
||||||
|
* @left, @right: Flowsides to compare
|
||||||
|
*
|
||||||
|
* Return: true if equal, false otherwise
|
||||||
|
*/
|
||||||
|
static inline bool flowside_eq(const struct flowside *left,
|
||||||
|
const struct flowside *right)
|
||||||
|
{
|
||||||
|
return inany_equals(&left->eaddr, &right->eaddr) &&
|
||||||
|
left->eport == right->eport &&
|
||||||
|
inany_equals(&left->oaddr, &right->oaddr) &&
|
||||||
|
left->oport == right->oport;
|
||||||
|
}
|
||||||
|
|
||||||
|
int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||||
|
const struct flowside *tgt, uint32_t data);
|
||||||
|
int flowside_connect(const struct ctx *c, int s,
|
||||||
|
uint8_t pif, const struct flowside *tgt);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct flow_common - Common fields for packet flows
|
* struct flow_common - Common fields for packet flows
|
||||||
|
* @state: State of the flow table entry
|
||||||
* @type: Type of packet flow
|
* @type: Type of packet flow
|
||||||
|
* @pif[]: Interface for each side of the flow
|
||||||
|
* @side[]: Information for each side of the flow
|
||||||
*/
|
*/
|
||||||
struct flow_common {
|
struct flow_common {
|
||||||
|
#ifdef __GNUC__
|
||||||
|
enum flow_state state:FLOW_STATE_BITS;
|
||||||
|
enum flow_type type:FLOW_TYPE_BITS;
|
||||||
|
#else
|
||||||
|
uint8_t state;
|
||||||
|
static_assert(sizeof(uint8_t) * 8 >= FLOW_STATE_BITS,
|
||||||
|
"Not enough bits for state field");
|
||||||
uint8_t type;
|
uint8_t type;
|
||||||
|
static_assert(sizeof(uint8_t) * 8 >= FLOW_TYPE_BITS,
|
||||||
|
"Not enough bits for type field");
|
||||||
|
#endif
|
||||||
|
uint8_t pif[SIDES];
|
||||||
|
struct flowside side[SIDES];
|
||||||
};
|
};
|
||||||
|
|
||||||
#define FLOW_INDEX_BITS 17 /* 128k - 1 */
|
#define FLOW_INDEX_BITS 17 /* 128k - 1 */
|
||||||
|
@ -45,24 +200,30 @@ struct flow_common {
|
||||||
#define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */
|
#define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */
|
||||||
#define FLOW_FILE_PRESSURE 30 /* % of c->nofile */
|
#define FLOW_FILE_PRESSURE 30 /* % of c->nofile */
|
||||||
|
|
||||||
union flow *flow_start(union flow *flow, enum flow_type type,
|
|
||||||
unsigned iniside);
|
|
||||||
#define FLOW_START(flow_, t_, var_, i_) \
|
|
||||||
(&flow_start((flow_), (t_), (i_))->var_)
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct flow_sidx - ID for one side of a specific flow
|
* struct flow_sidx - ID for one side of a specific flow
|
||||||
* @side: Side referenced (0 or 1)
|
* @sidei: Index of side referenced (0 or 1)
|
||||||
* @flow: Index of flow referenced
|
* @flowi: Index of flow referenced
|
||||||
*/
|
*/
|
||||||
typedef struct flow_sidx {
|
typedef struct flow_sidx {
|
||||||
unsigned side :1;
|
unsigned sidei :1;
|
||||||
unsigned flow :FLOW_INDEX_BITS;
|
unsigned flowi :FLOW_INDEX_BITS;
|
||||||
} flow_sidx_t;
|
} flow_sidx_t;
|
||||||
static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t),
|
static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t),
|
||||||
"flow_sidx_t must fit within 32 bits");
|
"flow_sidx_t must fit within 32 bits");
|
||||||
|
|
||||||
#define FLOW_SIDX_NONE ((flow_sidx_t){ .flow = FLOW_MAX })
|
#define FLOW_SIDX_NONE ((flow_sidx_t){ .flowi = FLOW_MAX })
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_sidx_valid() - Test if a sidx is valid
|
||||||
|
* @sidx: sidx value
|
||||||
|
*
|
||||||
|
* Return: true if @sidx refers to a valid flow & side
|
||||||
|
*/
|
||||||
|
static inline bool flow_sidx_valid(flow_sidx_t sidx)
|
||||||
|
{
|
||||||
|
return sidx.flowi < FLOW_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* flow_sidx_eq() - Test if two sidx values are equal
|
* flow_sidx_eq() - Test if two sidx values are equal
|
||||||
|
@ -72,9 +233,18 @@ static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t),
|
||||||
*/
|
*/
|
||||||
static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b)
|
static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b)
|
||||||
{
|
{
|
||||||
return (a.flow == b.flow) && (a.side == b.side);
|
return (a.flowi == b.flowi) && (a.sidei == b.sidei);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx);
|
||||||
|
void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx);
|
||||||
|
flow_sidx_t flow_lookup_af(const struct ctx *c,
|
||||||
|
uint8_t proto, uint8_t pif, sa_family_t af,
|
||||||
|
const void *eaddr, const void *oaddr,
|
||||||
|
in_port_t eport, in_port_t oport);
|
||||||
|
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||||
|
const void *esa, in_port_t oport);
|
||||||
|
|
||||||
union flow;
|
union flow;
|
||||||
|
|
||||||
void flow_init(void);
|
void flow_init(void);
|
||||||
|
@ -94,4 +264,11 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
||||||
flow_dbg((f), __VA_ARGS__); \
|
flow_dbg((f), __VA_ARGS__); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
void flow_log_details_(const struct flow_common *f, int pri,
|
||||||
|
enum flow_state state);
|
||||||
|
#define flow_log_details(f_, pri) \
|
||||||
|
flow_log_details_(&((f_)->f), (pri), (f_)->f.state)
|
||||||
|
#define flow_dbg_details(f_) flow_log_details((f_), LOG_DEBUG)
|
||||||
|
#define flow_err_details(f_) flow_log_details((f_), LOG_ERR)
|
||||||
|
|
||||||
#endif /* FLOW_H */
|
#endif /* FLOW_H */
|
||||||
|
|
103
flow_table.h
103
flow_table.h
|
@ -8,6 +8,8 @@
|
||||||
#define FLOW_TABLE_H
|
#define FLOW_TABLE_H
|
||||||
|
|
||||||
#include "tcp_conn.h"
|
#include "tcp_conn.h"
|
||||||
|
#include "icmp_flow.h"
|
||||||
|
#include "udp_flow.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct flow_free_cluster - Information about a cluster of free entries
|
* struct flow_free_cluster - Information about a cluster of free entries
|
||||||
|
@ -33,14 +35,22 @@ union flow {
|
||||||
struct flow_free_cluster free;
|
struct flow_free_cluster free;
|
||||||
struct tcp_tap_conn tcp;
|
struct tcp_tap_conn tcp;
|
||||||
struct tcp_splice_conn tcp_splice;
|
struct tcp_splice_conn tcp_splice;
|
||||||
|
struct icmp_ping_flow ping;
|
||||||
|
struct udp_flow udp;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Global Flow Table */
|
/* Global Flow Table */
|
||||||
extern unsigned flow_first_free;
|
extern unsigned flow_first_free;
|
||||||
extern union flow flowtab[];
|
extern union flow flowtab[];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow_foreach_sidei() - 'for' type macro to step through each side of flow
|
||||||
|
* @sidei_: Takes value INISIDE, then TGTSIDE
|
||||||
|
*/
|
||||||
|
#define flow_foreach_sidei(sidei_) \
|
||||||
|
for ((sidei_) = INISIDE; (sidei_) < SIDES; (sidei_)++)
|
||||||
|
|
||||||
/** flow_idx - Index of flow from common structure
|
/** flow_idx() - Index of flow from common structure
|
||||||
* @f: Common flow fields pointer
|
* @f: Common flow fields pointer
|
||||||
*
|
*
|
||||||
* Return: index of @f in the flow table
|
* Return: index of @f in the flow table
|
||||||
|
@ -50,59 +60,122 @@ static inline unsigned flow_idx(const struct flow_common *f)
|
||||||
return (union flow *)f - flowtab;
|
return (union flow *)f - flowtab;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** FLOW_IDX - Find the index of a flow
|
/** FLOW_IDX() - Find the index of a flow
|
||||||
* @f_: Flow pointer, either union flow * or protocol specific
|
* @f_: Flow pointer, either union flow * or protocol specific
|
||||||
*
|
*
|
||||||
* Return: index of @f in the flow table
|
* Return: index of @f in the flow table
|
||||||
*/
|
*/
|
||||||
#define FLOW_IDX(f_) (flow_idx(&(f_)->f))
|
#define FLOW_IDX(f_) (flow_idx(&(f_)->f))
|
||||||
|
|
||||||
/** FLOW - Flow entry at a given index
|
/** FLOW() - Flow entry at a given index
|
||||||
* @idx: Flow index
|
* @idx: Flow index
|
||||||
*
|
*
|
||||||
* Return: pointer to entry @idx in the flow table
|
* Return: pointer to entry @idx in the flow table
|
||||||
*/
|
*/
|
||||||
#define FLOW(idx) (&flowtab[(idx)])
|
#define FLOW(idx) (&flowtab[(idx)])
|
||||||
|
|
||||||
/** flow_at_sidx - Flow entry for a given sidx
|
/** flow_at_sidx() - Flow entry for a given sidx
|
||||||
* @sidx: Flow & side index
|
* @sidx: Flow & side index
|
||||||
*
|
*
|
||||||
* Return: pointer to the corresponding flow entry, or NULL
|
* Return: pointer to the corresponding flow entry, or NULL
|
||||||
*/
|
*/
|
||||||
static inline union flow *flow_at_sidx(flow_sidx_t sidx)
|
static inline union flow *flow_at_sidx(flow_sidx_t sidx)
|
||||||
{
|
{
|
||||||
if (sidx.flow >= FLOW_MAX)
|
if (!flow_sidx_valid(sidx))
|
||||||
return NULL;
|
return NULL;
|
||||||
return FLOW(sidx.flow);
|
return FLOW(sidx.flowi);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** flow_sidx_t - Index of one side of a flow from common structure
|
/** pif_at_sidx() - Interface for a given flow and side
|
||||||
|
* @sidx: Flow & side index
|
||||||
|
*
|
||||||
|
* Return: pif for the flow & side given by @sidx
|
||||||
|
*/
|
||||||
|
static inline uint8_t pif_at_sidx(flow_sidx_t sidx)
|
||||||
|
{
|
||||||
|
const union flow *flow = flow_at_sidx(sidx);
|
||||||
|
|
||||||
|
if (!flow)
|
||||||
|
return PIF_NONE;
|
||||||
|
return flow->f.pif[sidx.sidei];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** flowside_at_sidx() - Retrieve a specific flowside
|
||||||
|
* @sidx: Flow & side index
|
||||||
|
*
|
||||||
|
* Return: Flowside for the flow & side given by @sidx
|
||||||
|
*/
|
||||||
|
static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx)
|
||||||
|
{
|
||||||
|
const union flow *flow = flow_at_sidx(sidx);
|
||||||
|
|
||||||
|
if (!flow)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
return &flow->f.side[sidx.sidei];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** flow_sidx_opposite() - Get the other side of the same flow
|
||||||
|
* @sidx: Flow & side index
|
||||||
|
*
|
||||||
|
* Return: sidx for the other side of the same flow as @sidx
|
||||||
|
*/
|
||||||
|
static inline flow_sidx_t flow_sidx_opposite(flow_sidx_t sidx)
|
||||||
|
{
|
||||||
|
if (!flow_sidx_valid(sidx))
|
||||||
|
return FLOW_SIDX_NONE;
|
||||||
|
|
||||||
|
return (flow_sidx_t){.flowi = sidx.flowi, .sidei = !sidx.sidei};
|
||||||
|
}
|
||||||
|
|
||||||
|
/** flow_sidx() - Index of one side of a flow from common structure
|
||||||
* @f: Common flow fields pointer
|
* @f: Common flow fields pointer
|
||||||
* @side: Which side to refer to (0 or 1)
|
* @sidei: Which side to refer to (0 or 1)
|
||||||
*
|
*
|
||||||
* Return: index of @f and @side in the flow table
|
* Return: index of @f and @side in the flow table
|
||||||
*/
|
*/
|
||||||
static inline flow_sidx_t flow_sidx(const struct flow_common *f,
|
static inline flow_sidx_t flow_sidx(const struct flow_common *f,
|
||||||
int side)
|
unsigned sidei)
|
||||||
{
|
{
|
||||||
/* cppcheck-suppress [knownConditionTrueFalse, unmatchedSuppression] */
|
/* cppcheck-suppress [knownConditionTrueFalse, unmatchedSuppression] */
|
||||||
ASSERT(side == !!side);
|
ASSERT(sidei == !!sidei);
|
||||||
|
|
||||||
return (flow_sidx_t){
|
return (flow_sidx_t){
|
||||||
.side = side,
|
.sidei = sidei,
|
||||||
.flow = flow_idx(f),
|
.flowi = flow_idx(f),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/** FLOW_SIDX - Find the index of one side of a flow
|
/** FLOW_SIDX() - Find the index of one side of a flow
|
||||||
* @f_: Flow pointer, either union flow * or protocol specific
|
* @f_: Flow pointer, either union flow * or protocol specific
|
||||||
* @side: Which side to index (0 or 1)
|
* @sidei: Which side to index (0 or 1)
|
||||||
*
|
*
|
||||||
* Return: index of @f and @side in the flow table
|
* Return: index of @f and @side in the flow table
|
||||||
*/
|
*/
|
||||||
#define FLOW_SIDX(f_, side) (flow_sidx(&(f_)->f, (side)))
|
#define FLOW_SIDX(f_, sidei) (flow_sidx(&(f_)->f, (sidei)))
|
||||||
|
|
||||||
union flow *flow_alloc(void);
|
union flow *flow_alloc(void);
|
||||||
void flow_alloc_cancel(union flow *flow);
|
void flow_alloc_cancel(union flow *flow);
|
||||||
|
|
||||||
|
const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
|
||||||
|
sa_family_t af,
|
||||||
|
const void *saddr, in_port_t sport,
|
||||||
|
const void *daddr, in_port_t dport);
|
||||||
|
const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
||||||
|
const union sockaddr_inany *ssa,
|
||||||
|
in_port_t dport);
|
||||||
|
const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
|
||||||
|
sa_family_t af,
|
||||||
|
const void *saddr, in_port_t sport,
|
||||||
|
const void *daddr, in_port_t dport);
|
||||||
|
const struct flowside *flow_target(const struct ctx *c, union flow *flow,
|
||||||
|
uint8_t proto);
|
||||||
|
|
||||||
|
union flow *flow_set_type(union flow *flow, enum flow_type type);
|
||||||
|
#define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_)
|
||||||
|
|
||||||
|
void flow_activate(struct flow_common *f);
|
||||||
|
#define FLOW_ACTIVATE(flow_) \
|
||||||
|
(flow_activate(&(flow_)->f))
|
||||||
|
|
||||||
#endif /* FLOW_TABLE_H */
|
#endif /* FLOW_TABLE_H */
|
||||||
|
|
387
fwd.c
387
fwd.c
|
@ -25,6 +25,81 @@
|
||||||
#include "fwd.h"
|
#include "fwd.h"
|
||||||
#include "passt.h"
|
#include "passt.h"
|
||||||
#include "lineread.h"
|
#include "lineread.h"
|
||||||
|
#include "flow_table.h"
|
||||||
|
|
||||||
|
/* Empheral port range: values from RFC 6335 */
|
||||||
|
static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
|
||||||
|
static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
|
||||||
|
|
||||||
|
#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
|
||||||
|
|
||||||
|
/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
|
||||||
|
*
|
||||||
|
* Work out what ports the host thinks are emphemeral and record it for later
|
||||||
|
* use by fwd_port_is_ephemeral(). If we're unable to probe, assume the range
|
||||||
|
* recommended by RFC 6335.
|
||||||
|
*/
|
||||||
|
void fwd_probe_ephemeral(void)
|
||||||
|
{
|
||||||
|
char *line, *tab, *end;
|
||||||
|
struct lineread lr;
|
||||||
|
long min, max;
|
||||||
|
ssize_t len;
|
||||||
|
int fd;
|
||||||
|
|
||||||
|
fd = open(PORT_RANGE_SYSCTL, O_RDONLY | O_CLOEXEC);
|
||||||
|
if (fd < 0) {
|
||||||
|
warn_perror("Unable to open %s", PORT_RANGE_SYSCTL);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
lineread_init(&lr, fd);
|
||||||
|
len = lineread_get(&lr, &line);
|
||||||
|
close(fd);
|
||||||
|
|
||||||
|
if (len < 0)
|
||||||
|
goto parse_err;
|
||||||
|
|
||||||
|
tab = strchr(line, '\t');
|
||||||
|
if (!tab)
|
||||||
|
goto parse_err;
|
||||||
|
*tab = '\0';
|
||||||
|
|
||||||
|
errno = 0;
|
||||||
|
min = strtol(line, &end, 10);
|
||||||
|
if (*end || errno)
|
||||||
|
goto parse_err;
|
||||||
|
|
||||||
|
errno = 0;
|
||||||
|
max = strtol(tab + 1, &end, 10);
|
||||||
|
if (*end || errno)
|
||||||
|
goto parse_err;
|
||||||
|
|
||||||
|
if (min < 0 || min >= (long)NUM_PORTS ||
|
||||||
|
max < 0 || max >= (long)NUM_PORTS)
|
||||||
|
goto parse_err;
|
||||||
|
|
||||||
|
fwd_ephemeral_min = min;
|
||||||
|
fwd_ephemeral_max = max;
|
||||||
|
|
||||||
|
return;
|
||||||
|
|
||||||
|
parse_err:
|
||||||
|
warn("Unable to parse %s", PORT_RANGE_SYSCTL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* fwd_port_is_ephemeral() - Is port number ephemeral?
|
||||||
|
* @port: Port number
|
||||||
|
*
|
||||||
|
* Return: true if @port is ephemeral, that is may be allocated by the kernel as
|
||||||
|
* a local port for outgoing connections or datagrams, but should not be
|
||||||
|
* used for binding services to.
|
||||||
|
*/
|
||||||
|
bool fwd_port_is_ephemeral(in_port_t port)
|
||||||
|
{
|
||||||
|
return (port >= fwd_ephemeral_min) && (port <= fwd_ephemeral_max);
|
||||||
|
}
|
||||||
|
|
||||||
/* See enum in kernel's include/net/tcp_states.h */
|
/* See enum in kernel's include/net/tcp_states.h */
|
||||||
#define UDP_LISTEN 0x07
|
#define UDP_LISTEN 0x07
|
||||||
|
@ -38,7 +113,7 @@
|
||||||
* @exclude: Bitmap of ports to exclude from setting (and clear)
|
* @exclude: Bitmap of ports to exclude from setting (and clear)
|
||||||
*
|
*
|
||||||
* #syscalls:pasta lseek
|
* #syscalls:pasta lseek
|
||||||
* #syscalls:pasta ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek
|
* #syscalls:pasta ppc64le:_llseek ppc64:_llseek arm:_llseek
|
||||||
*/
|
*/
|
||||||
static void procfs_scan_listen(int fd, unsigned int lstate,
|
static void procfs_scan_listen(int fd, unsigned int lstate,
|
||||||
uint8_t *map, const uint8_t *exclude)
|
uint8_t *map, const uint8_t *exclude)
|
||||||
|
@ -52,7 +127,7 @@ static void procfs_scan_listen(int fd, unsigned int lstate,
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (lseek(fd, 0, SEEK_SET)) {
|
if (lseek(fd, 0, SEEK_SET)) {
|
||||||
warn("lseek() failed on /proc/net file: %s", strerror(errno));
|
warn_perror("lseek() failed on /proc/net file");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -128,18 +203,18 @@ void fwd_scan_ports_init(struct ctx *c)
|
||||||
|
|
||||||
c->tcp.fwd_in.scan4 = c->tcp.fwd_in.scan6 = -1;
|
c->tcp.fwd_in.scan4 = c->tcp.fwd_in.scan6 = -1;
|
||||||
c->tcp.fwd_out.scan4 = c->tcp.fwd_out.scan6 = -1;
|
c->tcp.fwd_out.scan4 = c->tcp.fwd_out.scan6 = -1;
|
||||||
c->udp.fwd_in.f.scan4 = c->udp.fwd_in.f.scan6 = -1;
|
c->udp.fwd_in.scan4 = c->udp.fwd_in.scan6 = -1;
|
||||||
c->udp.fwd_out.f.scan4 = c->udp.fwd_out.f.scan6 = -1;
|
c->udp.fwd_out.scan4 = c->udp.fwd_out.scan6 = -1;
|
||||||
|
|
||||||
if (c->tcp.fwd_in.mode == FWD_AUTO) {
|
if (c->tcp.fwd_in.mode == FWD_AUTO) {
|
||||||
c->tcp.fwd_in.scan4 = open_in_ns(c, "/proc/net/tcp", flags);
|
c->tcp.fwd_in.scan4 = open_in_ns(c, "/proc/net/tcp", flags);
|
||||||
c->tcp.fwd_in.scan6 = open_in_ns(c, "/proc/net/tcp6", flags);
|
c->tcp.fwd_in.scan6 = open_in_ns(c, "/proc/net/tcp6", flags);
|
||||||
fwd_scan_ports_tcp(&c->tcp.fwd_in, &c->tcp.fwd_out);
|
fwd_scan_ports_tcp(&c->tcp.fwd_in, &c->tcp.fwd_out);
|
||||||
}
|
}
|
||||||
if (c->udp.fwd_in.f.mode == FWD_AUTO) {
|
if (c->udp.fwd_in.mode == FWD_AUTO) {
|
||||||
c->udp.fwd_in.f.scan4 = open_in_ns(c, "/proc/net/udp", flags);
|
c->udp.fwd_in.scan4 = open_in_ns(c, "/proc/net/udp", flags);
|
||||||
c->udp.fwd_in.f.scan6 = open_in_ns(c, "/proc/net/udp6", flags);
|
c->udp.fwd_in.scan6 = open_in_ns(c, "/proc/net/udp6", flags);
|
||||||
fwd_scan_ports_udp(&c->udp.fwd_in.f, &c->udp.fwd_out.f,
|
fwd_scan_ports_udp(&c->udp.fwd_in, &c->udp.fwd_out,
|
||||||
&c->tcp.fwd_in, &c->tcp.fwd_out);
|
&c->tcp.fwd_in, &c->tcp.fwd_out);
|
||||||
}
|
}
|
||||||
if (c->tcp.fwd_out.mode == FWD_AUTO) {
|
if (c->tcp.fwd_out.mode == FWD_AUTO) {
|
||||||
|
@ -147,10 +222,298 @@ void fwd_scan_ports_init(struct ctx *c)
|
||||||
c->tcp.fwd_out.scan6 = open("/proc/net/tcp6", flags);
|
c->tcp.fwd_out.scan6 = open("/proc/net/tcp6", flags);
|
||||||
fwd_scan_ports_tcp(&c->tcp.fwd_out, &c->tcp.fwd_in);
|
fwd_scan_ports_tcp(&c->tcp.fwd_out, &c->tcp.fwd_in);
|
||||||
}
|
}
|
||||||
if (c->udp.fwd_out.f.mode == FWD_AUTO) {
|
if (c->udp.fwd_out.mode == FWD_AUTO) {
|
||||||
c->udp.fwd_out.f.scan4 = open("/proc/net/udp", flags);
|
c->udp.fwd_out.scan4 = open("/proc/net/udp", flags);
|
||||||
c->udp.fwd_out.f.scan6 = open("/proc/net/udp6", flags);
|
c->udp.fwd_out.scan6 = open("/proc/net/udp6", flags);
|
||||||
fwd_scan_ports_udp(&c->udp.fwd_out.f, &c->udp.fwd_in.f,
|
fwd_scan_ports_udp(&c->udp.fwd_out, &c->udp.fwd_in,
|
||||||
&c->tcp.fwd_out, &c->tcp.fwd_in);
|
&c->tcp.fwd_out, &c->tcp.fwd_in);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* is_dns_flow() - Determine if flow appears to be a DNS request
|
||||||
|
* @proto: Protocol (IP L4 protocol number)
|
||||||
|
* @ini: Flow address information of the initiating side
|
||||||
|
*
|
||||||
|
* Return: true if the flow appears to be directed at a dns server, that is a
|
||||||
|
* TCP or UDP flow to port 53 (domain) or port 853 (domain-s)
|
||||||
|
*/
|
||||||
|
static bool is_dns_flow(uint8_t proto, const struct flowside *ini)
|
||||||
|
{
|
||||||
|
return ((proto == IPPROTO_UDP) || (proto == IPPROTO_TCP)) &&
|
||||||
|
((ini->oport == 53) || (ini->oport == 853));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* fwd_guest_accessible4() - Is IPv4 address guest-accessible
|
||||||
|
* @c: Execution context
|
||||||
|
* @addr: Host visible IPv4 address
|
||||||
|
*
|
||||||
|
* Return: true if @addr on the host is accessible to the guest without
|
||||||
|
* translation, false otherwise
|
||||||
|
*/
|
||||||
|
static bool fwd_guest_accessible4(const struct ctx *c,
|
||||||
|
const struct in_addr *addr)
|
||||||
|
{
|
||||||
|
if (IN4_IS_ADDR_LOOPBACK(addr))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* In socket interfaces 0.0.0.0 generally means "any" or unspecified,
|
||||||
|
* however on the wire it can mean "this host on this network". Since
|
||||||
|
* that has a different meaning for host and guest, we can't let it
|
||||||
|
* through untranslated.
|
||||||
|
*/
|
||||||
|
if (IN4_IS_ADDR_UNSPECIFIED(addr))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* For IPv4, addr_seen is initialised to addr, so is always a valid
|
||||||
|
* address
|
||||||
|
*/
|
||||||
|
if (IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr) ||
|
||||||
|
IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr_seen))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* fwd_guest_accessible6() - Is IPv6 address guest-accessible
|
||||||
|
* @c: Execution context
|
||||||
|
* @addr: Host visible IPv6 address
|
||||||
|
*
|
||||||
|
* Return: true if @addr on the host is accessible to the guest without
|
||||||
|
* translation, false otherwise
|
||||||
|
*/
|
||||||
|
static bool fwd_guest_accessible6(const struct ctx *c,
|
||||||
|
const struct in6_addr *addr)
|
||||||
|
{
|
||||||
|
if (IN6_IS_ADDR_LOOPBACK(addr))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* For IPv6, addr_seen starts unspecified, because we don't know what LL
|
||||||
|
* address the guest will take until we see it. Only check against it
|
||||||
|
* if it has been set to a real address.
|
||||||
|
*/
|
||||||
|
if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen) &&
|
||||||
|
IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr_seen))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* fwd_guest_accessible() - Is IPv[46] address guest-accessible
|
||||||
|
* @c: Execution context
|
||||||
|
* @addr: Host visible IPv[46] address
|
||||||
|
*
|
||||||
|
* Return: true if @addr on the host is accessible to the guest without
|
||||||
|
* translation, false otherwise
|
||||||
|
*/
|
||||||
|
static bool fwd_guest_accessible(const struct ctx *c,
|
||||||
|
const union inany_addr *addr)
|
||||||
|
{
|
||||||
|
const struct in_addr *a4 = inany_v4(addr);
|
||||||
|
|
||||||
|
if (a4)
|
||||||
|
return fwd_guest_accessible4(c, a4);
|
||||||
|
|
||||||
|
return fwd_guest_accessible6(c, &addr->a6);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* fwd_nat_from_tap() - Determine to forward a flow from the tap interface
|
||||||
|
* @c: Execution context
|
||||||
|
* @proto: Protocol (IP L4 protocol number)
|
||||||
|
* @ini: Flow address information of the initiating side
|
||||||
|
* @tgt: Flow address information on the target side (updated)
|
||||||
|
*
|
||||||
|
* Return: pif of the target interface to forward the flow to, PIF_NONE if the
|
||||||
|
* flow cannot or should not be forwarded at all.
|
||||||
|
*/
|
||||||
|
uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
|
||||||
|
const struct flowside *ini, struct flowside *tgt)
|
||||||
|
{
|
||||||
|
if (is_dns_flow(proto, ini) &&
|
||||||
|
inany_equals4(&ini->oaddr, &c->ip4.dns_match))
|
||||||
|
tgt->eaddr = inany_from_v4(c->ip4.dns_host);
|
||||||
|
else if (is_dns_flow(proto, ini) &&
|
||||||
|
inany_equals6(&ini->oaddr, &c->ip6.dns_match))
|
||||||
|
tgt->eaddr.a6 = c->ip6.dns_host;
|
||||||
|
else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback))
|
||||||
|
tgt->eaddr = inany_loopback4;
|
||||||
|
else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback))
|
||||||
|
tgt->eaddr = inany_loopback6;
|
||||||
|
else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr))
|
||||||
|
tgt->eaddr = inany_from_v4(c->ip4.addr);
|
||||||
|
else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr))
|
||||||
|
tgt->eaddr.a6 = c->ip6.addr;
|
||||||
|
else
|
||||||
|
tgt->eaddr = ini->oaddr;
|
||||||
|
|
||||||
|
tgt->eport = ini->oport;
|
||||||
|
|
||||||
|
/* The relevant addr_out controls the host side source address. This
|
||||||
|
* may be unspecified, which allows the kernel to pick an address.
|
||||||
|
*/
|
||||||
|
if (inany_v4(&tgt->eaddr))
|
||||||
|
tgt->oaddr = inany_from_v4(c->ip4.addr_out);
|
||||||
|
else
|
||||||
|
tgt->oaddr.a6 = c->ip6.addr_out;
|
||||||
|
|
||||||
|
/* Let the kernel pick a host side source port */
|
||||||
|
tgt->oport = 0;
|
||||||
|
if (proto == IPPROTO_UDP) {
|
||||||
|
/* But for UDP we preserve the source port */
|
||||||
|
tgt->oport = ini->eport;
|
||||||
|
}
|
||||||
|
|
||||||
|
return PIF_HOST;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* fwd_nat_from_splice() - Determine to forward a flow from the splice interface
|
||||||
|
* @c: Execution context
|
||||||
|
* @proto: Protocol (IP L4 protocol number)
|
||||||
|
* @ini: Flow address information of the initiating side
|
||||||
|
* @tgt: Flow address information on the target side (updated)
|
||||||
|
*
|
||||||
|
* Return: pif of the target interface to forward the flow to, PIF_NONE if the
|
||||||
|
* flow cannot or should not be forwarded at all.
|
||||||
|
*/
|
||||||
|
uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
|
||||||
|
const struct flowside *ini, struct flowside *tgt)
|
||||||
|
{
|
||||||
|
if (!inany_is_loopback(&ini->eaddr) ||
|
||||||
|
(!inany_is_loopback(&ini->oaddr) && !inany_is_unspecified(&ini->oaddr))) {
|
||||||
|
char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
|
||||||
|
|
||||||
|
debug("Non loopback address on %s: [%s]:%hu -> [%s]:%hu",
|
||||||
|
pif_name(PIF_SPLICE),
|
||||||
|
inany_ntop(&ini->eaddr, estr, sizeof(estr)), ini->eport,
|
||||||
|
inany_ntop(&ini->oaddr, fstr, sizeof(fstr)), ini->oport);
|
||||||
|
return PIF_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inany_v4(&ini->eaddr))
|
||||||
|
tgt->eaddr = inany_loopback4;
|
||||||
|
else
|
||||||
|
tgt->eaddr = inany_loopback6;
|
||||||
|
|
||||||
|
/* Preserve the specific loopback adddress used, but let the kernel pick
|
||||||
|
* a source port on the target side
|
||||||
|
*/
|
||||||
|
tgt->oaddr = ini->eaddr;
|
||||||
|
tgt->oport = 0;
|
||||||
|
|
||||||
|
tgt->eport = ini->oport;
|
||||||
|
if (proto == IPPROTO_TCP)
|
||||||
|
tgt->eport += c->tcp.fwd_out.delta[tgt->eport];
|
||||||
|
else if (proto == IPPROTO_UDP)
|
||||||
|
tgt->eport += c->udp.fwd_out.delta[tgt->eport];
|
||||||
|
|
||||||
|
/* Let the kernel pick a host side source port */
|
||||||
|
tgt->oport = 0;
|
||||||
|
if (proto == IPPROTO_UDP)
|
||||||
|
/* But for UDP preserve the source port */
|
||||||
|
tgt->oport = ini->eport;
|
||||||
|
|
||||||
|
return PIF_HOST;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* fwd_nat_from_host() - Determine to forward a flow from the host interface
|
||||||
|
* @c: Execution context
|
||||||
|
* @proto: Protocol (IP L4 protocol number)
|
||||||
|
* @ini: Flow address information of the initiating side
|
||||||
|
* @tgt: Flow address information on the target side (updated)
|
||||||
|
*
|
||||||
|
* Return: pif of the target interface to forward the flow to, PIF_NONE if the
|
||||||
|
* flow cannot or should not be forwarded at all.
|
||||||
|
*/
|
||||||
|
uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
|
||||||
|
const struct flowside *ini, struct flowside *tgt)
|
||||||
|
{
|
||||||
|
/* Common for spliced and non-spliced cases */
|
||||||
|
tgt->eport = ini->oport;
|
||||||
|
if (proto == IPPROTO_TCP)
|
||||||
|
tgt->eport += c->tcp.fwd_in.delta[tgt->eport];
|
||||||
|
else if (proto == IPPROTO_UDP)
|
||||||
|
tgt->eport += c->udp.fwd_in.delta[tgt->eport];
|
||||||
|
|
||||||
|
if (c->mode == MODE_PASTA && inany_is_loopback(&ini->eaddr) &&
|
||||||
|
(proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
|
||||||
|
/* spliceable */
|
||||||
|
|
||||||
|
/* The traffic will go over the guest's 'lo' interface, but by
|
||||||
|
* default use its external address, so we don't inadvertently
|
||||||
|
* expose services that listen only on the guest's loopback
|
||||||
|
* address. That can be overridden by --host-lo-to-ns-lo which
|
||||||
|
* will instead forward to the loopback address in the guest.
|
||||||
|
*
|
||||||
|
* In either case, let the kernel pick the source address to
|
||||||
|
* match.
|
||||||
|
*/
|
||||||
|
if (inany_v4(&ini->eaddr)) {
|
||||||
|
if (c->host_lo_to_ns_lo)
|
||||||
|
tgt->eaddr = inany_loopback4;
|
||||||
|
else
|
||||||
|
tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
|
||||||
|
tgt->oaddr = inany_any4;
|
||||||
|
} else {
|
||||||
|
if (c->host_lo_to_ns_lo)
|
||||||
|
tgt->eaddr = inany_loopback6;
|
||||||
|
else
|
||||||
|
tgt->eaddr.a6 = c->ip6.addr_seen;
|
||||||
|
tgt->oaddr = inany_any6;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Let the kernel pick source port */
|
||||||
|
tgt->oport = 0;
|
||||||
|
if (proto == IPPROTO_UDP)
|
||||||
|
/* But for UDP preserve the source port */
|
||||||
|
tgt->oport = ini->eport;
|
||||||
|
|
||||||
|
return PIF_SPLICE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
|
||||||
|
inany_equals4(&ini->eaddr, &in4addr_loopback)) {
|
||||||
|
/* Specifically 127.0.0.1, not 127.0.0.0/8 */
|
||||||
|
tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback);
|
||||||
|
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
|
||||||
|
inany_equals6(&ini->eaddr, &in6addr_loopback)) {
|
||||||
|
tgt->oaddr.a6 = c->ip6.map_host_loopback;
|
||||||
|
} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
|
||||||
|
inany_equals4(&ini->eaddr, &c->ip4.addr)) {
|
||||||
|
tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr);
|
||||||
|
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
|
||||||
|
inany_equals6(&ini->eaddr, &c->ip6.addr)) {
|
||||||
|
tgt->oaddr.a6 = c->ip6.map_guest_addr;
|
||||||
|
} else if (!fwd_guest_accessible(c, &ini->eaddr)) {
|
||||||
|
if (inany_v4(&ini->eaddr)) {
|
||||||
|
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr))
|
||||||
|
/* No source address we can use */
|
||||||
|
return PIF_NONE;
|
||||||
|
tgt->oaddr = inany_from_v4(c->ip4.our_tap_addr);
|
||||||
|
} else {
|
||||||
|
tgt->oaddr.a6 = c->ip6.our_tap_ll;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
tgt->oaddr = ini->eaddr;
|
||||||
|
}
|
||||||
|
tgt->oport = ini->eport;
|
||||||
|
|
||||||
|
if (inany_v4(&tgt->oaddr)) {
|
||||||
|
tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
|
||||||
|
} else {
|
||||||
|
if (inany_is_linklocal6(&tgt->oaddr))
|
||||||
|
tgt->eaddr.a6 = c->ip6.addr_ll_seen;
|
||||||
|
else
|
||||||
|
tgt->eaddr.a6 = c->ip6.addr_seen;
|
||||||
|
}
|
||||||
|
|
||||||
|
return PIF_TAP;
|
||||||
|
}
|
||||||
|
|
13
fwd.h
13
fwd.h
|
@ -7,10 +7,16 @@
|
||||||
#ifndef FWD_H
|
#ifndef FWD_H
|
||||||
#define FWD_H
|
#define FWD_H
|
||||||
|
|
||||||
|
struct flowside;
|
||||||
|
|
||||||
/* Number of ports for both TCP and UDP */
|
/* Number of ports for both TCP and UDP */
|
||||||
#define NUM_PORTS (1U << 16)
|
#define NUM_PORTS (1U << 16)
|
||||||
|
|
||||||
|
void fwd_probe_ephemeral(void);
|
||||||
|
bool fwd_port_is_ephemeral(in_port_t port);
|
||||||
|
|
||||||
enum fwd_ports_mode {
|
enum fwd_ports_mode {
|
||||||
|
FWD_UNSET = 0,
|
||||||
FWD_SPEC = 1,
|
FWD_SPEC = 1,
|
||||||
FWD_NONE,
|
FWD_NONE,
|
||||||
FWD_AUTO,
|
FWD_AUTO,
|
||||||
|
@ -41,4 +47,11 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
|
||||||
const struct fwd_ports *tcp_rev);
|
const struct fwd_ports *tcp_rev);
|
||||||
void fwd_scan_ports_init(struct ctx *c);
|
void fwd_scan_ports_init(struct ctx *c);
|
||||||
|
|
||||||
|
uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
|
||||||
|
const struct flowside *ini, struct flowside *tgt);
|
||||||
|
uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
|
||||||
|
const struct flowside *ini, struct flowside *tgt);
|
||||||
|
uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
|
||||||
|
const struct flowside *ini, struct flowside *tgt);
|
||||||
|
|
||||||
#endif /* FWD_H */
|
#endif /* FWD_H */
|
||||||
|
|
262
icmp.c
262
icmp.c
|
@ -40,36 +40,38 @@
|
||||||
#include "siphash.h"
|
#include "siphash.h"
|
||||||
#include "inany.h"
|
#include "inany.h"
|
||||||
#include "icmp.h"
|
#include "icmp.h"
|
||||||
|
#include "flow_table.h"
|
||||||
|
|
||||||
#define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */
|
#define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */
|
||||||
#define ICMP_NUM_IDS (1U << 16)
|
#define ICMP_NUM_IDS (1U << 16)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct icmp_id_sock - Tracking information for single ICMP echo identifier
|
* ping_at_sidx() - Get ping specific flow at given sidx
|
||||||
* @sock: Bound socket for identifier
|
* @sidx: Flow and side to retrieve
|
||||||
* @seq: Last sequence number sent to tap, host order, -1: not sent yet
|
*
|
||||||
* @ts: Last associated activity from tap, seconds
|
* Return: ping specific flow at @sidx, or NULL of @sidx is invalid. Asserts if
|
||||||
|
* the flow at @sidx is not FLOW_PING4 or FLOW_PING6
|
||||||
*/
|
*/
|
||||||
struct icmp_id_sock {
|
static struct icmp_ping_flow *ping_at_sidx(flow_sidx_t sidx)
|
||||||
int sock;
|
{
|
||||||
int seq;
|
union flow *flow = flow_at_sidx(sidx);
|
||||||
time_t ts;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Indexed by ICMP echo identifier */
|
if (!flow)
|
||||||
static struct icmp_id_sock icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS];
|
return NULL;
|
||||||
|
|
||||||
|
ASSERT(flow->f.type == FLOW_PING4 || flow->f.type == FLOW_PING6);
|
||||||
|
return &flow->ping;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* icmp_sock_handler() - Handle new data from ICMP or ICMPv6 socket
|
* icmp_sock_handler() - Handle new data from ICMP or ICMPv6 socket
|
||||||
* @c: Execution context
|
* @c: Execution context
|
||||||
* @af: Address family (AF_INET or AF_INET6)
|
|
||||||
* @ref: epoll reference
|
* @ref: epoll reference
|
||||||
*/
|
*/
|
||||||
void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref)
|
void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
|
||||||
{
|
{
|
||||||
struct icmp_id_sock *const id_sock = af == AF_INET
|
struct icmp_ping_flow *pingf = ping_at_sidx(ref.flowside);
|
||||||
? &icmp_id_map[V4][ref.icmp.id] : &icmp_id_map[V6][ref.icmp.id];
|
const struct flowside *ini = &pingf->f.side[INISIDE];
|
||||||
const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6";
|
|
||||||
union sockaddr_inany sr;
|
union sockaddr_inany sr;
|
||||||
socklen_t sl = sizeof(sr);
|
socklen_t sl = sizeof(sr);
|
||||||
char buf[USHRT_MAX];
|
char buf[USHRT_MAX];
|
||||||
|
@ -79,33 +81,33 @@ void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref)
|
||||||
if (c->no_icmp)
|
if (c->no_icmp)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
ASSERT(pingf);
|
||||||
|
|
||||||
n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl);
|
n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl);
|
||||||
if (n < 0) {
|
if (n < 0) {
|
||||||
warn("%s: recvfrom() error on ping socket: %s",
|
flow_err(pingf, "recvfrom() error: %s", strerror(errno));
|
||||||
pname, strerror(errno));
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (sr.sa_family != af)
|
|
||||||
goto unexpected;
|
|
||||||
|
|
||||||
if (af == AF_INET) {
|
if (pingf->f.type == FLOW_PING4) {
|
||||||
struct icmphdr *ih4 = (struct icmphdr *)buf;
|
struct icmphdr *ih4 = (struct icmphdr *)buf;
|
||||||
|
|
||||||
if ((size_t)n < sizeof(*ih4) || ih4->type != ICMP_ECHOREPLY)
|
if (sr.sa_family != AF_INET || (size_t)n < sizeof(*ih4) ||
|
||||||
|
ih4->type != ICMP_ECHOREPLY)
|
||||||
goto unexpected;
|
goto unexpected;
|
||||||
|
|
||||||
/* Adjust packet back to guest-side ID */
|
/* Adjust packet back to guest-side ID */
|
||||||
ih4->un.echo.id = htons(ref.icmp.id);
|
ih4->un.echo.id = htons(ini->eport);
|
||||||
seq = ntohs(ih4->un.echo.sequence);
|
seq = ntohs(ih4->un.echo.sequence);
|
||||||
} else if (af == AF_INET6) {
|
} else if (pingf->f.type == FLOW_PING6) {
|
||||||
struct icmp6hdr *ih6 = (struct icmp6hdr *)buf;
|
struct icmp6hdr *ih6 = (struct icmp6hdr *)buf;
|
||||||
|
|
||||||
if ((size_t)n < sizeof(*ih6) ||
|
if (sr.sa_family != AF_INET6 || (size_t)n < sizeof(*ih6) ||
|
||||||
ih6->icmp6_type != ICMPV6_ECHO_REPLY)
|
ih6->icmp6_type != ICMPV6_ECHO_REPLY)
|
||||||
goto unexpected;
|
goto unexpected;
|
||||||
|
|
||||||
/* Adjust packet back to guest-side ID */
|
/* Adjust packet back to guest-side ID */
|
||||||
ih6->icmp6_identifier = htons(ref.icmp.id);
|
ih6->icmp6_identifier = htons(ini->eport);
|
||||||
seq = ntohs(ih6->icmp6_sequence);
|
seq = ntohs(ih6->icmp6_sequence);
|
||||||
} else {
|
} else {
|
||||||
ASSERT(0);
|
ASSERT(0);
|
||||||
|
@ -113,87 +115,111 @@ void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref)
|
||||||
|
|
||||||
/* In PASTA mode, we'll get any reply we send, discard them. */
|
/* In PASTA mode, we'll get any reply we send, discard them. */
|
||||||
if (c->mode == MODE_PASTA) {
|
if (c->mode == MODE_PASTA) {
|
||||||
if (id_sock->seq == seq)
|
if (pingf->seq == seq)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
id_sock->seq = seq;
|
pingf->seq = seq;
|
||||||
}
|
}
|
||||||
|
|
||||||
debug("%s: echo reply to tap, ID: %"PRIu16", seq: %"PRIu16, pname,
|
flow_dbg(pingf, "echo reply to tap, ID: %"PRIu16", seq: %"PRIu16,
|
||||||
ref.icmp.id, seq);
|
ini->eport, seq);
|
||||||
if (af == AF_INET)
|
|
||||||
tap_icmp4_send(c, sr.sa4.sin_addr, tap_ip4_daddr(c), buf, n);
|
if (pingf->f.type == FLOW_PING4) {
|
||||||
else if (af == AF_INET6)
|
const struct in_addr *saddr = inany_v4(&ini->oaddr);
|
||||||
tap_icmp6_send(c, &sr.sa6.sin6_addr,
|
const struct in_addr *daddr = inany_v4(&ini->eaddr);
|
||||||
tap_ip6_daddr(c, &sr.sa6.sin6_addr), buf, n);
|
|
||||||
|
ASSERT(saddr && daddr); /* Must have IPv4 addresses */
|
||||||
|
tap_icmp4_send(c, *saddr, *daddr, buf, n);
|
||||||
|
} else if (pingf->f.type == FLOW_PING6) {
|
||||||
|
const struct in6_addr *saddr = &ini->oaddr.a6;
|
||||||
|
const struct in6_addr *daddr = &ini->eaddr.a6;
|
||||||
|
|
||||||
|
tap_icmp6_send(c, saddr, daddr, buf, n);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
|
|
||||||
unexpected:
|
unexpected:
|
||||||
warn("%s: Unexpected packet on ping socket", pname);
|
flow_err(pingf, "Unexpected packet on ping socket");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* icmp_ping_close() - Close and clean up a ping socket
|
* icmp_ping_close() - Close and clean up a ping flow
|
||||||
* @c: Execution context
|
* @c: Execution context
|
||||||
* @id_sock: Socket number and other info
|
* @pingf: ping flow entry to close
|
||||||
*/
|
*/
|
||||||
static void icmp_ping_close(const struct ctx *c, struct icmp_id_sock *id_sock)
|
static void icmp_ping_close(const struct ctx *c,
|
||||||
|
const struct icmp_ping_flow *pingf)
|
||||||
{
|
{
|
||||||
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_sock->sock, NULL);
|
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, pingf->sock, NULL);
|
||||||
close(id_sock->sock);
|
close(pingf->sock);
|
||||||
id_sock->sock = -1;
|
flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE));
|
||||||
id_sock->seq = -1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* icmp_ping_new() - Prepare a new ping socket for a new id
|
* icmp_ping_new() - Prepare a new ping socket for a new id
|
||||||
* @c: Execution context
|
* @c: Execution context
|
||||||
* @id_sock: Socket fd and other information
|
|
||||||
* @af: Address family, AF_INET or AF_INET6
|
* @af: Address family, AF_INET or AF_INET6
|
||||||
* @id: ICMP id for the new socket
|
* @id: ICMP id for the new socket
|
||||||
|
* @saddr: Source address
|
||||||
|
* @daddr: Destination address
|
||||||
*
|
*
|
||||||
* Return: Newly opened ping socket fd, or -1 on failure
|
* Return: Newly opened ping flow, or NULL on failure
|
||||||
*/
|
*/
|
||||||
static int icmp_ping_new(const struct ctx *c, struct icmp_id_sock *id_sock,
|
static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
|
||||||
sa_family_t af, uint16_t id)
|
sa_family_t af, uint16_t id,
|
||||||
|
const void *saddr, const void *daddr)
|
||||||
{
|
{
|
||||||
uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6;
|
uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6;
|
||||||
const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6";
|
uint8_t flowtype = af == AF_INET ? FLOW_PING4 : FLOW_PING6;
|
||||||
union icmp_epoll_ref iref = { .id = id };
|
union epoll_ref ref = { .type = EPOLL_TYPE_PING };
|
||||||
const void *bind_addr;
|
union flow *flow = flow_alloc();
|
||||||
const char *bind_if;
|
struct icmp_ping_flow *pingf;
|
||||||
int s;
|
const struct flowside *tgt;
|
||||||
|
|
||||||
if (af == AF_INET) {
|
if (!flow)
|
||||||
bind_addr = &c->ip4.addr_out;
|
return NULL;
|
||||||
bind_if = c->ip4.ifname_out;
|
|
||||||
} else {
|
flow_initiate_af(flow, PIF_TAP, af, saddr, id, daddr, id);
|
||||||
bind_addr = &c->ip6.addr_out;
|
if (!(tgt = flow_target(c, flow, proto)))
|
||||||
bind_if = c->ip6.ifname_out;
|
goto cancel;
|
||||||
|
|
||||||
|
if (flow->f.pif[TGTSIDE] != PIF_HOST) {
|
||||||
|
flow_err(flow, "No support for forwarding %s from %s to %s",
|
||||||
|
proto == IPPROTO_ICMP ? "ICMP" : "ICMPv6",
|
||||||
|
pif_name(flow->f.pif[INISIDE]),
|
||||||
|
pif_name(flow->f.pif[TGTSIDE]));
|
||||||
|
goto cancel;
|
||||||
}
|
}
|
||||||
|
|
||||||
s = sock_l4(c, af, proto, bind_addr, bind_if, 0, iref.u32);
|
pingf = FLOW_SET_TYPE(flow, flowtype, ping);
|
||||||
|
|
||||||
if (s < 0) {
|
pingf->seq = -1;
|
||||||
|
|
||||||
|
ref.flowside = FLOW_SIDX(flow, TGTSIDE);
|
||||||
|
pingf->sock = flowside_sock_l4(c, EPOLL_TYPE_PING, PIF_HOST,
|
||||||
|
tgt, ref.data);
|
||||||
|
|
||||||
|
if (pingf->sock < 0) {
|
||||||
warn("Cannot open \"ping\" socket. You might need to:");
|
warn("Cannot open \"ping\" socket. You might need to:");
|
||||||
warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\"");
|
warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\"");
|
||||||
warn("...echo requests/replies will fail.");
|
warn("...echo requests/replies will fail.");
|
||||||
goto cancel;
|
goto cancel;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (s > FD_REF_MAX)
|
if (pingf->sock > FD_REF_MAX)
|
||||||
goto cancel;
|
goto cancel;
|
||||||
|
|
||||||
id_sock->sock = s;
|
flow_dbg(pingf, "new socket %i for echo ID %"PRIu16, pingf->sock, id);
|
||||||
|
|
||||||
debug("%s: new socket %i for echo ID %"PRIu16, pname, s, id);
|
flow_hash_insert(c, FLOW_SIDX(pingf, INISIDE));
|
||||||
|
|
||||||
return s;
|
FLOW_ACTIVATE(pingf);
|
||||||
|
|
||||||
|
return pingf;
|
||||||
|
|
||||||
cancel:
|
cancel:
|
||||||
if (s >= 0)
|
flow_alloc_cancel(flow);
|
||||||
close(s);
|
return NULL;
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -212,111 +238,93 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||||
const void *saddr, const void *daddr,
|
const void *saddr, const void *daddr,
|
||||||
const struct pool *p, const struct timespec *now)
|
const struct pool *p, const struct timespec *now)
|
||||||
{
|
{
|
||||||
const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6";
|
struct icmp_ping_flow *pingf;
|
||||||
union sockaddr_inany sa = { .sa_family = af };
|
const struct flowside *tgt;
|
||||||
const socklen_t sl = af == AF_INET ? sizeof(sa.sa4) : sizeof(sa.sa6);
|
union sockaddr_inany sa;
|
||||||
struct icmp_id_sock *id_sock;
|
size_t dlen, l4len;
|
||||||
uint16_t id, seq;
|
uint16_t id, seq;
|
||||||
size_t plen;
|
union flow *flow;
|
||||||
|
uint8_t proto;
|
||||||
|
socklen_t sl;
|
||||||
void *pkt;
|
void *pkt;
|
||||||
int s;
|
|
||||||
|
|
||||||
(void)saddr;
|
(void)saddr;
|
||||||
(void)pif;
|
ASSERT(pif == PIF_TAP);
|
||||||
|
|
||||||
if (af == AF_INET) {
|
if (af == AF_INET) {
|
||||||
const struct icmphdr *ih;
|
const struct icmphdr *ih;
|
||||||
|
|
||||||
if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &plen)))
|
if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen)))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
ih = (struct icmphdr *)pkt;
|
ih = (struct icmphdr *)pkt;
|
||||||
plen += sizeof(*ih);
|
l4len = dlen + sizeof(*ih);
|
||||||
|
|
||||||
if (ih->type != ICMP_ECHO)
|
if (ih->type != ICMP_ECHO)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
proto = IPPROTO_ICMP;
|
||||||
id = ntohs(ih->un.echo.id);
|
id = ntohs(ih->un.echo.id);
|
||||||
id_sock = &icmp_id_map[V4][id];
|
|
||||||
seq = ntohs(ih->un.echo.sequence);
|
seq = ntohs(ih->un.echo.sequence);
|
||||||
sa.sa4.sin_addr = *(struct in_addr *)daddr;
|
|
||||||
} else if (af == AF_INET6) {
|
} else if (af == AF_INET6) {
|
||||||
const struct icmp6hdr *ih;
|
const struct icmp6hdr *ih;
|
||||||
|
|
||||||
if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &plen)))
|
if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen)))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
ih = (struct icmp6hdr *)pkt;
|
ih = (struct icmp6hdr *)pkt;
|
||||||
plen += sizeof(*ih);
|
l4len = dlen + sizeof(*ih);
|
||||||
|
|
||||||
if (ih->icmp6_type != ICMPV6_ECHO_REQUEST)
|
if (ih->icmp6_type != ICMPV6_ECHO_REQUEST)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
proto = IPPROTO_ICMPV6;
|
||||||
id = ntohs(ih->icmp6_identifier);
|
id = ntohs(ih->icmp6_identifier);
|
||||||
id_sock = &icmp_id_map[V6][id];
|
|
||||||
seq = ntohs(ih->icmp6_sequence);
|
seq = ntohs(ih->icmp6_sequence);
|
||||||
sa.sa6.sin6_addr = *(struct in6_addr *)daddr;
|
|
||||||
sa.sa6.sin6_scope_id = c->ifi6;
|
|
||||||
} else {
|
} else {
|
||||||
ASSERT(0);
|
ASSERT(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((s = id_sock->sock) < 0)
|
flow = flow_at_sidx(flow_lookup_af(c, proto, PIF_TAP,
|
||||||
if ((s = icmp_ping_new(c, id_sock, af, id)) < 0)
|
af, saddr, daddr, id, id));
|
||||||
return 1;
|
|
||||||
|
|
||||||
id_sock->ts = now->tv_sec;
|
if (flow)
|
||||||
|
pingf = &flow->ping;
|
||||||
|
else if (!(pingf = icmp_ping_new(c, af, id, saddr, daddr)))
|
||||||
|
return 1;
|
||||||
|
|
||||||
if (sendto(s, pkt, plen, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
|
tgt = &pingf->f.side[TGTSIDE];
|
||||||
debug("%s: failed to relay request to socket: %s",
|
|
||||||
pname, strerror(errno));
|
ASSERT(flow_proto[pingf->f.type] == proto);
|
||||||
|
pingf->ts = now->tv_sec;
|
||||||
|
|
||||||
|
pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0);
|
||||||
|
if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
|
||||||
|
flow_dbg(pingf, "failed to relay request to socket: %s",
|
||||||
|
strerror(errno));
|
||||||
} else {
|
} else {
|
||||||
debug("%s: echo request to socket, ID: %"PRIu16", seq: %"PRIu16,
|
flow_dbg(pingf,
|
||||||
pname, id, seq);
|
"echo request to socket, ID: %"PRIu16", seq: %"PRIu16,
|
||||||
|
id, seq);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* icmp_timer_one() - Handler for timed events related to a given identifier
|
* icmp_ping_timer() - Handler for timed events related to a given flow
|
||||||
* @c: Execution context
|
* @c: Execution context
|
||||||
* @id_sock: Socket fd and activity timestamp
|
* @pingf: Ping flow to check for timeout
|
||||||
* @now: Current timestamp
|
* @now: Current timestamp
|
||||||
|
*
|
||||||
|
* Return: true if the flow is ready to free, false otherwise
|
||||||
*/
|
*/
|
||||||
static void icmp_timer_one(const struct ctx *c, struct icmp_id_sock *id_sock,
|
bool icmp_ping_timer(const struct ctx *c, const struct icmp_ping_flow *pingf,
|
||||||
const struct timespec *now)
|
const struct timespec *now)
|
||||||
{
|
{
|
||||||
if (id_sock->sock < 0 || now->tv_sec - id_sock->ts <= ICMP_ECHO_TIMEOUT)
|
if (now->tv_sec - pingf->ts <= ICMP_ECHO_TIMEOUT)
|
||||||
return;
|
return false;
|
||||||
|
|
||||||
icmp_ping_close(c, id_sock);
|
icmp_ping_close(c, pingf);
|
||||||
}
|
return true;
|
||||||
|
|
||||||
/**
|
|
||||||
* icmp_timer() - Scan activity bitmap for identifiers with timed events
|
|
||||||
* @c: Execution context
|
|
||||||
* @now: Current timestamp
|
|
||||||
*/
|
|
||||||
void icmp_timer(const struct ctx *c, const struct timespec *now)
|
|
||||||
{
|
|
||||||
unsigned int i;
|
|
||||||
|
|
||||||
for (i = 0; i < ICMP_NUM_IDS; i++) {
|
|
||||||
icmp_timer_one(c, &icmp_id_map[V4][i], now);
|
|
||||||
icmp_timer_one(c, &icmp_id_map[V6][i], now);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* icmp_init() - Initialise sequences in ID map to -1 (no sequence sent yet)
|
|
||||||
*/
|
|
||||||
void icmp_init(void)
|
|
||||||
{
|
|
||||||
unsigned i;
|
|
||||||
|
|
||||||
for (i = 0; i < ICMP_NUM_IDS; i++) {
|
|
||||||
icmp_id_map[V4][i].seq = icmp_id_map[V6][i].seq = -1;
|
|
||||||
icmp_id_map[V4][i].sock = icmp_id_map[V6][i].sock = -1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
15
icmp.h
15
icmp.h
|
@ -9,25 +9,14 @@
|
||||||
#define ICMP_TIMER_INTERVAL 10000 /* ms */
|
#define ICMP_TIMER_INTERVAL 10000 /* ms */
|
||||||
|
|
||||||
struct ctx;
|
struct ctx;
|
||||||
|
struct icmp_ping_flow;
|
||||||
|
|
||||||
void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref);
|
void icmp_sock_handler(const struct ctx *c, union epoll_ref ref);
|
||||||
int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||||
const void *saddr, const void *daddr,
|
const void *saddr, const void *daddr,
|
||||||
const struct pool *p, const struct timespec *now);
|
const struct pool *p, const struct timespec *now);
|
||||||
void icmp_timer(const struct ctx *c, const struct timespec *now);
|
|
||||||
void icmp_init(void);
|
void icmp_init(void);
|
||||||
|
|
||||||
/**
|
|
||||||
* union icmp_epoll_ref - epoll reference portion for ICMP tracking
|
|
||||||
* @v6: Set for IPv6 sockets or connections
|
|
||||||
* @u32: Opaque u32 value of reference
|
|
||||||
* @id: Associated echo identifier, needed if bind() fails
|
|
||||||
*/
|
|
||||||
union icmp_epoll_ref {
|
|
||||||
uint16_t id;
|
|
||||||
uint32_t u32;
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct icmp_ctx - Execution context for ICMP routines
|
* struct icmp_ctx - Execution context for ICMP routines
|
||||||
* @timer_run: Timestamp of most recent timer run
|
* @timer_run: Timestamp of most recent timer run
|
||||||
|
|
29
icmp_flow.h
Normal file
29
icmp_flow.h
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
* Copyright Red Hat
|
||||||
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||||
|
*
|
||||||
|
* ICMP flow tracking data structures
|
||||||
|
*/
|
||||||
|
#ifndef ICMP_FLOW_H
|
||||||
|
#define ICMP_FLOW_H
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct icmp_ping_flow - Descriptor for a flow of ping requests/replies
|
||||||
|
* @f: Generic flow information
|
||||||
|
* @seq: Last sequence number sent to tap, host order, -1: not sent yet
|
||||||
|
* @sock: "ping" socket
|
||||||
|
* @ts: Last associated activity from tap, seconds
|
||||||
|
*/
|
||||||
|
struct icmp_ping_flow {
|
||||||
|
/* Must be first element */
|
||||||
|
struct flow_common f;
|
||||||
|
|
||||||
|
int seq;
|
||||||
|
int sock;
|
||||||
|
time_t ts;
|
||||||
|
};
|
||||||
|
|
||||||
|
bool icmp_ping_timer(const struct ctx *c, const struct icmp_ping_flow *pingf,
|
||||||
|
const struct timespec *now);
|
||||||
|
|
||||||
|
#endif /* ICMP_FLOW_H */
|
37
inany.c
37
inany.c
|
@ -17,21 +17,8 @@
|
||||||
#include "siphash.h"
|
#include "siphash.h"
|
||||||
#include "inany.h"
|
#include "inany.h"
|
||||||
|
|
||||||
const union inany_addr inany_loopback4 = {
|
const union inany_addr inany_loopback4 = INANY_INIT4(IN4ADDR_LOOPBACK_INIT);
|
||||||
.v4mapped = {
|
const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT);
|
||||||
.zero = { 0 },
|
|
||||||
.one = { 0xff, 0xff, },
|
|
||||||
.a4 = IN4ADDR_LOOPBACK_INIT,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
const union inany_addr inany_any4 = {
|
|
||||||
.v4mapped = {
|
|
||||||
.zero = { 0 },
|
|
||||||
.one = { 0xff, 0xff, },
|
|
||||||
.a4 = IN4ADDR_ANY_INIT,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
/** inany_ntop - Convert an IPv[46] address to text format
|
/** inany_ntop - Convert an IPv[46] address to text format
|
||||||
* @src: IPv[46] address
|
* @src: IPv[46] address
|
||||||
|
@ -49,3 +36,23 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
|
||||||
|
|
||||||
return inet_ntop(AF_INET6, &src->a6, dst, size);
|
return inet_ntop(AF_INET6, &src->a6, dst, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** inany_pton - Parse an IPv[46] address from text format
|
||||||
|
* @src: IPv[46] address
|
||||||
|
* @dst: output buffer, filled with parsed address
|
||||||
|
*
|
||||||
|
* Return: On success, 1, if no parseable address is found, 0
|
||||||
|
*/
|
||||||
|
int inany_pton(const char *src, union inany_addr *dst)
|
||||||
|
{
|
||||||
|
if (inet_pton(AF_INET, src, &dst->v4mapped.a4)) {
|
||||||
|
memset(&dst->v4mapped.zero, 0, sizeof(dst->v4mapped.zero));
|
||||||
|
memset(&dst->v4mapped.one, 0xff, sizeof(dst->v4mapped.one));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inet_pton(AF_INET6, src, &dst->a6))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
98
inany.h
98
inany.h
|
@ -43,6 +43,17 @@ extern const union inany_addr inany_any4;
|
||||||
#define in4addr_loopback (inany_loopback4.v4mapped.a4)
|
#define in4addr_loopback (inany_loopback4.v4mapped.a4)
|
||||||
#define in4addr_any (inany_any4.v4mapped.a4)
|
#define in4addr_any (inany_any4.v4mapped.a4)
|
||||||
|
|
||||||
|
#define INANY_INIT4(a4init) { \
|
||||||
|
.v4mapped = { \
|
||||||
|
.zero = { 0 }, \
|
||||||
|
.one = { 0xff, 0xff }, \
|
||||||
|
.a4 = a4init, \
|
||||||
|
}, \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define inany_from_v4(a4) \
|
||||||
|
((union inany_addr)INANY_INIT4((a4)))
|
||||||
|
|
||||||
/** union sockaddr_inany - Either a sockaddr_in or a sockaddr_in6
|
/** union sockaddr_inany - Either a sockaddr_in or a sockaddr_in6
|
||||||
* @sa_family: Address family, AF_INET or AF_INET6
|
* @sa_family: Address family, AF_INET or AF_INET6
|
||||||
* @sa: Plain struct sockaddr (useful to avoid casts)
|
* @sa: Plain struct sockaddr (useful to avoid casts)
|
||||||
|
@ -79,16 +90,84 @@ static inline bool inany_equals(const union inany_addr *a,
|
||||||
return IN6_ARE_ADDR_EQUAL(&a->a6, &b->a6);
|
return IN6_ARE_ADDR_EQUAL(&a->a6, &b->a6);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** inany_equals4 - Compare an IPv[46] address to an IPv4 address
|
||||||
|
* @a: IPv[46] addresses
|
||||||
|
* @b: IPv4 address
|
||||||
|
*
|
||||||
|
* Return: true if @a and @b are the same address
|
||||||
|
*/
|
||||||
|
static inline bool inany_equals4(const union inany_addr *a,
|
||||||
|
const struct in_addr *b)
|
||||||
|
{
|
||||||
|
const struct in_addr *a4 = inany_v4(a);
|
||||||
|
|
||||||
|
return a4 && IN4_ARE_ADDR_EQUAL(a4, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** inany_equals6 - Compare an IPv[46] address to an IPv6 address
|
||||||
|
* @a: IPv[46] addresses
|
||||||
|
* @b: IPv6 address
|
||||||
|
*
|
||||||
|
* Return: true if @a and @b are the same address
|
||||||
|
*/
|
||||||
|
static inline bool inany_equals6(const union inany_addr *a,
|
||||||
|
const struct in6_addr *b)
|
||||||
|
{
|
||||||
|
return IN6_ARE_ADDR_EQUAL(&a->a6, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** inany_is_loopback4() - Check if address is IPv4 loopback
|
||||||
|
* @a: IPv[46] address
|
||||||
|
*
|
||||||
|
* Return: true if @a is in 127.0.0.1/8
|
||||||
|
*/
|
||||||
|
static inline bool inany_is_loopback4(const union inany_addr *a)
|
||||||
|
{
|
||||||
|
const struct in_addr *v4 = inany_v4(a);
|
||||||
|
|
||||||
|
return v4 && IN4_IS_ADDR_LOOPBACK(v4);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** inany_is_loopback6() - Check if address is IPv6 loopback
|
||||||
|
* @a: IPv[46] address
|
||||||
|
*
|
||||||
|
* Return: true if @a is in ::1
|
||||||
|
*/
|
||||||
|
static inline bool inany_is_loopback6(const union inany_addr *a)
|
||||||
|
{
|
||||||
|
return IN6_IS_ADDR_LOOPBACK(&a->a6);
|
||||||
|
}
|
||||||
|
|
||||||
/** inany_is_loopback() - Check if address is loopback
|
/** inany_is_loopback() - Check if address is loopback
|
||||||
* @a: IPv[46] address
|
* @a: IPv[46] address
|
||||||
*
|
*
|
||||||
* Return: true if @a is either ::1 or in 127.0.0.1/8
|
* Return: true if @a is either ::1 or in 127.0.0.1/8
|
||||||
*/
|
*/
|
||||||
static inline bool inany_is_loopback(const union inany_addr *a)
|
static inline bool inany_is_loopback(const union inany_addr *a)
|
||||||
|
{
|
||||||
|
return inany_is_loopback4(a) || inany_is_loopback6(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** inany_is_unspecified4() - Check if address is unspecified IPv4
|
||||||
|
* @a: IPv[46] address
|
||||||
|
*
|
||||||
|
* Return: true if @a is 0.0.0.0
|
||||||
|
*/
|
||||||
|
static inline bool inany_is_unspecified4(const union inany_addr *a)
|
||||||
{
|
{
|
||||||
const struct in_addr *v4 = inany_v4(a);
|
const struct in_addr *v4 = inany_v4(a);
|
||||||
|
|
||||||
return IN6_IS_ADDR_LOOPBACK(&a->a6) || (v4 && IN4_IS_ADDR_LOOPBACK(v4));
|
return v4 && IN4_IS_ADDR_UNSPECIFIED(v4);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** inany_is_unspecified6() - Check if address is unspecified IPv6
|
||||||
|
* @a: IPv[46] address
|
||||||
|
*
|
||||||
|
* Return: true if @a is ::
|
||||||
|
*/
|
||||||
|
static inline bool inany_is_unspecified6(const union inany_addr *a)
|
||||||
|
{
|
||||||
|
return IN6_IS_ADDR_UNSPECIFIED(&a->a6);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** inany_is_unspecified() - Check if address is unspecified
|
/** inany_is_unspecified() - Check if address is unspecified
|
||||||
|
@ -98,10 +177,19 @@ static inline bool inany_is_loopback(const union inany_addr *a)
|
||||||
*/
|
*/
|
||||||
static inline bool inany_is_unspecified(const union inany_addr *a)
|
static inline bool inany_is_unspecified(const union inany_addr *a)
|
||||||
{
|
{
|
||||||
const struct in_addr *v4 = inany_v4(a);
|
return inany_is_unspecified4(a) || inany_is_unspecified6(a);
|
||||||
|
}
|
||||||
|
|
||||||
return IN6_IS_ADDR_UNSPECIFIED(&a->a6) ||
|
/* FIXME: consider handling of IPv4 link-local addresses */
|
||||||
(v4 && IN4_IS_ADDR_UNSPECIFIED(v4));
|
|
||||||
|
/** inany_is_linklocal6() - Check if address is link-local IPv6
|
||||||
|
* @a: IPv[46] address
|
||||||
|
*
|
||||||
|
* Return: true if @a is in fe80::/10 (IPv6 link local unicast)
|
||||||
|
*/
|
||||||
|
static inline bool inany_is_linklocal6(const union inany_addr *a)
|
||||||
|
{
|
||||||
|
return IN6_IS_ADDR_LINKLOCAL(&a->a6);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** inany_is_multicast() - Check if address is multicast or broadcast
|
/** inany_is_multicast() - Check if address is multicast or broadcast
|
||||||
|
@ -123,7 +211,6 @@ static inline bool inany_is_multicast(const union inany_addr *a)
|
||||||
*
|
*
|
||||||
* Return: true if @a is specified and a unicast address
|
* Return: true if @a is specified and a unicast address
|
||||||
*/
|
*/
|
||||||
/* cppcheck-suppress unusedFunction */
|
|
||||||
static inline bool inany_is_unicast(const union inany_addr *a)
|
static inline bool inany_is_unicast(const union inany_addr *a)
|
||||||
{
|
{
|
||||||
return !inany_is_unspecified(a) && !inany_is_multicast(a);
|
return !inany_is_unspecified(a) && !inany_is_multicast(a);
|
||||||
|
@ -183,5 +270,6 @@ static inline void inany_siphash_feed(struct siphash_state *state,
|
||||||
#define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)
|
#define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)
|
||||||
|
|
||||||
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size);
|
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size);
|
||||||
|
int inany_pton(const char *src, union inany_addr *dst);
|
||||||
|
|
||||||
#endif /* INANY_H */
|
#endif /* INANY_H */
|
||||||
|
|
39
iov.c
39
iov.c
|
@ -156,42 +156,3 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt)
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* iov_copy - Copy data from one scatter/gather I/O vector (struct iovec) to
|
|
||||||
* another.
|
|
||||||
*
|
|
||||||
* @dst_iov: Pointer to the destination array of struct iovec describing
|
|
||||||
* the scatter/gather I/O vector to copy to.
|
|
||||||
* @dst_iov_cnt: Number of elements in the destination iov array.
|
|
||||||
* @iov: Pointer to the source array of struct iovec describing
|
|
||||||
* the scatter/gather I/O vector to copy from.
|
|
||||||
* @iov_cnt: Number of elements in the source iov array.
|
|
||||||
* @offset: Offset within the source iov from where copying should start.
|
|
||||||
* @bytes: Total number of bytes to copy from iov to dst_iov.
|
|
||||||
*
|
|
||||||
* Returns: The number of elements successfully copied to the destination
|
|
||||||
* iov array.
|
|
||||||
*/
|
|
||||||
/* cppcheck-suppress unusedFunction */
|
|
||||||
unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt,
|
|
||||||
const struct iovec *iov, size_t iov_cnt,
|
|
||||||
size_t offset, size_t bytes)
|
|
||||||
{
|
|
||||||
unsigned int i, j;
|
|
||||||
|
|
||||||
i = iov_skip_bytes(iov, iov_cnt, offset, &offset);
|
|
||||||
|
|
||||||
/* copying data */
|
|
||||||
for (j = 0; i < iov_cnt && j < dst_iov_cnt && bytes; i++) {
|
|
||||||
size_t len = MIN(bytes, iov[i].iov_len - offset);
|
|
||||||
|
|
||||||
dst_iov[j].iov_base = (char *)iov[i].iov_base + offset;
|
|
||||||
dst_iov[j].iov_len = len;
|
|
||||||
j++;
|
|
||||||
bytes -= len;
|
|
||||||
offset = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
return j;
|
|
||||||
}
|
|
||||||
|
|
6
iov.h
6
iov.h
|
@ -18,6 +18,9 @@
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
#define IOV_OF_LVALUE(lval) \
|
||||||
|
(struct iovec){ .iov_base = &(lval), .iov_len = sizeof(lval) }
|
||||||
|
|
||||||
size_t iov_skip_bytes(const struct iovec *iov, size_t n,
|
size_t iov_skip_bytes(const struct iovec *iov, size_t n,
|
||||||
size_t skip, size_t *offset);
|
size_t skip, size_t *offset);
|
||||||
size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
|
size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
|
||||||
|
@ -25,7 +28,4 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
|
||||||
size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
|
size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
|
||||||
size_t offset, void *buf, size_t bytes);
|
size_t offset, void *buf, size_t bytes);
|
||||||
size_t iov_size(const struct iovec *iov, size_t iov_cnt);
|
size_t iov_size(const struct iovec *iov, size_t iov_cnt);
|
||||||
unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt,
|
|
||||||
const struct iovec *iov, size_t iov_cnt,
|
|
||||||
size_t offset, size_t bytes);
|
|
||||||
#endif /* IOVEC_H */
|
#endif /* IOVEC_H */
|
||||||
|
|
11
ip.h
11
ip.h
|
@ -24,6 +24,11 @@
|
||||||
#define IN4ADDR_ANY_INIT \
|
#define IN4ADDR_ANY_INIT \
|
||||||
{ .s_addr = htonl_constant(INADDR_ANY) }
|
{ .s_addr = htonl_constant(INADDR_ANY) }
|
||||||
|
|
||||||
|
#define IN4_IS_ADDR_LINKLOCAL(a) \
|
||||||
|
((ntohl(((struct in_addr *)(a))->s_addr) >> 16) == 0xa9fe)
|
||||||
|
#define IN4_IS_PREFIX_LINKLOCAL(a, len) \
|
||||||
|
((len) >= 16 && IN4_IS_ADDR_LINKLOCAL(a))
|
||||||
|
|
||||||
#define L2_BUF_IP4_INIT(proto) \
|
#define L2_BUF_IP4_INIT(proto) \
|
||||||
{ \
|
{ \
|
||||||
.version = 4, \
|
.version = 4, \
|
||||||
|
@ -38,7 +43,11 @@
|
||||||
.daddr = 0, \
|
.daddr = 0, \
|
||||||
}
|
}
|
||||||
#define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \
|
#define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \
|
||||||
(uint32_t)htons_constant(0xff00 | (proto)))
|
(uint32_t)htons(0xff00 | (proto)))
|
||||||
|
|
||||||
|
|
||||||
|
#define IN6_IS_PREFIX_LINKLOCAL(a, len) \
|
||||||
|
((len) >= 10 && IN6_IS_ADDR_LINKLOCAL(a))
|
||||||
|
|
||||||
#define L2_BUF_IP6_INIT(proto) \
|
#define L2_BUF_IP6_INIT(proto) \
|
||||||
{ \
|
{ \
|
||||||
|
|
68
isolation.c
68
isolation.c
|
@ -29,7 +29,8 @@
|
||||||
*
|
*
|
||||||
* Executed immediately after startup, drops capabilities we don't
|
* Executed immediately after startup, drops capabilities we don't
|
||||||
* need at any point during execution (or which we gain back when we
|
* need at any point during execution (or which we gain back when we
|
||||||
* need by joining other namespaces).
|
* need by joining other namespaces), and closes any leaked file we
|
||||||
|
* might have inherited from the parent process.
|
||||||
*
|
*
|
||||||
* 2. isolate_user()
|
* 2. isolate_user()
|
||||||
* =================
|
* =================
|
||||||
|
@ -105,7 +106,7 @@ static void drop_caps_ep_except(uint64_t keep)
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (syscall(SYS_capget, &hdr, data))
|
if (syscall(SYS_capget, &hdr, data))
|
||||||
die("Couldn't get current capabilities: %s", strerror(errno));
|
die_perror("Couldn't get current capabilities");
|
||||||
|
|
||||||
for (i = 0; i < CAP_WORDS; i++) {
|
for (i = 0; i < CAP_WORDS; i++) {
|
||||||
uint32_t mask = keep >> (32 * i);
|
uint32_t mask = keep >> (32 * i);
|
||||||
|
@ -115,7 +116,7 @@ static void drop_caps_ep_except(uint64_t keep)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (syscall(SYS_capset, &hdr, data))
|
if (syscall(SYS_capset, &hdr, data))
|
||||||
die("Couldn't drop capabilities: %s", strerror(errno));
|
die_perror("Couldn't drop capabilities");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -152,30 +153,31 @@ static void clamp_caps(void)
|
||||||
*/
|
*/
|
||||||
if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) &&
|
if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) &&
|
||||||
errno != EINVAL && errno != EPERM)
|
errno != EINVAL && errno != EPERM)
|
||||||
die("Couldn't drop cap %i from bounding set: %s",
|
die_perror("Couldn't drop cap %i from bounding set", i);
|
||||||
i, strerror(errno));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (syscall(SYS_capget, &hdr, data))
|
if (syscall(SYS_capget, &hdr, data))
|
||||||
die("Couldn't get current capabilities: %s", strerror(errno));
|
die_perror("Couldn't get current capabilities");
|
||||||
|
|
||||||
for (i = 0; i < CAP_WORDS; i++)
|
for (i = 0; i < CAP_WORDS; i++)
|
||||||
data[i].inheritable = 0;
|
data[i].inheritable = 0;
|
||||||
|
|
||||||
if (syscall(SYS_capset, &hdr, data))
|
if (syscall(SYS_capset, &hdr, data))
|
||||||
die("Couldn't drop inheritable capabilities: %s",
|
die_perror("Couldn't drop inheritable capabilities");
|
||||||
strerror(errno));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* isolate_initial() - Early, config independent self isolation
|
* isolate_initial() - Early, mostly config independent self isolation
|
||||||
|
* @argc: Argument count
|
||||||
|
* @argv: Command line options: only --fd (if present) is relevant here
|
||||||
*
|
*
|
||||||
* Should:
|
* Should:
|
||||||
* - drop unneeded capabilities
|
* - drop unneeded capabilities
|
||||||
|
* - close all open files except for standard streams and the one from --fd
|
||||||
* Musn't:
|
* Musn't:
|
||||||
* - remove filesytem access (we need to access files during setup)
|
* - remove filesytem access (we need to access files during setup)
|
||||||
*/
|
*/
|
||||||
void isolate_initial(void)
|
void isolate_initial(int argc, char **argv)
|
||||||
{
|
{
|
||||||
uint64_t keep;
|
uint64_t keep;
|
||||||
|
|
||||||
|
@ -209,6 +211,8 @@ void isolate_initial(void)
|
||||||
keep |= BIT(CAP_SETFCAP) | BIT(CAP_SYS_PTRACE);
|
keep |= BIT(CAP_SETFCAP) | BIT(CAP_SYS_PTRACE);
|
||||||
|
|
||||||
drop_caps_ep_except(keep);
|
drop_caps_ep_except(keep);
|
||||||
|
|
||||||
|
close_open_files(argc, argv);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -234,34 +238,30 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns,
|
||||||
if (setgroups(0, NULL)) {
|
if (setgroups(0, NULL)) {
|
||||||
/* If we don't have CAP_SETGID, this will EPERM */
|
/* If we don't have CAP_SETGID, this will EPERM */
|
||||||
if (errno != EPERM)
|
if (errno != EPERM)
|
||||||
die("Can't drop supplementary groups: %s",
|
die_perror("Can't drop supplementary groups");
|
||||||
strerror(errno));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (setgid(gid) != 0)
|
if (setgid(gid) != 0)
|
||||||
die("Can't set GID to %u: %s", gid, strerror(errno));
|
die_perror("Can't set GID to %u", gid);
|
||||||
|
|
||||||
if (setuid(uid) != 0)
|
if (setuid(uid) != 0)
|
||||||
die("Can't set UID to %u: %s", uid, strerror(errno));
|
die_perror("Can't set UID to %u", uid);
|
||||||
|
|
||||||
if (*userns) { /* If given a userns, join it */
|
if (*userns) { /* If given a userns, join it */
|
||||||
int ufd;
|
int ufd;
|
||||||
|
|
||||||
ufd = open(userns, O_RDONLY | O_CLOEXEC);
|
ufd = open(userns, O_RDONLY | O_CLOEXEC);
|
||||||
if (ufd < 0)
|
if (ufd < 0)
|
||||||
die("Couldn't open user namespace %s: %s",
|
die_perror("Couldn't open user namespace %s", userns);
|
||||||
userns, strerror(errno));
|
|
||||||
|
|
||||||
if (setns(ufd, CLONE_NEWUSER) != 0)
|
if (setns(ufd, CLONE_NEWUSER) != 0)
|
||||||
die("Couldn't enter user namespace %s: %s",
|
die_perror("Couldn't enter user namespace %s", userns);
|
||||||
userns, strerror(errno));
|
|
||||||
|
|
||||||
close(ufd);
|
close(ufd);
|
||||||
|
|
||||||
} else if (use_userns) { /* Create and join a new userns */
|
} else if (use_userns) { /* Create and join a new userns */
|
||||||
if (unshare(CLONE_NEWUSER) != 0)
|
if (unshare(CLONE_NEWUSER) != 0)
|
||||||
die("Couldn't create user namespace: %s",
|
die_perror("Couldn't create user namespace");
|
||||||
strerror(errno));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Joining a new userns gives us full capabilities; drop the
|
/* Joining a new userns gives us full capabilities; drop the
|
||||||
|
@ -312,38 +312,38 @@ int isolate_prefork(const struct ctx *c)
|
||||||
* PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody
|
* PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody
|
||||||
* ever gets around seccomp profiles -- there's no harm in passing it.
|
* ever gets around seccomp profiles -- there's no harm in passing it.
|
||||||
*/
|
*/
|
||||||
if (!c->foreground || c->mode == MODE_PASST)
|
if (!c->foreground || c->mode != MODE_PASTA)
|
||||||
flags |= CLONE_NEWPID;
|
flags |= CLONE_NEWPID;
|
||||||
|
|
||||||
if (unshare(flags)) {
|
if (unshare(flags)) {
|
||||||
perror("unshare");
|
err_perror("Failed to detach isolating namespaces");
|
||||||
return -errno;
|
return -errno;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL)) {
|
if (mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL)) {
|
||||||
perror("mount /");
|
err_perror("Failed to remount /");
|
||||||
return -errno;
|
return -errno;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mount("", TMPDIR, "tmpfs",
|
if (mount("", TMPDIR, "tmpfs",
|
||||||
MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY,
|
MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY,
|
||||||
"nr_inodes=2,nr_blocks=0")) {
|
"nr_inodes=2,nr_blocks=0")) {
|
||||||
perror("mount tmpfs");
|
err_perror("Failed to mount empty tmpfs for pivot_root()");
|
||||||
return -errno;
|
return -errno;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (chdir(TMPDIR)) {
|
if (chdir(TMPDIR)) {
|
||||||
perror("chdir");
|
err_perror("Failed to change directory into empty tmpfs");
|
||||||
return -errno;
|
return -errno;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (syscall(SYS_pivot_root, ".", ".")) {
|
if (syscall(SYS_pivot_root, ".", ".")) {
|
||||||
perror("pivot_root");
|
err_perror("Failed to pivot_root() into empty tmpfs");
|
||||||
return -errno;
|
return -errno;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW)) {
|
if (umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW)) {
|
||||||
perror("umount2");
|
err_perror("Failed to unmount original root filesystem");
|
||||||
return -errno;
|
return -errno;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -379,17 +379,15 @@ void isolate_postfork(const struct ctx *c)
|
||||||
|
|
||||||
prctl(PR_SET_DUMPABLE, 0);
|
prctl(PR_SET_DUMPABLE, 0);
|
||||||
|
|
||||||
if (c->mode == MODE_PASST) {
|
if (c->mode == MODE_PASTA) {
|
||||||
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
|
|
||||||
prog.filter = filter_passt;
|
|
||||||
} else {
|
|
||||||
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
|
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
|
||||||
prog.filter = filter_pasta;
|
prog.filter = filter_pasta;
|
||||||
|
} else {
|
||||||
|
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
|
||||||
|
prog.filter = filter_passt;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
|
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
|
||||||
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
|
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog))
|
||||||
perror("prctl");
|
die_perror("Failed to apply seccomp filter");
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
#ifndef ISOLATION_H
|
#ifndef ISOLATION_H
|
||||||
#define ISOLATION_H
|
#define ISOLATION_H
|
||||||
|
|
||||||
void isolate_initial(void);
|
void isolate_initial(int argc, char **argv);
|
||||||
void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns,
|
void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns,
|
||||||
enum passt_modes mode);
|
enum passt_modes mode);
|
||||||
int isolate_prefork(const struct ctx *c);
|
int isolate_prefork(const struct ctx *c);
|
||||||
|
|
10
lineread.c
10
lineread.c
|
@ -39,13 +39,11 @@ void lineread_init(struct lineread *lr, int fd)
|
||||||
*
|
*
|
||||||
* Return: length of line in bytes, -1 if no line was found
|
* Return: length of line in bytes, -1 if no line was found
|
||||||
*/
|
*/
|
||||||
static int peek_line(struct lineread *lr, bool eof)
|
static ssize_t peek_line(struct lineread *lr, bool eof)
|
||||||
{
|
{
|
||||||
char *nl;
|
char *nl;
|
||||||
|
|
||||||
/* Sanity checks (which also document invariants) */
|
/* Sanity checks (which also document invariants) */
|
||||||
ASSERT(lr->count >= 0);
|
|
||||||
ASSERT(lr->next_line >= 0);
|
|
||||||
ASSERT(lr->next_line + lr->count >= lr->next_line);
|
ASSERT(lr->next_line + lr->count >= lr->next_line);
|
||||||
ASSERT(lr->next_line + lr->count <= LINEREAD_BUFFER_SIZE);
|
ASSERT(lr->next_line + lr->count <= LINEREAD_BUFFER_SIZE);
|
||||||
|
|
||||||
|
@ -74,13 +72,13 @@ static int peek_line(struct lineread *lr, bool eof)
|
||||||
*
|
*
|
||||||
* Return: Length of line read on success, 0 on EOF, negative on error
|
* Return: Length of line read on success, 0 on EOF, negative on error
|
||||||
*/
|
*/
|
||||||
int lineread_get(struct lineread *lr, char **line)
|
ssize_t lineread_get(struct lineread *lr, char **line)
|
||||||
{
|
{
|
||||||
bool eof = false;
|
bool eof = false;
|
||||||
int line_len;
|
ssize_t line_len;
|
||||||
|
|
||||||
while ((line_len = peek_line(lr, eof)) < 0) {
|
while ((line_len = peek_line(lr, eof)) < 0) {
|
||||||
int rc;
|
ssize_t rc;
|
||||||
|
|
||||||
if ((lr->next_line + lr->count) == LINEREAD_BUFFER_SIZE) {
|
if ((lr->next_line + lr->count) == LINEREAD_BUFFER_SIZE) {
|
||||||
/* No space at end */
|
/* No space at end */
|
||||||
|
|
|
@ -18,14 +18,15 @@
|
||||||
* @buf: Buffer storing data read from file.
|
* @buf: Buffer storing data read from file.
|
||||||
*/
|
*/
|
||||||
struct lineread {
|
struct lineread {
|
||||||
int fd; int next_line;
|
int fd;
|
||||||
int count;
|
ssize_t next_line;
|
||||||
|
ssize_t count;
|
||||||
|
|
||||||
/* One extra byte for possible trailing \0 */
|
/* One extra byte for possible trailing \0 */
|
||||||
char buf[LINEREAD_BUFFER_SIZE+1];
|
char buf[LINEREAD_BUFFER_SIZE+1];
|
||||||
};
|
};
|
||||||
|
|
||||||
void lineread_init(struct lineread *lr, int fd);
|
void lineread_init(struct lineread *lr, int fd);
|
||||||
int lineread_get(struct lineread *lr, char **line);
|
ssize_t lineread_get(struct lineread *lr, char **line);
|
||||||
|
|
||||||
#endif /* _LINEREAD_H */
|
#endif /* _LINEREAD_H */
|
||||||
|
|
144
linux_dep.h
Normal file
144
linux_dep.h
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
* Copyright Red Hat
|
||||||
|
*
|
||||||
|
* Declarations for Linux specific dependencies
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef LINUX_DEP_H
|
||||||
|
#define LINUX_DEP_H
|
||||||
|
|
||||||
|
/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt()
|
||||||
|
*
|
||||||
|
* Largely derived from include/linux/tcp.h in the Linux kernel
|
||||||
|
*
|
||||||
|
* Some fields returned by TCP_INFO have been there for ages and are shared with
|
||||||
|
* BSD. struct tcp_info from netinet/tcp.h has only those fields. There are
|
||||||
|
* also a many Linux specific extensions to the structure, which are only found
|
||||||
|
* in the linux/tcp.h version of struct tcp_info.
|
||||||
|
*
|
||||||
|
* We want to use some of those extension fields, when available. We can test
|
||||||
|
* for availability in the runtime kernel using the length returned from
|
||||||
|
* getsockopt(). However, we won't necessarily be compiled against the same
|
||||||
|
* kernel headers as we'll run with, so compiling directly against linux/tcp.h
|
||||||
|
* means wrapping every field access in an #ifdef whose #else does the same
|
||||||
|
* thing as when the field is missing at runtime. This rapidly gets messy.
|
||||||
|
*
|
||||||
|
* Instead we define here struct tcp_info_linux which includes all the Linux
|
||||||
|
* extensions that we want to use. This is taken from v6.11 of the kernel.
|
||||||
|
*/
|
||||||
|
struct tcp_info_linux {
|
||||||
|
uint8_t tcpi_state;
|
||||||
|
uint8_t tcpi_ca_state;
|
||||||
|
uint8_t tcpi_retransmits;
|
||||||
|
uint8_t tcpi_probes;
|
||||||
|
uint8_t tcpi_backoff;
|
||||||
|
uint8_t tcpi_options;
|
||||||
|
uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
|
||||||
|
uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
|
||||||
|
|
||||||
|
uint32_t tcpi_rto;
|
||||||
|
uint32_t tcpi_ato;
|
||||||
|
uint32_t tcpi_snd_mss;
|
||||||
|
uint32_t tcpi_rcv_mss;
|
||||||
|
|
||||||
|
uint32_t tcpi_unacked;
|
||||||
|
uint32_t tcpi_sacked;
|
||||||
|
uint32_t tcpi_lost;
|
||||||
|
uint32_t tcpi_retrans;
|
||||||
|
uint32_t tcpi_fackets;
|
||||||
|
|
||||||
|
/* Times. */
|
||||||
|
uint32_t tcpi_last_data_sent;
|
||||||
|
uint32_t tcpi_last_ack_sent;
|
||||||
|
uint32_t tcpi_last_data_recv;
|
||||||
|
uint32_t tcpi_last_ack_recv;
|
||||||
|
|
||||||
|
/* Metrics. */
|
||||||
|
uint32_t tcpi_pmtu;
|
||||||
|
uint32_t tcpi_rcv_ssthresh;
|
||||||
|
uint32_t tcpi_rtt;
|
||||||
|
uint32_t tcpi_rttvar;
|
||||||
|
uint32_t tcpi_snd_ssthresh;
|
||||||
|
uint32_t tcpi_snd_cwnd;
|
||||||
|
uint32_t tcpi_advmss;
|
||||||
|
uint32_t tcpi_reordering;
|
||||||
|
|
||||||
|
uint32_t tcpi_rcv_rtt;
|
||||||
|
uint32_t tcpi_rcv_space;
|
||||||
|
|
||||||
|
uint32_t tcpi_total_retrans;
|
||||||
|
|
||||||
|
/* Linux extensions */
|
||||||
|
uint64_t tcpi_pacing_rate;
|
||||||
|
uint64_t tcpi_max_pacing_rate;
|
||||||
|
uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
|
||||||
|
uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
|
||||||
|
uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
|
||||||
|
uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
|
||||||
|
|
||||||
|
uint32_t tcpi_notsent_bytes;
|
||||||
|
uint32_t tcpi_min_rtt;
|
||||||
|
uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
|
||||||
|
uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
|
||||||
|
|
||||||
|
uint64_t tcpi_delivery_rate;
|
||||||
|
|
||||||
|
uint64_t tcpi_busy_time; /* Time (usec) busy sending data */
|
||||||
|
uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */
|
||||||
|
uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
|
||||||
|
|
||||||
|
uint32_t tcpi_delivered;
|
||||||
|
uint32_t tcpi_delivered_ce;
|
||||||
|
|
||||||
|
uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
|
||||||
|
uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */
|
||||||
|
uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */
|
||||||
|
uint32_t tcpi_reord_seen; /* reordering events seen */
|
||||||
|
|
||||||
|
uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */
|
||||||
|
|
||||||
|
uint32_t tcpi_snd_wnd; /* peer's advertised receive window after
|
||||||
|
* scaling (bytes)
|
||||||
|
*/
|
||||||
|
uint32_t tcpi_rcv_wnd; /* local advertised receive window after
|
||||||
|
* scaling (bytes)
|
||||||
|
*/
|
||||||
|
|
||||||
|
uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */
|
||||||
|
|
||||||
|
uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including
|
||||||
|
* SYN/SYN-ACK and recurring timeouts.
|
||||||
|
*/
|
||||||
|
uint16_t tcpi_total_rto_recoveries; /* Total number of RTO
|
||||||
|
* recoveries, including any
|
||||||
|
* unfinished recovery.
|
||||||
|
*/
|
||||||
|
uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries
|
||||||
|
* in milliseconds, including any
|
||||||
|
* unfinished recovery.
|
||||||
|
*/
|
||||||
|
};
|
||||||
|
|
||||||
|
#include <linux/falloc.h>
|
||||||
|
|
||||||
|
#ifndef FALLOC_FL_COLLAPSE_RANGE
|
||||||
|
#define FALLOC_FL_COLLAPSE_RANGE 0x08
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <linux/close_range.h>
|
||||||
|
|
||||||
|
/* glibc < 2.34 and musl as of 1.2.5 need these */
|
||||||
|
#ifndef SYS_close_range
|
||||||
|
#define SYS_close_range 436
|
||||||
|
#endif
|
||||||
|
#ifndef CLOSE_RANGE_UNSHARE /* Linux kernel < 5.9 */
|
||||||
|
#define CLOSE_RANGE_UNSHARE (1U << 1)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__attribute__ ((weak))
|
||||||
|
/* cppcheck-suppress funcArgNamesDifferent */
|
||||||
|
int close_range(unsigned int first, unsigned int last, int flags) {
|
||||||
|
return syscall(SYS_close_range, first, last, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* LINUX_DEP_H */
|
421
log.c
421
log.c
|
@ -26,17 +26,14 @@
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <sys/socket.h>
|
#include <sys/socket.h>
|
||||||
|
|
||||||
|
#include "linux_dep.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
#include "passt.h"
|
#include "passt.h"
|
||||||
|
|
||||||
/* LOG_EARLY means we don't know yet: log everything. LOG_EMERG is unused */
|
|
||||||
#define LOG_EARLY LOG_MASK(LOG_EMERG)
|
|
||||||
|
|
||||||
static int log_sock = -1; /* Optional socket to system logger */
|
static int log_sock = -1; /* Optional socket to system logger */
|
||||||
static char log_ident[BUFSIZ]; /* Identifier string for openlog() */
|
static char log_ident[BUFSIZ]; /* Identifier string for openlog() */
|
||||||
static int log_mask = LOG_EARLY; /* Current log priority mask */
|
static int log_mask; /* Current log priority mask */
|
||||||
static int log_opt; /* Options for openlog() */
|
|
||||||
|
|
||||||
static int log_file = -1; /* Optional log file descriptor */
|
static int log_file = -1; /* Optional log file descriptor */
|
||||||
static size_t log_size; /* Maximum log file size in bytes */
|
static size_t log_size; /* Maximum log file size in bytes */
|
||||||
|
@ -44,50 +41,46 @@ static size_t log_written; /* Currently used bytes in log file */
|
||||||
static size_t log_cut_size; /* Bytes to cut at start on rotation */
|
static size_t log_cut_size; /* Bytes to cut at start on rotation */
|
||||||
static char log_header[BUFSIZ]; /* File header, written back on cuts */
|
static char log_header[BUFSIZ]; /* File header, written back on cuts */
|
||||||
|
|
||||||
static time_t log_start; /* Start timestamp */
|
struct timespec log_start; /* Start timestamp */
|
||||||
|
|
||||||
int log_trace; /* --trace mode enabled */
|
int log_trace; /* --trace mode enabled */
|
||||||
int log_to_stdout; /* Print to stdout instead of stderr */
|
bool log_conf_parsed; /* Logging options already parsed */
|
||||||
|
bool log_stderr = true; /* Not daemonised, no shell spawned */
|
||||||
|
|
||||||
void vlogmsg(int pri, const char *format, va_list ap)
|
#define LL_STRLEN (sizeof("-9223372036854775808"))
|
||||||
|
#define LOGTIME_STRLEN (LL_STRLEN + 5)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* logtime() - Get the current time for logging purposes
|
||||||
|
* @ts: Buffer into which to store the timestamp
|
||||||
|
*
|
||||||
|
* Return: pointer to @now, or NULL if there was an error retrieving the time
|
||||||
|
*/
|
||||||
|
const struct timespec *logtime(struct timespec *ts)
|
||||||
{
|
{
|
||||||
bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;
|
if (clock_gettime(CLOCK_MONOTONIC, ts))
|
||||||
bool early_print = LOG_PRI(log_mask) == LOG_EARLY;
|
return NULL;
|
||||||
FILE *out = log_to_stdout ? stdout : stderr;
|
return ts;
|
||||||
struct timespec tp;
|
|
||||||
|
|
||||||
if (debug_print) {
|
|
||||||
clock_gettime(CLOCK_REALTIME, &tp);
|
|
||||||
fprintf(out, "%lli.%04lli: ",
|
|
||||||
(long long int)tp.tv_sec - log_start,
|
|
||||||
(long long int)tp.tv_nsec / (100L * 1000));
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((log_mask & LOG_MASK(LOG_PRI(pri))) || early_print) {
|
|
||||||
va_list ap2;
|
|
||||||
|
|
||||||
va_copy(ap2, ap); /* Don't clobber ap, we need it again */
|
|
||||||
if (log_file != -1)
|
|
||||||
logfile_write(pri, format, ap2);
|
|
||||||
else if (!(log_mask & LOG_MASK(LOG_DEBUG)))
|
|
||||||
passt_vsyslog(pri, format, ap2);
|
|
||||||
|
|
||||||
va_end(ap2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (debug_print || (early_print && !(log_opt & LOG_PERROR))) {
|
|
||||||
(void)vfprintf(out, format, ap);
|
|
||||||
if (format[strlen(format)] != '\n')
|
|
||||||
fprintf(out, "\n");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void logmsg(int pri, const char *format, ...)
|
/**
|
||||||
|
* logtime_fmt() - Format timestamp into a string for the log
|
||||||
|
* @buf: Buffer into which to format the time
|
||||||
|
* @size: Size of @buf
|
||||||
|
* @ts: Time to format (or NULL on error)
|
||||||
|
*
|
||||||
|
* Return: number of characters written to @buf (excluding \0)
|
||||||
|
*/
|
||||||
|
static int logtime_fmt(char *buf, size_t size, const struct timespec *ts)
|
||||||
{
|
{
|
||||||
va_list ap;
|
if (ts) {
|
||||||
|
int64_t delta = timespec_diff_us(ts, &log_start);
|
||||||
|
|
||||||
va_start(ap, format);
|
return snprintf(buf, size, "%lli.%04lli", delta / 1000000LL,
|
||||||
vlogmsg(pri, format, ap);
|
(delta / 100LL) % 10000);
|
||||||
va_end(ap);
|
}
|
||||||
|
|
||||||
|
return snprintf(buf, size, "<error>");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Prefixes for log file messages, indexed by priority */
|
/* Prefixes for log file messages, indexed by priority */
|
||||||
|
@ -100,127 +93,12 @@ const char *logfile_prefix[] = {
|
||||||
" ", /* LOG_DEBUG */
|
" ", /* LOG_DEBUG */
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* trace_init() - Set log_trace depending on trace (debug) mode
|
|
||||||
* @enable: Tracing debug mode enabled if non-zero
|
|
||||||
*/
|
|
||||||
void trace_init(int enable)
|
|
||||||
{
|
|
||||||
log_trace = enable;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* __openlog() - Non-optional openlog() implementation, for custom vsyslog()
|
|
||||||
* @ident: openlog() identity (program name)
|
|
||||||
* @option: openlog() options
|
|
||||||
* @facility: openlog() facility (LOG_DAEMON)
|
|
||||||
*/
|
|
||||||
void __openlog(const char *ident, int option, int facility)
|
|
||||||
{
|
|
||||||
struct timespec tp;
|
|
||||||
|
|
||||||
clock_gettime(CLOCK_REALTIME, &tp);
|
|
||||||
log_start = tp.tv_sec;
|
|
||||||
|
|
||||||
if (log_sock < 0) {
|
|
||||||
struct sockaddr_un a = { .sun_family = AF_UNIX, };
|
|
||||||
|
|
||||||
log_sock = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0);
|
|
||||||
if (log_sock < 0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
strncpy(a.sun_path, _PATH_LOG, sizeof(a.sun_path));
|
|
||||||
if (connect(log_sock, (const struct sockaddr *)&a, sizeof(a))) {
|
|
||||||
close(log_sock);
|
|
||||||
log_sock = -1;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
log_mask |= facility;
|
|
||||||
strncpy(log_ident, ident, sizeof(log_ident) - 1);
|
|
||||||
log_opt = option;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* __setlogmask() - setlogmask() wrapper, to allow custom vsyslog()
|
|
||||||
* @mask: Same as setlogmask() mask
|
|
||||||
*/
|
|
||||||
void __setlogmask(int mask)
|
|
||||||
{
|
|
||||||
log_mask = mask;
|
|
||||||
setlogmask(mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* passt_vsyslog() - vsyslog() implementation not using heap memory
|
|
||||||
* @pri: Facility and level map, same as priority for vsyslog()
|
|
||||||
* @format: Same as vsyslog() format
|
|
||||||
* @ap: Same as vsyslog() ap
|
|
||||||
*/
|
|
||||||
void passt_vsyslog(int pri, const char *format, va_list ap)
|
|
||||||
{
|
|
||||||
int prefix_len, n;
|
|
||||||
char buf[BUFSIZ];
|
|
||||||
|
|
||||||
/* Send without timestamp, the system logger should add it */
|
|
||||||
n = prefix_len = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
|
|
||||||
|
|
||||||
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
|
||||||
|
|
||||||
if (format[strlen(format)] != '\n')
|
|
||||||
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
|
||||||
|
|
||||||
if (log_opt & LOG_PERROR)
|
|
||||||
fprintf(stderr, "%s", buf + prefix_len);
|
|
||||||
|
|
||||||
if (send(log_sock, buf, n, 0) != n)
|
|
||||||
fprintf(stderr, "Failed to send %i bytes to syslog\n", n);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* logfile_init() - Open log file and write header with PID, version, path
|
|
||||||
* @name: Identifier for header: passt or pasta
|
|
||||||
* @path: Path to log file
|
|
||||||
* @size: Maximum size of log file: log_cut_size is calculatd here
|
|
||||||
*/
|
|
||||||
void logfile_init(const char *name, const char *path, size_t size)
|
|
||||||
{
|
|
||||||
char nl = '\n', exe[PATH_MAX] = { 0 };
|
|
||||||
int n;
|
|
||||||
|
|
||||||
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) {
|
|
||||||
perror("readlink /proc/self/exe");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC,
|
|
||||||
S_IRUSR | S_IWUSR);
|
|
||||||
if (log_file == -1)
|
|
||||||
die("Couldn't open log file %s: %s", path, strerror(errno));
|
|
||||||
|
|
||||||
log_size = size ? size : LOGFILE_SIZE_DEFAULT;
|
|
||||||
|
|
||||||
n = snprintf(log_header, sizeof(log_header), "%s " VERSION ": %s (%i)",
|
|
||||||
name, exe, getpid());
|
|
||||||
|
|
||||||
if (write(log_file, log_header, n) <= 0 ||
|
|
||||||
write(log_file, &nl, 1) <= 0) {
|
|
||||||
perror("Couldn't write to log file\n");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */
|
|
||||||
log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef FALLOC_FL_COLLAPSE_RANGE
|
|
||||||
/**
|
/**
|
||||||
* logfile_rotate_fallocate() - Write header, set log_written after fallocate()
|
* logfile_rotate_fallocate() - Write header, set log_written after fallocate()
|
||||||
* @fd: Log file descriptor
|
* @fd: Log file descriptor
|
||||||
* @now: Current timestamp
|
* @now: Current timestamp
|
||||||
*
|
*
|
||||||
* #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek
|
* #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek i686:_llseek
|
||||||
*/
|
*/
|
||||||
static void logfile_rotate_fallocate(int fd, const struct timespec *now)
|
static void logfile_rotate_fallocate(int fd, const struct timespec *now)
|
||||||
{
|
{
|
||||||
|
@ -233,10 +111,8 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)
|
||||||
if (read(fd, buf, BUFSIZ) == -1)
|
if (read(fd, buf, BUFSIZ) == -1)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
n = snprintf(buf, BUFSIZ,
|
n = snprintf(buf, BUFSIZ, "%s - log truncated at ", log_header);
|
||||||
"%s - log truncated at %lli.%04lli", log_header,
|
n += logtime_fmt(buf + n, BUFSIZ - n, now);
|
||||||
(long long int)(now->tv_sec - log_start),
|
|
||||||
(long long int)(now->tv_nsec / (100L * 1000)));
|
|
||||||
|
|
||||||
/* Avoid partial lines by padding the header with spaces */
|
/* Avoid partial lines by padding the header with spaces */
|
||||||
nl = memchr(buf + n + 1, '\n', BUFSIZ - n - 1);
|
nl = memchr(buf + n + 1, '\n', BUFSIZ - n - 1);
|
||||||
|
@ -250,14 +126,13 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)
|
||||||
|
|
||||||
log_written -= log_cut_size;
|
log_written -= log_cut_size;
|
||||||
}
|
}
|
||||||
#endif /* FALLOC_FL_COLLAPSE_RANGE */
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* logfile_rotate_move() - Fallback: move recent entries toward start, then cut
|
* logfile_rotate_move() - Fallback: move recent entries toward start, then cut
|
||||||
* @fd: Log file descriptor
|
* @fd: Log file descriptor
|
||||||
* @now: Current timestamp
|
* @now: Current timestamp
|
||||||
*
|
*
|
||||||
* #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek
|
* #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek
|
||||||
* #syscalls ftruncate
|
* #syscalls ftruncate
|
||||||
*/
|
*/
|
||||||
static void logfile_rotate_move(int fd, const struct timespec *now)
|
static void logfile_rotate_move(int fd, const struct timespec *now)
|
||||||
|
@ -266,10 +141,10 @@ static void logfile_rotate_move(int fd, const struct timespec *now)
|
||||||
char buf[BUFSIZ];
|
char buf[BUFSIZ];
|
||||||
const char *nl;
|
const char *nl;
|
||||||
|
|
||||||
header_len = snprintf(buf, BUFSIZ,
|
header_len = snprintf(buf, BUFSIZ, "%s - log truncated at ",
|
||||||
"%s - log truncated at %lli.%04lli\n", log_header,
|
log_header);
|
||||||
(long long int)(now->tv_sec - log_start),
|
header_len += logtime_fmt(buf + header_len, BUFSIZ - header_len, now);
|
||||||
(long long int)(now->tv_nsec / (100L * 1000)));
|
|
||||||
if (lseek(fd, 0, SEEK_SET) == -1)
|
if (lseek(fd, 0, SEEK_SET) == -1)
|
||||||
return;
|
return;
|
||||||
if (write(fd, buf, header_len) == -1)
|
if (write(fd, buf, header_len) == -1)
|
||||||
|
@ -322,21 +197,17 @@ out:
|
||||||
*
|
*
|
||||||
* Return: 0 on success, negative error code on failure
|
* Return: 0 on success, negative error code on failure
|
||||||
*
|
*
|
||||||
* #syscalls fcntl
|
* #syscalls fcntl fallocate
|
||||||
*
|
|
||||||
* fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there
|
|
||||||
*/
|
*/
|
||||||
static int logfile_rotate(int fd, const struct timespec *now)
|
static int logfile_rotate(int fd, const struct timespec *now)
|
||||||
{
|
{
|
||||||
if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */))
|
if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */))
|
||||||
return -errno;
|
return -errno;
|
||||||
|
|
||||||
#ifdef FALLOC_FL_COLLAPSE_RANGE
|
|
||||||
/* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */
|
/* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */
|
||||||
if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size))
|
if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size))
|
||||||
logfile_rotate_fallocate(fd, now);
|
logfile_rotate_fallocate(fd, now);
|
||||||
else
|
else
|
||||||
#endif
|
|
||||||
logfile_rotate_move(fd, now);
|
logfile_rotate_move(fd, now);
|
||||||
|
|
||||||
if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND))
|
if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND))
|
||||||
|
@ -347,32 +218,212 @@ static int logfile_rotate(int fd, const struct timespec *now)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* logfile_write() - Write entry to log file, trigger rotation if full
|
* logfile_write() - Write entry to log file, trigger rotation if full
|
||||||
|
* @newline: Append newline at the end of the message, if missing
|
||||||
|
* @cont: Continuation of a previous message, on the same line
|
||||||
* @pri: Facility and level map, same as priority for vsyslog()
|
* @pri: Facility and level map, same as priority for vsyslog()
|
||||||
|
* @now: Timestamp
|
||||||
* @format: Same as vsyslog() format
|
* @format: Same as vsyslog() format
|
||||||
* @ap: Same as vsyslog() ap
|
* @ap: Same as vsyslog() ap
|
||||||
*/
|
*/
|
||||||
void logfile_write(int pri, const char *format, va_list ap)
|
static void logfile_write(bool newline, bool cont, int pri,
|
||||||
|
const struct timespec *now,
|
||||||
|
const char *format, va_list ap)
|
||||||
{
|
{
|
||||||
struct timespec now;
|
|
||||||
char buf[BUFSIZ];
|
char buf[BUFSIZ];
|
||||||
int n;
|
int n = 0;
|
||||||
|
|
||||||
if (clock_gettime(CLOCK_REALTIME, &now))
|
if (!cont) {
|
||||||
return;
|
n += logtime_fmt(buf, BUFSIZ, now);
|
||||||
|
n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]);
|
||||||
n = snprintf(buf, BUFSIZ, "%lli.%04lli: %s",
|
}
|
||||||
(long long int)(now.tv_sec - log_start),
|
|
||||||
(long long int)(now.tv_nsec / (100L * 1000)),
|
|
||||||
logfile_prefix[pri]);
|
|
||||||
|
|
||||||
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
||||||
|
|
||||||
if (format[strlen(format)] != '\n')
|
if (newline && format[strlen(format)] != '\n')
|
||||||
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
||||||
|
|
||||||
if ((log_written + n >= log_size) && logfile_rotate(log_file, &now))
|
if ((log_written + n >= log_size) && logfile_rotate(log_file, now))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if ((n = write(log_file, buf, n)) >= 0)
|
if ((n = write(log_file, buf, n)) >= 0)
|
||||||
log_written += n;
|
log_written += n;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* vlogmsg() - Print or send messages to log or output files as configured
|
||||||
|
* @newline: Append newline at the end of the message, if missing
|
||||||
|
* @cont: Continuation of a previous message, on the same line
|
||||||
|
* @pri: Facility and level map, same as priority for vsyslog()
|
||||||
|
* @format: Message
|
||||||
|
* @ap: Variable argument list
|
||||||
|
*/
|
||||||
|
void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
|
||||||
|
{
|
||||||
|
bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;
|
||||||
|
const struct timespec *now;
|
||||||
|
struct timespec ts;
|
||||||
|
|
||||||
|
now = logtime(&ts);
|
||||||
|
|
||||||
|
if (debug_print && !cont) {
|
||||||
|
char timestr[LOGTIME_STRLEN];
|
||||||
|
|
||||||
|
logtime_fmt(timestr, sizeof(timestr), now);
|
||||||
|
FPRINTF(stderr, "%s: ", timestr);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) {
|
||||||
|
va_list ap2;
|
||||||
|
|
||||||
|
va_copy(ap2, ap); /* Don't clobber ap, we need it again */
|
||||||
|
if (log_file != -1)
|
||||||
|
logfile_write(newline, cont, pri, now, format, ap2);
|
||||||
|
else if (!(log_mask & LOG_MASK(LOG_DEBUG)))
|
||||||
|
passt_vsyslog(newline, pri, format, ap2);
|
||||||
|
|
||||||
|
va_end(ap2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (debug_print || !log_conf_parsed ||
|
||||||
|
(log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) {
|
||||||
|
(void)vfprintf(stderr, format, ap);
|
||||||
|
if (newline && format[strlen(format)] != '\n')
|
||||||
|
FPRINTF(stderr, "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* logmsg() - vlogmsg() wrapper for variable argument lists
|
||||||
|
* @newline: Append newline at the end of the message, if missing
|
||||||
|
* @cont: Continuation of a previous message, on the same line
|
||||||
|
* @pri: Facility and level map, same as priority for vsyslog()
|
||||||
|
* @format: Message
|
||||||
|
*/
|
||||||
|
void logmsg(bool newline, bool cont, int pri, const char *format, ...)
|
||||||
|
{
|
||||||
|
va_list ap;
|
||||||
|
|
||||||
|
va_start(ap, format);
|
||||||
|
vlogmsg(newline, cont, pri, format, ap);
|
||||||
|
va_end(ap);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* logmsg_perror() - vlogmsg() wrapper with perror()-like functionality
|
||||||
|
* @pri: Facility and level map, same as priority for vsyslog()
|
||||||
|
* @format: Message
|
||||||
|
*/
|
||||||
|
void logmsg_perror(int pri, const char *format, ...)
|
||||||
|
{
|
||||||
|
int errno_copy = errno;
|
||||||
|
va_list ap;
|
||||||
|
|
||||||
|
va_start(ap, format);
|
||||||
|
vlogmsg(false, false, pri, format, ap);
|
||||||
|
va_end(ap);
|
||||||
|
|
||||||
|
logmsg(true, true, pri, ": %s", strerror(errno_copy));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* trace_init() - Set log_trace depending on trace (debug) mode
|
||||||
|
* @enable: Tracing debug mode enabled if non-zero
|
||||||
|
*/
|
||||||
|
void trace_init(int enable)
|
||||||
|
{
|
||||||
|
log_trace = enable;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* __openlog() - Non-optional openlog() implementation, for custom vsyslog()
|
||||||
|
* @ident: openlog() identity (program name)
|
||||||
|
* @option: openlog() options, unused
|
||||||
|
* @facility: openlog() facility (LOG_DAEMON)
|
||||||
|
*/
|
||||||
|
void __openlog(const char *ident, int option, int facility)
|
||||||
|
{
|
||||||
|
(void)option;
|
||||||
|
|
||||||
|
if (log_sock < 0) {
|
||||||
|
struct sockaddr_un a = { .sun_family = AF_UNIX, };
|
||||||
|
|
||||||
|
log_sock = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0);
|
||||||
|
if (log_sock < 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
strncpy(a.sun_path, _PATH_LOG, sizeof(a.sun_path));
|
||||||
|
if (connect(log_sock, (const struct sockaddr *)&a, sizeof(a))) {
|
||||||
|
close(log_sock);
|
||||||
|
log_sock = -1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log_mask |= facility;
|
||||||
|
strncpy(log_ident, ident, sizeof(log_ident) - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* __setlogmask() - setlogmask() wrapper, to allow custom vsyslog()
|
||||||
|
* @mask: Same as setlogmask() mask
|
||||||
|
*/
|
||||||
|
void __setlogmask(int mask)
|
||||||
|
{
|
||||||
|
log_mask = mask;
|
||||||
|
setlogmask(mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* passt_vsyslog() - vsyslog() implementation not using heap memory
|
||||||
|
* @newline: Append newline at the end of the message, if missing
|
||||||
|
* @pri: Facility and level map, same as priority for vsyslog()
|
||||||
|
* @format: Same as vsyslog() format
|
||||||
|
* @ap: Same as vsyslog() ap
|
||||||
|
*/
|
||||||
|
void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
|
||||||
|
{
|
||||||
|
char buf[BUFSIZ];
|
||||||
|
int n;
|
||||||
|
|
||||||
|
/* Send without timestamp, the system logger should add it */
|
||||||
|
n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
|
||||||
|
|
||||||
|
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
||||||
|
|
||||||
|
if (newline && format[strlen(format)] != '\n')
|
||||||
|
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
||||||
|
|
||||||
|
if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
|
||||||
|
FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* logfile_init() - Open log file and write header with PID, version, path
|
||||||
|
* @name: Identifier for header: passt or pasta
|
||||||
|
* @path: Path to log file
|
||||||
|
* @size: Maximum size of log file: log_cut_size is calculatd here
|
||||||
|
*/
|
||||||
|
void logfile_init(const char *name, const char *path, size_t size)
|
||||||
|
{
|
||||||
|
char nl = '\n', exe[PATH_MAX] = { 0 };
|
||||||
|
int n;
|
||||||
|
|
||||||
|
if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0)
|
||||||
|
die_perror("Failed to read own /proc/self/exe link");
|
||||||
|
|
||||||
|
log_file = output_file_open(path, O_APPEND | O_RDWR);
|
||||||
|
if (log_file == -1)
|
||||||
|
die_perror("Couldn't open log file %s", path);
|
||||||
|
|
||||||
|
log_size = size ? size : LOGFILE_SIZE_DEFAULT;
|
||||||
|
|
||||||
|
n = snprintf(log_header, sizeof(log_header), "%s " VERSION ": %s (%i)",
|
||||||
|
name, exe, getpid());
|
||||||
|
|
||||||
|
if (write(log_file, log_header, n) <= 0 ||
|
||||||
|
write(log_file, &nl, 1) <= 0)
|
||||||
|
die_perror("Couldn't write to log file");
|
||||||
|
|
||||||
|
/* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */
|
||||||
|
log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE);
|
||||||
|
}
|
||||||
|
|
34
log.h
34
log.h
|
@ -6,20 +6,28 @@
|
||||||
#ifndef LOG_H
|
#ifndef LOG_H
|
||||||
#define LOG_H
|
#define LOG_H
|
||||||
|
|
||||||
|
#include <stdbool.h>
|
||||||
#include <syslog.h>
|
#include <syslog.h>
|
||||||
|
|
||||||
#define LOGFILE_SIZE_DEFAULT (1024 * 1024UL)
|
#define LOGFILE_SIZE_DEFAULT (1024 * 1024UL)
|
||||||
#define LOGFILE_CUT_RATIO 30 /* When full, cut ~30% size */
|
#define LOGFILE_CUT_RATIO 30 /* When full, cut ~30% size */
|
||||||
#define LOGFILE_SIZE_MIN (5UL * MAX(BUFSIZ, PAGE_SIZE))
|
#define LOGFILE_SIZE_MIN (5UL * MAX(BUFSIZ, PAGE_SIZE))
|
||||||
|
|
||||||
void vlogmsg(int pri, const char *format, va_list ap);
|
void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap);
|
||||||
void logmsg(int pri, const char *format, ...)
|
void logmsg(bool newline, bool cont, int pri, const char *format, ...)
|
||||||
|
__attribute__((format(printf, 4, 5)));
|
||||||
|
void logmsg_perror(int pri, const char *format, ...)
|
||||||
__attribute__((format(printf, 2, 3)));
|
__attribute__((format(printf, 2, 3)));
|
||||||
|
|
||||||
#define err(...) logmsg(LOG_ERR, __VA_ARGS__)
|
#define err(...) logmsg(true, false, LOG_ERR, __VA_ARGS__)
|
||||||
#define warn(...) logmsg(LOG_WARNING, __VA_ARGS__)
|
#define warn(...) logmsg(true, false, LOG_WARNING, __VA_ARGS__)
|
||||||
#define info(...) logmsg(LOG_INFO, __VA_ARGS__)
|
#define info(...) logmsg(true, false, LOG_INFO, __VA_ARGS__)
|
||||||
#define debug(...) logmsg(LOG_DEBUG, __VA_ARGS__)
|
#define debug(...) logmsg(true, false, LOG_DEBUG, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define err_perror(...) logmsg_perror( LOG_ERR, __VA_ARGS__)
|
||||||
|
#define warn_perror(...) logmsg_perror( LOG_WARNING, __VA_ARGS__)
|
||||||
|
#define info_perror(...) logmsg_perror( LOG_INFO, __VA_ARGS__)
|
||||||
|
#define debug_perror(...) logmsg_perror( LOG_DEBUG, __VA_ARGS__)
|
||||||
|
|
||||||
#define die(...) \
|
#define die(...) \
|
||||||
do { \
|
do { \
|
||||||
|
@ -27,8 +35,17 @@ void logmsg(int pri, const char *format, ...)
|
||||||
exit(EXIT_FAILURE); \
|
exit(EXIT_FAILURE); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#define die_perror(...) \
|
||||||
|
do { \
|
||||||
|
err_perror(__VA_ARGS__); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
extern int log_trace;
|
extern int log_trace;
|
||||||
extern int log_to_stdout;
|
extern bool log_conf_parsed;
|
||||||
|
extern bool log_stderr;
|
||||||
|
extern struct timespec log_start;
|
||||||
|
|
||||||
void trace_init(int enable);
|
void trace_init(int enable);
|
||||||
#define trace(...) \
|
#define trace(...) \
|
||||||
do { \
|
do { \
|
||||||
|
@ -38,8 +55,7 @@ void trace_init(int enable);
|
||||||
|
|
||||||
void __openlog(const char *ident, int option, int facility);
|
void __openlog(const char *ident, int option, int facility);
|
||||||
void logfile_init(const char *name, const char *path, size_t size);
|
void logfile_init(const char *name, const char *path, size_t size);
|
||||||
void passt_vsyslog(int pri, const char *format, va_list ap);
|
void passt_vsyslog(bool newline, int pri, const char *format, va_list ap);
|
||||||
void logfile_write(int pri, const char *format, va_list ap);
|
|
||||||
void __setlogmask(int mask);
|
void __setlogmask(int mask);
|
||||||
|
|
||||||
#endif /* LOG_H */
|
#endif /* LOG_H */
|
||||||
|
|
320
ndp.c
320
ndp.c
|
@ -38,23 +38,194 @@
|
||||||
#define NS 135
|
#define NS 135
|
||||||
#define NA 136
|
#define NA 136
|
||||||
|
|
||||||
|
enum ndp_option_types {
|
||||||
|
OPT_SRC_L2_ADDR = 1,
|
||||||
|
OPT_TARGET_L2_ADDR = 2,
|
||||||
|
OPT_PREFIX_INFO = 3,
|
||||||
|
OPT_MTU = 5,
|
||||||
|
OPT_RDNSS_TYPE = 25,
|
||||||
|
OPT_DNSSL_TYPE = 31,
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct opt_header - Option header
|
||||||
|
* @type: Option type
|
||||||
|
* @len: Option length, in units of 8 bytes
|
||||||
|
*/
|
||||||
|
struct opt_header {
|
||||||
|
uint8_t type;
|
||||||
|
uint8_t len;
|
||||||
|
} __attribute__((packed));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct opt_l2_addr - Link-layer address
|
||||||
|
* @header: Option header
|
||||||
|
* @mac: MAC address
|
||||||
|
*/
|
||||||
|
struct opt_l2_addr {
|
||||||
|
struct opt_header header;
|
||||||
|
unsigned char mac[ETH_ALEN];
|
||||||
|
} __attribute__((packed));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct ndp_na - NDP Neighbor Advertisement (NA) message
|
||||||
|
* @ih: ICMPv6 header
|
||||||
|
* @target_addr: Target IPv6 address
|
||||||
|
* @target_l2_addr: Target link-layer address
|
||||||
|
*/
|
||||||
|
struct ndp_na {
|
||||||
|
struct icmp6hdr ih;
|
||||||
|
struct in6_addr target_addr;
|
||||||
|
struct opt_l2_addr target_l2_addr;
|
||||||
|
} __attribute__((packed));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct opt_prefix_info - Prefix Information option
|
||||||
|
* @header: Option header
|
||||||
|
* @prefix_len: The number of leading bits in the Prefix that are valid
|
||||||
|
* @prefix_flags: Flags associated with the prefix
|
||||||
|
* @valid_lifetime: Valid lifetime (ms)
|
||||||
|
* @pref_lifetime: Preferred lifetime (ms)
|
||||||
|
* @reserved: Unused
|
||||||
|
*/
|
||||||
|
struct opt_prefix_info {
|
||||||
|
struct opt_header header;
|
||||||
|
uint8_t prefix_len;
|
||||||
|
uint8_t prefix_flags;
|
||||||
|
uint32_t valid_lifetime;
|
||||||
|
uint32_t pref_lifetime;
|
||||||
|
uint32_t reserved;
|
||||||
|
} __attribute__((packed));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct opt_mtu - Maximum transmission unit (MTU) option
|
||||||
|
* @header: Option header
|
||||||
|
* @reserved: Unused
|
||||||
|
* @value: MTU value, network order
|
||||||
|
*/
|
||||||
|
struct opt_mtu {
|
||||||
|
struct opt_header header;
|
||||||
|
uint16_t reserved;
|
||||||
|
uint32_t value;
|
||||||
|
} __attribute__((packed));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct rdnss - Recursive DNS Server (RDNSS) option
|
||||||
|
* @header: Option header
|
||||||
|
* @reserved: Unused
|
||||||
|
* @lifetime: Validity time (s)
|
||||||
|
* @dns: List of DNS server addresses
|
||||||
|
*/
|
||||||
|
struct opt_rdnss {
|
||||||
|
struct opt_header header;
|
||||||
|
uint16_t reserved;
|
||||||
|
uint32_t lifetime;
|
||||||
|
struct in6_addr dns[MAXNS + 1];
|
||||||
|
} __attribute__((packed));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct dnssl - DNS Search List (DNSSL) option
|
||||||
|
* @header: Option header
|
||||||
|
* @reserved: Unused
|
||||||
|
* @lifetime: Validity time (s)
|
||||||
|
* @domains: List of NULL-seperated search domains
|
||||||
|
*/
|
||||||
|
struct opt_dnssl {
|
||||||
|
struct opt_header header;
|
||||||
|
uint16_t reserved;
|
||||||
|
uint32_t lifetime;
|
||||||
|
unsigned char domains[MAXDNSRCH * NS_MAXDNAME];
|
||||||
|
} __attribute__((packed));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct ndp_ra - NDP Router Advertisement (RA) message
|
||||||
|
* @ih: ICMPv6 header
|
||||||
|
* @reachable: Reachability time, after confirmation (ms)
|
||||||
|
* @retrans: Time between retransmitted NS messages (ms)
|
||||||
|
* @prefix_info: Prefix Information option
|
||||||
|
* @prefix: IPv6 prefix
|
||||||
|
* @mtu: MTU option
|
||||||
|
* @source_ll: Target link-layer address
|
||||||
|
* @var: Variable fields
|
||||||
|
*/
|
||||||
|
struct ndp_ra {
|
||||||
|
struct icmp6hdr ih;
|
||||||
|
uint32_t reachable;
|
||||||
|
uint32_t retrans;
|
||||||
|
struct opt_prefix_info prefix_info;
|
||||||
|
struct in6_addr prefix;
|
||||||
|
struct opt_l2_addr source_ll;
|
||||||
|
|
||||||
|
unsigned char var[sizeof(struct opt_mtu) + sizeof(struct opt_rdnss) +
|
||||||
|
sizeof(struct opt_dnssl)];
|
||||||
|
} __attribute__((packed));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct ndp_ns - NDP Neighbor Solicitation (NS) message
|
||||||
|
* @ih: ICMPv6 header
|
||||||
|
* @target_addr: Target IPv6 address
|
||||||
|
*/
|
||||||
|
struct ndp_ns {
|
||||||
|
struct icmp6hdr ih;
|
||||||
|
struct in6_addr target_addr;
|
||||||
|
} __attribute__((packed));
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ndp() - Check for NDP solicitations, reply as needed
|
* ndp() - Check for NDP solicitations, reply as needed
|
||||||
* @c: Execution context
|
* @c: Execution context
|
||||||
* @ih: ICMPv6 header
|
* @ih: ICMPv6 header
|
||||||
* @saddr Source IPv6 address
|
* @saddr: Source IPv6 address
|
||||||
|
* @p: Packet pool
|
||||||
*
|
*
|
||||||
* Return: 0 if not handled here, 1 if handled, -1 on failure
|
* Return: 0 if not handled here, 1 if handled, -1 on failure
|
||||||
*/
|
*/
|
||||||
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr)
|
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
|
||||||
|
const struct pool *p)
|
||||||
{
|
{
|
||||||
|
struct ndp_na na = {
|
||||||
|
.ih = {
|
||||||
|
.icmp6_type = NA,
|
||||||
|
.icmp6_code = 0,
|
||||||
|
.icmp6_router = 1,
|
||||||
|
.icmp6_solicited = 1,
|
||||||
|
.icmp6_override = 1,
|
||||||
|
},
|
||||||
|
.target_l2_addr = {
|
||||||
|
.header = {
|
||||||
|
.type = OPT_TARGET_L2_ADDR,
|
||||||
|
.len = 1,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
struct ndp_ra ra = {
|
||||||
|
.ih = {
|
||||||
|
.icmp6_type = RA,
|
||||||
|
.icmp6_code = 0,
|
||||||
|
.icmp6_hop_limit = 255,
|
||||||
|
/* RFC 8319 */
|
||||||
|
.icmp6_rt_lifetime = htons_constant(65535),
|
||||||
|
.icmp6_addrconf_managed = 1,
|
||||||
|
},
|
||||||
|
.prefix_info = {
|
||||||
|
.header = {
|
||||||
|
.type = OPT_PREFIX_INFO,
|
||||||
|
.len = 4,
|
||||||
|
},
|
||||||
|
.prefix_len = 64,
|
||||||
|
.prefix_flags = 0xc0, /* prefix flags: L, A */
|
||||||
|
.valid_lifetime = ~0U,
|
||||||
|
.pref_lifetime = ~0U,
|
||||||
|
},
|
||||||
|
.source_ll = {
|
||||||
|
.header = {
|
||||||
|
.type = OPT_SRC_L2_ADDR,
|
||||||
|
.len = 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
const struct in6_addr *rsaddr; /* src addr for reply */
|
const struct in6_addr *rsaddr; /* src addr for reply */
|
||||||
char buf[BUFSIZ] = { 0 };
|
unsigned char *ptr = NULL;
|
||||||
struct ipv6hdr *ip6hr;
|
size_t dlen;
|
||||||
struct icmp6hdr *ihr;
|
|
||||||
struct ethhdr *ehr;
|
|
||||||
unsigned char *p;
|
|
||||||
size_t len;
|
|
||||||
|
|
||||||
if (ih->icmp6_type < RS || ih->icmp6_type > NA)
|
if (ih->icmp6_type < RS || ih->icmp6_type > NA)
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -62,28 +233,22 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr)
|
||||||
if (c->no_ndp)
|
if (c->no_ndp)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
ehr = (struct ethhdr *)buf;
|
|
||||||
ip6hr = (struct ipv6hdr *)(ehr + 1);
|
|
||||||
ihr = (struct icmp6hdr *)(ip6hr + 1);
|
|
||||||
|
|
||||||
if (ih->icmp6_type == NS) {
|
if (ih->icmp6_type == NS) {
|
||||||
|
const struct ndp_ns *ns =
|
||||||
|
packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);
|
||||||
|
|
||||||
|
if (!ns)
|
||||||
|
return -1;
|
||||||
|
|
||||||
if (IN6_IS_ADDR_UNSPECIFIED(saddr))
|
if (IN6_IS_ADDR_UNSPECIFIED(saddr))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
info("NDP: received NS, sending NA");
|
info("NDP: received NS, sending NA");
|
||||||
ihr->icmp6_type = NA;
|
|
||||||
ihr->icmp6_code = 0;
|
|
||||||
ihr->icmp6_router = 1;
|
|
||||||
ihr->icmp6_solicited = 1;
|
|
||||||
ihr->icmp6_override = 1;
|
|
||||||
|
|
||||||
p = (unsigned char *)(ihr + 1);
|
memcpy(&na.target_addr, &ns->target_addr,
|
||||||
memcpy(p, ih + 1, sizeof(struct in6_addr)); /* target address */
|
sizeof(na.target_addr));
|
||||||
p += 16;
|
memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
|
||||||
*p++ = 2; /* target ll */
|
|
||||||
*p++ = 1; /* length */
|
|
||||||
memcpy(p, c->mac, ETH_ALEN);
|
|
||||||
p += 6;
|
|
||||||
} else if (ih->icmp6_type == RS) {
|
} else if (ih->icmp6_type == RS) {
|
||||||
size_t dns_s_len = 0;
|
size_t dns_s_len = 0;
|
||||||
int i, n;
|
int i, n;
|
||||||
|
@ -92,31 +257,20 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
info("NDP: received RS, sending RA");
|
info("NDP: received RS, sending RA");
|
||||||
ihr->icmp6_type = RA;
|
memcpy(&ra.prefix, &c->ip6.addr, sizeof(ra.prefix));
|
||||||
ihr->icmp6_code = 0;
|
|
||||||
ihr->icmp6_hop_limit = 255;
|
|
||||||
ihr->icmp6_rt_lifetime = htons(65535); /* RFC 8319 */
|
|
||||||
ihr->icmp6_addrconf_managed = 1;
|
|
||||||
|
|
||||||
p = (unsigned char *)(ihr + 1);
|
ptr = &ra.var[0];
|
||||||
p += 8; /* reachable, retrans time */
|
|
||||||
*p++ = 3; /* prefix */
|
|
||||||
*p++ = 4; /* length */
|
|
||||||
*p++ = 64; /* prefix length */
|
|
||||||
*p++ = 0xc0; /* prefix flags: L, A */
|
|
||||||
*(uint32_t *)p = (uint32_t)~0U; /* lifetime */
|
|
||||||
p += 4;
|
|
||||||
*(uint32_t *)p = (uint32_t)~0U; /* preferred lifetime */
|
|
||||||
p += 8;
|
|
||||||
memcpy(p, &c->ip6.addr, 8); /* prefix */
|
|
||||||
p += 16;
|
|
||||||
|
|
||||||
if (c->mtu != -1) {
|
if (c->mtu != -1) {
|
||||||
*p++ = 5; /* type */
|
struct opt_mtu *mtu = (struct opt_mtu *)ptr;
|
||||||
*p++ = 1; /* length */
|
*mtu = (struct opt_mtu) {
|
||||||
p += 2; /* reserved */
|
.header = {
|
||||||
*(uint32_t *)p = htonl(c->mtu); /* MTU */
|
.type = OPT_MTU,
|
||||||
p += 4;
|
.len = 1,
|
||||||
|
},
|
||||||
|
.value = htonl(c->mtu),
|
||||||
|
};
|
||||||
|
ptr += sizeof(struct opt_mtu);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c->no_dhcp_dns)
|
if (c->no_dhcp_dns)
|
||||||
|
@ -124,70 +278,78 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr)
|
||||||
|
|
||||||
for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[n]); n++);
|
for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[n]); n++);
|
||||||
if (n) {
|
if (n) {
|
||||||
*p++ = 25; /* RDNSS */
|
struct opt_rdnss *rdnss = (struct opt_rdnss *)ptr;
|
||||||
*p++ = 1 + 2 * n; /* length */
|
*rdnss = (struct opt_rdnss) {
|
||||||
p += 2; /* reserved */
|
.header = {
|
||||||
*(uint32_t *)p = (uint32_t)~0U; /* lifetime */
|
.type = OPT_RDNSS_TYPE,
|
||||||
p += 4;
|
.len = 1 + 2 * n,
|
||||||
|
},
|
||||||
|
.lifetime = ~0U,
|
||||||
|
};
|
||||||
for (i = 0; i < n; i++) {
|
for (i = 0; i < n; i++) {
|
||||||
memcpy(p, &c->ip6.dns[i], 16); /* address */
|
memcpy(&rdnss->dns[i], &c->ip6.dns[i],
|
||||||
p += 16;
|
sizeof(rdnss->dns[i]));
|
||||||
}
|
}
|
||||||
|
ptr += offsetof(struct opt_rdnss, dns) +
|
||||||
|
i * sizeof(rdnss->dns[0]);
|
||||||
|
|
||||||
for (n = 0; *c->dns_search[n].n; n++)
|
for (n = 0; *c->dns_search[n].n; n++)
|
||||||
dns_s_len += strlen(c->dns_search[n].n) + 2;
|
dns_s_len += strlen(c->dns_search[n].n) + 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!c->no_dhcp_dns_search && dns_s_len) {
|
if (!c->no_dhcp_dns_search && dns_s_len) {
|
||||||
*p++ = 31; /* DNSSL */
|
struct opt_dnssl *dnssl = (struct opt_dnssl *)ptr;
|
||||||
*p++ = (dns_s_len + 8 - 1) / 8 + 1; /* length */
|
*dnssl = (struct opt_dnssl) {
|
||||||
p += 2; /* reserved */
|
.header = {
|
||||||
*(uint32_t *)p = (uint32_t)~0U; /* lifetime */
|
.type = OPT_DNSSL_TYPE,
|
||||||
p += 4;
|
.len = DIV_ROUND_UP(dns_s_len, 8) + 1,
|
||||||
|
},
|
||||||
|
.lifetime = ~0U,
|
||||||
|
};
|
||||||
|
ptr = dnssl->domains;
|
||||||
|
|
||||||
for (i = 0; i < n; i++) {
|
for (i = 0; i < n; i++) {
|
||||||
|
size_t len;
|
||||||
char *dot;
|
char *dot;
|
||||||
|
|
||||||
*(p++) = '.';
|
*(ptr++) = '.';
|
||||||
|
|
||||||
strncpy((char *)p, c->dns_search[i].n,
|
len = sizeof(dnssl->domains) -
|
||||||
sizeof(buf) -
|
(ptr - dnssl->domains);
|
||||||
((intptr_t)p - (intptr_t)buf));
|
|
||||||
for (dot = (char *)p - 1; *dot; dot++) {
|
strncpy((char *)ptr, c->dns_search[i].n, len);
|
||||||
|
for (dot = (char *)ptr - 1; *dot; dot++) {
|
||||||
if (*dot == '.')
|
if (*dot == '.')
|
||||||
*dot = strcspn(dot + 1, ".");
|
*dot = strcspn(dot + 1, ".");
|
||||||
}
|
}
|
||||||
p += strlen(c->dns_search[i].n);
|
ptr += strlen(c->dns_search[i].n);
|
||||||
*(p++) = 0;
|
*(ptr++) = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
memset(p, 0, 8 - dns_s_len % 8); /* padding */
|
memset(ptr, 0, 8 - dns_s_len % 8); /* padding */
|
||||||
p += 8 - dns_s_len % 8;
|
ptr += 8 - dns_s_len % 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
dns_done:
|
dns_done:
|
||||||
*p++ = 1; /* source ll */
|
memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
|
||||||
*p++ = 1; /* length */
|
|
||||||
memcpy(p, c->mac, ETH_ALEN);
|
|
||||||
p += 6;
|
|
||||||
} else {
|
} else {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
len = (uintptr_t)p - (uintptr_t)ihr - sizeof(*ihr);
|
|
||||||
|
|
||||||
if (IN6_IS_ADDR_LINKLOCAL(saddr))
|
if (IN6_IS_ADDR_LINKLOCAL(saddr))
|
||||||
c->ip6.addr_ll_seen = *saddr;
|
c->ip6.addr_ll_seen = *saddr;
|
||||||
else
|
else
|
||||||
c->ip6.addr_seen = *saddr;
|
c->ip6.addr_seen = *saddr;
|
||||||
|
|
||||||
if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw))
|
rsaddr = &c->ip6.our_tap_ll;
|
||||||
rsaddr = &c->ip6.gw;
|
|
||||||
else
|
|
||||||
rsaddr = &c->ip6.addr_ll;
|
|
||||||
|
|
||||||
tap_icmp6_send(c, rsaddr, saddr, ihr, len + sizeof(*ihr));
|
if (ih->icmp6_type == NS) {
|
||||||
|
dlen = sizeof(struct ndp_na);
|
||||||
|
tap_icmp6_send(c, rsaddr, saddr, &na, dlen);
|
||||||
|
} else if (ih->icmp6_type == RS) {
|
||||||
|
dlen = ptr - (unsigned char *)&ra;
|
||||||
|
tap_icmp6_send(c, rsaddr, saddr, &ra, dlen);
|
||||||
|
}
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
3
ndp.h
3
ndp.h
|
@ -6,6 +6,7 @@
|
||||||
#ifndef NDP_H
|
#ifndef NDP_H
|
||||||
#define NDP_H
|
#define NDP_H
|
||||||
|
|
||||||
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr);
|
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
|
||||||
|
const struct pool *p);
|
||||||
|
|
||||||
#endif /* NDP_H */
|
#endif /* NDP_H */
|
||||||
|
|
326
netlink.c
326
netlink.c
|
@ -33,8 +33,13 @@
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
#include "passt.h"
|
#include "passt.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
#include "ip.h"
|
||||||
#include "netlink.h"
|
#include "netlink.h"
|
||||||
|
|
||||||
|
/* Same as RTA_NEXT() but for nexthops: RTNH_NEXT() doesn't take 'attrlen' */
|
||||||
|
#define RTNH_NEXT_AND_DEC(rtnh, attrlen) \
|
||||||
|
((attrlen) -= RTNH_ALIGN((rtnh)->rtnh_len), RTNH_NEXT(rtnh))
|
||||||
|
|
||||||
/* Netlink expects a buffer of at least 8kiB or the system page size,
|
/* Netlink expects a buffer of at least 8kiB or the system page size,
|
||||||
* whichever is larger. 32kiB is recommended for more efficient.
|
* whichever is larger. 32kiB is recommended for more efficient.
|
||||||
* Since the largest page size on any remotely common Linux setup is
|
* Since the largest page size on any remotely common Linux setup is
|
||||||
|
@ -128,7 +133,7 @@ static uint32_t nl_send(int s, void *req, uint16_t type,
|
||||||
|
|
||||||
n = send(s, req, len, 0);
|
n = send(s, req, len, 0);
|
||||||
if (n < 0)
|
if (n < 0)
|
||||||
die("netlink: Failed to send(): %s", strerror(errno));
|
die_perror("netlink: Failed to send()");
|
||||||
else if (n < len)
|
else if (n < len)
|
||||||
die("netlink: Short send (%zd of %zd bytes)", n, len);
|
die("netlink: Short send (%zd of %zd bytes)", n, len);
|
||||||
|
|
||||||
|
@ -184,7 +189,7 @@ static struct nlmsghdr *nl_next(int s, char *buf, struct nlmsghdr *nh, ssize_t *
|
||||||
|
|
||||||
*n = recv(s, buf, NLBUFSIZ, 0);
|
*n = recv(s, buf, NLBUFSIZ, 0);
|
||||||
if (*n < 0)
|
if (*n < 0)
|
||||||
die("netlink: Failed to recv(): %s", strerror(errno));
|
die_perror("netlink: Failed to recv()");
|
||||||
|
|
||||||
nh = (struct nlmsghdr *)buf;
|
nh = (struct nlmsghdr *)buf;
|
||||||
if (!NLMSG_OK(nh, *n))
|
if (!NLMSG_OK(nh, *n))
|
||||||
|
@ -254,7 +259,8 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
|
||||||
.rtm.rtm_type = RTN_UNICAST,
|
.rtm.rtm_type = RTN_UNICAST,
|
||||||
.rtm.rtm_family = af,
|
.rtm.rtm_family = af,
|
||||||
};
|
};
|
||||||
unsigned int ifi = 0;
|
unsigned defifi = 0, anyifi = 0;
|
||||||
|
unsigned ndef = 0, nany = 0;
|
||||||
struct nlmsghdr *nh;
|
struct nlmsghdr *nh;
|
||||||
struct rtattr *rta;
|
struct rtattr *rta;
|
||||||
char buf[NLBUFSIZ];
|
char buf[NLBUFSIZ];
|
||||||
|
@ -262,30 +268,80 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
|
||||||
uint32_t seq;
|
uint32_t seq;
|
||||||
size_t na;
|
size_t na;
|
||||||
|
|
||||||
|
/* Look for an interface with a default route first, failing that, look
|
||||||
|
* for any interface with a route, and pick the first one, if any.
|
||||||
|
*/
|
||||||
seq = nl_send(s, &req, RTM_GETROUTE, NLM_F_DUMP, sizeof(req));
|
seq = nl_send(s, &req, RTM_GETROUTE, NLM_F_DUMP, sizeof(req));
|
||||||
nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWROUTE) {
|
nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWROUTE) {
|
||||||
struct rtmsg *rtm = (struct rtmsg *)NLMSG_DATA(nh);
|
struct rtmsg *rtm = (struct rtmsg *)NLMSG_DATA(nh);
|
||||||
|
const void *dst = NULL;
|
||||||
|
unsigned thisifi = 0;
|
||||||
|
|
||||||
if (ifi || rtm->rtm_dst_len || rtm->rtm_family != af)
|
if (rtm->rtm_family != af)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
|
for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
|
||||||
rta = RTA_NEXT(rta, na)) {
|
rta = RTA_NEXT(rta, na)) {
|
||||||
if (rta->rta_type == RTA_OIF) {
|
if (rta->rta_type == RTA_OIF) {
|
||||||
ifi = *(unsigned int *)RTA_DATA(rta);
|
thisifi = *(unsigned int *)RTA_DATA(rta);
|
||||||
} else if (rta->rta_type == RTA_MULTIPATH) {
|
} else if (rta->rta_type == RTA_MULTIPATH) {
|
||||||
const struct rtnexthop *rtnh;
|
const struct rtnexthop *rtnh;
|
||||||
|
|
||||||
rtnh = (struct rtnexthop *)RTA_DATA(rta);
|
rtnh = (struct rtnexthop *)RTA_DATA(rta);
|
||||||
ifi = rtnh->rtnh_ifindex;
|
thisifi = rtnh->rtnh_ifindex;
|
||||||
|
} else if (rta->rta_type == RTA_DST) {
|
||||||
|
dst = RTA_DATA(rta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!thisifi)
|
||||||
|
continue; /* No interface for this route */
|
||||||
|
|
||||||
|
/* Skip routes to link-local addresses */
|
||||||
|
if (af == AF_INET && dst &&
|
||||||
|
IN4_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (af == AF_INET6 && dst &&
|
||||||
|
IN6_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (rtm->rtm_dst_len == 0) {
|
||||||
|
/* Default route */
|
||||||
|
ndef++;
|
||||||
|
if (!defifi)
|
||||||
|
defifi = thisifi;
|
||||||
|
} else {
|
||||||
|
/* Non-default route */
|
||||||
|
nany++;
|
||||||
|
if (!anyifi)
|
||||||
|
anyifi = thisifi;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (status < 0)
|
if (status < 0)
|
||||||
warn("netlink: RTM_GETROUTE failed: %s", strerror(-status));
|
warn("netlink: RTM_GETROUTE failed: %s", strerror(-status));
|
||||||
|
|
||||||
return ifi;
|
if (defifi) {
|
||||||
|
if (ndef > 1) {
|
||||||
|
info("Multiple default %s routes, picked first",
|
||||||
|
af_name(af));
|
||||||
|
}
|
||||||
|
return defifi;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (anyifi) {
|
||||||
|
if (nany > 1) {
|
||||||
|
info("Multiple interfaces with %s routes, picked first",
|
||||||
|
af_name(af));
|
||||||
|
}
|
||||||
|
return anyifi;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!nany)
|
||||||
|
info("No interfaces with usable %s routes", af_name(af));
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -297,12 +353,13 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
|
||||||
*/
|
*/
|
||||||
bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
|
bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
|
||||||
{
|
{
|
||||||
|
int nh_len = RTA_PAYLOAD(rta);
|
||||||
struct rtnexthop *rtnh;
|
struct rtnexthop *rtnh;
|
||||||
bool found = false;
|
bool found = false;
|
||||||
int hops = -1;
|
int hops = -1;
|
||||||
|
|
||||||
for (rtnh = (struct rtnexthop *)RTA_DATA(rta);
|
for (rtnh = (struct rtnexthop *)RTA_DATA(rta);
|
||||||
RTNH_OK(rtnh, RTA_PAYLOAD(rta)); rtnh = RTNH_NEXT(rtnh)) {
|
RTNH_OK(rtnh, nh_len); rtnh = RTNH_NEXT_AND_DEC(rtnh, nh_len)) {
|
||||||
size_t len = rtnh->rtnh_len - sizeof(*rtnh);
|
size_t len = rtnh->rtnh_len - sizeof(*rtnh);
|
||||||
struct rtattr *rta_inner;
|
struct rtattr *rta_inner;
|
||||||
|
|
||||||
|
@ -332,7 +389,7 @@ bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
|
||||||
* @af: Address family
|
* @af: Address family
|
||||||
* @gw: Default gateway to fill on NL_GET
|
* @gw: Default gateway to fill on NL_GET
|
||||||
*
|
*
|
||||||
* Return: 0 on success, negative error code on failure
|
* Return: error on netlink failure, or 0 (gw unset if default route not found)
|
||||||
*/
|
*/
|
||||||
int nl_route_get_def(int s, unsigned int ifi, sa_family_t af, void *gw)
|
int nl_route_get_def(int s, unsigned int ifi, sa_family_t af, void *gw)
|
||||||
{
|
{
|
||||||
|
@ -479,7 +536,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
||||||
.rta.rta_len = RTA_LENGTH(sizeof(unsigned int)),
|
.rta.rta_len = RTA_LENGTH(sizeof(unsigned int)),
|
||||||
.ifi = ifi_src,
|
.ifi = ifi_src,
|
||||||
};
|
};
|
||||||
ssize_t nlmsgs_size, status;
|
ssize_t nlmsgs_size, left, status;
|
||||||
unsigned dup_routes = 0;
|
unsigned dup_routes = 0;
|
||||||
struct nlmsghdr *nh;
|
struct nlmsghdr *nh;
|
||||||
char buf[NLBUFSIZ];
|
char buf[NLBUFSIZ];
|
||||||
|
@ -493,39 +550,83 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
||||||
* routes in the buffer at once.
|
* routes in the buffer at once.
|
||||||
*/
|
*/
|
||||||
nh = nl_next(s_src, buf, NULL, &nlmsgs_size);
|
nh = nl_next(s_src, buf, NULL, &nlmsgs_size);
|
||||||
for (status = nlmsgs_size;
|
for (left = nlmsgs_size;
|
||||||
NLMSG_OK(nh, status) && (status = nl_status(nh, status, seq)) > 0;
|
NLMSG_OK(nh, left) && (status = nl_status(nh, left, seq)) > 0;
|
||||||
nh = NLMSG_NEXT(nh, status)) {
|
nh = NLMSG_NEXT(nh, left)) {
|
||||||
struct rtmsg *rtm = (struct rtmsg *)NLMSG_DATA(nh);
|
struct rtmsg *rtm = (struct rtmsg *)NLMSG_DATA(nh);
|
||||||
|
bool discard = false;
|
||||||
struct rtattr *rta;
|
struct rtattr *rta;
|
||||||
size_t na;
|
size_t na;
|
||||||
|
|
||||||
if (nh->nlmsg_type != RTM_NEWROUTE)
|
if (nh->nlmsg_type != RTM_NEWROUTE)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
dup_routes++;
|
|
||||||
|
|
||||||
for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
|
for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
|
||||||
rta = RTA_NEXT(rta, na)) {
|
rta = RTA_NEXT(rta, na)) {
|
||||||
|
/* RTA_OIF and RTA_MULTIPATH attributes carry the
|
||||||
|
* identifier of a host interface. If they match the
|
||||||
|
* host interface we're copying from, change them to
|
||||||
|
* match the corresponding identifier in the target
|
||||||
|
* namespace.
|
||||||
|
*
|
||||||
|
* If RTA_OIF doesn't match (NETLINK_GET_STRICT_CHK not
|
||||||
|
* available), or if any interface index in nexthop
|
||||||
|
* objects differ from the host interface, discard the
|
||||||
|
* route altogether.
|
||||||
|
*/
|
||||||
if (rta->rta_type == RTA_OIF) {
|
if (rta->rta_type == RTA_OIF) {
|
||||||
/* The host obviously list's the host interface
|
if (*(unsigned int *)RTA_DATA(rta) != ifi_src) {
|
||||||
* id here, we need to change it to the
|
discard = true;
|
||||||
* namespace's interface id
|
break;
|
||||||
*/
|
}
|
||||||
|
|
||||||
*(unsigned int *)RTA_DATA(rta) = ifi_dst;
|
*(unsigned int *)RTA_DATA(rta) = ifi_dst;
|
||||||
} else if (rta->rta_type == RTA_PREFSRC) {
|
} else if (rta->rta_type == RTA_MULTIPATH) {
|
||||||
/* Host routes might include a preferred source
|
int nh_len = RTA_PAYLOAD(rta);
|
||||||
* address, which must be one of the host's
|
struct rtnexthop *rtnh;
|
||||||
* addresses. However, with -a pasta will use a
|
|
||||||
* different namespace address, making such a
|
for (rtnh = (struct rtnexthop *)RTA_DATA(rta);
|
||||||
* route invalid in the namespace. Strip off
|
RTNH_OK(rtnh, nh_len);
|
||||||
* RTA_PREFSRC attributes to avoid that. */
|
rtnh = RTNH_NEXT_AND_DEC(rtnh, nh_len)) {
|
||||||
|
int src = (int)ifi_src;
|
||||||
|
|
||||||
|
if (rtnh->rtnh_ifindex != src) {
|
||||||
|
discard = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
rtnh->rtnh_ifindex = ifi_dst;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (discard)
|
||||||
|
break;
|
||||||
|
} else if (rta->rta_type == RTA_PREFSRC ||
|
||||||
|
rta->rta_type == RTA_NH_ID) {
|
||||||
|
/* Strip RTA_PREFSRC attributes: host routes
|
||||||
|
* might include a preferred source address,
|
||||||
|
* which must be one of the host's addresses.
|
||||||
|
* However, with -a, pasta will use a different
|
||||||
|
* namespace address, making such a route
|
||||||
|
* invalid in the namespace.
|
||||||
|
*
|
||||||
|
* Strip RTA_NH_ID attributes: host routes set
|
||||||
|
* up via routing protocols (e.g. OSPF) might
|
||||||
|
* contain a nexthop ID (and not nexthop
|
||||||
|
* objects, which are taken care of in the
|
||||||
|
* RTA_MULTIPATH case above) that's not valid
|
||||||
|
* in the target namespace.
|
||||||
|
*/
|
||||||
rta->rta_type = RTA_UNSPEC;
|
rta->rta_type = RTA_UNSPEC;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (discard)
|
||||||
|
nh->nlmsg_type = NLMSG_NOOP;
|
||||||
|
else
|
||||||
|
dup_routes++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!NLMSG_OK(nh, status) || status > 0) {
|
if (!NLMSG_OK(nh, left)) {
|
||||||
/* Process any remaining datagrams in a different
|
/* Process any remaining datagrams in a different
|
||||||
* buffer so we don't overwrite the first one.
|
* buffer so we don't overwrite the first one.
|
||||||
*/
|
*/
|
||||||
|
@ -551,9 +652,9 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
||||||
* to calculate dependencies: let the kernel do that.
|
* to calculate dependencies: let the kernel do that.
|
||||||
*/
|
*/
|
||||||
for (i = 0; i < dup_routes; i++) {
|
for (i = 0; i < dup_routes; i++) {
|
||||||
for (nh = (struct nlmsghdr *)buf, status = nlmsgs_size;
|
for (nh = (struct nlmsghdr *)buf, left = nlmsgs_size;
|
||||||
NLMSG_OK(nh, status);
|
NLMSG_OK(nh, left);
|
||||||
nh = NLMSG_NEXT(nh, status)) {
|
nh = NLMSG_NEXT(nh, left)) {
|
||||||
uint16_t flags = nh->nlmsg_flags;
|
uint16_t flags = nh->nlmsg_flags;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
@ -563,7 +664,8 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
||||||
rc = nl_do(s_dst, nh, RTM_NEWROUTE,
|
rc = nl_do(s_dst, nh, RTM_NEWROUTE,
|
||||||
(flags & ~NLM_F_DUMP_FILTERED) | NLM_F_CREATE,
|
(flags & ~NLM_F_DUMP_FILTERED) | NLM_F_CREATE,
|
||||||
nh->nlmsg_len);
|
nh->nlmsg_len);
|
||||||
if (rc < 0 && rc != -ENETUNREACH && rc != -EEXIST)
|
if (rc < 0 && rc != -EEXIST &&
|
||||||
|
rc != -ENETUNREACH && rc != -EHOSTUNREACH)
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -571,6 +673,63 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* nl_addr_set_ll_nodad() - Set IFA_F_NODAD on IPv6 link-local addresses
|
||||||
|
* @s: Netlink socket
|
||||||
|
* @ifi: Interface index in target namespace
|
||||||
|
*
|
||||||
|
* Return: 0 on success, negative error code on failure
|
||||||
|
*/
|
||||||
|
int nl_addr_set_ll_nodad(int s, unsigned int ifi)
|
||||||
|
{
|
||||||
|
struct req_t {
|
||||||
|
struct nlmsghdr nlh;
|
||||||
|
struct ifaddrmsg ifa;
|
||||||
|
} req = {
|
||||||
|
.ifa.ifa_family = AF_INET6,
|
||||||
|
.ifa.ifa_index = ifi,
|
||||||
|
};
|
||||||
|
uint32_t seq, last_seq = 0;
|
||||||
|
ssize_t status, ret = 0;
|
||||||
|
struct nlmsghdr *nh;
|
||||||
|
char buf[NLBUFSIZ];
|
||||||
|
|
||||||
|
seq = nl_send(s, &req, RTM_GETADDR, NLM_F_DUMP, sizeof(req));
|
||||||
|
nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWADDR) {
|
||||||
|
struct ifaddrmsg *ifa = (struct ifaddrmsg *)NLMSG_DATA(nh);
|
||||||
|
struct rtattr *rta;
|
||||||
|
size_t na;
|
||||||
|
|
||||||
|
if (ifa->ifa_index != ifi || ifa->ifa_scope != RT_SCOPE_LINK)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
ifa->ifa_flags |= IFA_F_NODAD;
|
||||||
|
|
||||||
|
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
|
||||||
|
rta = RTA_NEXT(rta, na)) {
|
||||||
|
/* If 32-bit flags are used, add IFA_F_NODAD there */
|
||||||
|
if (rta->rta_type == IFA_FLAGS)
|
||||||
|
*(uint32_t *)RTA_DATA(rta) |= IFA_F_NODAD;
|
||||||
|
}
|
||||||
|
|
||||||
|
last_seq = nl_send(s, nh, RTM_NEWADDR, NLM_F_REPLACE,
|
||||||
|
nh->nlmsg_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (status < 0)
|
||||||
|
ret = status;
|
||||||
|
|
||||||
|
for (seq = seq + 1; seq <= last_seq; seq++) {
|
||||||
|
nl_foreach(nh, status, s, buf, seq)
|
||||||
|
warn("netlink: Unexpected response message");
|
||||||
|
|
||||||
|
if (!ret && status < 0)
|
||||||
|
ret = status;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* nl_addr_get() - Get most specific global address, given interface and family
|
* nl_addr_get() - Get most specific global address, given interface and family
|
||||||
* @s: Netlink socket
|
* @s: Netlink socket
|
||||||
|
@ -580,7 +739,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
|
||||||
* @prefix_len: Mask or prefix length, to fill (for IPv4)
|
* @prefix_len: Mask or prefix length, to fill (for IPv4)
|
||||||
* @addr_l: Link-scoped address to fill (for IPv6)
|
* @addr_l: Link-scoped address to fill (for IPv6)
|
||||||
*
|
*
|
||||||
* Return: 9 on success, negative error code on failure
|
* Return: 0 on success, negative error code on failure
|
||||||
*/
|
*/
|
||||||
int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
|
int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
|
||||||
void *addr, int *prefix_len, void *addr_l)
|
void *addr, int *prefix_len, void *addr_l)
|
||||||
|
@ -604,12 +763,13 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
|
||||||
struct rtattr *rta;
|
struct rtattr *rta;
|
||||||
size_t na;
|
size_t na;
|
||||||
|
|
||||||
if (ifa->ifa_index != ifi)
|
if (ifa->ifa_index != ifi || ifa->ifa_flags & IFA_F_DEPRECATED)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
|
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
|
||||||
rta = RTA_NEXT(rta, na)) {
|
rta = RTA_NEXT(rta, na)) {
|
||||||
if (rta->rta_type != IFA_ADDRESS)
|
if ((af == AF_INET && rta->rta_type != IFA_LOCAL) ||
|
||||||
|
(af == AF_INET6 && rta->rta_type != IFA_ADDRESS))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (af == AF_INET && ifa->ifa_prefixlen > prefix_max) {
|
if (af == AF_INET && ifa->ifa_prefixlen > prefix_max) {
|
||||||
|
@ -637,7 +797,54 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* nl_add_set() - Set IP addresses for given interface and address family
|
* nl_addr_get_ll() - Get first IPv6 link-local address for a given interface
|
||||||
|
* @s: Netlink socket
|
||||||
|
* @ifi: Interface index in outer network namespace
|
||||||
|
* @addr: Link-local address to fill
|
||||||
|
*
|
||||||
|
* Return: 0 on success, negative error code on failure
|
||||||
|
*/
|
||||||
|
int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr)
|
||||||
|
{
|
||||||
|
struct req_t {
|
||||||
|
struct nlmsghdr nlh;
|
||||||
|
struct ifaddrmsg ifa;
|
||||||
|
} req = {
|
||||||
|
.ifa.ifa_family = AF_INET6,
|
||||||
|
.ifa.ifa_index = ifi,
|
||||||
|
};
|
||||||
|
struct nlmsghdr *nh;
|
||||||
|
bool found = false;
|
||||||
|
char buf[NLBUFSIZ];
|
||||||
|
ssize_t status;
|
||||||
|
uint32_t seq;
|
||||||
|
|
||||||
|
seq = nl_send(s, &req, RTM_GETADDR, NLM_F_DUMP, sizeof(req));
|
||||||
|
nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWADDR) {
|
||||||
|
struct ifaddrmsg *ifa = (struct ifaddrmsg *)NLMSG_DATA(nh);
|
||||||
|
struct rtattr *rta;
|
||||||
|
size_t na;
|
||||||
|
|
||||||
|
if (ifa->ifa_index != ifi || ifa->ifa_scope != RT_SCOPE_LINK ||
|
||||||
|
found)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
|
||||||
|
rta = RTA_NEXT(rta, na)) {
|
||||||
|
if (rta->rta_type != IFA_ADDRESS)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (!found) {
|
||||||
|
memcpy(addr, RTA_DATA(rta), RTA_PAYLOAD(rta));
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* nl_addr_set() - Set IP addresses for given interface and address family
|
||||||
* @s: Netlink socket
|
* @s: Netlink socket
|
||||||
* @ifi: Interface index
|
* @ifi: Interface index
|
||||||
* @af: Address family
|
* @af: Address family
|
||||||
|
@ -740,10 +947,13 @@ int nl_addr_dup(int s_src, unsigned int ifi_src,
|
||||||
ifa = (struct ifaddrmsg *)NLMSG_DATA(nh);
|
ifa = (struct ifaddrmsg *)NLMSG_DATA(nh);
|
||||||
|
|
||||||
if (rc < 0 || ifa->ifa_scope == RT_SCOPE_LINK ||
|
if (rc < 0 || ifa->ifa_scope == RT_SCOPE_LINK ||
|
||||||
ifa->ifa_index != ifi_src)
|
ifa->ifa_index != ifi_src ||
|
||||||
|
ifa->ifa_flags & IFA_F_DEPRECATED)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
ifa->ifa_index = ifi_dst;
|
ifa->ifa_index = ifi_dst;
|
||||||
|
/* Same as nl_addr_set(), but here it's more than a default */
|
||||||
|
ifa->ifa_flags |= IFA_F_NODAD;
|
||||||
|
|
||||||
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
|
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
|
||||||
rta = RTA_NEXT(rta, na)) {
|
rta = RTA_NEXT(rta, na)) {
|
||||||
|
@ -751,6 +961,10 @@ int nl_addr_dup(int s_src, unsigned int ifi_src,
|
||||||
if (rta->rta_type == IFA_LABEL ||
|
if (rta->rta_type == IFA_LABEL ||
|
||||||
rta->rta_type == IFA_CACHEINFO)
|
rta->rta_type == IFA_CACHEINFO)
|
||||||
rta->rta_type = IFA_UNSPEC;
|
rta->rta_type = IFA_UNSPEC;
|
||||||
|
|
||||||
|
/* If 32-bit flags are used, add IFA_F_NODAD there */
|
||||||
|
if (rta->rta_type == IFA_FLAGS)
|
||||||
|
*(uint32_t *)RTA_DATA(rta) |= IFA_F_NODAD;
|
||||||
}
|
}
|
||||||
|
|
||||||
rc = nl_do(s_dst, nh, RTM_NEWADDR,
|
rc = nl_do(s_dst, nh, RTM_NEWADDR,
|
||||||
|
@ -832,14 +1046,14 @@ int nl_link_set_mac(int s, unsigned int ifi, const void *mac)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* nl_link_up() - Bring link up
|
* nl_link_set_mtu() - Set link MTU
|
||||||
* @s: Netlink socket
|
* @s: Netlink socket
|
||||||
* @ifi: Interface index
|
* @ifi: Interface index
|
||||||
* @mtu: If non-zero, set interface MTU
|
* @mtu: Interface MTU
|
||||||
*
|
*
|
||||||
* Return: 0 on success, negative error code on failure
|
* Return: 0 on success, negative error code on failure
|
||||||
*/
|
*/
|
||||||
int nl_link_up(int s, unsigned int ifi, int mtu)
|
int nl_link_set_mtu(int s, unsigned int ifi, int mtu)
|
||||||
{
|
{
|
||||||
struct req_t {
|
struct req_t {
|
||||||
struct nlmsghdr nlh;
|
struct nlmsghdr nlh;
|
||||||
|
@ -849,17 +1063,35 @@ int nl_link_up(int s, unsigned int ifi, int mtu)
|
||||||
} req = {
|
} req = {
|
||||||
.ifm.ifi_family = AF_UNSPEC,
|
.ifm.ifi_family = AF_UNSPEC,
|
||||||
.ifm.ifi_index = ifi,
|
.ifm.ifi_index = ifi,
|
||||||
.ifm.ifi_flags = IFF_UP,
|
|
||||||
.ifm.ifi_change = IFF_UP,
|
|
||||||
.rta.rta_type = IFLA_MTU,
|
.rta.rta_type = IFLA_MTU,
|
||||||
.rta.rta_len = RTA_LENGTH(sizeof(unsigned int)),
|
.rta.rta_len = RTA_LENGTH(sizeof(unsigned int)),
|
||||||
.mtu = mtu,
|
.mtu = mtu,
|
||||||
};
|
};
|
||||||
ssize_t len = sizeof(req);
|
|
||||||
|
|
||||||
if (!mtu)
|
return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req));
|
||||||
/* Shorten request to drop MTU attribute */
|
}
|
||||||
len = offsetof(struct req_t, rta);
|
|
||||||
|
/**
|
||||||
return nl_do(s, &req, RTM_NEWLINK, 0, len);
|
* nl_link_set_flags() - Set link flags
|
||||||
|
* @s: Netlink socket
|
||||||
|
* @ifi: Interface index
|
||||||
|
* @set: Device flags to set
|
||||||
|
* @change: Mask of device flag changes
|
||||||
|
*
|
||||||
|
* Return: 0 on success, negative error code on failure
|
||||||
|
*/
|
||||||
|
int nl_link_set_flags(int s, unsigned int ifi,
|
||||||
|
unsigned int set, unsigned int change)
|
||||||
|
{
|
||||||
|
struct req_t {
|
||||||
|
struct nlmsghdr nlh;
|
||||||
|
struct ifinfomsg ifm;
|
||||||
|
} req = {
|
||||||
|
.ifm.ifi_family = AF_UNSPEC,
|
||||||
|
.ifm.ifi_index = ifi,
|
||||||
|
.ifm.ifi_flags = set,
|
||||||
|
.ifm.ifi_change = change,
|
||||||
|
};
|
||||||
|
|
||||||
|
return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req));
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,10 +19,14 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
|
||||||
void *addr, int *prefix_len, void *addr_l);
|
void *addr, int *prefix_len, void *addr_l);
|
||||||
int nl_addr_set(int s, unsigned int ifi, sa_family_t af,
|
int nl_addr_set(int s, unsigned int ifi, sa_family_t af,
|
||||||
const void *addr, int prefix_len);
|
const void *addr, int prefix_len);
|
||||||
|
int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr);
|
||||||
|
int nl_addr_set_ll_nodad(int s, unsigned int ifi);
|
||||||
int nl_addr_dup(int s_src, unsigned int ifi_src,
|
int nl_addr_dup(int s_src, unsigned int ifi_src,
|
||||||
int s_dst, unsigned int ifi_dst, sa_family_t af);
|
int s_dst, unsigned int ifi_dst, sa_family_t af);
|
||||||
int nl_link_get_mac(int s, unsigned int ifi, void *mac);
|
int nl_link_get_mac(int s, unsigned int ifi, void *mac);
|
||||||
int nl_link_set_mac(int s, unsigned int ifi, const void *mac);
|
int nl_link_set_mac(int s, unsigned int ifi, const void *mac);
|
||||||
int nl_link_up(int s, unsigned int ifi, int mtu);
|
int nl_link_set_mtu(int s, unsigned int ifi, int mtu);
|
||||||
|
int nl_link_set_flags(int s, unsigned int ifi,
|
||||||
|
unsigned int set, unsigned int change);
|
||||||
|
|
||||||
#endif /* NETLINK_H */
|
#endif /* NETLINK_H */
|
||||||
|
|
216
passt.1
216
passt.1
|
@ -73,6 +73,9 @@ for performance reasons.
|
||||||
|
|
||||||
.SH OPTIONS
|
.SH OPTIONS
|
||||||
|
|
||||||
|
Unless otherwise noted below, \fBif conflicting or multiple options are given,
|
||||||
|
the last one takes effect.\fR
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-d ", " \-\-debug
|
.BR \-d ", " \-\-debug
|
||||||
Be verbose, don't log to the system logger.
|
Be verbose, don't log to the system logger.
|
||||||
|
@ -92,14 +95,18 @@ detached PID namespace after starting, because the PID itself cannot change.
|
||||||
Default is to fork into background.
|
Default is to fork into background.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-e ", " \-\-stderr
|
.BR \-e ", " \-\-stderr " " (DEPRECATED)
|
||||||
Log to standard error too.
|
This option has no effect, and is maintained for compatibility purposes only.
|
||||||
Default is to log to the system logger only, if started from an interactive
|
|
||||||
terminal, and to both system logger and standard error otherwise.
|
Note that this configuration option is \fBdeprecated\fR and will be removed in a
|
||||||
|
future version.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-l ", " \-\-log-file " " \fIPATH\fR
|
.BR \-l ", " \-\-log-file " " \fIPATH\fR
|
||||||
Log to file \fIPATH\fR, not to standard error, and not to the system logger.
|
Log to file \fIPATH\fR, and not to the system logger.
|
||||||
|
|
||||||
|
Specifying this option multiple times does \fInot\fR lead to multiple log files:
|
||||||
|
the last given option takes effect.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-\-log-size " " \fISIZE\fR
|
.BR \-\-log-size " " \fISIZE\fR
|
||||||
|
@ -128,6 +135,9 @@ Show version and exit.
|
||||||
Capture tap-facing (that is, guest-side or namespace-side) network packets to
|
Capture tap-facing (that is, guest-side or namespace-side) network packets to
|
||||||
\fIfile\fR in \fBpcap\fR format.
|
\fIfile\fR in \fBpcap\fR format.
|
||||||
|
|
||||||
|
Specifying this option multiple times does \fInot\fR lead to multiple capture
|
||||||
|
files: the last given option takes effect.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-P ", " \-\-pid " " \fIfile
|
.BR \-P ", " \-\-pid " " \fIfile
|
||||||
Write own PID to \fIfile\fR once initialisation is done, before forking to
|
Write own PID to \fIfile\fR once initialisation is done, before forking to
|
||||||
|
@ -148,7 +158,9 @@ for an IPv6 \fIaddr\fR.
|
||||||
This option can be specified zero (for defaults) to two times (once for IPv4,
|
This option can be specified zero (for defaults) to two times (once for IPv4,
|
||||||
once for IPv6).
|
once for IPv6).
|
||||||
By default, assigned IPv4 and IPv6 addresses are taken from the host interfaces
|
By default, assigned IPv4 and IPv6 addresses are taken from the host interfaces
|
||||||
with the first default route for the corresponding IP version.
|
with the first default route, if any, for the corresponding IP version. If no
|
||||||
|
default routes are available and there is any interface with any route for a
|
||||||
|
given IP version, the first of these interfaces will be chosen instead.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-n ", " \-\-netmask " " \fImask
|
.BR \-n ", " \-\-netmask " " \fImask
|
||||||
|
@ -172,9 +184,11 @@ Assign IPv4 \fIaddr\fR as default gateway via DHCP (option 3), or IPv6
|
||||||
This option can be specified zero (for defaults) to two times (once for IPv4,
|
This option can be specified zero (for defaults) to two times (once for IPv4,
|
||||||
once for IPv6).
|
once for IPv6).
|
||||||
By default, IPv4 and IPv6 gateways are taken from the host interface with the
|
By default, IPv4 and IPv6 gateways are taken from the host interface with the
|
||||||
first default route for the corresponding IP version. If the default route is a
|
first default route, if any, for the corresponding IP version. If the default
|
||||||
multipath one, the gateway is the first nexthop router returned by the kernel
|
route is a multipath one, the gateway is the first nexthop router returned by
|
||||||
which has the highest weight in the set of paths.
|
the kernel which has the highest weight in the set of paths. If no default
|
||||||
|
routes are available and there is just one interface with any route, that
|
||||||
|
interface will be chosen instead.
|
||||||
|
|
||||||
Note: these addresses are also used as source address for packets directed to
|
Note: these addresses are also used as source address for packets directed to
|
||||||
the guest or to the target namespace having a loopback or local source address,
|
the guest or to the target namespace having a loopback or local source address,
|
||||||
|
@ -185,9 +199,11 @@ to allow mapping of local traffic to guest and target namespace. See the
|
||||||
.BR \-i ", " \-\-interface " " \fIname
|
.BR \-i ", " \-\-interface " " \fIname
|
||||||
Use host interface \fIname\fR to derive addresses and routes.
|
Use host interface \fIname\fR to derive addresses and routes.
|
||||||
Default is to use the interfaces specified by \fB--outbound-if4\fR and
|
Default is to use the interfaces specified by \fB--outbound-if4\fR and
|
||||||
\fB--outbound-if6\fR, for IPv4 and IPv6 addresses and routes, respectively. If
|
\fB--outbound-if6\fR, for IPv4 and IPv6 addresses and routes, respectively.
|
||||||
no interfaces are given, the interface with the first default routes for each IP
|
|
||||||
version is selected.
|
If no interfaces are given, the interface with the first default routes for each
|
||||||
|
IP version is selected. If no default routes are available and there is just one
|
||||||
|
interface with any route, that interface will be chosen instead.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-o ", " \-\-outbound " " \fIaddr
|
.BR \-o ", " \-\-outbound " " \fIaddr
|
||||||
|
@ -203,30 +219,49 @@ By default, the source address is selected by the routing tables.
|
||||||
Bind IPv4 outbound sockets to host interface \fIname\fR, and, unless another
|
Bind IPv4 outbound sockets to host interface \fIname\fR, and, unless another
|
||||||
interface is specified via \fB-i\fR, \fB--interface\fR, use this interface to
|
interface is specified via \fB-i\fR, \fB--interface\fR, use this interface to
|
||||||
derive IPv4 addresses and routes.
|
derive IPv4 addresses and routes.
|
||||||
By default, the interface given by the default route is selected.
|
|
||||||
|
By default, the interface given by the default route is selected. If no default
|
||||||
|
routes are available and there is just one interface with any route, that
|
||||||
|
interface will be chosen instead.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-\-outbound-if6 " " \fIname
|
.BR \-\-outbound-if6 " " \fIname
|
||||||
Bind IPv6 outbound sockets to host interface \fIname\fR, and, unless another
|
Bind IPv6 outbound sockets to host interface \fIname\fR, and, unless another
|
||||||
interface is specified via \fB-i\fR, \fB--interface\fR, use this interface to
|
interface is specified via \fB-i\fR, \fB--interface\fR, use this interface to
|
||||||
derive IPv6 addresses and routes.
|
derive IPv6 addresses and routes.
|
||||||
By default, the interface given by the default route is selected.
|
|
||||||
|
By default, the interface given by the default route is selected. If no default
|
||||||
|
routes are available and there is just one interface with any route, that
|
||||||
|
interface will be chosen instead.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-D ", " \-\-dns " " \fIaddr
|
.BR \-D ", " \-\-dns " " \fIaddr
|
||||||
Use \fIaddr\fR (IPv4 or IPv6) for DHCP, DHCPv6, NDP or DNS forwarding, as
|
Instruct the guest (via DHCP, DHVPv6 or NDP) to use \fIaddr\fR (IPv4
|
||||||
configured (see options \fB--no-dhcp-dns\fR, \fB--dhcp-dns\fR,
|
or IPv6) as a nameserver, as configured (see options
|
||||||
\fB--dns-forward\fR) instead of reading addresses from \fI/etc/resolv.conf\fR.
|
\fB--no-dhcp-dns\fR, \fB--dhcp-dns\fR) instead of reading addresses
|
||||||
This option can be specified multiple times. Specifying \fB-D none\fR disables
|
from \fI/etc/resolv.conf\fR. This option can be specified multiple
|
||||||
usage of DNS addresses altogether.
|
times. Specifying \fB-D none\fR disables usage of DNS addresses
|
||||||
|
altogether. Unlike addresses from \fI/etc/resolv.conf\fR, \fIaddr\fR
|
||||||
|
is given to the guest without remapping. For example \fB--dns
|
||||||
|
127.0.0.1\fR will instruct the guest to use itself as nameserver, not
|
||||||
|
the host.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-\-dns-forward " " \fIaddr
|
.BR \-\-dns-forward " " \fIaddr
|
||||||
Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the first
|
Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the
|
||||||
configured DNS resolver (with corresponding IP version). Mapping is limited to
|
nameserver (with corresponding IP version) specified by the
|
||||||
UDP traffic directed to port 53, and DNS answers are translated back with a
|
\fB\-\-dns-host\fR option. Maps only UDP and TCP traffic to port 53 or
|
||||||
reverse mapping.
|
port 853. Replies are translated back with a reverse mapping. This
|
||||||
This option can be specified zero to two times (once for IPv4, once for IPv6).
|
option can be specified zero to two times (once for IPv4, once for
|
||||||
|
IPv6).
|
||||||
|
|
||||||
|
.TP
|
||||||
|
.BR \-\-dns-host " " \fIaddr
|
||||||
|
Configure the host nameserver which guest or namespace queries to the
|
||||||
|
\fB\-\-dns-forward\fR address will be redirected to. This option can
|
||||||
|
be specified zero to two times (once for IPv4, once for IPv6).
|
||||||
|
By default, the first nameserver from the host's
|
||||||
|
\fI/etc/resolv.conf\fR.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-S ", " \-\-search " " \fIlist
|
.BR \-S ", " \-\-search " " \fIlist
|
||||||
|
@ -237,28 +272,28 @@ list altogether (if you need to search a domain called "none" you can use
|
||||||
\fB--search none.\fR).
|
\fB--search none.\fR).
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-\-no-dhcp-dns " " \fIaddr
|
.BR \-\-no-dhcp-dns
|
||||||
In \fIpasst\fR mode, do not assign IPv4 addresses via DHCP (option 23) or IPv6
|
In \fIpasst\fR mode, do not assign IPv4 addresses via DHCP (option 23) or IPv6
|
||||||
addresses via NDP Router Advertisement (option type 25) and DHCPv6 (option 23)
|
addresses via NDP Router Advertisement (option type 25) and DHCPv6 (option 23)
|
||||||
as DNS resolvers.
|
as DNS resolvers.
|
||||||
By default, all the configured addresses are passed.
|
By default, all the configured addresses are passed.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-\-dhcp-dns " " \fIaddr
|
.BR \-\-dhcp-dns
|
||||||
In \fIpasta\fR mode, assign IPv4 addresses via DHCP (option 23) or IPv6
|
In \fIpasta\fR mode, assign IPv4 addresses via DHCP (option 23) or IPv6
|
||||||
addresses via NDP Router Advertisement (option type 25) and DHCPv6 (option 23)
|
addresses via NDP Router Advertisement (option type 25) and DHCPv6 (option 23)
|
||||||
as DNS resolvers.
|
as DNS resolvers.
|
||||||
By default, configured addresses, if any, are not passed.
|
By default, configured addresses, if any, are not passed.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-\-no-dhcp-search " " \fIaddr
|
.BR \-\-no-dhcp-search
|
||||||
In \fIpasst\fR mode, do not send the DNS domain search list addresses via DHCP
|
In \fIpasst\fR mode, do not send the DNS domain search list addresses via DHCP
|
||||||
(option 119), via NDP Router Advertisement (option type 31) and DHCPv6 (option
|
(option 119), via NDP Router Advertisement (option type 31) and DHCPv6 (option
|
||||||
24).
|
24).
|
||||||
By default, the DNS domain search list resulting from configuration is passed.
|
By default, the DNS domain search list resulting from configuration is passed.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-\-dhcp-search " " \fIaddr
|
.BR \-\-dhcp-search
|
||||||
In \fIpasta\fR mode, send the DNS domain search list addresses via DHCP (option
|
In \fIpasta\fR mode, send the DNS domain search list addresses via DHCP (option
|
||||||
119), via NDP Router Advertisement (option type 31) and DHCPv6 (option 24).
|
119), via NDP Router Advertisement (option type 31) and DHCPv6 (option 24).
|
||||||
By default, the DNS domain search list resulting from configuration is not
|
By default, the DNS domain search list resulting from configuration is not
|
||||||
|
@ -301,23 +336,63 @@ namespace will be silently dropped.
|
||||||
Disable Router Advertisements. Router Solicitations coming from guest or target
|
Disable Router Advertisements. Router Solicitations coming from guest or target
|
||||||
namespace will be ignored.
|
namespace will be ignored.
|
||||||
|
|
||||||
|
.TP
|
||||||
|
.BR \-\-freebind
|
||||||
|
Allow any binding address to be specified for \fB-t\fR and \fB-u\fR
|
||||||
|
options. Usually binding addresses must be addresses currently
|
||||||
|
configured on the host. With \fB\-\-freebind\fR, the
|
||||||
|
\fBIP_FREEBIND\fR or \fBIPV6_FREEBIND\fR socket option is enabled
|
||||||
|
allowing any address to be used. This is typically used to bind
|
||||||
|
addresses which might be configured on the host in future, at which
|
||||||
|
point the forwarding will immediately start operating.
|
||||||
|
|
||||||
|
.TP
|
||||||
|
.BR \-\-map-host-loopback " " \fIaddr
|
||||||
|
Translate \fIaddr\fR to refer to the host. Packets from the guest to
|
||||||
|
\fIaddr\fR will be redirected to the host. On the host such packets
|
||||||
|
will appear to have both source and destination of 127.0.0.1 or ::1.
|
||||||
|
|
||||||
|
If \fIaddr\fR is 'none', no address is mapped (this implies
|
||||||
|
\fB--no-map-gw\fR). Only one IPv4 and one IPv6 address can be
|
||||||
|
translated, if the option is specified multiple times, the last one
|
||||||
|
takes effect.
|
||||||
|
|
||||||
|
Default is to translate the guest's default gateway address, unless
|
||||||
|
\fB--no-map-gw\fR is given, in which case no address is mapped.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-\-no-map-gw
|
.BR \-\-no-map-gw
|
||||||
Don't remap TCP connections and untracked UDP traffic, with the gateway address
|
Don't remap TCP connections and untracked UDP traffic, with the gateway address
|
||||||
as destination, to the host. Implied if there is no gateway on the selected
|
as destination, to the host. Implied if there is no gateway on the selected
|
||||||
default route for any of the enabled address families.
|
default route, or if there is no default route, for any of the enabled address
|
||||||
|
families.
|
||||||
|
|
||||||
|
.TP
|
||||||
|
.BR \-\-map-guest-addr " " \fIaddr
|
||||||
|
Translate \fIaddr\fR in the guest to be equal to the guest's assigned
|
||||||
|
address on the host. That is, packets from the guest to \fIaddr\fR
|
||||||
|
will be redirected to the address assigned to the guest with \fB-a\fR,
|
||||||
|
or by default the host's global address. This allows the guest to
|
||||||
|
access services availble on the host's global address, even though its
|
||||||
|
own address shadows that of the host.
|
||||||
|
|
||||||
|
If \fIaddr\fR is 'none', no address is mapped. Only one IPv4 and one
|
||||||
|
IPv6 address can be translated, and if the option is specified
|
||||||
|
multiple times, the last one for each address type takes effect.
|
||||||
|
|
||||||
|
Default is no mapping.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-4 ", " \-\-ipv4-only
|
.BR \-4 ", " \-\-ipv4-only
|
||||||
Enable IPv4-only operation. IPv6 traffic will be ignored.
|
Enable IPv4-only operation. IPv6 traffic will be ignored.
|
||||||
By default, IPv6 operation is enabled as long as at least an IPv6 default route
|
By default, IPv6 operation is enabled as long as at least an IPv6 route and an
|
||||||
and an interface address are configured on a given host interface.
|
interface address are configured on a given host interface.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-6 ", " \-\-ipv6-only
|
.BR \-6 ", " \-\-ipv6-only
|
||||||
Enable IPv6-only operation. IPv4 traffic will be ignored.
|
Enable IPv6-only operation. IPv4 traffic will be ignored.
|
||||||
By default, IPv4 operation is enabled as long as at least an IPv4 default route
|
By default, IPv4 operation is enabled as long as at least an IPv4 route and an
|
||||||
and an interface address are configured on a given host interface.
|
interface address are configured on a given host interface.
|
||||||
|
|
||||||
.SS \fBpasst\fR-only options
|
.SS \fBpasst\fR-only options
|
||||||
|
|
||||||
|
@ -530,6 +605,13 @@ Configure UDP port forwarding from target namespace to init namespace.
|
||||||
|
|
||||||
Default is \fBauto\fR.
|
Default is \fBauto\fR.
|
||||||
|
|
||||||
|
.TP
|
||||||
|
.BR \-\-host-lo-to-ns-lo " " (DEPRECATED)
|
||||||
|
If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from
|
||||||
|
the host's loopback address will appear on the loopback address in the
|
||||||
|
guest as well. Without this option such forwarded packets will appear
|
||||||
|
to come from the guest's public address.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.BR \-\-userns " " \fIspec
|
.BR \-\-userns " " \fIspec
|
||||||
Target user namespace to join, as a path. If PID is given, without this option,
|
Target user namespace to join, as a path. If PID is given, without this option,
|
||||||
|
@ -566,7 +648,7 @@ or sourced from the host, and bring up the tap interface.
|
||||||
.BR \-\-no-copy-routes " " (DEPRECATED)
|
.BR \-\-no-copy-routes " " (DEPRECATED)
|
||||||
With \-\-config-net, do not copy all the routes associated to the interface we
|
With \-\-config-net, do not copy all the routes associated to the interface we
|
||||||
derive addresses and routes from: set up only the default gateway. Implied by
|
derive addresses and routes from: set up only the default gateway. Implied by
|
||||||
-g, \-\-gateway.
|
-g, \-\-gateway, for the corresponding IP version only.
|
||||||
|
|
||||||
Default is to copy all the routing entries from the interface in the outer
|
Default is to copy all the routing entries from the interface in the outer
|
||||||
namespace to the target namespace, translating the output interface attribute to
|
namespace to the target namespace, translating the output interface attribute to
|
||||||
|
@ -581,7 +663,7 @@ below.
|
||||||
.BR \-\-no-copy-addrs " " (DEPRECATED)
|
.BR \-\-no-copy-addrs " " (DEPRECATED)
|
||||||
With \-\-config-net, do not copy all the addresses associated to the interface
|
With \-\-config-net, do not copy all the addresses associated to the interface
|
||||||
we derive addresses and routes from: set up a single one. Implied by \-a,
|
we derive addresses and routes from: set up a single one. Implied by \-a,
|
||||||
\-\-address.
|
\-\-address, for the corresponding IP version only.
|
||||||
|
|
||||||
Default is to copy all the addresses, except for link-local ones, from the
|
Default is to copy all the addresses, except for link-local ones, from the
|
||||||
interface from the outer namespace to the target namespace.
|
interface from the outer namespace to the target namespace.
|
||||||
|
@ -807,38 +889,41 @@ root@localhost's password:
|
||||||
|
|
||||||
.SH NOTES
|
.SH NOTES
|
||||||
|
|
||||||
.SS Handling of traffic with local destination and source addresses
|
.SS Handling of traffic with loopback destination and source addresses
|
||||||
|
|
||||||
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a local address,
|
Both \fBpasst\fR and \fBpasta\fR can bind on ports with a loopback
|
||||||
depending on the configuration. Local destination or source addresses need to be
|
address (127.0.0.0/8 or ::1), depending on the configuration. Loopback
|
||||||
changed before packets are delivered to the guest or target namespace: most
|
destination or source addresses need to be changed before packets are
|
||||||
operating systems would drop packets received from non-loopback interfaces with
|
delivered to the guest or target namespace: most operating systems
|
||||||
local addresses, and it would also be impossible for guest or target namespace
|
would drop packets received with loopback addresses on non-loopback
|
||||||
to route answers back.
|
interfaces, and it would also be impossible for guest or target
|
||||||
|
namespace to route answers back.
|
||||||
|
|
||||||
For convenience, and somewhat arbitrarily, the source address on these packets
|
For convenience, the source address on these packets is translated to
|
||||||
is translated to the address of the default IPv4 or IPv6 gateway -- this is
|
the address specified by the \fB\-\-map-host-loopback\fR option (with
|
||||||
known to be an existing, valid address on the same subnet.
|
some exceptions in pasta mode, see next section below). If not
|
||||||
|
specified this defaults, somewhat arbitrarily, to the address of
|
||||||
|
default IPv4 or IPv6 gateway (if any) -- this is known to be an
|
||||||
|
existing, valid address on the same subnet. If \fB\-\-no-map-gw\fR or
|
||||||
|
\fB\-\-map-host-loopback none\fR are specified this translation is
|
||||||
|
disabled and packets with loopback addresses are simply dropped.
|
||||||
|
|
||||||
Loopback destination addresses are instead translated to the observed external
|
Loopback destination addresses are translated to the observed external
|
||||||
address of the guest or target namespace. For IPv6 packets, if usage of a
|
address of the guest or target namespace. For IPv6, the observed
|
||||||
link-local address by guest or namespace has ever been observed, and the
|
link-local address is used if the translated source address is
|
||||||
original destination address is also a link-local address, the observed
|
link-local, otherwise the observed global address is used. For both
|
||||||
link-local address is used. Otherwise, the observed global address is used. For
|
IPv4 and IPv6, if no addresses have been seen yet, the configured
|
||||||
both IPv4 and IPv6, if no addresses have been seen yet, the configured addresses
|
addresses will be used instead.
|
||||||
will be used instead.
|
|
||||||
|
|
||||||
For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1,
|
For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1,
|
||||||
with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
|
with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while
|
||||||
the last observed source address from guest or namespace is 192.0.2.2, this will
|
the last observed source address from guest or namespace is 192.0.2.2, this will
|
||||||
be translated to a connection from 192.0.2.1 to 192.0.2.2.
|
be translated to a connection from 192.0.2.1 to 192.0.2.2.
|
||||||
|
|
||||||
Similarly, for traffic coming from guest or namespace, packets with destination
|
Similarly, for traffic coming from guest or namespace, packets with
|
||||||
address corresponding to the default gateway will have their destination address
|
destination address corresponding to the \fB\-\-map-host-loopback\fR
|
||||||
translated to a loopback address, if and only if a packet, in the opposite
|
address will have their destination address translated to a loopback
|
||||||
direction, with a loopback destination or source address, port-wise matching for
|
address.
|
||||||
UDP, or connection-wise for TCP, has been recently forwarded to guest or
|
|
||||||
namespace. This behaviour can be disabled with \-\-no\-map\-gw.
|
|
||||||
|
|
||||||
.SS Handling of local traffic in pasta
|
.SS Handling of local traffic in pasta
|
||||||
|
|
||||||
|
@ -854,8 +939,15 @@ and the new socket using the \fBsplice\fR(2) system call, and for UDP, a pair
|
||||||
of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet
|
of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet
|
||||||
transfers.
|
transfers.
|
||||||
|
|
||||||
This bypass only applies to local connections and traffic, because it's not
|
Because it's not possible to bind sockets to foreign addresses, this
|
||||||
possible to bind sockets to foreign addresses.
|
bypass only applies to local connections and traffic. It also means
|
||||||
|
that the address translation differs slightly from passt mode.
|
||||||
|
Connections from loopback to loopback on the host will appear to come
|
||||||
|
from the target namespace's public address within the guest, unless
|
||||||
|
\fB\-\-host-lo-to-ns-lo\fR is specified, in which case they will
|
||||||
|
appear to come from loopback in the namespace as well. The latter
|
||||||
|
behaviour used to be the default, but is usually undesirable, since it
|
||||||
|
can unintentionally expose namespace local services to the host.
|
||||||
|
|
||||||
.SS Binding to low numbered ports (well-known or system ports, up to 1023)
|
.SS Binding to low numbered ports (well-known or system ports, up to 1023)
|
||||||
|
|
||||||
|
@ -964,8 +1056,8 @@ https://passt.top/passt/lists.
|
||||||
Copyright (c) 2020-2022 Red Hat GmbH.
|
Copyright (c) 2020-2022 Red Hat GmbH.
|
||||||
|
|
||||||
\fBpasst\fR and \fBpasta\fR are free software: you can redistribute them and/or
|
\fBpasst\fR and \fBpasta\fR are free software: you can redistribute them and/or
|
||||||
modify them under the terms of the GNU Affero General Public License as
|
modify them under the terms of the GNU General Public License as
|
||||||
published by the Free Software Foundation, either version 3 of the License, or
|
published by the Free Software Foundation, either version 2 of the License, or
|
||||||
(at your option) any later version.
|
(at your option) any later version.
|
||||||
|
|
||||||
.SH SEE ALSO
|
.SH SEE ALSO
|
||||||
|
|
146
passt.c
146
passt.c
|
@ -35,6 +35,7 @@
|
||||||
#include <syslog.h>
|
#include <syslog.h>
|
||||||
#include <sys/prctl.h>
|
#include <sys/prctl.h>
|
||||||
#include <netinet/if_ether.h>
|
#include <netinet/if_ether.h>
|
||||||
|
#include <libgen.h>
|
||||||
#ifdef HAS_GETRANDOM
|
#ifdef HAS_GETRANDOM
|
||||||
#include <sys/random.h>
|
#include <sys/random.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -65,9 +66,9 @@ char *epoll_type_str[] = {
|
||||||
[EPOLL_TYPE_TCP_SPLICE] = "connected spliced TCP socket",
|
[EPOLL_TYPE_TCP_SPLICE] = "connected spliced TCP socket",
|
||||||
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
|
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
|
||||||
[EPOLL_TYPE_TCP_TIMER] = "TCP timer",
|
[EPOLL_TYPE_TCP_TIMER] = "TCP timer",
|
||||||
[EPOLL_TYPE_UDP] = "UDP socket",
|
[EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket",
|
||||||
[EPOLL_TYPE_ICMP] = "ICMP socket",
|
[EPOLL_TYPE_UDP_REPLY] = "UDP reply socket",
|
||||||
[EPOLL_TYPE_ICMPV6] = "ICMPv6 socket",
|
[EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket",
|
||||||
[EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch",
|
[EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch",
|
||||||
[EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch",
|
[EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch",
|
||||||
[EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device",
|
[EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device",
|
||||||
|
@ -84,7 +85,7 @@ static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
|
||||||
*/
|
*/
|
||||||
static void post_handler(struct ctx *c, const struct timespec *now)
|
static void post_handler(struct ctx *c, const struct timespec *now)
|
||||||
{
|
{
|
||||||
#define CALL_PROTO_HANDLER(c, now, lc, uc) \
|
#define CALL_PROTO_HANDLER(lc, uc) \
|
||||||
do { \
|
do { \
|
||||||
extern void \
|
extern void \
|
||||||
lc ## _defer_handler (struct ctx *c) \
|
lc ## _defer_handler (struct ctx *c) \
|
||||||
|
@ -103,11 +104,9 @@ static void post_handler(struct ctx *c, const struct timespec *now)
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||||
CALL_PROTO_HANDLER(c, now, tcp, TCP);
|
CALL_PROTO_HANDLER(tcp, TCP);
|
||||||
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||||
CALL_PROTO_HANDLER(c, now, udp, UDP);
|
CALL_PROTO_HANDLER(udp, UDP);
|
||||||
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
|
||||||
CALL_PROTO_HANDLER(c, now, icmp, ICMP);
|
|
||||||
|
|
||||||
flow_defer_handler(c, now);
|
flow_defer_handler(c, now);
|
||||||
#undef CALL_PROTO_HANDLER
|
#undef CALL_PROTO_HANDLER
|
||||||
|
@ -138,14 +137,13 @@ static void secret_init(struct ctx *c)
|
||||||
}
|
}
|
||||||
if (dev_random >= 0)
|
if (dev_random >= 0)
|
||||||
close(dev_random);
|
close(dev_random);
|
||||||
if (random_read < sizeof(c->hash_secret)) {
|
|
||||||
|
if (random_read < sizeof(c->hash_secret))
|
||||||
#else
|
#else
|
||||||
if (getrandom(&c->hash_secret, sizeof(c->hash_secret),
|
if (getrandom(&c->hash_secret, sizeof(c->hash_secret),
|
||||||
GRND_RANDOM) < 0) {
|
GRND_RANDOM) < 0)
|
||||||
#endif /* !HAS_GETRANDOM */
|
#endif /* !HAS_GETRANDOM */
|
||||||
perror("TCP initial sequence getrandom");
|
die_perror("Failed to get random bytes for hash table and TCP");
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -193,28 +191,30 @@ void exit_handler(int signal)
|
||||||
* Return: non-zero on failure
|
* Return: non-zero on failure
|
||||||
*
|
*
|
||||||
* #syscalls read write writev
|
* #syscalls read write writev
|
||||||
* #syscalls socket bind connect getsockopt setsockopt s390x:socketcall close
|
* #syscalls socket getsockopt setsockopt s390x:socketcall i686:socketcall close
|
||||||
* #syscalls recvfrom sendto shutdown
|
* #syscalls bind connect recvfrom sendto shutdown
|
||||||
* #syscalls armv6l:recv armv7l:recv ppc64le:recv
|
* #syscalls arm:recv ppc64le:recv arm:send ppc64le:send
|
||||||
* #syscalls armv6l:send armv7l:send ppc64le:send
|
|
||||||
* #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
|
* #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
|
||||||
* #syscalls clock_gettime armv6l:clock_gettime64 armv7l:clock_gettime64
|
* #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64
|
||||||
*/
|
*/
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
int nfds, i, devnull_fd = -1, pidfile_fd = -1;
|
|
||||||
struct epoll_event events[EPOLL_EVENTS];
|
struct epoll_event events[EPOLL_EVENTS];
|
||||||
char *log_name, argv0[PATH_MAX], *name;
|
int nfds, i, devnull_fd = -1;
|
||||||
|
char argv0[PATH_MAX], *name;
|
||||||
struct ctx c = { 0 };
|
struct ctx c = { 0 };
|
||||||
struct rlimit limit;
|
struct rlimit limit;
|
||||||
struct timespec now;
|
struct timespec now;
|
||||||
struct sigaction sa;
|
struct sigaction sa;
|
||||||
|
|
||||||
|
if (clock_gettime(CLOCK_MONOTONIC, &log_start))
|
||||||
|
die_perror("Failed to get CLOCK_MONOTONIC time");
|
||||||
|
|
||||||
arch_avx2_exec(argv);
|
arch_avx2_exec(argv);
|
||||||
|
|
||||||
isolate_initial();
|
isolate_initial(argc, argv);
|
||||||
|
|
||||||
c.pasta_netns_fd = c.fd_tap = c.fd_tap_listen = -1;
|
c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1;
|
||||||
|
|
||||||
sigemptyset(&sa.sa_mask);
|
sigemptyset(&sa.sa_mask);
|
||||||
sa.sa_flags = 0;
|
sa.sa_flags = 0;
|
||||||
|
@ -229,69 +229,52 @@ int main(int argc, char **argv)
|
||||||
name = basename(argv0);
|
name = basename(argv0);
|
||||||
if (strstr(name, "pasta")) {
|
if (strstr(name, "pasta")) {
|
||||||
sa.sa_handler = pasta_child_handler;
|
sa.sa_handler = pasta_child_handler;
|
||||||
if (sigaction(SIGCHLD, &sa, NULL)) {
|
if (sigaction(SIGCHLD, &sa, NULL))
|
||||||
die("Couldn't install signal handlers: %s",
|
die_perror("Couldn't install signal handlers");
|
||||||
strerror(errno));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
|
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
|
||||||
die("Couldn't set disposition for SIGPIPE: %s",
|
die_perror("Couldn't set disposition for SIGPIPE");
|
||||||
strerror(errno));
|
|
||||||
}
|
|
||||||
|
|
||||||
c.mode = MODE_PASTA;
|
c.mode = MODE_PASTA;
|
||||||
log_name = "pasta";
|
|
||||||
} else if (strstr(name, "passt")) {
|
} else if (strstr(name, "passt")) {
|
||||||
c.mode = MODE_PASST;
|
c.mode = MODE_PASST;
|
||||||
log_name = "passt";
|
|
||||||
} else {
|
} else {
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
|
madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
|
||||||
|
|
||||||
__openlog(log_name, 0, LOG_DAEMON);
|
|
||||||
|
|
||||||
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
|
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
|
||||||
if (c.epollfd == -1) {
|
if (c.epollfd == -1)
|
||||||
perror("epoll_create1");
|
die_perror("Failed to create epoll file descriptor");
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
if (getrlimit(RLIMIT_NOFILE, &limit))
|
||||||
|
die_perror("Failed to get maximum value of open files limit");
|
||||||
|
|
||||||
if (getrlimit(RLIMIT_NOFILE, &limit)) {
|
|
||||||
perror("getrlimit");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
c.nofile = limit.rlim_cur = limit.rlim_max;
|
c.nofile = limit.rlim_cur = limit.rlim_max;
|
||||||
if (setrlimit(RLIMIT_NOFILE, &limit)) {
|
if (setrlimit(RLIMIT_NOFILE, &limit))
|
||||||
perror("setrlimit");
|
die_perror("Failed to set current limit for open files");
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
sock_probe_mem(&c);
|
sock_probe_mem(&c);
|
||||||
|
|
||||||
conf(&c, argc, argv);
|
conf(&c, argc, argv);
|
||||||
trace_init(c.trace);
|
trace_init(c.trace);
|
||||||
|
|
||||||
if (c.force_stderr || isatty(fileno(stdout)))
|
|
||||||
__openlog(log_name, LOG_PERROR, LOG_DAEMON);
|
|
||||||
|
|
||||||
pasta_netns_quit_init(&c);
|
pasta_netns_quit_init(&c);
|
||||||
|
|
||||||
tap_sock_init(&c);
|
tap_sock_init(&c);
|
||||||
|
|
||||||
secret_init(&c);
|
secret_init(&c);
|
||||||
|
|
||||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
if (clock_gettime(CLOCK_MONOTONIC, &now))
|
||||||
|
die_perror("Failed to get CLOCK_MONOTONIC time");
|
||||||
|
|
||||||
flow_init();
|
flow_init();
|
||||||
|
|
||||||
if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
|
if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
|
|
||||||
if (!c.no_icmp)
|
proto_update_l2_buf(c.guest_mac, c.our_tap_mac);
|
||||||
icmp_init();
|
|
||||||
|
|
||||||
proto_update_l2_buf(c.mac_guest, c.mac);
|
|
||||||
|
|
||||||
if (c.ifi4 && !c.no_dhcp)
|
if (c.ifi4 && !c.no_dhcp)
|
||||||
dhcp_init();
|
dhcp_init();
|
||||||
|
@ -302,53 +285,46 @@ int main(int argc, char **argv)
|
||||||
pcap_init(&c);
|
pcap_init(&c);
|
||||||
|
|
||||||
if (!c.foreground) {
|
if (!c.foreground) {
|
||||||
if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0) {
|
if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0)
|
||||||
perror("/dev/null open");
|
die_perror("Failed to open /dev/null");
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*c.pid_file) {
|
|
||||||
if ((pidfile_fd = open(c.pid_file,
|
|
||||||
O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC,
|
|
||||||
S_IRUSR | S_IWUSR)) < 0) {
|
|
||||||
perror("PID file open");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isolate_prefork(&c))
|
if (isolate_prefork(&c))
|
||||||
die("Failed to sandbox process, exiting");
|
die("Failed to sandbox process, exiting");
|
||||||
|
|
||||||
if (!c.foreground)
|
if (!c.foreground) {
|
||||||
__daemon(pidfile_fd, devnull_fd);
|
__daemon(c.pidfile_fd, devnull_fd);
|
||||||
else
|
log_stderr = false;
|
||||||
write_pidfile(pidfile_fd, getpid());
|
} else {
|
||||||
|
pidfile_write(c.pidfile_fd, getpid());
|
||||||
|
}
|
||||||
|
|
||||||
if (pasta_child_pid)
|
if (pasta_child_pid) {
|
||||||
kill(pasta_child_pid, SIGUSR1);
|
kill(pasta_child_pid, SIGUSR1);
|
||||||
|
log_stderr = false;
|
||||||
|
}
|
||||||
|
|
||||||
isolate_postfork(&c);
|
isolate_postfork(&c);
|
||||||
|
|
||||||
timer_init(&c, &now);
|
timer_init(&c, &now);
|
||||||
|
|
||||||
loop:
|
loop:
|
||||||
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
/* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */
|
||||||
/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
|
/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
|
||||||
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
|
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
|
||||||
if (nfds == -1 && errno != EINTR) {
|
/* NOLINTEND(bugprone-branch-clone) */
|
||||||
perror("epoll_wait");
|
if (nfds == -1 && errno != EINTR)
|
||||||
exit(EXIT_FAILURE);
|
die_perror("epoll_wait() failed in main loop");
|
||||||
}
|
|
||||||
|
|
||||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
if (clock_gettime(CLOCK_MONOTONIC, &now))
|
||||||
|
err_perror("Failed to get CLOCK_MONOTONIC time");
|
||||||
|
|
||||||
for (i = 0; i < nfds; i++) {
|
for (i = 0; i < nfds; i++) {
|
||||||
union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
|
union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
|
||||||
uint32_t eventmask = events[i].events;
|
uint32_t eventmask = events[i].events;
|
||||||
|
|
||||||
trace("%s: epoll event on %s %i (events: 0x%08x)",
|
trace("%s: epoll event on %s %i (events: 0x%08x)",
|
||||||
c.mode == MODE_PASST ? "passt" : "pasta",
|
c.mode == MODE_PASTA ? "pasta" : "passt",
|
||||||
EPOLL_TYPE_STR(ref.type), ref.fd, eventmask);
|
EPOLL_TYPE_STR(ref.type), ref.fd, eventmask);
|
||||||
|
|
||||||
switch (ref.type) {
|
switch (ref.type) {
|
||||||
|
@ -379,14 +355,14 @@ loop:
|
||||||
case EPOLL_TYPE_TCP_TIMER:
|
case EPOLL_TYPE_TCP_TIMER:
|
||||||
tcp_timer_handler(&c, ref);
|
tcp_timer_handler(&c, ref);
|
||||||
break;
|
break;
|
||||||
case EPOLL_TYPE_UDP:
|
case EPOLL_TYPE_UDP_LISTEN:
|
||||||
udp_sock_handler(&c, ref, eventmask, &now);
|
udp_listen_sock_handler(&c, ref, eventmask, &now);
|
||||||
break;
|
break;
|
||||||
case EPOLL_TYPE_ICMP:
|
case EPOLL_TYPE_UDP_REPLY:
|
||||||
icmp_sock_handler(&c, AF_INET, ref);
|
udp_reply_sock_handler(&c, ref, eventmask, &now);
|
||||||
break;
|
break;
|
||||||
case EPOLL_TYPE_ICMPV6:
|
case EPOLL_TYPE_PING:
|
||||||
icmp_sock_handler(&c, AF_INET6, ref);
|
icmp_sock_handler(&c, ref);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
/* Can't happen */
|
/* Can't happen */
|
||||||
|
|
148
passt.h
148
passt.h
|
@ -9,26 +9,6 @@
|
||||||
#define UNIX_SOCK_MAX 100
|
#define UNIX_SOCK_MAX 100
|
||||||
#define UNIX_SOCK_PATH "/tmp/passt_%i.socket"
|
#define UNIX_SOCK_PATH "/tmp/passt_%i.socket"
|
||||||
|
|
||||||
/**
|
|
||||||
* struct tap_msg - Generic message descriptor for arrays of messages
|
|
||||||
* @pkt_buf_offset: Offset from @pkt_buf
|
|
||||||
* @len: Message length, with L2 headers
|
|
||||||
*/
|
|
||||||
struct tap_msg {
|
|
||||||
uint32_t pkt_buf_offset;
|
|
||||||
uint16_t len;
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* struct tap_l4_msg - Layer-4 message descriptor for protocol handlers
|
|
||||||
* @pkt_buf_offset: Offset of message from @pkt_buf
|
|
||||||
* @l4_len: Length of Layer-4 payload, host order
|
|
||||||
*/
|
|
||||||
struct tap_l4_msg {
|
|
||||||
uint32_t pkt_buf_offset;
|
|
||||||
uint16_t l4_len;
|
|
||||||
};
|
|
||||||
|
|
||||||
union epoll_ref;
|
union epoll_ref;
|
||||||
|
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
@ -37,45 +17,21 @@ union epoll_ref;
|
||||||
|
|
||||||
#include "pif.h"
|
#include "pif.h"
|
||||||
#include "packet.h"
|
#include "packet.h"
|
||||||
|
#include "siphash.h"
|
||||||
|
#include "ip.h"
|
||||||
|
#include "inany.h"
|
||||||
#include "flow.h"
|
#include "flow.h"
|
||||||
#include "icmp.h"
|
#include "icmp.h"
|
||||||
#include "fwd.h"
|
#include "fwd.h"
|
||||||
#include "tcp.h"
|
#include "tcp.h"
|
||||||
#include "udp.h"
|
#include "udp.h"
|
||||||
|
|
||||||
/**
|
/* Default address for our end on the tap interface. Bit 0 of byte 0 must be 0
|
||||||
* enum epoll_type - Different types of fds we poll over
|
* (unicast) and bit 1 of byte 1 must be 1 (locally administered). Otherwise
|
||||||
|
* it's arbitrary.
|
||||||
*/
|
*/
|
||||||
enum epoll_type {
|
#define MAC_OUR_LAA \
|
||||||
/* Special value to indicate an invalid type */
|
((uint8_t [ETH_ALEN]){0x9a, 0x55, 0x9a, 0x55, 0x9a, 0x55})
|
||||||
EPOLL_TYPE_NONE = 0,
|
|
||||||
/* Connected TCP sockets */
|
|
||||||
EPOLL_TYPE_TCP,
|
|
||||||
/* Connected TCP sockets (spliced) */
|
|
||||||
EPOLL_TYPE_TCP_SPLICE,
|
|
||||||
/* Listening TCP sockets */
|
|
||||||
EPOLL_TYPE_TCP_LISTEN,
|
|
||||||
/* timerfds used for TCP timers */
|
|
||||||
EPOLL_TYPE_TCP_TIMER,
|
|
||||||
/* UDP sockets */
|
|
||||||
EPOLL_TYPE_UDP,
|
|
||||||
/* IPv4 ICMP sockets */
|
|
||||||
EPOLL_TYPE_ICMP,
|
|
||||||
/* ICMPv6 sockets */
|
|
||||||
EPOLL_TYPE_ICMPV6,
|
|
||||||
/* inotify fd watching for end of netns (pasta) */
|
|
||||||
EPOLL_TYPE_NSQUIT_INOTIFY,
|
|
||||||
/* timer fd watching for end of netns, fallback for inotify (pasta) */
|
|
||||||
EPOLL_TYPE_NSQUIT_TIMER,
|
|
||||||
/* tuntap character device */
|
|
||||||
EPOLL_TYPE_TAP_PASTA,
|
|
||||||
/* socket connected to qemu */
|
|
||||||
EPOLL_TYPE_TAP_PASST,
|
|
||||||
/* socket listening for qemu socket connections */
|
|
||||||
EPOLL_TYPE_TAP_LISTEN,
|
|
||||||
|
|
||||||
EPOLL_NUM_TYPES,
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* union epoll_ref - Breakdown of reference for epoll fd bookkeeping
|
* union epoll_ref - Breakdown of reference for epoll fd bookkeeping
|
||||||
|
@ -99,8 +55,7 @@ union epoll_ref {
|
||||||
uint32_t flow;
|
uint32_t flow;
|
||||||
flow_sidx_t flowside;
|
flow_sidx_t flowside;
|
||||||
union tcp_listen_epoll_ref tcp_listen;
|
union tcp_listen_epoll_ref tcp_listen;
|
||||||
union udp_epoll_ref udp;
|
union udp_listen_epoll_ref udp;
|
||||||
union icmp_epoll_ref icmp;
|
|
||||||
uint32_t data;
|
uint32_t data;
|
||||||
int nsdir_fd;
|
int nsdir_fd;
|
||||||
};
|
};
|
||||||
|
@ -112,7 +67,6 @@ static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
|
||||||
|
|
||||||
#define TAP_BUF_BYTES \
|
#define TAP_BUF_BYTES \
|
||||||
ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
|
ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
|
||||||
#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t))
|
|
||||||
#define TAP_MSGS \
|
#define TAP_MSGS \
|
||||||
DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
|
DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
|
||||||
|
|
||||||
|
@ -144,54 +98,84 @@ enum passt_modes {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct ip4_ctx - IPv4 execution context
|
* struct ip4_ctx - IPv4 execution context
|
||||||
* @addr: IPv4 address for external, routable interface
|
* @addr: IPv4 address assigned to guest
|
||||||
* @addr_seen: Latest IPv4 address seen as source from tap
|
* @addr_seen: Latest IPv4 address seen as source from tap
|
||||||
* @prefixlen: IPv4 prefix length (netmask)
|
* @prefixlen: IPv4 prefix length (netmask)
|
||||||
* @gw: Default IPv4 gateway, network order
|
* @guest_gw: IPv4 gateway as seen by the guest
|
||||||
* @dns: DNS addresses for DHCP, zero-terminated, network order
|
* @map_host_loopback: Outbound connections to this address are NATted to the
|
||||||
* @dns_match: Forward DNS query if sent to this address, network order
|
* host's 127.0.0.1
|
||||||
* @dns_host: Use this DNS on the host for forwarding, network order
|
* @map_guest_addr: Outbound connections to this address are NATted to the
|
||||||
|
* guest's assigned address
|
||||||
|
* @dns: DNS addresses for DHCP, zero-terminated
|
||||||
|
* @dns_match: Forward DNS query if sent to this address
|
||||||
|
* @our_tap_addr: IPv4 address for passt's use on tap
|
||||||
|
* @dns_host: Use this DNS on the host for forwarding
|
||||||
* @addr_out: Optional source address for outbound traffic
|
* @addr_out: Optional source address for outbound traffic
|
||||||
* @ifname_out: Optional interface name to bind outbound sockets to
|
* @ifname_out: Optional interface name to bind outbound sockets to
|
||||||
|
* @no_copy_routes: Don't copy all routes when configuring target namespace
|
||||||
|
* @no_copy_addrs: Don't copy all addresses when configuring namespace
|
||||||
*/
|
*/
|
||||||
struct ip4_ctx {
|
struct ip4_ctx {
|
||||||
|
/* PIF_TAP addresses */
|
||||||
struct in_addr addr;
|
struct in_addr addr;
|
||||||
struct in_addr addr_seen;
|
struct in_addr addr_seen;
|
||||||
int prefix_len;
|
int prefix_len;
|
||||||
struct in_addr gw;
|
struct in_addr guest_gw;
|
||||||
|
struct in_addr map_host_loopback;
|
||||||
|
struct in_addr map_guest_addr;
|
||||||
struct in_addr dns[MAXNS + 1];
|
struct in_addr dns[MAXNS + 1];
|
||||||
struct in_addr dns_match;
|
struct in_addr dns_match;
|
||||||
struct in_addr dns_host;
|
struct in_addr our_tap_addr;
|
||||||
|
|
||||||
|
/* PIF_HOST addresses */
|
||||||
|
struct in_addr dns_host;
|
||||||
struct in_addr addr_out;
|
struct in_addr addr_out;
|
||||||
|
|
||||||
char ifname_out[IFNAMSIZ];
|
char ifname_out[IFNAMSIZ];
|
||||||
|
|
||||||
|
bool no_copy_routes;
|
||||||
|
bool no_copy_addrs;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct ip6_ctx - IPv6 execution context
|
* struct ip6_ctx - IPv6 execution context
|
||||||
* @addr: IPv6 address for external, routable interface
|
* @addr: IPv6 address assigned to guest
|
||||||
* @addr_ll: Link-local IPv6 address on external, routable interface
|
|
||||||
* @addr_seen: Latest IPv6 global/site address seen as source from tap
|
* @addr_seen: Latest IPv6 global/site address seen as source from tap
|
||||||
* @addr_ll_seen: Latest IPv6 link-local address seen as source from tap
|
* @addr_ll_seen: Latest IPv6 link-local address seen as source from tap
|
||||||
* @gw: Default IPv6 gateway
|
* @guest_gw: IPv6 gateway as seen by the guest
|
||||||
|
* @map_host_loopback: Outbound connections to this address are NATted to the
|
||||||
|
* host's [::1]
|
||||||
|
* @map_guest_addr: Outbound connections to this address are NATted to the
|
||||||
|
* guest's assigned address
|
||||||
* @dns: DNS addresses for DHCPv6 and NDP, zero-terminated
|
* @dns: DNS addresses for DHCPv6 and NDP, zero-terminated
|
||||||
* @dns_match: Forward DNS query if sent to this address
|
* @dns_match: Forward DNS query if sent to this address
|
||||||
|
* @our_tap_ll: Link-local IPv6 address for passt's use on tap
|
||||||
* @dns_host: Use this DNS on the host for forwarding
|
* @dns_host: Use this DNS on the host for forwarding
|
||||||
* @addr_out: Optional source address for outbound traffic
|
* @addr_out: Optional source address for outbound traffic
|
||||||
* @ifname_out: Optional interface name to bind outbound sockets to
|
* @ifname_out: Optional interface name to bind outbound sockets to
|
||||||
|
* @no_copy_routes: Don't copy all routes when configuring target namespace
|
||||||
|
* @no_copy_addrs: Don't copy all addresses when configuring namespace
|
||||||
*/
|
*/
|
||||||
struct ip6_ctx {
|
struct ip6_ctx {
|
||||||
|
/* PIF_TAP addresses */
|
||||||
struct in6_addr addr;
|
struct in6_addr addr;
|
||||||
struct in6_addr addr_ll;
|
|
||||||
struct in6_addr addr_seen;
|
struct in6_addr addr_seen;
|
||||||
struct in6_addr addr_ll_seen;
|
struct in6_addr addr_ll_seen;
|
||||||
struct in6_addr gw;
|
struct in6_addr guest_gw;
|
||||||
|
struct in6_addr map_host_loopback;
|
||||||
|
struct in6_addr map_guest_addr;
|
||||||
struct in6_addr dns[MAXNS + 1];
|
struct in6_addr dns[MAXNS + 1];
|
||||||
struct in6_addr dns_match;
|
struct in6_addr dns_match;
|
||||||
struct in6_addr dns_host;
|
struct in6_addr our_tap_ll;
|
||||||
|
|
||||||
|
/* PIF_HOST addresses */
|
||||||
|
struct in6_addr dns_host;
|
||||||
struct in6_addr addr_out;
|
struct in6_addr addr_out;
|
||||||
|
|
||||||
char ifname_out[IFNAMSIZ];
|
char ifname_out[IFNAMSIZ];
|
||||||
|
|
||||||
|
bool no_copy_routes;
|
||||||
|
bool no_copy_addrs;
|
||||||
};
|
};
|
||||||
|
|
||||||
#include <netinet/if_ether.h>
|
#include <netinet/if_ether.h>
|
||||||
|
@ -203,11 +187,11 @@ struct ip6_ctx {
|
||||||
* @trace: Enable tracing (extra debug) mode
|
* @trace: Enable tracing (extra debug) mode
|
||||||
* @quiet: Don't print informational messages
|
* @quiet: Don't print informational messages
|
||||||
* @foreground: Run in foreground, don't log to stderr by default
|
* @foreground: Run in foreground, don't log to stderr by default
|
||||||
* @force_stderr: Force logging to stderr
|
|
||||||
* @nofile: Maximum number of open files (ulimit -n)
|
* @nofile: Maximum number of open files (ulimit -n)
|
||||||
* @sock_path: Path for UNIX domain socket
|
* @sock_path: Path for UNIX domain socket
|
||||||
* @pcap: Path for packet capture file
|
* @pcap: Path for packet capture file
|
||||||
* @pid_file: Path to PID file, empty string if not configured
|
* @pidfile: Path to PID file, empty string if not configured
|
||||||
|
* @pidfile_fd: File descriptor for PID file, -1 if none
|
||||||
* @pasta_netns_fd: File descriptor for network namespace in pasta mode
|
* @pasta_netns_fd: File descriptor for network namespace in pasta mode
|
||||||
* @no_netns_quit: In pasta mode, don't exit if fs-bound namespace is gone
|
* @no_netns_quit: In pasta mode, don't exit if fs-bound namespace is gone
|
||||||
* @netns_base: Base name for fs-bound namespace, if any, in pasta mode
|
* @netns_base: Base name for fs-bound namespace, if any, in pasta mode
|
||||||
|
@ -215,8 +199,8 @@ struct ip6_ctx {
|
||||||
* @epollfd: File descriptor for epoll instance
|
* @epollfd: File descriptor for epoll instance
|
||||||
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
|
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
|
||||||
* @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket
|
* @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket
|
||||||
* @mac: Host MAC address
|
* @our_tap_mac: Pasta/passt's MAC on the tap link
|
||||||
* @mac_guest: MAC address of guest or namespace, seen or configured
|
* @guest_mac: MAC address of guest or namespace, seen or configured
|
||||||
* @hash_secret: 128-bit secret for siphash functions
|
* @hash_secret: 128-bit secret for siphash functions
|
||||||
* @ifi4: Index of template interface for IPv4, 0 if IPv4 disabled
|
* @ifi4: Index of template interface for IPv4, 0 if IPv4 disabled
|
||||||
* @ip: IPv4 configuration
|
* @ip: IPv4 configuration
|
||||||
|
@ -226,8 +210,6 @@ struct ip6_ctx {
|
||||||
* @pasta_ifn: Name of namespace interface for pasta
|
* @pasta_ifn: Name of namespace interface for pasta
|
||||||
* @pasta_ifi: Index of namespace interface for pasta
|
* @pasta_ifi: Index of namespace interface for pasta
|
||||||
* @pasta_conf_ns: Configure namespace after creating it
|
* @pasta_conf_ns: Configure namespace after creating it
|
||||||
* @no_copy_routes: Don't copy all routes when configuring target namespace
|
|
||||||
* @no_copy_addrs: Don't copy all addresses when configuring namespace
|
|
||||||
* @no_tcp: Disable TCP operation
|
* @no_tcp: Disable TCP operation
|
||||||
* @tcp: Context for TCP protocol handler
|
* @tcp: Context for TCP protocol handler
|
||||||
* @no_tcp: Disable UDP operation
|
* @no_tcp: Disable UDP operation
|
||||||
|
@ -243,7 +225,8 @@ struct ip6_ctx {
|
||||||
* @no_dhcpv6: Disable DHCPv6 server
|
* @no_dhcpv6: Disable DHCPv6 server
|
||||||
* @no_ndp: Disable NDP handler altogether
|
* @no_ndp: Disable NDP handler altogether
|
||||||
* @no_ra: Disable router advertisements
|
* @no_ra: Disable router advertisements
|
||||||
* @no_map_gw: Don't map connections, untracked UDP to gateway to host
|
* @host_lo_to_ns_lo: Map host loopback addresses to ns loopback addresses
|
||||||
|
* @freebind: Allow binding of non-local addresses for forwarding
|
||||||
* @low_wmem: Low probed net.core.wmem_max
|
* @low_wmem: Low probed net.core.wmem_max
|
||||||
* @low_rmem: Low probed net.core.rmem_max
|
* @low_rmem: Low probed net.core.rmem_max
|
||||||
*/
|
*/
|
||||||
|
@ -253,11 +236,13 @@ struct ctx {
|
||||||
int trace;
|
int trace;
|
||||||
int quiet;
|
int quiet;
|
||||||
int foreground;
|
int foreground;
|
||||||
int force_stderr;
|
|
||||||
int nofile;
|
int nofile;
|
||||||
char sock_path[UNIX_PATH_MAX];
|
char sock_path[UNIX_PATH_MAX];
|
||||||
char pcap[PATH_MAX];
|
char pcap[PATH_MAX];
|
||||||
char pid_file[PATH_MAX];
|
|
||||||
|
char pidfile[PATH_MAX];
|
||||||
|
int pidfile_fd;
|
||||||
|
|
||||||
int one_off;
|
int one_off;
|
||||||
|
|
||||||
int pasta_netns_fd;
|
int pasta_netns_fd;
|
||||||
|
@ -269,8 +254,8 @@ struct ctx {
|
||||||
int epollfd;
|
int epollfd;
|
||||||
int fd_tap_listen;
|
int fd_tap_listen;
|
||||||
int fd_tap;
|
int fd_tap;
|
||||||
unsigned char mac[ETH_ALEN];
|
unsigned char our_tap_mac[ETH_ALEN];
|
||||||
unsigned char mac_guest[ETH_ALEN];
|
unsigned char guest_mac[ETH_ALEN];
|
||||||
uint64_t hash_secret[2];
|
uint64_t hash_secret[2];
|
||||||
|
|
||||||
unsigned int ifi4;
|
unsigned int ifi4;
|
||||||
|
@ -284,8 +269,6 @@ struct ctx {
|
||||||
char pasta_ifn[IF_NAMESIZE];
|
char pasta_ifn[IF_NAMESIZE];
|
||||||
unsigned int pasta_ifi;
|
unsigned int pasta_ifi;
|
||||||
int pasta_conf_ns;
|
int pasta_conf_ns;
|
||||||
int no_copy_routes;
|
|
||||||
int no_copy_addrs;
|
|
||||||
|
|
||||||
int no_tcp;
|
int no_tcp;
|
||||||
struct tcp_ctx tcp;
|
struct tcp_ctx tcp;
|
||||||
|
@ -303,7 +286,8 @@ struct ctx {
|
||||||
int no_dhcpv6;
|
int no_dhcpv6;
|
||||||
int no_ndp;
|
int no_ndp;
|
||||||
int no_ra;
|
int no_ra;
|
||||||
int no_map_gw;
|
int host_lo_to_ns_lo;
|
||||||
|
int freebind;
|
||||||
|
|
||||||
int low_wmem;
|
int low_wmem;
|
||||||
int low_rmem;
|
int low_rmem;
|
||||||
|
|
114
pasta.c
114
pasta.c
|
@ -12,8 +12,8 @@
|
||||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||||
*
|
*
|
||||||
* #syscalls:pasta clone waitid exit exit_group rt_sigprocmask
|
* #syscalls:pasta clone waitid exit exit_group rt_sigprocmask
|
||||||
* #syscalls:pasta rt_sigreturn|sigreturn armv6l:sigreturn armv7l:sigreturn
|
* #syscalls:pasta rt_sigreturn|sigreturn
|
||||||
* #syscalls:pasta ppc64:sigreturn s390x:sigreturn
|
* #syscalls:pasta arm:sigreturn ppc64:sigreturn s390x:sigreturn i686:sigreturn
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
|
@ -50,6 +50,8 @@
|
||||||
#include "netlink.h"
|
#include "netlink.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
|
#define HOSTNAME_PREFIX "pasta-"
|
||||||
|
|
||||||
/* PID of child, in case we created a namespace */
|
/* PID of child, in case we created a namespace */
|
||||||
int pasta_child_pid;
|
int pasta_child_pid;
|
||||||
|
|
||||||
|
@ -59,6 +61,7 @@ int pasta_child_pid;
|
||||||
*/
|
*/
|
||||||
void pasta_child_handler(int signal)
|
void pasta_child_handler(int signal)
|
||||||
{
|
{
|
||||||
|
int errno_save = errno;
|
||||||
siginfo_t infop;
|
siginfo_t infop;
|
||||||
|
|
||||||
(void)signal;
|
(void)signal;
|
||||||
|
@ -83,6 +86,8 @@ void pasta_child_handler(int signal)
|
||||||
|
|
||||||
waitid(P_ALL, 0, NULL, WEXITED | WNOHANG);
|
waitid(P_ALL, 0, NULL, WEXITED | WNOHANG);
|
||||||
waitid(P_ALL, 0, NULL, WEXITED | WNOHANG);
|
waitid(P_ALL, 0, NULL, WEXITED | WNOHANG);
|
||||||
|
|
||||||
|
errno = errno_save;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -97,7 +102,9 @@ static int pasta_wait_for_ns(void *arg)
|
||||||
int flags = O_RDONLY | O_CLOEXEC;
|
int flags = O_RDONLY | O_CLOEXEC;
|
||||||
char ns[PATH_MAX];
|
char ns[PATH_MAX];
|
||||||
|
|
||||||
snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
|
if (snprintf_check(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid))
|
||||||
|
die_perror("Can't build netns path");
|
||||||
|
|
||||||
do {
|
do {
|
||||||
while ((c->pasta_netns_fd = open(ns, flags)) < 0) {
|
while ((c->pasta_netns_fd = open(ns, flags)) < 0) {
|
||||||
if (errno != ENOENT)
|
if (errno != ENOENT)
|
||||||
|
@ -138,17 +145,15 @@ void pasta_open_ns(struct ctx *c, const char *netns)
|
||||||
int nfd = -1;
|
int nfd = -1;
|
||||||
|
|
||||||
nfd = open(netns, O_RDONLY | O_CLOEXEC);
|
nfd = open(netns, O_RDONLY | O_CLOEXEC);
|
||||||
if (nfd < 0) {
|
if (nfd < 0)
|
||||||
die("Couldn't open network namespace %s: %s",
|
die_perror("Couldn't open network namespace %s", netns);
|
||||||
netns, strerror(errno));
|
|
||||||
}
|
|
||||||
|
|
||||||
c->pasta_netns_fd = nfd;
|
c->pasta_netns_fd = nfd;
|
||||||
|
|
||||||
NS_CALL(ns_check, c);
|
NS_CALL(ns_check, c);
|
||||||
|
|
||||||
if (c->pasta_netns_fd < 0)
|
if (c->pasta_netns_fd < 0)
|
||||||
die("Couldn't switch to pasta namespaces: %s", strerror(errno));
|
die_perror("Couldn't switch to pasta namespaces");
|
||||||
|
|
||||||
if (!c->no_netns_quit) {
|
if (!c->no_netns_quit) {
|
||||||
char buf[PATH_MAX] = { 0 };
|
char buf[PATH_MAX] = { 0 };
|
||||||
|
@ -176,18 +181,28 @@ struct pasta_spawn_cmd_arg {
|
||||||
*
|
*
|
||||||
* Return: this function never returns
|
* Return: this function never returns
|
||||||
*/
|
*/
|
||||||
|
/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
|
||||||
static int pasta_spawn_cmd(void *arg)
|
static int pasta_spawn_cmd(void *arg)
|
||||||
{
|
{
|
||||||
|
char hostname[HOST_NAME_MAX + 1] = HOSTNAME_PREFIX;
|
||||||
const struct pasta_spawn_cmd_arg *a;
|
const struct pasta_spawn_cmd_arg *a;
|
||||||
sigset_t set;
|
sigset_t set;
|
||||||
|
|
||||||
/* We run in a detached PID and mount namespace: mount /proc over */
|
/* We run in a detached PID and mount namespace: mount /proc over */
|
||||||
if (mount("", "/proc", "proc", 0, NULL))
|
if (mount("", "/proc", "proc", 0, NULL))
|
||||||
warn("Couldn't mount /proc: %s", strerror(errno));
|
warn_perror("Couldn't mount /proc");
|
||||||
|
|
||||||
if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
|
if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
|
||||||
warn("Cannot set ping_group_range, ICMP requests might fail");
|
warn("Cannot set ping_group_range, ICMP requests might fail");
|
||||||
|
|
||||||
|
if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1,
|
||||||
|
HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) ||
|
||||||
|
errno == ENAMETOOLONG) {
|
||||||
|
hostname[HOST_NAME_MAX] = '\0';
|
||||||
|
if (sethostname(hostname, strlen(hostname)))
|
||||||
|
warn("Unable to set pasta-prefixed hostname");
|
||||||
|
}
|
||||||
|
|
||||||
/* Wait for the parent to be ready: see main() */
|
/* Wait for the parent to be ready: see main() */
|
||||||
sigemptyset(&set);
|
sigemptyset(&set);
|
||||||
sigaddset(&set, SIGUSR1);
|
sigaddset(&set, SIGUSR1);
|
||||||
|
@ -196,8 +211,7 @@ static int pasta_spawn_cmd(void *arg)
|
||||||
a = (const struct pasta_spawn_cmd_arg *)arg;
|
a = (const struct pasta_spawn_cmd_arg *)arg;
|
||||||
execvp(a->exe, a->argv);
|
execvp(a->exe, a->argv);
|
||||||
|
|
||||||
perror("execvp");
|
die_perror("Failed to start command or shell");
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -211,12 +225,13 @@ static int pasta_spawn_cmd(void *arg)
|
||||||
void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
|
void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
|
||||||
int argc, char *argv[])
|
int argc, char *argv[])
|
||||||
{
|
{
|
||||||
|
char ns_fn_stack[NS_FN_STACK_SIZE]
|
||||||
|
__attribute__ ((aligned(__alignof__(max_align_t))));
|
||||||
struct pasta_spawn_cmd_arg arg = {
|
struct pasta_spawn_cmd_arg arg = {
|
||||||
.exe = argv[0],
|
.exe = argv[0],
|
||||||
.argv = argv,
|
.argv = argv,
|
||||||
};
|
};
|
||||||
char uidmap[BUFSIZ], gidmap[BUFSIZ];
|
char uidmap[BUFSIZ], gidmap[BUFSIZ];
|
||||||
char ns_fn_stack[NS_FN_STACK_SIZE];
|
|
||||||
char *sh_argv[] = { NULL, NULL };
|
char *sh_argv[] = { NULL, NULL };
|
||||||
char sh_arg0[PATH_MAX + 1];
|
char sh_arg0[PATH_MAX + 1];
|
||||||
sigset_t set;
|
sigset_t set;
|
||||||
|
@ -226,8 +241,11 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
|
||||||
c->quiet = 1;
|
c->quiet = 1;
|
||||||
|
|
||||||
/* Configure user and group mappings */
|
/* Configure user and group mappings */
|
||||||
snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
|
if (snprintf_check(uidmap, BUFSIZ, "0 %u 1", uid))
|
||||||
snprintf(gidmap, BUFSIZ, "0 %u 1", gid);
|
die_perror("Can't build uidmap");
|
||||||
|
|
||||||
|
if (snprintf_check(gidmap, BUFSIZ, "0 %u 1", gid))
|
||||||
|
die_perror("Can't build gidmap");
|
||||||
|
|
||||||
if (write_file("/proc/self/uid_map", uidmap) ||
|
if (write_file("/proc/self/uid_map", uidmap) ||
|
||||||
write_file("/proc/self/setgroups", "deny") ||
|
write_file("/proc/self/setgroups", "deny") ||
|
||||||
|
@ -259,14 +277,12 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
|
||||||
CLONE_NEWUTS | CLONE_NEWNS | SIGCHLD,
|
CLONE_NEWUTS | CLONE_NEWNS | SIGCHLD,
|
||||||
(void *)&arg);
|
(void *)&arg);
|
||||||
|
|
||||||
if (pasta_child_pid == -1) {
|
if (pasta_child_pid == -1)
|
||||||
perror("clone");
|
die_perror("Failed to clone process with detached namespaces");
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
NS_CALL(pasta_wait_for_ns, c);
|
NS_CALL(pasta_wait_for_ns, c);
|
||||||
if (c->pasta_netns_fd < 0)
|
if (c->pasta_netns_fd < 0)
|
||||||
die("Failed to join network namespace: %s", strerror(errno));
|
die_perror("Failed to join network namespace");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -277,25 +293,33 @@ void pasta_ns_conf(struct ctx *c)
|
||||||
{
|
{
|
||||||
int rc = 0;
|
int rc = 0;
|
||||||
|
|
||||||
rc = nl_link_up(nl_sock_ns, 1 /* lo */, 0);
|
rc = nl_link_set_flags(nl_sock_ns, 1 /* lo */, IFF_UP, IFF_UP);
|
||||||
if (rc < 0)
|
if (rc < 0)
|
||||||
die("Couldn't bring up loopback interface in namespace: %s",
|
die("Couldn't bring up loopback interface in namespace: %s",
|
||||||
strerror(-rc));
|
strerror(-rc));
|
||||||
|
|
||||||
/* Get or set MAC in target namespace */
|
/* Get or set MAC in target namespace */
|
||||||
if (MAC_IS_ZERO(c->mac_guest))
|
if (MAC_IS_ZERO(c->guest_mac))
|
||||||
nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest);
|
nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac);
|
||||||
else
|
else
|
||||||
rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest);
|
rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac);
|
||||||
if (rc < 0)
|
if (rc < 0)
|
||||||
die("Couldn't set MAC address in namespace: %s",
|
die("Couldn't set MAC address in namespace: %s",
|
||||||
strerror(-rc));
|
strerror(-rc));
|
||||||
|
|
||||||
if (c->pasta_conf_ns) {
|
if (c->pasta_conf_ns) {
|
||||||
nl_link_up(nl_sock_ns, c->pasta_ifi, c->mtu);
|
unsigned int flags = IFF_UP;
|
||||||
|
|
||||||
|
if (c->mtu != -1)
|
||||||
|
nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu);
|
||||||
|
|
||||||
|
if (c->ifi6) /* Avoid duplicate address detection on link up */
|
||||||
|
flags |= IFF_NOARP;
|
||||||
|
|
||||||
|
nl_link_set_flags(nl_sock_ns, c->pasta_ifi, flags, flags);
|
||||||
|
|
||||||
if (c->ifi4) {
|
if (c->ifi4) {
|
||||||
if (c->no_copy_addrs) {
|
if (c->ip4.no_copy_addrs) {
|
||||||
rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
|
rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
|
||||||
AF_INET,
|
AF_INET,
|
||||||
&c->ip4.addr,
|
&c->ip4.addr,
|
||||||
|
@ -311,9 +335,10 @@ void pasta_ns_conf(struct ctx *c)
|
||||||
strerror(-rc));
|
strerror(-rc));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c->no_copy_routes) {
|
if (c->ip4.no_copy_routes) {
|
||||||
rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi,
|
rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi,
|
||||||
AF_INET, &c->ip4.gw);
|
AF_INET,
|
||||||
|
&c->ip4.guest_gw);
|
||||||
} else {
|
} else {
|
||||||
rc = nl_route_dup(nl_sock, c->ifi4, nl_sock_ns,
|
rc = nl_route_dup(nl_sock, c->ifi4, nl_sock_ns,
|
||||||
c->pasta_ifi, AF_INET);
|
c->pasta_ifi, AF_INET);
|
||||||
|
@ -326,7 +351,24 @@ void pasta_ns_conf(struct ctx *c)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c->ifi6) {
|
if (c->ifi6) {
|
||||||
if (c->no_copy_addrs) {
|
rc = nl_addr_get_ll(nl_sock_ns, c->pasta_ifi,
|
||||||
|
&c->ip6.addr_ll_seen);
|
||||||
|
if (rc < 0) {
|
||||||
|
warn("Can't get LL address from namespace: %s",
|
||||||
|
strerror(-rc));
|
||||||
|
}
|
||||||
|
|
||||||
|
rc = nl_addr_set_ll_nodad(nl_sock_ns, c->pasta_ifi);
|
||||||
|
if (rc < 0) {
|
||||||
|
warn("Can't set nodad for LL in namespace: %s",
|
||||||
|
strerror(-rc));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* We dodged DAD: re-enable neighbour solicitations */
|
||||||
|
nl_link_set_flags(nl_sock_ns, c->pasta_ifi,
|
||||||
|
0, IFF_NOARP);
|
||||||
|
|
||||||
|
if (c->ip6.no_copy_addrs) {
|
||||||
rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
|
rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
|
||||||
AF_INET6, &c->ip6.addr, 64);
|
AF_INET6, &c->ip6.addr, 64);
|
||||||
} else {
|
} else {
|
||||||
|
@ -340,9 +382,10 @@ void pasta_ns_conf(struct ctx *c)
|
||||||
strerror(-rc));
|
strerror(-rc));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c->no_copy_routes) {
|
if (c->ip6.no_copy_routes) {
|
||||||
rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi,
|
rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi,
|
||||||
AF_INET6, &c->ip6.gw);
|
AF_INET6,
|
||||||
|
&c->ip6.guest_gw);
|
||||||
} else {
|
} else {
|
||||||
rc = nl_route_dup(nl_sock, c->ifi6,
|
rc = nl_route_dup(nl_sock, c->ifi6,
|
||||||
nl_sock_ns, c->pasta_ifi,
|
nl_sock_ns, c->pasta_ifi,
|
||||||
|
@ -356,7 +399,7 @@ void pasta_ns_conf(struct ctx *c)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
proto_update_l2_buf(c->mac_guest, NULL);
|
proto_update_l2_buf(c->guest_mac, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -370,12 +413,12 @@ static int pasta_netns_quit_timer(void)
|
||||||
struct itimerspec it = { { 1, 0 }, { 1, 0 } }; /* one-second interval */
|
struct itimerspec it = { { 1, 0 }, { 1, 0 } }; /* one-second interval */
|
||||||
|
|
||||||
if (fd == -1) {
|
if (fd == -1) {
|
||||||
err("timerfd_create(): %s", strerror(errno));
|
err_perror("Failed to create timerfd for quit timer");
|
||||||
return -errno;
|
return -errno;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (timerfd_settime(fd, 0, &it, NULL) < 0) {
|
if (timerfd_settime(fd, 0, &it, NULL) < 0) {
|
||||||
err("timerfd_settime(): %s", strerror(errno));
|
err_perror("Failed to set interval for quit timer");
|
||||||
close(fd);
|
close(fd);
|
||||||
return -errno;
|
return -errno;
|
||||||
}
|
}
|
||||||
|
@ -389,12 +432,12 @@ static int pasta_netns_quit_timer(void)
|
||||||
*/
|
*/
|
||||||
void pasta_netns_quit_init(const struct ctx *c)
|
void pasta_netns_quit_init(const struct ctx *c)
|
||||||
{
|
{
|
||||||
union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY };
|
|
||||||
struct epoll_event ev = { .events = EPOLLIN };
|
struct epoll_event ev = { .events = EPOLLIN };
|
||||||
int flags = O_NONBLOCK | O_CLOEXEC;
|
int flags = O_NONBLOCK | O_CLOEXEC;
|
||||||
struct statfs s = { 0 };
|
struct statfs s = { 0 };
|
||||||
bool try_inotify = true;
|
bool try_inotify = true;
|
||||||
int fd = -1, dir_fd;
|
int fd = -1, dir_fd;
|
||||||
|
union epoll_ref ref;
|
||||||
|
|
||||||
if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base)
|
if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base)
|
||||||
return;
|
return;
|
||||||
|
@ -425,6 +468,7 @@ void pasta_netns_quit_init(const struct ctx *c)
|
||||||
ref.type = EPOLL_TYPE_NSQUIT_TIMER;
|
ref.type = EPOLL_TYPE_NSQUIT_TIMER;
|
||||||
} else {
|
} else {
|
||||||
close(dir_fd);
|
close(dir_fd);
|
||||||
|
ref.type = EPOLL_TYPE_NSQUIT_INOTIFY;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fd > FD_REF_MAX)
|
if (fd > FD_REF_MAX)
|
||||||
|
@ -468,7 +512,7 @@ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref)
|
||||||
|
|
||||||
n = read(ref.fd, &expirations, sizeof(expirations));
|
n = read(ref.fd, &expirations, sizeof(expirations));
|
||||||
if (n < 0)
|
if (n < 0)
|
||||||
die("Namespace watch timer read() error: %s", strerror(errno));
|
die_perror("Namespace watch timer read() error");
|
||||||
if ((size_t)n < sizeof(expirations))
|
if ((size_t)n < sizeof(expirations))
|
||||||
warn("Namespace watch timer: short read(): %zi", n);
|
warn("Namespace watch timer: short read(): %zi", n);
|
||||||
|
|
||||||
|
|
64
pcap.c
64
pcap.c
|
@ -72,44 +72,43 @@ struct pcap_pkthdr {
|
||||||
* @iov: IO vector containing frame (with L2 headers and tap headers)
|
* @iov: IO vector containing frame (with L2 headers and tap headers)
|
||||||
* @iovcnt: Number of buffers (@iov entries) in frame
|
* @iovcnt: Number of buffers (@iov entries) in frame
|
||||||
* @offset: Byte offset of the L2 headers within @iov
|
* @offset: Byte offset of the L2 headers within @iov
|
||||||
* @tv: Timestamp
|
* @now: Timestamp
|
||||||
*
|
*
|
||||||
* Returns: 0 on success, -errno on error writing to the file
|
* Returns: 0 on success, -errno on error writing to the file
|
||||||
*/
|
*/
|
||||||
static void pcap_frame(const struct iovec *iov, size_t iovcnt,
|
static void pcap_frame(const struct iovec *iov, size_t iovcnt,
|
||||||
size_t offset, const struct timeval *tv)
|
size_t offset, const struct timespec *now)
|
||||||
{
|
{
|
||||||
size_t len = iov_size(iov, iovcnt) - offset;
|
size_t l2len = iov_size(iov, iovcnt) - offset;
|
||||||
struct pcap_pkthdr h = {
|
struct pcap_pkthdr h = {
|
||||||
.tv_sec = tv->tv_sec,
|
.tv_sec = now->tv_sec,
|
||||||
.tv_usec = tv->tv_usec,
|
.tv_usec = DIV_ROUND_CLOSEST(now->tv_nsec, 1000),
|
||||||
.caplen = len,
|
.caplen = l2len,
|
||||||
.len = len
|
.len = l2len
|
||||||
};
|
};
|
||||||
struct iovec hiov = { &h, sizeof(h) };
|
|
||||||
|
|
||||||
if (write_remainder(pcap_fd, &hiov, 1, 0) < 0 ||
|
if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 ||
|
||||||
write_remainder(pcap_fd, iov, iovcnt, offset) < 0) {
|
write_remainder(pcap_fd, iov, iovcnt, offset) < 0)
|
||||||
debug("Cannot log packet, length %zu: %s",
|
debug_perror("Cannot log packet, length %zu", l2len);
|
||||||
len, strerror(errno));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* pcap() - Capture a single frame to pcap file
|
* pcap() - Capture a single frame to pcap file
|
||||||
* @pkt: Pointer to data buffer, including L2 headers
|
* @pkt: Pointer to data buffer, including L2 headers
|
||||||
* @len: L2 packet length
|
* @l2len: L2 frame length
|
||||||
*/
|
*/
|
||||||
void pcap(const char *pkt, size_t len)
|
void pcap(const char *pkt, size_t l2len)
|
||||||
{
|
{
|
||||||
struct iovec iov = { (char *)pkt, len };
|
struct iovec iov = { (char *)pkt, l2len };
|
||||||
struct timeval tv;
|
struct timespec now = { 0 };
|
||||||
|
|
||||||
if (pcap_fd == -1)
|
if (pcap_fd == -1)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
gettimeofday(&tv, NULL);
|
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||||
pcap_frame(&iov, 1, 0, &tv);
|
err_perror("Failed to get CLOCK_REALTIME time");
|
||||||
|
|
||||||
|
pcap_frame(&iov, 1, 0, &now);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -122,16 +121,17 @@ void pcap(const char *pkt, size_t len)
|
||||||
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
||||||
size_t offset)
|
size_t offset)
|
||||||
{
|
{
|
||||||
struct timeval tv;
|
struct timespec now = { 0 };
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
|
|
||||||
if (pcap_fd == -1)
|
if (pcap_fd == -1)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
gettimeofday(&tv, NULL);
|
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||||
|
err_perror("Failed to get CLOCK_REALTIME time");
|
||||||
|
|
||||||
for (i = 0; i < n; i++)
|
for (i = 0; i < n; i++)
|
||||||
pcap_frame(iov + i * frame_parts, frame_parts, offset, &tv);
|
pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -141,17 +141,20 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
||||||
* @iov: Pointer to the array of struct iovec describing the I/O vector
|
* @iov: Pointer to the array of struct iovec describing the I/O vector
|
||||||
* containing packet data to write, including L2 header
|
* containing packet data to write, including L2 header
|
||||||
* @iovcnt: Number of buffers (@iov entries)
|
* @iovcnt: Number of buffers (@iov entries)
|
||||||
|
* @offset: Offset of the L2 frame within the full data length
|
||||||
*/
|
*/
|
||||||
/* cppcheck-suppress unusedFunction */
|
/* cppcheck-suppress unusedFunction */
|
||||||
void pcap_iov(const struct iovec *iov, size_t iovcnt)
|
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
|
||||||
{
|
{
|
||||||
struct timeval tv;
|
struct timespec now = { 0 };
|
||||||
|
|
||||||
if (pcap_fd == -1)
|
if (pcap_fd == -1)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
gettimeofday(&tv, NULL);
|
if (clock_gettime(CLOCK_REALTIME, &now))
|
||||||
pcap_frame(iov, iovcnt, 0, &tv);
|
err_perror("Failed to get CLOCK_REALTIME time");
|
||||||
|
|
||||||
|
pcap_frame(iov, iovcnt, offset, &now);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -160,23 +163,20 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt)
|
||||||
*/
|
*/
|
||||||
void pcap_init(struct ctx *c)
|
void pcap_init(struct ctx *c)
|
||||||
{
|
{
|
||||||
int flags = O_WRONLY | O_CREAT | O_TRUNC;
|
|
||||||
|
|
||||||
if (pcap_fd != -1)
|
if (pcap_fd != -1)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!*c->pcap)
|
if (!*c->pcap)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
flags |= c->foreground ? O_CLOEXEC : 0;
|
pcap_fd = output_file_open(c->pcap, O_WRONLY);
|
||||||
pcap_fd = open(c->pcap, flags, S_IRUSR | S_IWUSR);
|
|
||||||
if (pcap_fd == -1) {
|
if (pcap_fd == -1) {
|
||||||
perror("open");
|
err_perror("Couldn't open pcap file %s", c->pcap);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
info("Saving packet capture to %s", c->pcap);
|
info("Saving packet capture to %s", c->pcap);
|
||||||
|
|
||||||
if (write(pcap_fd, &pcap_hdr, sizeof(pcap_hdr)) < 0)
|
if (write(pcap_fd, &pcap_hdr, sizeof(pcap_hdr)) < 0)
|
||||||
warn("Cannot write PCAP header: %s", strerror(errno));
|
warn_perror("Cannot write PCAP header");
|
||||||
}
|
}
|
||||||
|
|
4
pcap.h
4
pcap.h
|
@ -6,10 +6,10 @@
|
||||||
#ifndef PCAP_H
|
#ifndef PCAP_H
|
||||||
#define PCAP_H
|
#define PCAP_H
|
||||||
|
|
||||||
void pcap(const char *pkt, size_t len);
|
void pcap(const char *pkt, size_t l2len);
|
||||||
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
||||||
size_t offset);
|
size_t offset);
|
||||||
void pcap_iov(const struct iovec *iov, size_t iovcnt);
|
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset);
|
||||||
void pcap_init(struct ctx *c);
|
void pcap_init(struct ctx *c);
|
||||||
|
|
||||||
#endif /* PCAP_H */
|
#endif /* PCAP_H */
|
||||||
|
|
82
pif.c
82
pif.c
|
@ -7,9 +7,14 @@
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <netinet/in.h>
|
||||||
|
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
#include "pif.h"
|
#include "pif.h"
|
||||||
|
#include "siphash.h"
|
||||||
|
#include "ip.h"
|
||||||
|
#include "inany.h"
|
||||||
|
#include "passt.h"
|
||||||
|
|
||||||
const char *pif_type_str[] = {
|
const char *pif_type_str[] = {
|
||||||
[PIF_NONE] = "<none>",
|
[PIF_NONE] = "<none>",
|
||||||
|
@ -19,3 +24,80 @@ const char *pif_type_str[] = {
|
||||||
};
|
};
|
||||||
static_assert(ARRAY_SIZE(pif_type_str) == PIF_NUM_TYPES,
|
static_assert(ARRAY_SIZE(pif_type_str) == PIF_NUM_TYPES,
|
||||||
"pif_type_str[] doesn't match enum pif_type");
|
"pif_type_str[] doesn't match enum pif_type");
|
||||||
|
|
||||||
|
|
||||||
|
/** pif_sockaddr() - Construct a socket address suitable for an interface
|
||||||
|
* @c: Execution context
|
||||||
|
* @sa: Pointer to sockaddr to fill in
|
||||||
|
* @sl: Updated to relevant length of initialised @sa
|
||||||
|
* @pif: Interface to create the socket address
|
||||||
|
* @addr: IPv[46] address
|
||||||
|
* @port: Port (host byte order)
|
||||||
|
*/
|
||||||
|
void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
|
||||||
|
uint8_t pif, const union inany_addr *addr, in_port_t port)
|
||||||
|
{
|
||||||
|
const struct in_addr *v4 = inany_v4(addr);
|
||||||
|
|
||||||
|
ASSERT(pif_is_socket(pif));
|
||||||
|
|
||||||
|
if (v4) {
|
||||||
|
sa->sa_family = AF_INET;
|
||||||
|
sa->sa4.sin_addr = *v4;
|
||||||
|
sa->sa4.sin_port = htons(port);
|
||||||
|
memset(&sa->sa4.sin_zero, 0, sizeof(sa->sa4.sin_zero));
|
||||||
|
*sl = sizeof(sa->sa4);
|
||||||
|
} else {
|
||||||
|
sa->sa_family = AF_INET6;
|
||||||
|
sa->sa6.sin6_addr = addr->a6;
|
||||||
|
sa->sa6.sin6_port = htons(port);
|
||||||
|
if (pif == PIF_HOST && IN6_IS_ADDR_LINKLOCAL(&addr->a6))
|
||||||
|
sa->sa6.sin6_scope_id = c->ifi6;
|
||||||
|
else
|
||||||
|
sa->sa6.sin6_scope_id = 0;
|
||||||
|
sa->sa6.sin6_flowinfo = 0;
|
||||||
|
*sl = sizeof(sa->sa6);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** pif_sock_l4() - Open a socket bound to an address on a specified interface
|
||||||
|
* @c: Execution context
|
||||||
|
* @type: Socket epoll type
|
||||||
|
* @pif: Interface for this socket
|
||||||
|
* @addr: Address to bind to, or NULL for dual-stack any
|
||||||
|
* @ifname: Interface for binding, NULL for any
|
||||||
|
* @port: Port number to bind to (host byte order)
|
||||||
|
* @data: epoll reference portion for protocol handlers
|
||||||
|
*
|
||||||
|
* NOTE: For namespace pifs, this must be called having already entered the
|
||||||
|
* relevant namespace.
|
||||||
|
*
|
||||||
|
* Return: newly created socket, negative error code on failure
|
||||||
|
*/
|
||||||
|
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||||
|
const union inany_addr *addr, const char *ifname,
|
||||||
|
in_port_t port, uint32_t data)
|
||||||
|
{
|
||||||
|
union sockaddr_inany sa = {
|
||||||
|
.sa6.sin6_family = AF_INET6,
|
||||||
|
.sa6.sin6_addr = in6addr_any,
|
||||||
|
.sa6.sin6_port = htons(port),
|
||||||
|
};
|
||||||
|
socklen_t sl;
|
||||||
|
|
||||||
|
ASSERT(pif_is_socket(pif));
|
||||||
|
|
||||||
|
if (pif == PIF_SPLICE) {
|
||||||
|
/* Sanity checks */
|
||||||
|
ASSERT(!ifname);
|
||||||
|
ASSERT(addr && inany_is_loopback(addr));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!addr)
|
||||||
|
return sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
|
||||||
|
ifname, false, data);
|
||||||
|
|
||||||
|
pif_sockaddr(c, &sa, &sl, pif, addr, port);
|
||||||
|
return sock_l4_sa(c, type, &sa, sl,
|
||||||
|
ifname, sa.sa_family == AF_INET6, data);
|
||||||
|
}
|
||||||
|
|
21
pif.h
21
pif.h
|
@ -7,6 +7,9 @@
|
||||||
#ifndef PIF_H
|
#ifndef PIF_H
|
||||||
#define PIF_H
|
#define PIF_H
|
||||||
|
|
||||||
|
union inany_addr;
|
||||||
|
union sockaddr_inany;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* enum pif_type - Type of passt/pasta interface ("pif")
|
* enum pif_type - Type of passt/pasta interface ("pif")
|
||||||
*
|
*
|
||||||
|
@ -38,10 +41,26 @@ static inline const char *pif_type(enum pif_type pt)
|
||||||
return "?";
|
return "?";
|
||||||
}
|
}
|
||||||
|
|
||||||
/* cppcheck-suppress unusedFunction */
|
|
||||||
static inline const char *pif_name(uint8_t pif)
|
static inline const char *pif_name(uint8_t pif)
|
||||||
{
|
{
|
||||||
return pif_type(pif);
|
return pif_type(pif);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* pif_is_socket() - Is interface implemented via L4 sockets?
|
||||||
|
* @pif: pif to check
|
||||||
|
*
|
||||||
|
* Return: true of @pif is an L4 socket based interface, otherwise false
|
||||||
|
*/
|
||||||
|
static inline bool pif_is_socket(uint8_t pif)
|
||||||
|
{
|
||||||
|
return pif == PIF_HOST || pif == PIF_SPLICE;
|
||||||
|
}
|
||||||
|
|
||||||
|
void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
|
||||||
|
uint8_t pif, const union inany_addr *addr, in_port_t port);
|
||||||
|
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
||||||
|
const union inany_addr *addr, const char *ifname,
|
||||||
|
in_port_t port, uint32_t data);
|
||||||
|
|
||||||
#endif /* PIF_H */
|
#endif /* PIF_H */
|
||||||
|
|
4
qrap.1
4
qrap.1
|
@ -66,8 +66,8 @@ issues to Stefano Brivio <sbrivio@redhat.com>.
|
||||||
Copyright (c) 2020-2021 Red Hat GmbH.
|
Copyright (c) 2020-2021 Red Hat GmbH.
|
||||||
|
|
||||||
\fBqrap\fR is free software: you can redistribute is and/or modify it under the
|
\fBqrap\fR is free software: you can redistribute is and/or modify it under the
|
||||||
terms of the GNU Affero General Public License as published by the Free Software
|
terms of the GNU General Public License as published by the Free Software
|
||||||
Foundation, either version 3 of the License, or (at your option) any later
|
Foundation, either version 2 of the License, or (at your option) any later
|
||||||
version.
|
version.
|
||||||
|
|
||||||
.SH SEE ALSO
|
.SH SEE ALSO
|
||||||
|
|
23
seccomp.sh
23
seccomp.sh
|
@ -20,6 +20,15 @@ OUT="$(mktemp)"
|
||||||
[ -z "${ARCH}" ] && ARCH="$(uname -m)"
|
[ -z "${ARCH}" ] && ARCH="$(uname -m)"
|
||||||
[ -z "${CC}" ] && CC="cc"
|
[ -z "${CC}" ] && CC="cc"
|
||||||
|
|
||||||
|
AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z] \
|
||||||
|
| sed 's/^ARM.*/ARM/' \
|
||||||
|
| sed 's/I[456]86/I386/' \
|
||||||
|
| sed 's/PPC64/PPC/' \
|
||||||
|
| sed 's/PPCLE/PPC64LE/' \
|
||||||
|
| sed 's/MIPS64EL/MIPSEL64/' \
|
||||||
|
| sed 's/HPPA/PARISC/' \
|
||||||
|
| sed 's/SH4/SH/')"
|
||||||
|
|
||||||
HEADER="/* This file was automatically generated by $(basename ${0}) */
|
HEADER="/* This file was automatically generated by $(basename ${0}) */
|
||||||
|
|
||||||
#ifndef AUDIT_ARCH_PPC64LE
|
#ifndef AUDIT_ARCH_PPC64LE
|
||||||
|
@ -29,11 +38,11 @@ HEADER="/* This file was automatically generated by $(basename ${0}) */
|
||||||
# Prefix for each profile: check that 'arch' in seccomp_data is matching
|
# Prefix for each profile: check that 'arch' in seccomp_data is matching
|
||||||
PRE='
|
PRE='
|
||||||
struct sock_filter filter_@PROFILE@[] = {
|
struct sock_filter filter_@PROFILE@[] = {
|
||||||
/* cppcheck-suppress badBitmaskCheck */
|
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
|
||||||
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
||||||
(offsetof(struct seccomp_data, arch))),
|
(offsetof(struct seccomp_data, arch))),
|
||||||
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@),
|
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, @AUDIT_ARCH@, 0, @KILL@),
|
||||||
/* cppcheck-suppress badBitmaskCheck */
|
/* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
|
||||||
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
||||||
(offsetof(struct seccomp_data, nr))),
|
(offsetof(struct seccomp_data, nr))),
|
||||||
|
|
||||||
|
@ -233,7 +242,8 @@ gen_profile() {
|
||||||
sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}"
|
sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}"
|
||||||
done
|
done
|
||||||
|
|
||||||
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))"
|
finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" \
|
||||||
|
"AUDIT_ARCH:${AUDIT_ARCH}"
|
||||||
}
|
}
|
||||||
|
|
||||||
printf '%s\n' "${HEADER}" > "${OUT}"
|
printf '%s\n' "${HEADER}" > "${OUT}"
|
||||||
|
@ -242,7 +252,10 @@ for __p in ${__profiles}; do
|
||||||
__calls="$(sed -n 's/[\t ]*\*[\t ]*#syscalls\(:'"${__p}"'\|\)[\t ]\{1,\}\(.*\)/\2/p' ${IN})"
|
__calls="$(sed -n 's/[\t ]*\*[\t ]*#syscalls\(:'"${__p}"'\|\)[\t ]\{1,\}\(.*\)/\2/p' ${IN})"
|
||||||
__calls="${__calls} ${EXTRA_SYSCALLS:-}"
|
__calls="${__calls} ${EXTRA_SYSCALLS:-}"
|
||||||
__calls="$(filter ${__calls})"
|
__calls="$(filter ${__calls})"
|
||||||
echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t
|
|
||||||
|
cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
|
||||||
|
case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac
|
||||||
|
echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args}
|
||||||
|
|
||||||
# Pad here to keep gen_profile() "simple"
|
# Pad here to keep gen_profile() "simple"
|
||||||
__count=0
|
__count=0
|
||||||
|
|
|
@ -115,10 +115,4 @@ static inline uint64_t siphash_final(struct siphash_state *state,
|
||||||
return state->v[0] ^ state->v[1] ^ state->v[2] ^ state->v[3];
|
return state->v[0] ^ state->v[1] ^ state->v[2] ^ state->v[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t siphash_8b(const uint8_t *in, const uint64_t *k);
|
|
||||||
uint64_t siphash_12b(const uint8_t *in, const uint64_t *k);
|
|
||||||
uint64_t siphash_20b(const uint8_t *in, const uint64_t *k);
|
|
||||||
uint64_t siphash_32b(const uint8_t *in, const uint64_t *k);
|
|
||||||
uint64_t siphash_36b(const uint8_t *in, const uint64_t *k);
|
|
||||||
|
|
||||||
#endif /* SIPHASH_H */
|
#endif /* SIPHASH_H */
|
||||||
|
|
76
tap.h
76
tap.h
|
@ -6,74 +6,60 @@
|
||||||
#ifndef TAP_H
|
#ifndef TAP_H
|
||||||
#define TAP_H
|
#define TAP_H
|
||||||
|
|
||||||
|
#define ETH_HDR_INIT(proto) { .h_proto = htons_constant(proto) }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct tap_hdr - L2 and tap specific headers
|
* struct tap_hdr - tap backend specific headers
|
||||||
* @vnet_len: Frame length (for qemu socket transport)
|
* @vnet_len: Frame length (for qemu socket transport)
|
||||||
* @eh: Ethernet header
|
|
||||||
*/
|
*/
|
||||||
struct tap_hdr {
|
struct tap_hdr {
|
||||||
uint32_t vnet_len;
|
uint32_t vnet_len;
|
||||||
struct ethhdr eh;
|
|
||||||
} __attribute__((packed));
|
} __attribute__((packed));
|
||||||
|
|
||||||
#define TAP_HDR_INIT(proto) { .eh.h_proto = htons_constant(proto) }
|
/**
|
||||||
|
* tap_hdr_iov() - struct iovec for a tap header
|
||||||
static inline size_t tap_hdr_len_(const struct ctx *c)
|
* @c: Execution context
|
||||||
|
* @taph: Pointer to tap specific header buffer
|
||||||
|
*
|
||||||
|
* Returns: A struct iovec covering the correct portion of @taph to use as the
|
||||||
|
* tap specific header in the current configuration.
|
||||||
|
*/
|
||||||
|
static inline struct iovec tap_hdr_iov(const struct ctx *c,
|
||||||
|
struct tap_hdr *thdr)
|
||||||
{
|
{
|
||||||
if (c->mode == MODE_PASST)
|
return (struct iovec){
|
||||||
return sizeof(struct tap_hdr);
|
.iov_base = thdr,
|
||||||
else
|
.iov_len = c->mode == MODE_PASST ? sizeof(*thdr) : 0,
|
||||||
return sizeof(struct ethhdr);
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* tap_iov_base() - Find start of tap frame
|
* tap_hdr_update() - Update the tap specific header for a frame
|
||||||
* @c: Execution context
|
* @taph: Tap specific header buffer to update
|
||||||
* @taph: Pointer to L2 header buffer
|
* @l2len: Frame length (including L2 headers)
|
||||||
*
|
|
||||||
* Returns: pointer to the start of tap frame - suitable for an
|
|
||||||
* iov_base to be passed to tap_send_frames())
|
|
||||||
*/
|
*/
|
||||||
static inline void *tap_iov_base(const struct ctx *c, struct tap_hdr *taph)
|
static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
|
||||||
{
|
{
|
||||||
return (char *)(taph + 1) - tap_hdr_len_(c);
|
thdr->vnet_len = htonl(l2len);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* tap_iov_len() - Finalize tap frame and return total length
|
|
||||||
* @c: Execution context
|
|
||||||
* @taph: Tap header to finalize
|
|
||||||
* @plen: L2 payload length (excludes L2 and tap specific headers)
|
|
||||||
*
|
|
||||||
* Returns: length of the tap frame including L2 and tap specific
|
|
||||||
* headers - suitable for an iov_len to be passed to
|
|
||||||
* tap_send_frames()
|
|
||||||
*/
|
|
||||||
static inline size_t tap_iov_len(const struct ctx *c, struct tap_hdr *taph,
|
|
||||||
size_t plen)
|
|
||||||
{
|
|
||||||
if (c->mode == MODE_PASST)
|
|
||||||
taph->vnet_len = htonl(plen + sizeof(taph->eh));
|
|
||||||
return plen + tap_hdr_len_(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct in_addr tap_ip4_daddr(const struct ctx *c);
|
|
||||||
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
||||||
struct in_addr dst, in_port_t dport,
|
struct in_addr dst, in_port_t dport,
|
||||||
const void *in, size_t len);
|
const void *in, size_t dlen);
|
||||||
void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
|
void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
|
||||||
const void *in, size_t len);
|
const void *in, size_t l4len);
|
||||||
const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
|
const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
|
||||||
const struct in6_addr *src);
|
const struct in6_addr *src);
|
||||||
void tap_udp6_send(const struct ctx *c,
|
void tap_udp6_send(const struct ctx *c,
|
||||||
const struct in6_addr *src, in_port_t sport,
|
const struct in6_addr *src, in_port_t sport,
|
||||||
const struct in6_addr *dst, in_port_t dport,
|
const struct in6_addr *dst, in_port_t dport,
|
||||||
uint32_t flow, const void *in, size_t len);
|
uint32_t flow, void *in, size_t dlen);
|
||||||
void tap_icmp6_send(const struct ctx *c,
|
void tap_icmp6_send(const struct ctx *c,
|
||||||
const struct in6_addr *src, const struct in6_addr *dst,
|
const struct in6_addr *src, const struct in6_addr *dst,
|
||||||
const void *in, size_t len);
|
const void *in, size_t l4len);
|
||||||
int tap_send(const struct ctx *c, const void *data, size_t len);
|
void tap_send_single(const struct ctx *c, const void *data, size_t l2len);
|
||||||
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n);
|
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
|
||||||
|
size_t bufs_per_frame, size_t nframes);
|
||||||
void eth_update_mac(struct ethhdr *eh,
|
void eth_update_mac(struct ethhdr *eh,
|
||||||
const unsigned char *eth_d, const unsigned char *eth_s);
|
const unsigned char *eth_d, const unsigned char *eth_s);
|
||||||
void tap_listen_handler(struct ctx *c, uint32_t events);
|
void tap_listen_handler(struct ctx *c, uint32_t events);
|
||||||
|
@ -81,6 +67,10 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
|
||||||
const struct timespec *now);
|
const struct timespec *now);
|
||||||
void tap_handler_passt(struct ctx *c, uint32_t events,
|
void tap_handler_passt(struct ctx *c, uint32_t events,
|
||||||
const struct timespec *now);
|
const struct timespec *now);
|
||||||
|
int tap_sock_unix_open(char *sock_path);
|
||||||
void tap_sock_init(struct ctx *c);
|
void tap_sock_init(struct ctx *c);
|
||||||
|
void tap_flush_pools(void);
|
||||||
|
void tap_handler(struct ctx *c, const struct timespec *now);
|
||||||
|
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
|
||||||
|
|
||||||
#endif /* TAP_H */
|
#endif /* TAP_H */
|
||||||
|
|
18
tcp.h
18
tcp.h
|
@ -10,20 +10,24 @@
|
||||||
|
|
||||||
struct ctx;
|
struct ctx;
|
||||||
|
|
||||||
void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
|
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref);
|
||||||
void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
|
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
|
||||||
const struct timespec *now);
|
const struct timespec *now);
|
||||||
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
|
void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||||
int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
|
uint32_t events);
|
||||||
|
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||||
const void *saddr, const void *daddr,
|
const void *saddr, const void *daddr,
|
||||||
const struct pool *p, int idx, const struct timespec *now);
|
const struct pool *p, int idx, const struct timespec *now);
|
||||||
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
|
int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
|
||||||
const char *ifname, in_port_t port);
|
const char *ifname, in_port_t port);
|
||||||
int tcp_init(struct ctx *c);
|
int tcp_init(struct ctx *c);
|
||||||
void tcp_timer(struct ctx *c, const struct timespec *now);
|
void tcp_timer(struct ctx *c, const struct timespec *now);
|
||||||
void tcp_defer_handler(struct ctx *c);
|
void tcp_defer_handler(struct ctx *c);
|
||||||
|
|
||||||
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
|
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
|
||||||
|
int tcp_set_peek_offset(int s, int offset);
|
||||||
|
|
||||||
|
extern bool peek_offset_cap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* union tcp_epoll_ref - epoll reference portion for TCP connections
|
* union tcp_epoll_ref - epoll reference portion for TCP connections
|
||||||
|
@ -55,16 +59,12 @@ union tcp_listen_epoll_ref {
|
||||||
* @fwd_in: Port forwarding configuration for inbound packets
|
* @fwd_in: Port forwarding configuration for inbound packets
|
||||||
* @fwd_out: Port forwarding configuration for outbound packets
|
* @fwd_out: Port forwarding configuration for outbound packets
|
||||||
* @timer_run: Timestamp of most recent timer run
|
* @timer_run: Timestamp of most recent timer run
|
||||||
* @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035)
|
|
||||||
* @pipe_size: Size of pipes for spliced connections
|
* @pipe_size: Size of pipes for spliced connections
|
||||||
*/
|
*/
|
||||||
struct tcp_ctx {
|
struct tcp_ctx {
|
||||||
struct fwd_ports fwd_in;
|
struct fwd_ports fwd_in;
|
||||||
struct fwd_ports fwd_out;
|
struct fwd_ports fwd_out;
|
||||||
struct timespec timer_run;
|
struct timespec timer_run;
|
||||||
#ifdef HAS_SND_WND
|
|
||||||
int kernel_snd_wnd;
|
|
||||||
#endif
|
|
||||||
size_t pipe_size;
|
size_t pipe_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
381
tcp_buf.c
Normal file
381
tcp_buf.c
Normal file
|
@ -0,0 +1,381 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
/* PASST - Plug A Simple Socket Transport
|
||||||
|
* for qemu/UNIX domain socket mode
|
||||||
|
*
|
||||||
|
* PASTA - Pack A Subtle Tap Abstraction
|
||||||
|
* for network namespace/tap device mode
|
||||||
|
*
|
||||||
|
* tcp_buf.c - TCP L2 buffer management functions
|
||||||
|
*
|
||||||
|
* Copyright Red Hat
|
||||||
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
|
#include <netinet/ip.h>
|
||||||
|
|
||||||
|
#include <netinet/tcp.h>
|
||||||
|
|
||||||
|
#include "util.h"
|
||||||
|
#include "ip.h"
|
||||||
|
#include "iov.h"
|
||||||
|
#include "passt.h"
|
||||||
|
#include "tap.h"
|
||||||
|
#include "siphash.h"
|
||||||
|
#include "inany.h"
|
||||||
|
#include "tcp_conn.h"
|
||||||
|
#include "tcp_internal.h"
|
||||||
|
#include "tcp_buf.h"
|
||||||
|
|
||||||
|
#define TCP_FRAMES_MEM 128
|
||||||
|
#define TCP_FRAMES \
|
||||||
|
(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
|
||||||
|
|
||||||
|
/* Static buffers */
|
||||||
|
|
||||||
|
/* Ethernet header for IPv4 and IPv6 frames */
|
||||||
|
static struct ethhdr tcp4_eth_src;
|
||||||
|
static struct ethhdr tcp6_eth_src;
|
||||||
|
|
||||||
|
static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM];
|
||||||
|
|
||||||
|
/* IP headers for IPv4 and IPv6 */
|
||||||
|
struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
|
||||||
|
struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
|
||||||
|
|
||||||
|
/* TCP segments with payload for IPv4 and IPv6 frames */
|
||||||
|
static struct tcp_payload_t tcp_payload[TCP_FRAMES_MEM];
|
||||||
|
|
||||||
|
static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516");
|
||||||
|
static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516");
|
||||||
|
|
||||||
|
/* References tracking the owner connection of frames in the tap outqueue */
|
||||||
|
static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
|
||||||
|
static unsigned int tcp_payload_used;
|
||||||
|
|
||||||
|
/* recvmsg()/sendmsg() data for tap */
|
||||||
|
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
|
||||||
|
|
||||||
|
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* tcp_update_l2_buf() - Update Ethernet header buffers with addresses
|
||||||
|
* @eth_d: Ethernet destination address, NULL if unchanged
|
||||||
|
* @eth_s: Ethernet source address, NULL if unchanged
|
||||||
|
*/
|
||||||
|
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
|
||||||
|
{
|
||||||
|
eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
|
||||||
|
eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
|
||||||
|
* @c: Execution context
|
||||||
|
*/
|
||||||
|
void tcp_sock_iov_init(const struct ctx *c)
|
||||||
|
{
|
||||||
|
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||||
|
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||||
|
int i;
|
||||||
|
|
||||||
|
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
|
||||||
|
tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
|
||||||
|
|
||||||
|
for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
|
||||||
|
tcp6_payload_ip[i] = ip6;
|
||||||
|
tcp4_payload_ip[i] = iph;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||||
|
struct iovec *iov = tcp_l2_iov[i];
|
||||||
|
|
||||||
|
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]);
|
||||||
|
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
||||||
|
iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
|
||||||
|
* @ctx: Execution context
|
||||||
|
* @conns: Array of connection pointers corresponding to queued frames
|
||||||
|
* @frames: Two-dimensional array containing queued frames with sub-iovs
|
||||||
|
* @num_frames: Number of entries in the two arrays to be compared
|
||||||
|
*/
|
||||||
|
static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
|
||||||
|
struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < num_frames; i++) {
|
||||||
|
const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
|
||||||
|
struct tcp_tap_conn *conn = conns[i];
|
||||||
|
uint32_t seq = ntohl(th->seq);
|
||||||
|
uint32_t peek_offset;
|
||||||
|
|
||||||
|
if (SEQ_LE(conn->seq_to_tap, seq))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
conn->seq_to_tap = seq;
|
||||||
|
peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||||
|
if (tcp_set_peek_offset(conn->sock, peek_offset))
|
||||||
|
tcp_rst(c, conn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* tcp_payload_flush() - Send out buffers for segments with data or flags
|
||||||
|
* @c: Execution context
|
||||||
|
*/
|
||||||
|
void tcp_payload_flush(const struct ctx *c)
|
||||||
|
{
|
||||||
|
size_t m;
|
||||||
|
|
||||||
|
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
|
||||||
|
tcp_payload_used);
|
||||||
|
if (m != tcp_payload_used) {
|
||||||
|
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
|
||||||
|
tcp_payload_used - m);
|
||||||
|
}
|
||||||
|
tcp_payload_used = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* tcp_buf_send_flag() - Send segment with flags to tap (no payload)
|
||||||
|
* @c: Execution context
|
||||||
|
* @conn: Connection pointer
|
||||||
|
* @flags: TCP flags: if not set, send segment only if ACK is due
|
||||||
|
*
|
||||||
|
* Return: negative error code on connection reset, 0 otherwise
|
||||||
|
*/
|
||||||
|
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||||
|
{
|
||||||
|
struct tcp_payload_t *payload;
|
||||||
|
struct iovec *iov;
|
||||||
|
size_t optlen;
|
||||||
|
size_t l4len;
|
||||||
|
uint32_t seq;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
iov = tcp_l2_iov[tcp_payload_used];
|
||||||
|
if (CONN_V4(conn)) {
|
||||||
|
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
|
||||||
|
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||||
|
} else {
|
||||||
|
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
|
||||||
|
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = iov[TCP_IOV_PAYLOAD].iov_base;
|
||||||
|
seq = conn->seq_to_tap;
|
||||||
|
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
|
||||||
|
(struct tcp_syn_opts *)&payload->data, &optlen);
|
||||||
|
if (ret <= 0)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
tcp_payload_used++;
|
||||||
|
l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
|
||||||
|
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||||
|
if (flags & DUP_ACK) {
|
||||||
|
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
|
||||||
|
|
||||||
|
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
|
||||||
|
iov[TCP_IOV_TAP].iov_len);
|
||||||
|
dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
|
||||||
|
dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP];
|
||||||
|
memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
|
||||||
|
iov[TCP_IOV_PAYLOAD].iov_base, l4len);
|
||||||
|
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tcp_payload_used > TCP_FRAMES_MEM - 2)
|
||||||
|
tcp_payload_flush(c);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
|
||||||
|
* @c: Execution context
|
||||||
|
* @conn: Connection pointer
|
||||||
|
* @dlen: TCP payload length
|
||||||
|
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
|
||||||
|
* @seq: Sequence number to be sent
|
||||||
|
*/
|
||||||
|
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
|
ssize_t dlen, int no_csum, uint32_t seq)
|
||||||
|
{
|
||||||
|
struct tcp_payload_t *payload;
|
||||||
|
const uint16_t *check = NULL;
|
||||||
|
struct iovec *iov;
|
||||||
|
size_t l4len;
|
||||||
|
|
||||||
|
conn->seq_to_tap = seq + dlen;
|
||||||
|
tcp_frame_conns[tcp_payload_used] = conn;
|
||||||
|
iov = tcp_l2_iov[tcp_payload_used];
|
||||||
|
if (CONN_V4(conn)) {
|
||||||
|
if (no_csum) {
|
||||||
|
struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
|
||||||
|
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
|
||||||
|
|
||||||
|
check = &iph->check;
|
||||||
|
}
|
||||||
|
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
|
||||||
|
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||||
|
} else if (CONN_V6(conn)) {
|
||||||
|
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
|
||||||
|
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||||
|
}
|
||||||
|
payload = iov[TCP_IOV_PAYLOAD].iov_base;
|
||||||
|
payload->th.th_off = sizeof(struct tcphdr) / 4;
|
||||||
|
payload->th.th_x2 = 0;
|
||||||
|
payload->th.th_flags = 0;
|
||||||
|
payload->th.ack = 1;
|
||||||
|
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
|
||||||
|
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||||
|
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
|
||||||
|
tcp_payload_flush(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window
|
||||||
|
* @c: Execution context
|
||||||
|
* @conn: Connection pointer
|
||||||
|
*
|
||||||
|
* Return: negative on connection reset, 0 otherwise
|
||||||
|
*
|
||||||
|
* #syscalls recvmsg
|
||||||
|
*/
|
||||||
|
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||||
|
{
|
||||||
|
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
|
||||||
|
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
|
||||||
|
int len, dlen, i, s = conn->sock;
|
||||||
|
struct msghdr mh_sock = { 0 };
|
||||||
|
uint16_t mss = MSS_GET(conn);
|
||||||
|
uint32_t already_sent, seq;
|
||||||
|
struct iovec *iov;
|
||||||
|
|
||||||
|
/* How much have we read/sent since last received ack ? */
|
||||||
|
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||||
|
|
||||||
|
if (SEQ_LT(already_sent, 0)) {
|
||||||
|
/* RFC 761, section 2.1. */
|
||||||
|
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
|
||||||
|
conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||||
|
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||||
|
already_sent = 0;
|
||||||
|
if (tcp_set_peek_offset(s, 0)) {
|
||||||
|
tcp_rst(c, conn);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
||||||
|
conn_flag(c, conn, STALLED);
|
||||||
|
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Set up buffer descriptors we'll fill completely and partially. */
|
||||||
|
fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
|
||||||
|
if (fill_bufs > TCP_FRAMES) {
|
||||||
|
fill_bufs = TCP_FRAMES;
|
||||||
|
iov_rem = 0;
|
||||||
|
} else {
|
||||||
|
iov_rem = (wnd_scaled - already_sent) % mss;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Prepare iov according to kernel capability */
|
||||||
|
if (!peek_offset_cap) {
|
||||||
|
mh_sock.msg_iov = iov_sock;
|
||||||
|
iov_sock[0].iov_base = tcp_buf_discard;
|
||||||
|
iov_sock[0].iov_len = already_sent;
|
||||||
|
mh_sock.msg_iovlen = fill_bufs + 1;
|
||||||
|
} else {
|
||||||
|
mh_sock.msg_iov = &iov_sock[1];
|
||||||
|
mh_sock.msg_iovlen = fill_bufs;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
|
||||||
|
tcp_payload_flush(c);
|
||||||
|
|
||||||
|
/* Silence Coverity CWE-125 false positive */
|
||||||
|
tcp_payload_used = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
|
||||||
|
iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
|
||||||
|
iov->iov_len = mss;
|
||||||
|
}
|
||||||
|
if (iov_rem)
|
||||||
|
iov_sock[fill_bufs].iov_len = iov_rem;
|
||||||
|
|
||||||
|
/* Receive into buffers, don't dequeue until acknowledged by guest. */
|
||||||
|
do
|
||||||
|
len = recvmsg(s, &mh_sock, MSG_PEEK);
|
||||||
|
while (len < 0 && errno == EINTR);
|
||||||
|
|
||||||
|
if (len < 0) {
|
||||||
|
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||||
|
tcp_rst(c, conn);
|
||||||
|
return -errno;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!len) {
|
||||||
|
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
||||||
|
int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
|
||||||
|
if (ret) {
|
||||||
|
tcp_rst(c, conn);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
conn_event(c, conn, TAP_FIN_SENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!peek_offset_cap)
|
||||||
|
len -= already_sent;
|
||||||
|
|
||||||
|
if (len <= 0) {
|
||||||
|
conn_flag(c, conn, STALLED);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
conn_flag(c, conn, ~STALLED);
|
||||||
|
|
||||||
|
send_bufs = DIV_ROUND_UP(len, mss);
|
||||||
|
last_len = len - (send_bufs - 1) * mss;
|
||||||
|
|
||||||
|
/* Likely, some new data was acked too. */
|
||||||
|
tcp_update_seqack_wnd(c, conn, false, NULL);
|
||||||
|
|
||||||
|
/* Finally, queue to tap */
|
||||||
|
dlen = mss;
|
||||||
|
seq = conn->seq_to_tap;
|
||||||
|
for (i = 0; i < send_bufs; i++) {
|
||||||
|
int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
|
||||||
|
|
||||||
|
if (i == send_bufs - 1)
|
||||||
|
dlen = last_len;
|
||||||
|
|
||||||
|
tcp_data_to_tap(c, conn, dlen, no_csum, seq);
|
||||||
|
seq += dlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
14
tcp_buf.h
Normal file
14
tcp_buf.h
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
* Copyright (c) 2021 Red Hat GmbH
|
||||||
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef TCP_BUF_H
|
||||||
|
#define TCP_BUF_H
|
||||||
|
|
||||||
|
void tcp_sock_iov_init(const struct ctx *c);
|
||||||
|
void tcp_payload_flush(const struct ctx *c);
|
||||||
|
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
|
||||||
|
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||||
|
|
||||||
|
#endif /*TCP_BUF_H */
|
65
tcp_conn.h
65
tcp_conn.h
|
@ -13,19 +13,16 @@
|
||||||
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
|
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
|
||||||
* @f: Generic flow information
|
* @f: Generic flow information
|
||||||
* @in_epoll: Is the connection in the epoll set?
|
* @in_epoll: Is the connection in the epoll set?
|
||||||
|
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
|
||||||
|
* @ws_from_tap: Window scaling factor advertised from tap/guest
|
||||||
|
* @ws_to_tap: Window scaling factor advertised to tap/guest
|
||||||
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
|
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
|
||||||
* @sock: Socket descriptor number
|
* @sock: Socket descriptor number
|
||||||
* @events: Connection events, implying connection states
|
* @events: Connection events, implying connection states
|
||||||
* @timer: timerfd descriptor for timeout events
|
* @timer: timerfd descriptor for timeout events
|
||||||
* @flags: Connection flags representing internal attributes
|
* @flags: Connection flags representing internal attributes
|
||||||
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
|
|
||||||
* @ws_from_tap: Window scaling factor advertised from tap/guest
|
|
||||||
* @ws_to_tap: Window scaling factor advertised to tap/guest
|
|
||||||
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
|
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
|
||||||
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
|
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
|
||||||
* @faddr: Guest side forwarding address (guest's remote address)
|
|
||||||
* @eport: Guest side endpoint port (guest's local port)
|
|
||||||
* @fport: Guest side forwarding port (guest's remote port)
|
|
||||||
* @wnd_from_tap: Last window size from tap, unscaled (as received)
|
* @wnd_from_tap: Last window size from tap, unscaled (as received)
|
||||||
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
|
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
|
||||||
* @seq_to_tap: Next sequence for packets to tap
|
* @seq_to_tap: Next sequence for packets to tap
|
||||||
|
@ -49,6 +46,10 @@ struct tcp_tap_conn {
|
||||||
unsigned int ws_from_tap :TCP_WS_BITS;
|
unsigned int ws_from_tap :TCP_WS_BITS;
|
||||||
unsigned int ws_to_tap :TCP_WS_BITS;
|
unsigned int ws_to_tap :TCP_WS_BITS;
|
||||||
|
|
||||||
|
#define TCP_MSS_BITS 14
|
||||||
|
unsigned int tap_mss :TCP_MSS_BITS;
|
||||||
|
#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
|
||||||
|
#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
|
||||||
|
|
||||||
int sock :FD_REF_BITS;
|
int sock :FD_REF_BITS;
|
||||||
|
|
||||||
|
@ -77,13 +78,6 @@ struct tcp_tap_conn {
|
||||||
#define ACK_TO_TAP_DUE BIT(3)
|
#define ACK_TO_TAP_DUE BIT(3)
|
||||||
#define ACK_FROM_TAP_DUE BIT(4)
|
#define ACK_FROM_TAP_DUE BIT(4)
|
||||||
|
|
||||||
|
|
||||||
#define TCP_MSS_BITS 14
|
|
||||||
unsigned int tap_mss :TCP_MSS_BITS;
|
|
||||||
#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
|
|
||||||
#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
|
|
||||||
|
|
||||||
|
|
||||||
#define SNDBUF_BITS 24
|
#define SNDBUF_BITS 24
|
||||||
unsigned int sndbuf :SNDBUF_BITS;
|
unsigned int sndbuf :SNDBUF_BITS;
|
||||||
#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
|
#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
|
||||||
|
@ -91,11 +85,6 @@ struct tcp_tap_conn {
|
||||||
|
|
||||||
uint8_t seq_dup_ack_approx;
|
uint8_t seq_dup_ack_approx;
|
||||||
|
|
||||||
|
|
||||||
union inany_addr faddr;
|
|
||||||
in_port_t eport;
|
|
||||||
in_port_t fport;
|
|
||||||
|
|
||||||
uint16_t wnd_from_tap;
|
uint16_t wnd_from_tap;
|
||||||
uint16_t wnd_to_tap;
|
uint16_t wnd_to_tap;
|
||||||
|
|
||||||
|
@ -106,47 +95,41 @@ struct tcp_tap_conn {
|
||||||
uint32_t seq_init_from_tap;
|
uint32_t seq_init_from_tap;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define SIDES 2
|
|
||||||
/**
|
/**
|
||||||
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
|
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
|
||||||
* @f: Generic flow information
|
* @f: Generic flow information
|
||||||
* @in_epoll: Is the connection in the epoll set?
|
|
||||||
* @s: File descriptor for sockets
|
* @s: File descriptor for sockets
|
||||||
* @pipe: File descriptors for pipes
|
* @pipe: File descriptors for pipes
|
||||||
* @events: Events observed/actions performed on connection
|
|
||||||
* @flags: Connection flags (attributes, not events)
|
|
||||||
* @read: Bytes read (not fully written to other side in one shot)
|
* @read: Bytes read (not fully written to other side in one shot)
|
||||||
* @written: Bytes written (not fully written from one other side read)
|
* @written: Bytes written (not fully written from one other side read)
|
||||||
*/
|
* @events: Events observed/actions performed on connection
|
||||||
|
* @flags: Connection flags (attributes, not events)
|
||||||
|
* @in_epoll: Is the connection in the epoll set?
|
||||||
|
*/
|
||||||
struct tcp_splice_conn {
|
struct tcp_splice_conn {
|
||||||
/* Must be first element */
|
/* Must be first element */
|
||||||
struct flow_common f;
|
struct flow_common f;
|
||||||
|
|
||||||
bool in_epoll :1;
|
|
||||||
int s[SIDES];
|
int s[SIDES];
|
||||||
int pipe[SIDES][2];
|
int pipe[SIDES][2];
|
||||||
|
|
||||||
|
uint32_t read[SIDES];
|
||||||
|
uint32_t written[SIDES];
|
||||||
|
|
||||||
uint8_t events;
|
uint8_t events;
|
||||||
#define SPLICE_CLOSED 0
|
#define SPLICE_CLOSED 0
|
||||||
#define SPLICE_CONNECT BIT(0)
|
#define SPLICE_CONNECT BIT(0)
|
||||||
#define SPLICE_ESTABLISHED BIT(1)
|
#define SPLICE_ESTABLISHED BIT(1)
|
||||||
#define OUT_WAIT_0 BIT(2)
|
#define OUT_WAIT(sidei_) ((sidei_) ? BIT(3) : BIT(2))
|
||||||
#define OUT_WAIT_1 BIT(3)
|
#define FIN_RCVD(sidei_) ((sidei_) ? BIT(5) : BIT(4))
|
||||||
#define FIN_RCVD_0 BIT(4)
|
#define FIN_SENT(sidei_) ((sidei_) ? BIT(7) : BIT(6))
|
||||||
#define FIN_RCVD_1 BIT(5)
|
|
||||||
#define FIN_SENT_0 BIT(6)
|
|
||||||
#define FIN_SENT_1 BIT(7)
|
|
||||||
|
|
||||||
uint8_t flags;
|
uint8_t flags;
|
||||||
#define SPLICE_V6 BIT(0)
|
#define RCVLOWAT_SET(sidei_) ((sidei_) ? BIT(1) : BIT(0))
|
||||||
#define RCVLOWAT_SET_0 BIT(1)
|
#define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(3) : BIT(2))
|
||||||
#define RCVLOWAT_SET_1 BIT(2)
|
#define CLOSING BIT(4)
|
||||||
#define RCVLOWAT_ACT_0 BIT(3)
|
|
||||||
#define RCVLOWAT_ACT_1 BIT(4)
|
|
||||||
#define CLOSING BIT(5)
|
|
||||||
|
|
||||||
uint32_t read[SIDES];
|
bool in_epoll :1;
|
||||||
uint32_t written[SIDES];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Socket pools */
|
/* Socket pools */
|
||||||
|
@ -155,9 +138,9 @@ struct tcp_splice_conn {
|
||||||
extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
|
extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
|
||||||
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
|
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
|
||||||
|
|
||||||
bool tcp_flow_defer(union flow *flow);
|
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
|
||||||
bool tcp_splice_flow_defer(union flow *flow);
|
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
|
||||||
void tcp_splice_timer(const struct ctx *c, union flow *flow);
|
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
|
||||||
int tcp_conn_pool_sock(int pool[]);
|
int tcp_conn_pool_sock(int pool[]);
|
||||||
int tcp_conn_sock(const struct ctx *c, sa_family_t af);
|
int tcp_conn_sock(const struct ctx *c, sa_family_t af);
|
||||||
int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af);
|
int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af);
|
||||||
|
|
175
tcp_internal.h
Normal file
175
tcp_internal.h
Normal file
|
@ -0,0 +1,175 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
* Copyright (c) 2021 Red Hat GmbH
|
||||||
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef TCP_INTERNAL_H
|
||||||
|
#define TCP_INTERNAL_H
|
||||||
|
|
||||||
|
#define MAX_WS 8
|
||||||
|
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
|
||||||
|
|
||||||
|
#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
|
||||||
|
sizeof(struct tcphdr) - \
|
||||||
|
sizeof(struct iphdr), \
|
||||||
|
sizeof(uint32_t))
|
||||||
|
#define MSS6 ROUND_DOWN(IP_MAX_MTU - \
|
||||||
|
sizeof(struct tcphdr) - \
|
||||||
|
sizeof(struct ipv6hdr), \
|
||||||
|
sizeof(uint32_t))
|
||||||
|
|
||||||
|
#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
|
||||||
|
#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
|
||||||
|
#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
|
||||||
|
#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
|
||||||
|
|
||||||
|
#define FIN (1 << 0)
|
||||||
|
#define SYN (1 << 1)
|
||||||
|
#define RST (1 << 2)
|
||||||
|
#define ACK (1 << 4)
|
||||||
|
|
||||||
|
/* Flags for internal usage */
|
||||||
|
#define DUP_ACK (1 << 5)
|
||||||
|
#define OPT_EOL 0
|
||||||
|
#define OPT_NOP 1
|
||||||
|
#define OPT_MSS 2
|
||||||
|
#define OPT_WS 3
|
||||||
|
#define OPT_SACKP 4
|
||||||
|
#define OPT_SACK 5
|
||||||
|
#define OPT_TS 8
|
||||||
|
|
||||||
|
#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
|
||||||
|
#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
|
||||||
|
#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
|
||||||
|
|
||||||
|
#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr))
|
||||||
|
#define CONN_V6(conn) (!CONN_V4(conn))
|
||||||
|
|
||||||
|
/*
|
||||||
|
* enum tcp_iov_parts - I/O vector parts for one TCP frame
|
||||||
|
* @TCP_IOV_TAP tap backend specific header
|
||||||
|
* @TCP_IOV_ETH Ethernet header
|
||||||
|
* @TCP_IOV_IP IP (v4/v6) header
|
||||||
|
* @TCP_IOV_PAYLOAD IP payload (TCP header + data)
|
||||||
|
* @TCP_NUM_IOVS the number of entries in the iovec array
|
||||||
|
*/
|
||||||
|
enum tcp_iov_parts {
|
||||||
|
TCP_IOV_TAP = 0,
|
||||||
|
TCP_IOV_ETH = 1,
|
||||||
|
TCP_IOV_IP = 2,
|
||||||
|
TCP_IOV_PAYLOAD = 3,
|
||||||
|
TCP_NUM_IOVS
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct tcp_payload_t - TCP header and data to send segments with payload
|
||||||
|
* @th: TCP header
|
||||||
|
* @data: TCP data
|
||||||
|
*/
|
||||||
|
struct tcp_payload_t {
|
||||||
|
struct tcphdr th;
|
||||||
|
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
|
||||||
|
#ifdef __AVX2__
|
||||||
|
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
|
||||||
|
#else
|
||||||
|
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/** struct tcp_opt_nop - TCP NOP option
|
||||||
|
* @kind: Option kind (OPT_NOP = 1)
|
||||||
|
*/
|
||||||
|
struct tcp_opt_nop {
|
||||||
|
uint8_t kind;
|
||||||
|
} __attribute__ ((packed));
|
||||||
|
#define TCP_OPT_NOP ((struct tcp_opt_nop){ .kind = OPT_NOP, })
|
||||||
|
|
||||||
|
/** struct tcp_opt_mss - TCP MSS option
|
||||||
|
* @kind: Option kind (OPT_MSS == 2)
|
||||||
|
* @len: Option length (4)
|
||||||
|
* @mss: Maximum Segment Size
|
||||||
|
*/
|
||||||
|
struct tcp_opt_mss {
|
||||||
|
uint8_t kind;
|
||||||
|
uint8_t len;
|
||||||
|
uint16_t mss;
|
||||||
|
} __attribute__ ((packed));
|
||||||
|
#define TCP_OPT_MSS(mss_) \
|
||||||
|
((struct tcp_opt_mss) { \
|
||||||
|
.kind = OPT_MSS, \
|
||||||
|
.len = sizeof(struct tcp_opt_mss), \
|
||||||
|
.mss = htons(mss_), \
|
||||||
|
})
|
||||||
|
|
||||||
|
/** struct tcp_opt_ws - TCP Window Scaling option
|
||||||
|
* @kind: Option kind (OPT_WS == 3)
|
||||||
|
* @len: Option length (3)
|
||||||
|
* @shift: Window scaling shift
|
||||||
|
*/
|
||||||
|
struct tcp_opt_ws {
|
||||||
|
uint8_t kind;
|
||||||
|
uint8_t len;
|
||||||
|
uint8_t shift;
|
||||||
|
} __attribute__ ((packed));
|
||||||
|
#define TCP_OPT_WS(shift_) \
|
||||||
|
((struct tcp_opt_ws) { \
|
||||||
|
.kind = OPT_WS, \
|
||||||
|
.len = sizeof(struct tcp_opt_ws), \
|
||||||
|
.shift = (shift_), \
|
||||||
|
})
|
||||||
|
|
||||||
|
/** struct tcp_syn_opts - TCP options we apply to SYN packets
|
||||||
|
* @mss: Maximum Segment Size (MSS) option
|
||||||
|
* @nop: NOP opt (for alignment)
|
||||||
|
* @ws: Window Scaling (WS) option
|
||||||
|
*/
|
||||||
|
struct tcp_syn_opts {
|
||||||
|
struct tcp_opt_mss mss;
|
||||||
|
struct tcp_opt_nop nop;
|
||||||
|
struct tcp_opt_ws ws;
|
||||||
|
} __attribute__ ((packed));
|
||||||
|
#define TCP_SYN_OPTS(mss_, ws_) \
|
||||||
|
((struct tcp_syn_opts){ \
|
||||||
|
.mss = TCP_OPT_MSS(mss_), \
|
||||||
|
.nop = TCP_OPT_NOP, \
|
||||||
|
.ws = TCP_OPT_WS(ws_), \
|
||||||
|
})
|
||||||
|
|
||||||
|
extern char tcp_buf_discard [MAX_WINDOW];
|
||||||
|
|
||||||
|
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
|
unsigned long flag);
|
||||||
|
#define conn_flag(c, conn, flag) \
|
||||||
|
do { \
|
||||||
|
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
|
||||||
|
conn_flag_do(c, conn, flag); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
|
unsigned long event);
|
||||||
|
#define conn_event(c, conn, event) \
|
||||||
|
do { \
|
||||||
|
flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
|
||||||
|
conn_event_do(c, conn, event); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
|
||||||
|
#define tcp_rst(c, conn) \
|
||||||
|
do { \
|
||||||
|
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
|
||||||
|
tcp_rst_do(c, conn); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
struct tcp_info_linux;
|
||||||
|
|
||||||
|
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
||||||
|
struct iovec *iov, size_t dlen,
|
||||||
|
const uint16_t *check, uint32_t seq,
|
||||||
|
bool no_tcp_csum);
|
||||||
|
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
|
bool force_seq, struct tcp_info_linux *tinfo);
|
||||||
|
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
|
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
|
||||||
|
size_t *optlen);
|
||||||
|
|
||||||
|
#endif /* TCP_INTERNAL_H */
|
299
tcp_splice.c
299
tcp_splice.c
|
@ -28,7 +28,7 @@
|
||||||
* - FIN_SENT_0: FIN (write shutdown) sent to accepted socket
|
* - FIN_SENT_0: FIN (write shutdown) sent to accepted socket
|
||||||
* - FIN_SENT_1: FIN (write shutdown) sent to target socket
|
* - FIN_SENT_1: FIN (write shutdown) sent to target socket
|
||||||
*
|
*
|
||||||
* #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64
|
* #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
|
@ -73,10 +73,7 @@ static int ns_sock_pool6 [TCP_SOCK_POOL_SIZE];
|
||||||
/* Pool of pre-opened pipes */
|
/* Pool of pre-opened pipes */
|
||||||
static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2];
|
static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2];
|
||||||
|
|
||||||
#define CONN_V6(x) (x->flags & SPLICE_V6)
|
#define CONN_HAS(conn, set) (((conn)->events & (set)) == (set))
|
||||||
#define CONN_V4(x) (!CONN_V6(x))
|
|
||||||
#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
|
|
||||||
#define CONN(idx) (&FLOW(idx)->tcp_splice)
|
|
||||||
|
|
||||||
/* Display strings for connection events */
|
/* Display strings for connection events */
|
||||||
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
|
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
|
||||||
|
@ -94,6 +91,24 @@ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
|
||||||
static int tcp_sock_refill_ns(void *arg);
|
static int tcp_sock_refill_ns(void *arg);
|
||||||
static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);
|
static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* conn_at_sidx() - Get spliced TCP connection specific flow at given sidx
|
||||||
|
* @sidx: Flow and side to retrieve
|
||||||
|
*
|
||||||
|
* Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid.
|
||||||
|
* Asserts if the flow at @sidx is not FLOW_TCP_SPLICE.
|
||||||
|
*/
|
||||||
|
static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
|
||||||
|
{
|
||||||
|
union flow *flow = flow_at_sidx(sidx);
|
||||||
|
|
||||||
|
if (!flow)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
ASSERT(flow->f.type == FLOW_TCP_SPLICE);
|
||||||
|
return &flow->tcp_splice;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* tcp_splice_conn_epoll_events() - epoll events masks for given state
|
* tcp_splice_conn_epoll_events() - epoll events masks for given state
|
||||||
* @events: Connection event flags
|
* @events: Connection event flags
|
||||||
|
@ -102,19 +117,22 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);
|
||||||
static void tcp_splice_conn_epoll_events(uint16_t events,
|
static void tcp_splice_conn_epoll_events(uint16_t events,
|
||||||
struct epoll_event ev[])
|
struct epoll_event ev[])
|
||||||
{
|
{
|
||||||
ev[0].events = ev[1].events = 0;
|
unsigned sidei;
|
||||||
|
|
||||||
|
flow_foreach_sidei(sidei)
|
||||||
|
ev[sidei].events = 0;
|
||||||
|
|
||||||
if (events & SPLICE_ESTABLISHED) {
|
if (events & SPLICE_ESTABLISHED) {
|
||||||
if (!(events & FIN_SENT_1))
|
flow_foreach_sidei(sidei) {
|
||||||
ev[0].events = EPOLLIN | EPOLLRDHUP;
|
if (!(events & FIN_SENT(!sidei)))
|
||||||
if (!(events & FIN_SENT_0))
|
ev[sidei].events = EPOLLIN | EPOLLRDHUP;
|
||||||
ev[1].events = EPOLLIN | EPOLLRDHUP;
|
}
|
||||||
} else if (events & SPLICE_CONNECT) {
|
} else if (events & SPLICE_CONNECT) {
|
||||||
ev[1].events = EPOLLOUT;
|
ev[1].events = EPOLLOUT;
|
||||||
}
|
}
|
||||||
|
|
||||||
ev[0].events |= (events & OUT_WAIT_0) ? EPOLLOUT : 0;
|
flow_foreach_sidei(sidei)
|
||||||
ev[1].events |= (events & OUT_WAIT_1) ? EPOLLOUT : 0;
|
ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -235,32 +253,31 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed)
|
* tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed)
|
||||||
* @flow: Flow table entry for this connection
|
* @conn: Connection entry to handle
|
||||||
*
|
*
|
||||||
* Return: true if the flow is ready to free, false otherwise
|
* Return: true if the flow is ready to free, false otherwise
|
||||||
*/
|
*/
|
||||||
bool tcp_splice_flow_defer(union flow *flow)
|
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn)
|
||||||
{
|
{
|
||||||
struct tcp_splice_conn *conn = &flow->tcp_splice;
|
unsigned sidei;
|
||||||
unsigned side;
|
|
||||||
|
|
||||||
if (!(flow->tcp_splice.flags & CLOSING))
|
if (!(conn->flags & CLOSING))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
for (side = 0; side < SIDES; side++) {
|
flow_foreach_sidei(sidei) {
|
||||||
/* Flushing might need to block: don't recycle them. */
|
/* Flushing might need to block: don't recycle them. */
|
||||||
if (conn->pipe[side][0] >= 0) {
|
if (conn->pipe[sidei][0] >= 0) {
|
||||||
close(conn->pipe[side][0]);
|
close(conn->pipe[sidei][0]);
|
||||||
close(conn->pipe[side][1]);
|
close(conn->pipe[sidei][1]);
|
||||||
conn->pipe[side][0] = conn->pipe[side][1] = -1;
|
conn->pipe[sidei][0] = conn->pipe[sidei][1] = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (conn->s[side] >= 0) {
|
if (conn->s[sidei] >= 0) {
|
||||||
close(conn->s[side]);
|
close(conn->s[sidei]);
|
||||||
conn->s[side] = -1;
|
conn->s[sidei] = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
conn->read[side] = conn->written[side] = 0;
|
conn->read[sidei] = conn->written[sidei] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
conn->events = SPLICE_CLOSED;
|
conn->events = SPLICE_CLOSED;
|
||||||
|
@ -280,33 +297,33 @@ bool tcp_splice_flow_defer(union flow *flow)
|
||||||
static int tcp_splice_connect_finish(const struct ctx *c,
|
static int tcp_splice_connect_finish(const struct ctx *c,
|
||||||
struct tcp_splice_conn *conn)
|
struct tcp_splice_conn *conn)
|
||||||
{
|
{
|
||||||
unsigned side;
|
unsigned sidei;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
|
||||||
for (side = 0; side < SIDES; side++) {
|
flow_foreach_sidei(sidei) {
|
||||||
for (; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
|
for (; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
|
||||||
if (splice_pipe_pool[i][0] >= 0) {
|
if (splice_pipe_pool[i][0] >= 0) {
|
||||||
SWAP(conn->pipe[side][0],
|
SWAP(conn->pipe[sidei][0],
|
||||||
splice_pipe_pool[i][0]);
|
splice_pipe_pool[i][0]);
|
||||||
SWAP(conn->pipe[side][1],
|
SWAP(conn->pipe[sidei][1],
|
||||||
splice_pipe_pool[i][1]);
|
splice_pipe_pool[i][1]);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (conn->pipe[side][0] < 0) {
|
if (conn->pipe[sidei][0] < 0) {
|
||||||
if (pipe2(conn->pipe[side], O_NONBLOCK | O_CLOEXEC)) {
|
if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
|
||||||
flow_err(conn, "cannot create %d->%d pipe: %s",
|
flow_err(conn, "cannot create %d->%d pipe: %s",
|
||||||
side, !side, strerror(errno));
|
sidei, !sidei, strerror(errno));
|
||||||
conn_flag(c, conn, CLOSING);
|
conn_flag(c, conn, CLOSING);
|
||||||
return -EIO;
|
return -EIO;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fcntl(conn->pipe[side][0], F_SETPIPE_SZ,
|
if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ,
|
||||||
c->tcp.pipe_size)) {
|
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
|
||||||
flow_trace(conn,
|
flow_trace(conn,
|
||||||
"cannot set %d->%d pipe size to %zu",
|
"cannot set %d->%d pipe size to %zu",
|
||||||
side, !side, c->tcp.pipe_size);
|
sidei, !sidei, c->tcp.pipe_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -321,31 +338,20 @@ static int tcp_splice_connect_finish(const struct ctx *c,
|
||||||
* tcp_splice_connect() - Create and connect socket for new spliced connection
|
* tcp_splice_connect() - Create and connect socket for new spliced connection
|
||||||
* @c: Execution context
|
* @c: Execution context
|
||||||
* @conn: Connection pointer
|
* @conn: Connection pointer
|
||||||
* @af: Address family
|
|
||||||
* @pif: pif on which to create socket
|
|
||||||
* @port: Destination port, host order
|
|
||||||
*
|
*
|
||||||
* Return: 0 for connect() succeeded or in progress, negative value on error
|
* Return: 0 for connect() succeeded or in progress, negative value on error
|
||||||
*/
|
*/
|
||||||
static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
|
static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
|
||||||
sa_family_t af, uint8_t pif, in_port_t port)
|
|
||||||
{
|
{
|
||||||
struct sockaddr_in6 addr6 = {
|
const struct flowside *tgt = &conn->f.side[TGTSIDE];
|
||||||
.sin6_family = AF_INET6,
|
sa_family_t af = inany_v4(&tgt->eaddr) ? AF_INET : AF_INET6;
|
||||||
.sin6_port = htons(port),
|
uint8_t tgtpif = conn->f.pif[TGTSIDE];
|
||||||
.sin6_addr = IN6ADDR_LOOPBACK_INIT,
|
union sockaddr_inany sa;
|
||||||
};
|
|
||||||
struct sockaddr_in addr4 = {
|
|
||||||
.sin_family = AF_INET,
|
|
||||||
.sin_port = htons(port),
|
|
||||||
.sin_addr = IN4ADDR_LOOPBACK_INIT,
|
|
||||||
};
|
|
||||||
const struct sockaddr *sa;
|
|
||||||
socklen_t sl;
|
socklen_t sl;
|
||||||
|
|
||||||
if (pif == PIF_HOST)
|
if (tgtpif == PIF_HOST)
|
||||||
conn->s[1] = tcp_conn_sock(c, af);
|
conn->s[1] = tcp_conn_sock(c, af);
|
||||||
else if (pif == PIF_SPLICE)
|
else if (tgtpif == PIF_SPLICE)
|
||||||
conn->s[1] = tcp_conn_sock_ns(c, af);
|
conn->s[1] = tcp_conn_sock_ns(c, af);
|
||||||
else
|
else
|
||||||
ASSERT(0);
|
ASSERT(0);
|
||||||
|
@ -359,15 +365,9 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
|
||||||
conn->s[1]);
|
conn->s[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (CONN_V6(conn)) {
|
pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport);
|
||||||
sa = (struct sockaddr *)&addr6;
|
|
||||||
sl = sizeof(addr6);
|
|
||||||
} else {
|
|
||||||
sa = (struct sockaddr *)&addr4;
|
|
||||||
sl = sizeof(addr4);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (connect(conn->s[1], sa, sl)) {
|
if (connect(conn->s[1], &sa.sa, sl)) {
|
||||||
if (errno != EINPROGRESS) {
|
if (errno != EINPROGRESS) {
|
||||||
flow_trace(conn, "Couldn't connect socket for splice: %s",
|
flow_trace(conn, "Couldn't connect socket for splice: %s",
|
||||||
strerror(errno));
|
strerror(errno));
|
||||||
|
@ -414,67 +414,19 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af)
|
||||||
/**
|
/**
|
||||||
* tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection
|
* tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection
|
||||||
* @c: Execution context
|
* @c: Execution context
|
||||||
* @pif0: pif id of side 0
|
|
||||||
* @dstport: Side 0 destination port of connection
|
|
||||||
* @flow: flow to initialise
|
* @flow: flow to initialise
|
||||||
* @s0: Accepted (side 0) socket
|
* @s0: Accepted (side 0) socket
|
||||||
* @sa: Peer address of connection
|
* @sa: Peer address of connection
|
||||||
*
|
*
|
||||||
* Return: true if able to create a spliced connection, false otherwise
|
|
||||||
* #syscalls:pasta setsockopt
|
* #syscalls:pasta setsockopt
|
||||||
*/
|
*/
|
||||||
bool tcp_splice_conn_from_sock(const struct ctx *c,
|
void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0)
|
||||||
uint8_t pif0, in_port_t dstport,
|
|
||||||
union flow *flow, int s0,
|
|
||||||
const union sockaddr_inany *sa)
|
|
||||||
{
|
{
|
||||||
struct tcp_splice_conn *conn;
|
struct tcp_splice_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE,
|
||||||
union inany_addr src;
|
tcp_splice);
|
||||||
in_port_t srcport;
|
|
||||||
sa_family_t af;
|
|
||||||
uint8_t pif1;
|
|
||||||
|
|
||||||
if (c->mode != MODE_PASTA)
|
ASSERT(c->mode == MODE_PASTA);
|
||||||
return false;
|
|
||||||
|
|
||||||
inany_from_sockaddr(&src, &srcport, sa);
|
|
||||||
af = inany_v4(&src) ? AF_INET : AF_INET6;
|
|
||||||
|
|
||||||
switch (pif0) {
|
|
||||||
case PIF_SPLICE:
|
|
||||||
if (!inany_is_loopback(&src)) {
|
|
||||||
char str[INANY_ADDRSTRLEN];
|
|
||||||
|
|
||||||
/* We can't use flow_err() etc. because we haven't set
|
|
||||||
* the flow type yet
|
|
||||||
*/
|
|
||||||
warn("Bad source address %s for splice, closing",
|
|
||||||
inany_ntop(&src, str, sizeof(str)));
|
|
||||||
|
|
||||||
/* We *don't* want to fall back to tap */
|
|
||||||
flow_alloc_cancel(flow);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
pif1 = PIF_HOST;
|
|
||||||
dstport += c->tcp.fwd_out.delta[dstport];
|
|
||||||
break;
|
|
||||||
|
|
||||||
case PIF_HOST:
|
|
||||||
if (!inany_is_loopback(&src))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
pif1 = PIF_SPLICE;
|
|
||||||
dstport += c->tcp.fwd_in.delta[dstport];
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
conn = FLOW_START(flow, FLOW_TCP_SPLICE, tcp_splice, 0);
|
|
||||||
|
|
||||||
conn->flags = af == AF_INET ? 0 : SPLICE_V6;
|
|
||||||
conn->s[0] = s0;
|
conn->s[0] = s0;
|
||||||
conn->s[1] = -1;
|
conn->s[1] = -1;
|
||||||
conn->pipe[0][0] = conn->pipe[0][1] = -1;
|
conn->pipe[0][0] = conn->pipe[0][1] = -1;
|
||||||
|
@ -483,10 +435,10 @@ bool tcp_splice_conn_from_sock(const struct ctx *c,
|
||||||
if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int)))
|
if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int)))
|
||||||
flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0);
|
flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0);
|
||||||
|
|
||||||
if (tcp_splice_connect(c, conn, af, pif1, dstport))
|
if (tcp_splice_connect(c, conn))
|
||||||
conn_flag(c, conn, CLOSING);
|
conn_flag(c, conn, CLOSING);
|
||||||
|
|
||||||
return true;
|
FLOW_ACTIVATE(conn);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -500,8 +452,8 @@ bool tcp_splice_conn_from_sock(const struct ctx *c,
|
||||||
void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
|
void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
|
||||||
uint32_t events)
|
uint32_t events)
|
||||||
{
|
{
|
||||||
struct tcp_splice_conn *conn = CONN(ref.flowside.flow);
|
struct tcp_splice_conn *conn = conn_at_sidx(ref.flowside);
|
||||||
unsigned side = ref.flowside.side, fromside;
|
unsigned evsidei = ref.flowside.sidei, fromsidei;
|
||||||
uint8_t lowat_set_flag, lowat_act_flag;
|
uint8_t lowat_set_flag, lowat_act_flag;
|
||||||
int eof, never_read;
|
int eof, never_read;
|
||||||
|
|
||||||
|
@ -533,30 +485,31 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (events & EPOLLOUT) {
|
if (events & EPOLLOUT) {
|
||||||
fromside = !side;
|
fromsidei = !evsidei;
|
||||||
conn_event(c, conn, side == 0 ? ~OUT_WAIT_0 : ~OUT_WAIT_1);
|
conn_event(c, conn, ~OUT_WAIT(evsidei));
|
||||||
} else {
|
} else {
|
||||||
fromside = side;
|
fromsidei = evsidei;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (events & EPOLLRDHUP)
|
if (events & EPOLLRDHUP)
|
||||||
/* For side 0 this is fake, but implied */
|
/* For side 0 this is fake, but implied */
|
||||||
conn_event(c, conn, side == 0 ? FIN_RCVD_0 : FIN_RCVD_1);
|
conn_event(c, conn, FIN_RCVD(evsidei));
|
||||||
|
|
||||||
swap:
|
swap:
|
||||||
eof = 0;
|
eof = 0;
|
||||||
never_read = 1;
|
never_read = 1;
|
||||||
|
|
||||||
lowat_set_flag = fromside == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1;
|
lowat_set_flag = RCVLOWAT_SET(fromsidei);
|
||||||
lowat_act_flag = fromside == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1;
|
lowat_act_flag = RCVLOWAT_ACT(fromsidei);
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
ssize_t readlen, to_write = 0, written;
|
ssize_t readlen, written, pending;
|
||||||
int more = 0;
|
int more = 0;
|
||||||
|
|
||||||
retry:
|
retry:
|
||||||
readlen = splice(conn->s[fromside], NULL,
|
readlen = splice(conn->s[fromsidei], NULL,
|
||||||
conn->pipe[fromside][1], NULL, c->tcp.pipe_size,
|
conn->pipe[fromsidei][1], NULL,
|
||||||
|
c->tcp.pipe_size,
|
||||||
SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
|
SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
|
||||||
flow_trace(conn, "%zi from read-side call", readlen);
|
flow_trace(conn, "%zi from read-side call", readlen);
|
||||||
if (readlen < 0) {
|
if (readlen < 0) {
|
||||||
|
@ -565,14 +518,11 @@ retry:
|
||||||
|
|
||||||
if (errno != EAGAIN)
|
if (errno != EAGAIN)
|
||||||
goto close;
|
goto close;
|
||||||
|
|
||||||
to_write = c->tcp.pipe_size;
|
|
||||||
} else if (!readlen) {
|
} else if (!readlen) {
|
||||||
eof = 1;
|
eof = 1;
|
||||||
to_write = c->tcp.pipe_size;
|
|
||||||
} else {
|
} else {
|
||||||
never_read = 0;
|
never_read = 0;
|
||||||
to_write += readlen;
|
|
||||||
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
|
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
|
||||||
more = SPLICE_F_MORE;
|
more = SPLICE_F_MORE;
|
||||||
|
|
||||||
|
@ -581,11 +531,11 @@ retry:
|
||||||
}
|
}
|
||||||
|
|
||||||
eintr:
|
eintr:
|
||||||
written = splice(conn->pipe[fromside][0], NULL,
|
written = splice(conn->pipe[fromsidei][0], NULL,
|
||||||
conn->s[!fromside], NULL, to_write,
|
conn->s[!fromsidei], NULL, c->tcp.pipe_size,
|
||||||
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
|
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
|
||||||
flow_trace(conn, "%zi from write-side call (passed %zi)",
|
flow_trace(conn, "%zi from write-side call (passed %zi)",
|
||||||
written, to_write);
|
written, c->tcp.pipe_size);
|
||||||
|
|
||||||
/* Most common case: skip updating counters. */
|
/* Most common case: skip updating counters. */
|
||||||
if (readlen > 0 && readlen == written) {
|
if (readlen > 0 && readlen == written) {
|
||||||
|
@ -596,18 +546,23 @@ eintr:
|
||||||
readlen > (long)c->tcp.pipe_size / 10) {
|
readlen > (long)c->tcp.pipe_size / 10) {
|
||||||
int lowat = c->tcp.pipe_size / 4;
|
int lowat = c->tcp.pipe_size / 4;
|
||||||
|
|
||||||
setsockopt(conn->s[fromside], SOL_SOCKET,
|
if (setsockopt(conn->s[fromsidei], SOL_SOCKET,
|
||||||
SO_RCVLOWAT, &lowat, sizeof(lowat));
|
SO_RCVLOWAT,
|
||||||
|
&lowat, sizeof(lowat))) {
|
||||||
conn_flag(c, conn, lowat_set_flag);
|
flow_trace(conn,
|
||||||
conn_flag(c, conn, lowat_act_flag);
|
"Setting SO_RCVLOWAT %i: %s",
|
||||||
|
lowat, strerror(errno));
|
||||||
|
} else {
|
||||||
|
conn_flag(c, conn, lowat_set_flag);
|
||||||
|
conn_flag(c, conn, lowat_act_flag);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
conn->read[fromside] += readlen > 0 ? readlen : 0;
|
conn->read[fromsidei] += readlen > 0 ? readlen : 0;
|
||||||
conn->written[fromside] += written > 0 ? written : 0;
|
conn->written[fromsidei] += written > 0 ? written : 0;
|
||||||
|
|
||||||
if (written < 0) {
|
if (written < 0) {
|
||||||
if (errno == EINTR)
|
if (errno == EINTR)
|
||||||
|
@ -616,47 +571,43 @@ eintr:
|
||||||
if (errno != EAGAIN)
|
if (errno != EAGAIN)
|
||||||
goto close;
|
goto close;
|
||||||
|
|
||||||
if (never_read)
|
if (conn->read[fromsidei] == conn->written[fromsidei])
|
||||||
break;
|
break;
|
||||||
|
|
||||||
conn_event(c, conn,
|
conn_event(c, conn, OUT_WAIT(!fromsidei));
|
||||||
fromside == 0 ? OUT_WAIT_1 : OUT_WAIT_0);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (never_read && written == (long)(c->tcp.pipe_size))
|
if (never_read && written == (long)(c->tcp.pipe_size))
|
||||||
goto retry;
|
goto retry;
|
||||||
|
|
||||||
if (!never_read && written < to_write) {
|
pending = conn->read[fromsidei] - conn->written[fromsidei];
|
||||||
to_write -= written;
|
if (!never_read && written > 0 && written < pending)
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
|
||||||
|
|
||||||
if (eof)
|
if (eof)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((conn->events & FIN_RCVD_0) && !(conn->events & FIN_SENT_1)) {
|
if (conn->read[fromsidei] == conn->written[fromsidei] && eof) {
|
||||||
if (conn->read[fromside] == conn->written[fromside] && eof) {
|
unsigned sidei;
|
||||||
shutdown(conn->s[1], SHUT_WR);
|
|
||||||
conn_event(c, conn, FIN_SENT_1);
|
flow_foreach_sidei(sidei) {
|
||||||
|
if ((conn->events & FIN_RCVD(sidei)) &&
|
||||||
|
!(conn->events & FIN_SENT(!sidei))) {
|
||||||
|
shutdown(conn->s[!sidei], SHUT_WR);
|
||||||
|
conn_event(c, conn, FIN_SENT(!sidei));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((conn->events & FIN_RCVD_1) && !(conn->events & FIN_SENT_0)) {
|
if (CONN_HAS(conn, FIN_SENT(0) | FIN_SENT(1)))
|
||||||
if (conn->read[fromside] == conn->written[fromside] && eof) {
|
|
||||||
shutdown(conn->s[0], SHUT_WR);
|
|
||||||
conn_event(c, conn, FIN_SENT_0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (CONN_HAS(conn, FIN_SENT_0 | FIN_SENT_1))
|
|
||||||
goto close;
|
goto close;
|
||||||
|
|
||||||
if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
|
if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
|
||||||
events = EPOLLIN;
|
events = EPOLLIN;
|
||||||
|
|
||||||
fromside = !fromside;
|
fromsidei = !fromsidei;
|
||||||
goto swap;
|
goto swap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -721,7 +672,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
|
if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
|
||||||
c->tcp.pipe_size)) {
|
c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
|
||||||
trace("TCP (spliced): cannot set pool pipe size to %zu",
|
trace("TCP (spliced): cannot set pool pipe size to %zu",
|
||||||
c->tcp.pipe_size);
|
c->tcp.pipe_size);
|
||||||
}
|
}
|
||||||
|
@ -734,6 +685,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
|
||||||
*
|
*
|
||||||
* Return: 0
|
* Return: 0
|
||||||
*/
|
*/
|
||||||
|
/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
|
||||||
static int tcp_sock_refill_ns(void *arg)
|
static int tcp_sock_refill_ns(void *arg)
|
||||||
{
|
{
|
||||||
const struct ctx *c = (const struct ctx *)arg;
|
const struct ctx *c = (const struct ctx *)arg;
|
||||||
|
@ -786,29 +738,26 @@ void tcp_splice_init(struct ctx *c)
|
||||||
/**
|
/**
|
||||||
* tcp_splice_timer() - Timer for spliced connections
|
* tcp_splice_timer() - Timer for spliced connections
|
||||||
* @c: Execution context
|
* @c: Execution context
|
||||||
* @flow: Flow table entry
|
* @conn: Connection to handle
|
||||||
*/
|
*/
|
||||||
void tcp_splice_timer(const struct ctx *c, union flow *flow)
|
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn)
|
||||||
{
|
{
|
||||||
struct tcp_splice_conn *conn = &flow->tcp_splice;
|
unsigned sidei;
|
||||||
int side;
|
|
||||||
|
|
||||||
ASSERT(!(conn->flags & CLOSING));
|
ASSERT(!(conn->flags & CLOSING));
|
||||||
|
|
||||||
for (side = 0; side < SIDES; side++) {
|
flow_foreach_sidei(sidei) {
|
||||||
uint8_t set = side == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1;
|
if ((conn->flags & RCVLOWAT_SET(sidei)) &&
|
||||||
uint8_t act = side == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1;
|
!(conn->flags & RCVLOWAT_ACT(sidei))) {
|
||||||
|
if (setsockopt(conn->s[sidei], SOL_SOCKET, SO_RCVLOWAT,
|
||||||
if ((conn->flags & set) && !(conn->flags & act)) {
|
|
||||||
if (setsockopt(conn->s[side], SOL_SOCKET, SO_RCVLOWAT,
|
|
||||||
&((int){ 1 }), sizeof(int))) {
|
&((int){ 1 }), sizeof(int))) {
|
||||||
flow_trace(conn, "can't set SO_RCVLOWAT on %d",
|
flow_trace(conn, "can't set SO_RCVLOWAT on %d",
|
||||||
conn->s[side]);
|
conn->s[sidei]);
|
||||||
}
|
}
|
||||||
conn_flag(c, conn, ~set);
|
conn_flag(c, conn, ~RCVLOWAT_SET(sidei));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
conn_flag(c, conn, ~RCVLOWAT_ACT_0);
|
flow_foreach_sidei(sidei)
|
||||||
conn_flag(c, conn, ~RCVLOWAT_ACT_1);
|
conn_flag(c, conn, ~RCVLOWAT_ACT(sidei));
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,10 +11,7 @@ union sockaddr_inany;
|
||||||
|
|
||||||
void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
|
void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
|
||||||
uint32_t events);
|
uint32_t events);
|
||||||
bool tcp_splice_conn_from_sock(const struct ctx *c,
|
void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0);
|
||||||
uint8_t pif0, in_port_t dstport,
|
|
||||||
union flow *flow, int s0,
|
|
||||||
const union sockaddr_inany *sa);
|
|
||||||
void tcp_splice_init(struct ctx *c);
|
void tcp_splice_init(struct ctx *c);
|
||||||
|
|
||||||
#endif /* TCP_SPLICE_H */
|
#endif /* TCP_SPLICE_H */
|
||||||
|
|
1
test/.gitignore
vendored
1
test/.gitignore
vendored
|
@ -1,5 +1,6 @@
|
||||||
test_logs/
|
test_logs/
|
||||||
mbuto/
|
mbuto/
|
||||||
|
podman/
|
||||||
*.img
|
*.img
|
||||||
QEMU_EFI.fd
|
QEMU_EFI.fd
|
||||||
*.qcow2
|
*.qcow2
|
||||||
|
|
|
@ -8,7 +8,6 @@
|
||||||
WGET = wget -c
|
WGET = wget -c
|
||||||
|
|
||||||
DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
|
DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
|
||||||
debian-9-nocloud-amd64-daily-20200210-166.qcow2 \
|
|
||||||
debian-10-nocloud-amd64.qcow2 \
|
debian-10-nocloud-amd64.qcow2 \
|
||||||
debian-10-generic-arm64.qcow2 \
|
debian-10-generic-arm64.qcow2 \
|
||||||
debian-10-generic-ppc64el-20220911-1135.qcow2 \
|
debian-10-generic-ppc64el-20220911-1135.qcow2 \
|
||||||
|
@ -42,8 +41,7 @@ OPENSUSE_IMGS = openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 \
|
||||||
openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \
|
openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \
|
||||||
openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \
|
openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \
|
||||||
openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \
|
openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \
|
||||||
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz \
|
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
|
||||||
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
|
|
||||||
|
|
||||||
UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \
|
UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \
|
||||||
trusty-server-cloudimg-i386-disk1.img \
|
trusty-server-cloudimg-i386-disk1.img \
|
||||||
|
@ -52,10 +50,10 @@ UBUNTU_NEW_IMGS = xenial-server-cloudimg-powerpc-disk1.img \
|
||||||
jammy-server-cloudimg-s390x.img
|
jammy-server-cloudimg-s390x.img
|
||||||
UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS)
|
UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS)
|
||||||
|
|
||||||
DOWNLOAD_ASSETS = mbuto \
|
DOWNLOAD_ASSETS = mbuto podman \
|
||||||
$(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS)
|
$(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS)
|
||||||
TESTDATA_ASSETS = small.bin big.bin medium.bin
|
TESTDATA_ASSETS = small.bin big.bin medium.bin
|
||||||
LOCAL_ASSETS = mbuto.img mbuto.mem.img QEMU_EFI.fd \
|
LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \
|
||||||
$(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \
|
$(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \
|
||||||
$(UBUNTU_NEW_IMGS:%=prepared-%) \
|
$(UBUNTU_NEW_IMGS:%=prepared-%) \
|
||||||
nstool guest-key guest-key.pub \
|
nstool guest-key guest-key.pub \
|
||||||
|
@ -67,13 +65,27 @@ CFLAGS = -Wall -Werror -Wextra -pedantic -std=c99
|
||||||
|
|
||||||
assets: $(ASSETS)
|
assets: $(ASSETS)
|
||||||
|
|
||||||
|
.PHONY: pull-%
|
||||||
|
pull-%: %
|
||||||
|
git -C $* pull
|
||||||
|
|
||||||
mbuto:
|
mbuto:
|
||||||
git clone git://mbuto.sh/mbuto
|
git clone git://mbuto.sh/mbuto
|
||||||
|
|
||||||
|
mbuto/mbuto: pull-mbuto
|
||||||
|
|
||||||
|
podman:
|
||||||
|
git clone https://github.com/containers/podman.git
|
||||||
|
|
||||||
|
# To succesfully build podman, you will need gpgme and systemd
|
||||||
|
# development packages
|
||||||
|
podman/bin/podman: pull-podman
|
||||||
|
$(MAKE) -C podman
|
||||||
|
|
||||||
guest-key guest-key.pub:
|
guest-key guest-key.pub:
|
||||||
ssh-keygen -f guest-key -N ''
|
ssh-keygen -f guest-key -N ''
|
||||||
|
|
||||||
mbuto.img: passt.mbuto mbuto guest-key.pub $(TESTDATA_ASSETS)
|
mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub $(TESTDATA_ASSETS)
|
||||||
./mbuto/mbuto -p ./$< -c lz4 -f $@
|
./mbuto/mbuto -p ./$< -c lz4 -f $@
|
||||||
|
|
||||||
mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2
|
mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2
|
||||||
|
@ -121,9 +133,6 @@ realclean: clean
|
||||||
debian-8.11.0-openstack-%.qcow2:
|
debian-8.11.0-openstack-%.qcow2:
|
||||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2
|
$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2
|
||||||
|
|
||||||
debian-9-nocloud-%-daily-20200210-166.qcow2:
|
|
||||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/stretch/daily/20200210-166/debian-9-nocloud-$*-daily-20200210-166.qcow2
|
|
||||||
|
|
||||||
debian-10-nocloud-%.qcow2:
|
debian-10-nocloud-%.qcow2:
|
||||||
$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2
|
$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2
|
||||||
|
|
||||||
|
@ -189,9 +198,6 @@ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz:
|
||||||
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz:
|
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz:
|
||||||
$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
|
$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
|
||||||
|
|
||||||
openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2:
|
|
||||||
$(WGET) -O $@ https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2
|
|
||||||
|
|
||||||
# Ubuntu downloads
|
# Ubuntu downloads
|
||||||
trusty-server-cloudimg-%-disk1.img:
|
trusty-server-cloudimg-%-disk1.img:
|
||||||
$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img
|
$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img
|
||||||
|
|
|
@ -28,10 +28,11 @@ on a system, i.e. common utilities such as a shell are not included here.
|
||||||
|
|
||||||
Example for Debian, and possibly most Debian-based distributions:
|
Example for Debian, and possibly most Debian-based distributions:
|
||||||
|
|
||||||
build-essential git jq strace iperf3 qemu-system-x86 tmux sipcalc bats bc
|
bats bc build-essential catatonit clang-tidy conmon cppcheck crun fakeroot
|
||||||
catatonit clang-tidy cppcheck go isc-dhcp-common psmisc linux-cpupower socat
|
git go iperf3 isc-dhcp-common jq libgpgme-dev libseccomp-dev linux-cpupower
|
||||||
netcat-openbsd fakeroot lz4 lm-sensors qemu-system-arm qemu-system-ppc
|
lm-sensors lz4 netavark netcat-openbsd psmisc qemu-efi-aarch64
|
||||||
qemu-system-misc qemu-system-x86 valgrind
|
qemu-system-arm qemu-system-misc qemu-system-ppc qemu-system-x86
|
||||||
|
qemu-system-x86 sipcalc socat strace tmux uidmap valgrind
|
||||||
|
|
||||||
NOTE: the tests need a qemu version >= 7.2, or one that contains commit
|
NOTE: the tests need a qemu version >= 7.2, or one that contains commit
|
||||||
13c6be96618c ("net: stream: add unix socket"): this change introduces support
|
13c6be96618c ("net: stream: add unix socket"): this change introduces support
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
|
|
||||||
# layout_pasta() - Panes for host, pasta, and separate one for namespace
|
# layout_pasta() - Panes for host, pasta, and separate one for namespace
|
||||||
layout_pasta() {
|
layout_pasta() {
|
||||||
sleep 3
|
sleep 1
|
||||||
|
|
||||||
tmux kill-pane -a -t 0
|
tmux kill-pane -a -t 0
|
||||||
cmd_write 0 clear
|
cmd_write 0 clear
|
||||||
|
@ -46,7 +46,7 @@ layout_pasta() {
|
||||||
|
|
||||||
# layout_passt() - Panes for host, passt, and guest
|
# layout_passt() - Panes for host, passt, and guest
|
||||||
layout_passt() {
|
layout_passt() {
|
||||||
sleep 3
|
sleep 1
|
||||||
|
|
||||||
tmux kill-pane -a -t 0
|
tmux kill-pane -a -t 0
|
||||||
cmd_write 0 clear
|
cmd_write 0 clear
|
||||||
|
@ -77,7 +77,7 @@ layout_passt() {
|
||||||
|
|
||||||
# layout_passt_in_pasta() - Host, passt within pasta, namespace and guest
|
# layout_passt_in_pasta() - Host, passt within pasta, namespace and guest
|
||||||
layout_passt_in_pasta() {
|
layout_passt_in_pasta() {
|
||||||
sleep 3
|
sleep 1
|
||||||
|
|
||||||
tmux kill-pane -a -t 0
|
tmux kill-pane -a -t 0
|
||||||
cmd_write 0 clear
|
cmd_write 0 clear
|
||||||
|
@ -113,7 +113,7 @@ layout_passt_in_pasta() {
|
||||||
|
|
||||||
# layout_two_guests() - Two guest panes, two passt panes, plus host and log
|
# layout_two_guests() - Two guest panes, two passt panes, plus host and log
|
||||||
layout_two_guests() {
|
layout_two_guests() {
|
||||||
sleep 3
|
sleep 1
|
||||||
|
|
||||||
tmux kill-pane -a -t 0
|
tmux kill-pane -a -t 0
|
||||||
cmd_write 0 clear
|
cmd_write 0 clear
|
||||||
|
@ -152,7 +152,7 @@ layout_two_guests() {
|
||||||
|
|
||||||
# layout_demo_pasta() - Four panes for pasta demo
|
# layout_demo_pasta() - Four panes for pasta demo
|
||||||
layout_demo_pasta() {
|
layout_demo_pasta() {
|
||||||
sleep 3
|
sleep 1
|
||||||
|
|
||||||
cmd_write 0 cd ${BASEPATH}
|
cmd_write 0 cd ${BASEPATH}
|
||||||
cmd_write 0 clear
|
cmd_write 0 clear
|
||||||
|
@ -188,7 +188,7 @@ layout_demo_pasta() {
|
||||||
|
|
||||||
# layout_demo_passt() - Four panes for passt demo
|
# layout_demo_passt() - Four panes for passt demo
|
||||||
layout_demo_passt() {
|
layout_demo_passt() {
|
||||||
sleep 3
|
sleep 1
|
||||||
|
|
||||||
cmd_write 0 cd ${BASEPATH}
|
cmd_write 0 cd ${BASEPATH}
|
||||||
cmd_write 0 clear
|
cmd_write 0 clear
|
||||||
|
@ -224,7 +224,7 @@ layout_demo_passt() {
|
||||||
|
|
||||||
# layout_demo_podman() - Four panes for pasta demo with Podman
|
# layout_demo_podman() - Four panes for pasta demo with Podman
|
||||||
layout_demo_podman() {
|
layout_demo_podman() {
|
||||||
sleep 3
|
sleep 1
|
||||||
|
|
||||||
cmd_write 0 cd ${BASEPATH}
|
cmd_write 0 cd ${BASEPATH}
|
||||||
cmd_write 0 clear
|
cmd_write 0 clear
|
||||||
|
|
|
@ -18,7 +18,7 @@ PERF_LINK_COUNT=0
|
||||||
PERF_JS="${LOGDIR}/web/perf.js"
|
PERF_JS="${LOGDIR}/web/perf.js"
|
||||||
|
|
||||||
PERF_TEMPLATE_HTML="document.write('"'
|
PERF_TEMPLATE_HTML="document.write('"'
|
||||||
Throughput in Gbps, latency in µs. Threads are <span style="font-family: monospace;">iperf3</span> processes, <i>passt</i> and <i>pasta</i> are currently single-threaded.<br/>
|
Throughput in Gbps, latency in µs. Threads are <span style="font-family: monospace;">iperf3</span> threads, <i>passt</i> and <i>pasta</i> are currently single-threaded.<br/>
|
||||||
Click on numbers to show test execution. Measured at head, commit <span style="font-family: monospace;">__commit__</span>.
|
Click on numbers to show test execution. Measured at head, commit <span style="font-family: monospace;">__commit__</span>.
|
||||||
|
|
||||||
<style type="text/CSS">
|
<style type="text/CSS">
|
||||||
|
@ -56,7 +56,7 @@ table.pasta_local th { text-align: center; font-weight: bold; }
|
||||||
table.pasta_local tr:not(:first-of-type) td:not(:first-of-type) { font-family: monospace; font-weight: bolder; }
|
table.pasta_local tr:not(:first-of-type) td:not(:first-of-type) { font-family: monospace; font-weight: bolder; }
|
||||||
table.pasta_local tr:nth-child(3n+0) { background-color: #112315; }
|
table.pasta_local tr:nth-child(3n+0) { background-color: #112315; }
|
||||||
table.pasta_local tr:not(:nth-child(3n+0)) td { background-color: #101010; }
|
table.pasta_local tr:not(:nth-child(3n+0)) td { background-color: #101010; }
|
||||||
table.pasta_local td:nth-child(3n+2) { background-color: #603302; }
|
table.pasta_local td:nth-child(4n+2) { background-color: #603302; }
|
||||||
table.pasta_local tr:nth-child(1) { background-color: #363e61; }
|
table.pasta_local tr:nth-child(1) { background-color: #363e61; }
|
||||||
table.pasta td { border: 0px solid; padding: 6px; line-height: 1; }
|
table.pasta td { border: 0px solid; padding: 6px; line-height: 1; }
|
||||||
table.pasta td { text-align: right; }
|
table.pasta td { text-align: right; }
|
||||||
|
|
|
@ -17,6 +17,8 @@ INITRAMFS="${BASEPATH}/mbuto.img"
|
||||||
VCPUS="$( [ $(nproc) -ge 8 ] && echo 6 || echo $(( $(nproc) / 2 + 1 )) )"
|
VCPUS="$( [ $(nproc) -ge 8 ] && echo 6 || echo $(( $(nproc) / 2 + 1 )) )"
|
||||||
__mem_kib="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
|
__mem_kib="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
|
||||||
VMEM="$((${__mem_kib} / 1024 / 4))"
|
VMEM="$((${__mem_kib} / 1024 / 4))"
|
||||||
|
QEMU_ARCH="$(uname -m)"
|
||||||
|
[ "${QEMU_ARCH}" = "i686" ] && QEMU_ARCH=i386
|
||||||
|
|
||||||
# setup_build() - Set up pane layout for build tests
|
# setup_build() - Set up pane layout for build tests
|
||||||
setup_build() {
|
setup_build() {
|
||||||
|
@ -53,10 +55,10 @@ setup_passt() {
|
||||||
wait_for [ -f "${STATESETUP}/passt.pid" ]
|
wait_for [ -f "${STATESETUP}/passt.pid" ]
|
||||||
|
|
||||||
GUEST_CID=94557
|
GUEST_CID=94557
|
||||||
context_run_bg qemu 'qemu-system-$(uname -m)' \
|
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
|
||||||
' -machine accel=kvm' \
|
' -machine accel=kvm' \
|
||||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
' -kernel '"${KERNEL}" \
|
||||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||||
' -nodefaults' \
|
' -nodefaults' \
|
||||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||||
|
@ -124,7 +126,12 @@ setup_passt_in_ns() {
|
||||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||||
|
|
||||||
context_run_bg pasta "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${STATESETUP}/pasta.pid --config-net ${NSTOOL} hold ${STATESETUP}/ns.hold"
|
__map_host4=192.0.2.1
|
||||||
|
__map_host6=2001:db8:9a55::1
|
||||||
|
__map_ns4=192.0.2.2
|
||||||
|
__map_ns6=2001:db8:9a55::2
|
||||||
|
|
||||||
|
context_run_bg pasta "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${STATESETUP}/pasta.pid --map-host-loopback ${__map_host4} --map-host-loopback ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns.hold"
|
||||||
wait_for [ -f "${STATESETUP}/pasta.pid" ]
|
wait_for [ -f "${STATESETUP}/pasta.pid" ]
|
||||||
|
|
||||||
context_setup_nstool qemu ${STATESETUP}/ns.hold
|
context_setup_nstool qemu ${STATESETUP}/ns.hold
|
||||||
|
@ -139,20 +146,20 @@ setup_passt_in_ns() {
|
||||||
if [ ${VALGRIND} -eq 1 ]; then
|
if [ ${VALGRIND} -eq 1 ]; then
|
||||||
context_run passt "make clean"
|
context_run passt "make clean"
|
||||||
context_run passt "make valgrind"
|
context_run passt "make valgrind"
|
||||||
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid"
|
context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
|
||||||
else
|
else
|
||||||
context_run passt "make clean"
|
context_run passt "make clean"
|
||||||
context_run passt "make"
|
context_run passt "make"
|
||||||
context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid"
|
context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
|
||||||
fi
|
fi
|
||||||
wait_for [ -f "${STATESETUP}/passt.pid" ]
|
wait_for [ -f "${STATESETUP}/passt.pid" ]
|
||||||
|
|
||||||
GUEST_CID=94557
|
GUEST_CID=94557
|
||||||
context_run_bg qemu 'qemu-system-$(uname -m)' \
|
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
|
||||||
' -machine accel=kvm' \
|
' -machine accel=kvm' \
|
||||||
' -M accel=kvm:tcg' \
|
' -M accel=kvm:tcg' \
|
||||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
' -kernel '"${KERNEL}" \
|
||||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||||
' -nodefaults' \
|
' -nodefaults' \
|
||||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||||
|
@ -220,10 +227,10 @@ setup_two_guests() {
|
||||||
wait_for [ -f "${STATESETUP}/passt_2.pid" ]
|
wait_for [ -f "${STATESETUP}/passt_2.pid" ]
|
||||||
|
|
||||||
GUEST_1_CID=94557
|
GUEST_1_CID=94557
|
||||||
context_run_bg qemu_1 'qemu-system-$(uname -m)' \
|
context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
|
||||||
' -M accel=kvm:tcg' \
|
' -M accel=kvm:tcg' \
|
||||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
' -kernel '"${KERNEL}" \
|
||||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||||
' -nodefaults' \
|
' -nodefaults' \
|
||||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||||
|
@ -233,10 +240,10 @@ setup_two_guests() {
|
||||||
" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"
|
" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"
|
||||||
|
|
||||||
GUEST_2_CID=94558
|
GUEST_2_CID=94558
|
||||||
context_run_bg qemu_2 'qemu-system-$(uname -m)' \
|
context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
|
||||||
' -M accel=kvm:tcg' \
|
' -M accel=kvm:tcg' \
|
||||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
' -kernel '"${KERNEL}" \
|
||||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||||
' -nodefaults' \
|
' -nodefaults' \
|
||||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||||
|
|
|
@ -31,8 +31,8 @@ PR_DELAY_INIT=100 # ms
|
||||||
# $@: Message to print
|
# $@: Message to print
|
||||||
info() {
|
info() {
|
||||||
tmux select-pane -t ${PANE_INFO}
|
tmux select-pane -t ${PANE_INFO}
|
||||||
echo "${@}" >> $STATEBASE/log_pipe
|
printf "${@}\n" >> $STATEBASE/log_pipe
|
||||||
echo "${@}" >> "${LOGFILE}"
|
printf "${@}\n" >> "${LOGFILE}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# info_n() - Highlight, print message to pane and to log file without newline
|
# info_n() - Highlight, print message to pane and to log file without newline
|
||||||
|
@ -47,13 +47,13 @@ info_n() {
|
||||||
# $@: Message to print
|
# $@: Message to print
|
||||||
info_nolog() {
|
info_nolog() {
|
||||||
tmux select-pane -t ${PANE_INFO}
|
tmux select-pane -t ${PANE_INFO}
|
||||||
echo "${@}" >> $STATEBASE/log_pipe
|
printf "${@}\n" >> $STATEBASE/log_pipe
|
||||||
}
|
}
|
||||||
|
|
||||||
# info_nolog() - Print message to log file
|
# info_nolog() - Print message to log file
|
||||||
# $@: Message to print
|
# $@: Message to print
|
||||||
log() {
|
log() {
|
||||||
echo "${@}" >> "${LOGFILE}"
|
printf "${@}\n" >> "${LOGFILE}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# info_nolog_n() - Send message to pane without highlighting it, without newline
|
# info_nolog_n() - Send message to pane without highlighting it, without newline
|
||||||
|
@ -97,7 +97,6 @@ display_delay() {
|
||||||
switch_pane() {
|
switch_pane() {
|
||||||
tmux select-pane -t ${1}
|
tmux select-pane -t ${1}
|
||||||
PR_DELAY=${PR_DELAY_INIT}
|
PR_DELAY=${PR_DELAY_INIT}
|
||||||
display_delay "0.2"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# cmd_write() - Write a command to a pane, letter by letter, and execute it
|
# cmd_write() - Write a command to a pane, letter by letter, and execute it
|
||||||
|
@ -199,7 +198,7 @@ pane_run() {
|
||||||
# $1: Pane name
|
# $1: Pane name
|
||||||
pane_wait() {
|
pane_wait() {
|
||||||
__lc="$(echo "${1}" | tr [A-Z] [a-z])"
|
__lc="$(echo "${1}" | tr [A-Z] [a-z])"
|
||||||
sleep 0.1 || sleep 1
|
sleep 0.01 || sleep 1
|
||||||
|
|
||||||
__done=0
|
__done=0
|
||||||
while
|
while
|
||||||
|
@ -207,7 +206,7 @@ pane_wait() {
|
||||||
case ${__l} in
|
case ${__l} in
|
||||||
*"$ " | *"# ") return ;;
|
*"$ " | *"# ") return ;;
|
||||||
esac
|
esac
|
||||||
do sleep 0.1 || sleep 1; done
|
do sleep 0.01 || sleep 1; done
|
||||||
}
|
}
|
||||||
|
|
||||||
# pane_parse() - Print last line, @EMPTY@ if command had no output
|
# pane_parse() - Print last line, @EMPTY@ if command had no output
|
||||||
|
@ -231,7 +230,7 @@ pane_status() {
|
||||||
|
|
||||||
__status="$(pane_parse "${1}")"
|
__status="$(pane_parse "${1}")"
|
||||||
while ! [ "${__status}" -eq "${__status}" ] 2>/dev/null; do
|
while ! [ "${__status}" -eq "${__status}" ] 2>/dev/null; do
|
||||||
sleep 1
|
sleep 0.01 || sleep 1
|
||||||
pane_run "${1}" 'echo $?'
|
pane_run "${1}" 'echo $?'
|
||||||
pane_wait "${1}"
|
pane_wait "${1}"
|
||||||
__status="$(pane_parse "${1}")"
|
__status="$(pane_parse "${1}")"
|
||||||
|
@ -383,6 +382,16 @@ info_check_failed() {
|
||||||
printf " < failed.\n" >> "${LOGFILE}"
|
printf " < failed.\n" >> "${LOGFILE}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# status_bar_blink() - Make status bar blink
|
||||||
|
status_bar_blink() {
|
||||||
|
for i in `seq 1 3`; do
|
||||||
|
tmux set status-right-style 'bg=colour1 fg=colour196 bold'
|
||||||
|
sleep 0.1 || sleep 1
|
||||||
|
tmux set status-right-style 'bg=colour1 fg=colour233 bold'
|
||||||
|
sleep 0.1 || sleep 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
# info_passed() - Display, log, and make status bar blink when a test passes
|
# info_passed() - Display, log, and make status bar blink when a test passes
|
||||||
info_passed() {
|
info_passed() {
|
||||||
switch_pane ${PANE_INFO}
|
switch_pane ${PANE_INFO}
|
||||||
|
@ -391,12 +400,7 @@ info_passed() {
|
||||||
log "...passed."
|
log "...passed."
|
||||||
log
|
log
|
||||||
|
|
||||||
for i in `seq 1 3`; do
|
[ ${FAST} -eq 1 ] || status_bar_blink
|
||||||
tmux set status-right-style 'bg=colour1 fg=colour2 bold'
|
|
||||||
sleep "0.1"
|
|
||||||
tmux set status-right-style 'bg=colour1 fg=colour233 bold'
|
|
||||||
sleep "0.1"
|
|
||||||
done
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# info_failed() - Display, log, and make status bar blink when a test passes
|
# info_failed() - Display, log, and make status bar blink when a test passes
|
||||||
|
@ -407,12 +411,7 @@ info_failed() {
|
||||||
log "...failed."
|
log "...failed."
|
||||||
log
|
log
|
||||||
|
|
||||||
for i in `seq 1 3`; do
|
[ ${FAST} -eq 1 ] || status_bar_blink
|
||||||
tmux set status-right-style 'bg=colour1 fg=colour196 bold'
|
|
||||||
sleep "0.1"
|
|
||||||
tmux set status-right-style 'bg=colour1 fg=colour233 bold'
|
|
||||||
sleep "0.1"
|
|
||||||
done
|
|
||||||
|
|
||||||
pause_continue \
|
pause_continue \
|
||||||
"Press any key to pause test session" \
|
"Press any key to pause test session" \
|
||||||
|
@ -665,7 +664,7 @@ pause_continue() {
|
||||||
|
|
||||||
# run_term() - Start tmux session, running entry point, with recording if needed
|
# run_term() - Start tmux session, running entry point, with recording if needed
|
||||||
run_term() {
|
run_term() {
|
||||||
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG"
|
TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eTRACE=$TRACE -eKERNEL=$KERNEL"
|
||||||
|
|
||||||
if [ ${CI} -eq 1 ]; then
|
if [ ${CI} -eq 1 ]; then
|
||||||
printf '\e[8;50;240t'
|
printf '\e[8;50;240t'
|
||||||
|
|
|
@ -15,18 +15,13 @@
|
||||||
|
|
||||||
# test_iperf3s() - Start iperf3 server
|
# test_iperf3s() - Start iperf3 server
|
||||||
# $1: Destination/server context
|
# $1: Destination/server context
|
||||||
# $2: Port number, ${i} is translated to process index
|
# $2: Port number
|
||||||
# $3: Number of processes to run in parallel
|
|
||||||
test_iperf3s() {
|
test_iperf3s() {
|
||||||
__sctx="${1}"
|
__sctx="${1}"
|
||||||
__port="${2}"
|
__port="${2}"
|
||||||
__procs="$((${3} - 1))"
|
|
||||||
|
|
||||||
pane_or_context_run_bg "${__sctx}" \
|
pane_or_context_run_bg "${__sctx}" \
|
||||||
'for i in $(seq 0 '${__procs}'); do' \
|
'iperf3 -s -p'${__port}' & echo $! > s.pid' \
|
||||||
' iperf3 -s -p'${__port}' &' \
|
|
||||||
' echo $! > s${i}.pid; ' \
|
|
||||||
'done' \
|
|
||||||
|
|
||||||
sleep 1 # Wait for server to be ready
|
sleep 1 # Wait for server to be ready
|
||||||
}
|
}
|
||||||
|
@ -36,9 +31,9 @@ test_iperf3s() {
|
||||||
test_iperf3k() {
|
test_iperf3k() {
|
||||||
__sctx="${1}"
|
__sctx="${1}"
|
||||||
|
|
||||||
pane_or_context_run "${__sctx}" 'kill -INT $(cat s*.pid); rm s*.pid'
|
pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid'
|
||||||
|
|
||||||
sleep 3 # Wait for kernel to free up ports
|
sleep 1 # Wait for kernel to free up ports
|
||||||
}
|
}
|
||||||
|
|
||||||
# test_iperf3() - Ugly helper for iperf3 directive
|
# test_iperf3() - Ugly helper for iperf3 directive
|
||||||
|
@ -46,37 +41,29 @@ test_iperf3k() {
|
||||||
# $2: Source/client context
|
# $2: Source/client context
|
||||||
# $3: Destination name or address for client
|
# $3: Destination name or address for client
|
||||||
# $4: Port number, ${i} is translated to process index
|
# $4: Port number, ${i} is translated to process index
|
||||||
# $5: Number of processes to run in parallel
|
# $5: Run time, in seconds
|
||||||
# $6: Run time, in seconds
|
|
||||||
# $@: Client options
|
# $@: Client options
|
||||||
test_iperf3() {
|
test_iperf3() {
|
||||||
__var="${1}"; shift
|
__var="${1}"; shift
|
||||||
__cctx="${1}"; shift
|
__cctx="${1}"; shift
|
||||||
__dest="${1}"; shift
|
__dest="${1}"; shift
|
||||||
__port="${1}"; shift
|
__port="${1}"; shift
|
||||||
__procs="$((${1} - 1))"; shift
|
|
||||||
__time="${1}"; shift
|
__time="${1}"; shift
|
||||||
|
|
||||||
pane_or_context_run "${__cctx}" 'rm -f c*.json'
|
pane_or_context_run "${__cctx}" 'rm -f c.json'
|
||||||
|
|
||||||
# A 1s wait for connection on what's basically a local link
|
# A 1s wait for connection on what's basically a local link
|
||||||
# indicates something is pretty wrong
|
# indicates something is pretty wrong
|
||||||
__timeout=1000
|
__timeout=1000
|
||||||
pane_or_context_run "${__cctx}" \
|
pane_or_context_run "${__cctx}" \
|
||||||
'(' \
|
'iperf3 -J -c '${__dest}' -p '${__port} \
|
||||||
' for i in $(seq 0 '${__procs}'); do' \
|
' --connect-timeout '${__timeout} \
|
||||||
' iperf3 -J -c '${__dest}' -p '${__port} \
|
' -t'${__time}' -i0 '"${@}"' > c.json' \
|
||||||
' --connect-timeout '${__timeout} \
|
|
||||||
' -t'${__time}' -i0 -T c${i} '"${@}" \
|
|
||||||
' > c${i}.json &' \
|
|
||||||
' done;' \
|
|
||||||
' wait' \
|
|
||||||
')'
|
|
||||||
|
|
||||||
__jval=".end.sum_received.bits_per_second"
|
__jval=".end.sum_received.bits_per_second"
|
||||||
|
|
||||||
__bw=$(pane_or_context_output "${__cctx}" \
|
__bw=$(pane_or_context_output "${__cctx}" \
|
||||||
'cat c*.json | jq -rMs "map('${__jval}') | add"')
|
'cat c.json | jq -rMs "map('${__jval}') | add"')
|
||||||
|
|
||||||
TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
|
TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,7 +44,7 @@ endef
|
||||||
def start_stop_diff
|
def start_stop_diff
|
||||||
guest sed /proc/slabinfo -ne 's/^\([^ ]* *[^ ]* *[^ ]* *[^ ]*\).*/\\\1/p' > /tmp/slabinfo.before
|
guest sed /proc/slabinfo -ne 's/^\([^ ]* *[^ ]* *[^ ]* *[^ ]*\).*/\\\1/p' > /tmp/slabinfo.before
|
||||||
guest cat /proc/meminfo > /tmp/meminfo.before
|
guest cat /proc/meminfo > /tmp/meminfo.before
|
||||||
guest /bin/passt.avx2 -l /tmp/log -s /tmp/sock -P /tmp/pid __OPTS__ --netns-only
|
guest /bin/passt.avx2 -l /tmp/log -s /tmp/sock -P /tmp/pid __OPTS__
|
||||||
sleep 2
|
sleep 2
|
||||||
guest cat /proc/meminfo > /tmp/meminfo.after
|
guest cat /proc/meminfo > /tmp/meminfo.after
|
||||||
guest sed /proc/slabinfo -ne 's/^\([^ ]* *[^ ]* *[^ ]* *[^ ]*\).*/\\\1/p' > /tmp/slabinfo.after
|
guest sed /proc/slabinfo -ne 's/^\([^ ]* *[^ ]* *[^ ]* *[^ ]*\).*/\\\1/p' > /tmp/slabinfo.after
|
||||||
|
@ -78,9 +78,16 @@ guest mount -o bind /proc /test/proc
|
||||||
guest mount -o bind /dev /test/dev
|
guest mount -o bind /dev /test/dev
|
||||||
guest cp -Lr /bin /lib /lib64 /usr /sbin /test/
|
guest cp -Lr /bin /lib /lib64 /usr /sbin /test/
|
||||||
|
|
||||||
|
guest exec switch_root /test /bin/sh
|
||||||
|
|
||||||
guest ulimit -Hn 300000
|
guest ulimit -Hn 300000
|
||||||
guest unshare -rUm -R /test
|
guest unshare -rUn
|
||||||
guest chroot .
|
guest ip link add eth0 type dummy
|
||||||
|
guest ip link set eth0 up
|
||||||
|
guest ip address add 192.0.2.2/24 dev eth0
|
||||||
|
guest ip address add 2001:db8::2/64 dev eth0
|
||||||
|
guest ip route add default via 192.0.2.1
|
||||||
|
guest ip -6 route add default via 2001:db8::1 dev eth0
|
||||||
|
|
||||||
guest meminfo_size() { grep "^$2:" $1 | tr -s ' ' | cut -f2 -d ' '; }
|
guest meminfo_size() { grep "^$2:" $1 | tr -s ' ' | cut -f2 -d ' '; }
|
||||||
guest meminfo_diff() { echo $(( $(meminfo_size $2 $3) - $(meminfo_size $1 $3) )); }
|
guest meminfo_diff() { echo $(( $(meminfo_size $2 $3) - $(meminfo_size $1 $3) )); }
|
||||||
|
@ -103,27 +110,17 @@ info
|
||||||
th symbol MiB
|
th symbol MiB
|
||||||
set WHAT tcp_buf_discard
|
set WHAT tcp_buf_discard
|
||||||
nm_row
|
nm_row
|
||||||
set WHAT tcp6_l2_buf
|
set WHAT flowtab
|
||||||
nm_row
|
nm_row
|
||||||
set WHAT tcp4_l2_buf
|
set WHAT tcp6_payload
|
||||||
nm_row
|
nm_row
|
||||||
set WHAT tc
|
set WHAT tcp4_payload
|
||||||
nm_row
|
nm_row
|
||||||
set WHAT pkt_buf
|
set WHAT pkt_buf
|
||||||
nm_row
|
nm_row
|
||||||
set WHAT udp_splice_map
|
set WHAT udp_payload
|
||||||
nm_row
|
nm_row
|
||||||
set WHAT udp6_l2_buf
|
set WHAT flow_hashtab
|
||||||
nm_row
|
|
||||||
set WHAT udp4_l2_buf
|
|
||||||
nm_row
|
|
||||||
set WHAT udp_tap_map
|
|
||||||
nm_row
|
|
||||||
set WHAT icmp_id_map
|
|
||||||
nm_row
|
|
||||||
set WHAT udp_splice_buf
|
|
||||||
nm_row
|
|
||||||
set WHAT tc_hash
|
|
||||||
nm_row
|
nm_row
|
||||||
set WHAT pool_tap6_storage
|
set WHAT pool_tap6_storage
|
||||||
nm_row
|
nm_row
|
||||||
|
@ -142,8 +139,6 @@ set WHAT pid
|
||||||
slab_row
|
slab_row
|
||||||
set WHAT dentry
|
set WHAT dentry
|
||||||
slab_row
|
slab_row
|
||||||
set WHAT Acpi-Parse
|
|
||||||
slab_row
|
|
||||||
set WHAT kmalloc-64
|
set WHAT kmalloc-64
|
||||||
slab_row
|
slab_row
|
||||||
set WHAT kmalloc-32
|
set WHAT kmalloc-32
|
||||||
|
|
|
@ -31,10 +31,15 @@
|
||||||
|
|
||||||
#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0])))
|
#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0])))
|
||||||
|
|
||||||
#define die(...) \
|
#define die(...) \
|
||||||
do { \
|
do { \
|
||||||
fprintf(stderr, __VA_ARGS__); \
|
fprintf(stderr, "nstool: " __VA_ARGS__); \
|
||||||
exit(1); \
|
exit(1); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define err(...) \
|
||||||
|
do { \
|
||||||
|
fprintf(stderr, "nstool: " __VA_ARGS__); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
struct ns_type {
|
struct ns_type {
|
||||||
|
@ -156,6 +161,9 @@ static int connect_ctl(const char *sockpath, bool wait,
|
||||||
|
|
||||||
static void cmd_hold(int argc, char *argv[])
|
static void cmd_hold(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
|
struct sigaction sa = {
|
||||||
|
.sa_handler = SIG_IGN,
|
||||||
|
};
|
||||||
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX);
|
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX);
|
||||||
struct sockaddr_un addr;
|
struct sockaddr_un addr;
|
||||||
const char *sockpath = argv[1];
|
const char *sockpath = argv[1];
|
||||||
|
@ -185,6 +193,10 @@ static void cmd_hold(int argc, char *argv[])
|
||||||
if (!getcwd(info.cwd, sizeof(info.cwd)))
|
if (!getcwd(info.cwd, sizeof(info.cwd)))
|
||||||
die("getcwd(): %s\n", strerror(errno));
|
die("getcwd(): %s\n", strerror(errno));
|
||||||
|
|
||||||
|
rc = sigaction(SIGPIPE, &sa, NULL);
|
||||||
|
if (rc)
|
||||||
|
die("sigaction(SIGPIPE): %s\n", strerror(errno));
|
||||||
|
|
||||||
do {
|
do {
|
||||||
int afd = accept(fd, NULL, NULL);
|
int afd = accept(fd, NULL, NULL);
|
||||||
char buf;
|
char buf;
|
||||||
|
@ -193,17 +205,21 @@ static void cmd_hold(int argc, char *argv[])
|
||||||
die("accept(): %s\n", strerror(errno));
|
die("accept(): %s\n", strerror(errno));
|
||||||
|
|
||||||
rc = write(afd, &info, sizeof(info));
|
rc = write(afd, &info, sizeof(info));
|
||||||
if (rc < 0)
|
if (rc < 0) {
|
||||||
die("write(): %s\n", strerror(errno));
|
err("holder write() to control socket: %s\n",
|
||||||
|
strerror(errno));
|
||||||
|
}
|
||||||
if ((size_t)rc < sizeof(info))
|
if ((size_t)rc < sizeof(info))
|
||||||
die("short write() on control socket\n");
|
err("holder short write() on control socket\n");
|
||||||
|
|
||||||
rc = read(afd, &buf, sizeof(buf));
|
rc = read(afd, &buf, sizeof(buf));
|
||||||
if (rc < 0)
|
if (rc < 0) {
|
||||||
die("read(): %s\n", strerror(errno));
|
err("holder read() on control socket: %s\n",
|
||||||
|
strerror(errno));
|
||||||
|
}
|
||||||
|
|
||||||
close(afd);
|
close(afd);
|
||||||
} while (rc == 0);
|
} while (rc <= 0);
|
||||||
|
|
||||||
unlink(sockpath);
|
unlink(sockpath);
|
||||||
}
|
}
|
||||||
|
@ -345,21 +361,43 @@ static int openns(const char *fmt, ...)
|
||||||
return fd;
|
return fd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static pid_t sig_pid;
|
||||||
|
static void sig_propagate(int signum)
|
||||||
|
{
|
||||||
|
int err;
|
||||||
|
|
||||||
|
err = kill(sig_pid, signum);
|
||||||
|
if (err)
|
||||||
|
die("Propagating %s: %s\n", strsignal(signum), strerror(errno));
|
||||||
|
}
|
||||||
|
|
||||||
static void wait_for_child(pid_t pid)
|
static void wait_for_child(pid_t pid)
|
||||||
{
|
{
|
||||||
int status;
|
struct sigaction sa = {
|
||||||
|
.sa_handler = sig_propagate,
|
||||||
|
.sa_flags = SA_RESETHAND,
|
||||||
|
};
|
||||||
|
int status, err;
|
||||||
|
|
||||||
|
sig_pid = pid;
|
||||||
|
err = sigaction(SIGTERM, &sa, NULL);
|
||||||
|
if (err)
|
||||||
|
die("sigaction(SIGTERM): %s\n", strerror(errno));
|
||||||
|
|
||||||
/* Match the child's exit status, if possible */
|
/* Match the child's exit status, if possible */
|
||||||
for (;;) {
|
for (;;) {
|
||||||
pid_t rc;
|
pid_t rc;
|
||||||
|
|
||||||
rc = waitpid(pid, &status, WUNTRACED);
|
rc = waitpid(pid, &status, WUNTRACED);
|
||||||
if (rc < 0)
|
if (rc < 0) {
|
||||||
|
if (errno == EINTR)
|
||||||
|
continue;
|
||||||
die("waitpid() on %d: %s\n", pid, strerror(errno));
|
die("waitpid() on %d: %s\n", pid, strerror(errno));
|
||||||
|
}
|
||||||
if (rc != pid)
|
if (rc != pid)
|
||||||
die("waitpid() on %d returned %d", pid, rc);
|
die("waitpid() on %d returned %d", pid, rc);
|
||||||
if (WIFSTOPPED(status)) {
|
if (WIFSTOPPED(status)) {
|
||||||
/* Stop the parent to patch */
|
/* Stop the parent to match */
|
||||||
kill(getpid(), SIGSTOP);
|
kill(getpid(), SIGSTOP);
|
||||||
/* We must have resumed, resume the child */
|
/* We must have resumed, resume the child */
|
||||||
kill(pid, SIGCONT);
|
kill(pid, SIGCONT);
|
||||||
|
@ -508,7 +546,7 @@ static void cmd_exec(int argc, char *argv[])
|
||||||
/* CHILD */
|
/* CHILD */
|
||||||
if (argc > optind + 1) {
|
if (argc > optind + 1) {
|
||||||
exe = argv[optind + 1];
|
exe = argv[optind + 1];
|
||||||
xargs = (const char * const*)(argv + optind + 1);
|
xargs = (const char *const *)(argv + optind + 1);
|
||||||
} else {
|
} else {
|
||||||
exe = getenv("SHELL");
|
exe = getenv("SHELL");
|
||||||
if (!exe)
|
if (!exe)
|
||||||
|
|
|
@ -15,6 +15,14 @@ PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod
|
||||||
sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl
|
sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl
|
||||||
nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}"
|
nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}"
|
||||||
|
|
||||||
|
# OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and
|
||||||
|
# sshd-session the per-session program. We need the latter as well, and the path
|
||||||
|
# depends on the distribution. It doesn't exist on older versions.
|
||||||
|
for bin in /usr/lib/openssh/sshd-session /usr/lib/ssh/sshd-session \
|
||||||
|
/usr/libexec/openssh/sshd-session; do
|
||||||
|
command -v "${bin}" >/dev/null && PROGS="${PROGS} ${bin}"
|
||||||
|
done
|
||||||
|
|
||||||
KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}"
|
KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}"
|
||||||
|
|
||||||
LINKS="${LINKS:-
|
LINKS="${LINKS:-
|
||||||
|
@ -54,7 +62,7 @@ EOF
|
||||||
ln -s /run /var/run
|
ln -s /run /var/run
|
||||||
:> /etc/fstab
|
:> /etc/fstab
|
||||||
|
|
||||||
# sshd(dropbear) via vsock
|
# sshd via vsock
|
||||||
cat > /etc/passwd << EOF
|
cat > /etc/passwd << EOF
|
||||||
root:x:0:0:root:/root:/bin/sh
|
root:x:0:0:root:/root:/bin/sh
|
||||||
sshd:x:100:100:Privilege-separated SSH:/var/empty/sshd:/sbin/nologin
|
sshd:x:100:100:Privilege-separated SSH:/var/empty/sshd:/sbin/nologin
|
||||||
|
@ -64,7 +72,9 @@ root:::0:99999:7:::
|
||||||
EOF
|
EOF
|
||||||
chmod 000 /etc/shadow
|
chmod 000 /etc/shadow
|
||||||
|
|
||||||
:> /etc/ssh/sshd_config
|
cat > /etc/ssh/sshd_config << EOF
|
||||||
|
Subsystem sftp internal-sftp
|
||||||
|
EOF
|
||||||
ssh-keygen -A
|
ssh-keygen -A
|
||||||
chmod 700 /root/.ssh
|
chmod 700 /root/.ssh
|
||||||
chmod 700 /run/sshd
|
chmod 700 /run/sshd
|
||||||
|
@ -76,7 +86,7 @@ EOF
|
||||||
EOF
|
EOF
|
||||||
chmod 600 /root/.ssh/authorized_keys
|
chmod 600 /root/.ssh/authorized_keys
|
||||||
chmod 700 /root
|
chmod 700 /root
|
||||||
socat VSOCK-LISTEN:22,fork EXEC:"sshd -i -e" 2> /var/log/vsock-ssh.log &
|
socat VSOCK-LISTEN:22,fork EXEC:"/sbin/sshd -i -e" 2> /var/log/vsock-ssh.log &
|
||||||
sh +m
|
sh +m
|
||||||
'
|
'
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
|
|
||||||
PROGS="${PROGS:-ash,dash,bash chmod ip mount insmod mkdir ln cat chmod modprobe
|
PROGS="${PROGS:-ash,dash,bash chmod ip mount insmod mkdir ln cat chmod modprobe
|
||||||
grep mknod sed chown sleep bc ls ps mount unshare chroot cp kill diff
|
grep mknod sed chown sleep bc ls ps mount unshare chroot cp kill diff
|
||||||
head tail sort tr tee cut nm which}"
|
head tail sort tr tee cut nm which switch_root}"
|
||||||
|
|
||||||
KMODS="${KMODS:- dummy}"
|
KMODS="${KMODS:- dummy}"
|
||||||
|
|
||||||
|
@ -29,13 +29,6 @@ COPIES="${COPIES} ../passt.avx2,/bin/passt.avx2"
|
||||||
FIXUP="${FIXUP}"'
|
FIXUP="${FIXUP}"'
|
||||||
ln -s /bin /usr/bin
|
ln -s /bin /usr/bin
|
||||||
chmod 777 /tmp
|
chmod 777 /tmp
|
||||||
ip link add eth0 type dummy
|
|
||||||
ip link set eth0 up
|
|
||||||
ip address add 192.0.2.2/24 dev eth0
|
|
||||||
ip address add 2001:db8::2/64 dev eth0
|
|
||||||
ip route add default via 192.0.2.1
|
|
||||||
ip -6 route add default via 2001:db8::1 dev eth0
|
|
||||||
sleep 2
|
|
||||||
sh +m
|
sh +m
|
||||||
'
|
'
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@ check [ __MTU__ = 65520 ]
|
||||||
test DHCP: DNS
|
test DHCP: DNS
|
||||||
gout DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
gout DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||||
hout HOST_DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | head -n3 | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
hout HOST_DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | head -n3 | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||||
check [ "__DNS__" = "__HOST_DNS__" ] || [ "__DNS__" = "__HOST_GW__" -a "__HOST_DNS__" = "127.0.0.1" ]
|
check [ "__DNS__" = "__HOST_DNS__" ] || ( [ "__DNS__" = "__HOST_GW__" ] && expr "__HOST_DNS__" : "127[.]" )
|
||||||
|
|
||||||
# FQDNs should be terminated by dots, but the guest DHCP client might omit them:
|
# FQDNs should be terminated by dots, but the guest DHCP client might omit them:
|
||||||
# strip them first
|
# strip them first
|
||||||
|
@ -49,8 +49,10 @@ check [ "__SEARCH__" = "__HOST_SEARCH__" ]
|
||||||
|
|
||||||
test DHCPv6: address
|
test DHCPv6: address
|
||||||
guest /sbin/dhclient -6 __IFNAME__
|
guest /sbin/dhclient -6 __IFNAME__
|
||||||
|
# Wait for DAD to complete
|
||||||
|
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||||
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global").local] | .[0]'
|
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||||
check [ "__ADDR6__" = "__HOST_ADDR6__" ]
|
check [ "__ADDR6__" = "__HOST_ADDR6__" ]
|
||||||
|
|
||||||
test DHCPv6: route
|
test DHCPv6: route
|
||||||
|
|
|
@ -16,14 +16,16 @@ htools ip jq sipcalc grep cut
|
||||||
|
|
||||||
test Interface name
|
test Interface name
|
||||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||||
guest ip link set dev __IFNAME__ up && sleep 2
|
guest ip link set dev __IFNAME__ up
|
||||||
|
# Wait for DAD to complete
|
||||||
|
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||||
check [ -n "__IFNAME__" ]
|
check [ -n "__IFNAME__" ]
|
||||||
|
|
||||||
test SLAAC: prefix
|
test SLAAC: prefix
|
||||||
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
|
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
|
||||||
gout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
|
gout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
|
||||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global").local] | .[0]'
|
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||||
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
|
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
|
||||||
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]
|
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]
|
||||||
|
|
||||||
|
|
75
test/passt_in_ns/dhcp
Normal file
75
test/passt_in_ns/dhcp
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
#
|
||||||
|
# PASST - Plug A Simple Socket Transport
|
||||||
|
# for qemu/UNIX domain socket mode
|
||||||
|
#
|
||||||
|
# PASTA - Pack A Subtle Tap Abstraction
|
||||||
|
# for network namespace/tap device mode
|
||||||
|
#
|
||||||
|
# test/passt/dhcp - Check DHCP and DHCPv6 functionality in passt mode
|
||||||
|
#
|
||||||
|
# Copyright (c) 2021 Red Hat GmbH
|
||||||
|
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||||
|
|
||||||
|
gtools ip jq dhclient sed tr
|
||||||
|
htools ip jq sed tr head
|
||||||
|
|
||||||
|
set MAP_NS4 192.0.2.2
|
||||||
|
set MAP_NS6 2001:db8:9a55::2
|
||||||
|
|
||||||
|
test Interface name
|
||||||
|
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||||
|
hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||||
|
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||||
|
check [ -n "__IFNAME__" ]
|
||||||
|
|
||||||
|
test DHCP: address
|
||||||
|
guest /sbin/dhclient -4 __IFNAME__
|
||||||
|
gout ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
|
||||||
|
hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
|
||||||
|
check [ "__ADDR__" = "__HOST_ADDR__" ]
|
||||||
|
|
||||||
|
test DHCP: route
|
||||||
|
gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||||
|
hout HOST_GW ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").gateway] | .[0]'
|
||||||
|
check [ "__GW__" = "__HOST_GW__" ]
|
||||||
|
|
||||||
|
test DHCP: MTU
|
||||||
|
gout MTU ip -j link show | jq -rM '.[] | select(.ifname == "__IFNAME__").mtu'
|
||||||
|
check [ __MTU__ = 65520 ]
|
||||||
|
|
||||||
|
test DHCP: DNS
|
||||||
|
gout DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||||
|
hout HOST_DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | head -n3 | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||||
|
check [ "__DNS__" = "__HOST_DNS__" ] || ( [ "__DNS__" = "__MAP_NS4__" ] && expr "__HOST_DNS__" : "127[.]" )
|
||||||
|
|
||||||
|
# FQDNs should be terminated by dots, but the guest DHCP client might omit them:
|
||||||
|
# strip them first
|
||||||
|
test DHCP: search list
|
||||||
|
gout SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
|
||||||
|
hout HOST_SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
|
||||||
|
check [ "__SEARCH__" = "__HOST_SEARCH__" ]
|
||||||
|
|
||||||
|
test DHCPv6: address
|
||||||
|
guest /sbin/dhclient -6 __IFNAME__
|
||||||
|
# Wait for DAD to complete
|
||||||
|
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||||
|
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||||
|
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||||
|
check [ "__ADDR6__" = "__HOST_ADDR6__" ]
|
||||||
|
|
||||||
|
test DHCPv6: route
|
||||||
|
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
||||||
|
hout HOST_GW6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").gateway] | .[0]'
|
||||||
|
check [ "__GW6__" = "__HOST_GW6__" ]
|
||||||
|
|
||||||
|
# Strip interface specifier: interface names might differ between host and guest
|
||||||
|
test DHCPv6: DNS
|
||||||
|
gout DNS6 sed -n 's/^nameserver \([^:]*:\)\([^%]*\).*/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||||
|
hout HOST_DNS6 sed -n 's/^nameserver \([^:]*:\)\([^%]*\).*/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
|
||||||
|
check [ "__DNS6__" = "__HOST_DNS6__" ] || [ "__DNS6__" = "__MAP_NS6__" -a "__HOST_DNS6__" = "::1" ]
|
||||||
|
|
||||||
|
test DHCPv6: search list
|
||||||
|
gout SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
|
||||||
|
hout HOST_SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
|
||||||
|
check [ "__SEARCH6__" = "__HOST_SEARCH6__" ]
|
|
@ -15,6 +15,11 @@ gtools socat ip jq
|
||||||
htools socat ip jq
|
htools socat ip jq
|
||||||
nstools socat ip jq
|
nstools socat ip jq
|
||||||
|
|
||||||
|
set MAP_HOST4 192.0.2.1
|
||||||
|
set MAP_HOST6 2001:db8:9a55::1
|
||||||
|
set MAP_NS4 192.0.2.2
|
||||||
|
set MAP_NS6 2001:db8:9a55::2
|
||||||
|
|
||||||
set TEMP_BIG __STATEDIR__/test_big.bin
|
set TEMP_BIG __STATEDIR__/test_big.bin
|
||||||
set TEMP_SMALL __STATEDIR__/test_small.bin
|
set TEMP_SMALL __STATEDIR__/test_small.bin
|
||||||
set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin
|
set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin
|
||||||
|
@ -27,7 +32,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10001
|
||||||
guestw
|
guestw
|
||||||
guest cmp test_big.bin /root/big.bin
|
guest cmp test_big.bin /root/big.bin
|
||||||
|
|
||||||
test TCP/IPv4: host to ns: big transfer
|
test TCP/IPv4: host to ns (spliced): big transfer
|
||||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
|
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
|
||||||
|
@ -36,16 +41,15 @@ check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
|
||||||
|
|
||||||
test TCP/IPv4: guest to host: big transfer
|
test TCP/IPv4: guest to host: big transfer
|
||||||
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
||||||
gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/big.bin TCP4:__GW__:10003
|
guest socat -u OPEN:/root/big.bin TCP4:__MAP_HOST4__:10003
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
||||||
|
|
||||||
test TCP/IPv4: guest to ns: big transfer
|
test TCP/IPv4: guest to ns: big transfer
|
||||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/big.bin TCP4:__GW__:10002
|
guest socat -u OPEN:/root/big.bin TCP4:__MAP_NS4__:10002
|
||||||
nsw
|
nsw
|
||||||
check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
|
check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
|
||||||
|
|
||||||
|
@ -59,7 +63,7 @@ check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
||||||
test TCP/IPv4: ns to host (via tap): big transfer
|
test TCP/IPv4: ns to host (via tap): big transfer
|
||||||
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003
|
ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__MAP_HOST4__:10003
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
||||||
|
|
||||||
|
@ -86,7 +90,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10001
|
||||||
guestw
|
guestw
|
||||||
guest cmp test_small.bin /root/small.bin
|
guest cmp test_small.bin /root/small.bin
|
||||||
|
|
||||||
test TCP/IPv4: host to ns: small transfer
|
test TCP/IPv4: host to ns (spliced): small transfer
|
||||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
|
host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
|
||||||
|
@ -95,16 +99,15 @@ check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
|
||||||
|
|
||||||
test TCP/IPv4: guest to host: small transfer
|
test TCP/IPv4: guest to host: small transfer
|
||||||
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
||||||
gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/small.bin TCP4:__GW__:10003
|
guest socat -u OPEN:/root/small.bin TCP4:__MAP_HOST4__:10003
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
||||||
|
|
||||||
test TCP/IPv4: guest to ns: small transfer
|
test TCP/IPv4: guest to ns: small transfer
|
||||||
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/small.bin TCP4:__GW__:10002
|
guest socat -u OPEN:/root/small.bin TCP4:__MAP_NS4__:10002
|
||||||
nsw
|
nsw
|
||||||
check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
|
check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
|
||||||
|
|
||||||
|
@ -118,7 +121,7 @@ check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
||||||
test TCP/IPv4: ns to host (via tap): small transfer
|
test TCP/IPv4: ns to host (via tap): small transfer
|
||||||
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003
|
ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__MAP_HOST4__:10003
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
||||||
|
|
||||||
|
@ -143,7 +146,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10001
|
||||||
guestw
|
guestw
|
||||||
guest cmp test_big.bin /root/big.bin
|
guest cmp test_big.bin /root/big.bin
|
||||||
|
|
||||||
test TCP/IPv6: host to ns: big transfer
|
test TCP/IPv6: host to ns (spliced): big transfer
|
||||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
|
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
|
||||||
|
@ -152,17 +155,15 @@ check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
|
||||||
|
|
||||||
test TCP/IPv6: guest to host: big transfer
|
test TCP/IPv6: guest to host: big transfer
|
||||||
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
||||||
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/big.bin TCP6:[__GW6__%__IFNAME__]:10003
|
guest socat -u OPEN:/root/big.bin TCP6:[__MAP_HOST6__]:10003
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
||||||
|
|
||||||
test TCP/IPv6: guest to ns: big transfer
|
test TCP/IPv6: guest to ns: big transfer
|
||||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/big.bin TCP6:[__GW6__%__IFNAME__]:10002
|
guest socat -u OPEN:/root/big.bin TCP6:[__MAP_NS6__]:10002
|
||||||
nsw
|
nsw
|
||||||
check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
|
check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
|
||||||
|
|
||||||
|
@ -175,9 +176,8 @@ check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
||||||
|
|
||||||
test TCP/IPv6: ns to host (via tap): big transfer
|
test TCP/IPv6: ns to host (via tap): big transfer
|
||||||
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
|
||||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
|
||||||
sleep 1
|
sleep 1
|
||||||
ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003
|
ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__MAP_HOST6__]:10003
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
|
||||||
|
|
||||||
|
@ -190,6 +190,7 @@ guest cmp test_big.bin /root/big.bin
|
||||||
|
|
||||||
test TCP/IPv6: ns to guest (using namespace address): big transfer
|
test TCP/IPv6: ns to guest (using namespace address): big transfer
|
||||||
guestb socat -u TCP6-LISTEN:10001 OPEN:test_big.bin,create,trunc
|
guestb socat -u TCP6-LISTEN:10001 OPEN:test_big.bin,create,trunc
|
||||||
|
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||||
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
|
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
|
||||||
sleep 1
|
sleep 1
|
||||||
ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__ADDR6__]:10001
|
ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__ADDR6__]:10001
|
||||||
|
@ -203,7 +204,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10001
|
||||||
guestw
|
guestw
|
||||||
guest cmp test_small.bin /root/small.bin
|
guest cmp test_small.bin /root/small.bin
|
||||||
|
|
||||||
test TCP/IPv6: host to ns: small transfer
|
test TCP/IPv6: host to ns (spliced): small transfer
|
||||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
|
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
|
||||||
|
@ -212,17 +213,15 @@ check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
|
||||||
|
|
||||||
test TCP/IPv6: guest to host: small transfer
|
test TCP/IPv6: guest to host: small transfer
|
||||||
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
||||||
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/small.bin TCP6:[__GW6__%__IFNAME__]:10003
|
guest socat -u OPEN:/root/small.bin TCP6:[__MAP_HOST6__]:10003
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
||||||
|
|
||||||
test TCP/IPv6: guest to ns: small transfer
|
test TCP/IPv6: guest to ns: small transfer
|
||||||
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__
|
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/small.bin TCP6:[__GW6__%__IFNAME__]:10002
|
guest socat -u OPEN:/root/small.bin TCP6:[__MAP_NS6__]:10002
|
||||||
nsw
|
nsw
|
||||||
check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
|
check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
|
||||||
|
|
||||||
|
@ -235,9 +234,8 @@ check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
||||||
|
|
||||||
test TCP/IPv6: ns to host (via tap): small transfer
|
test TCP/IPv6: ns to host (via tap): small transfer
|
||||||
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
|
||||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
|
||||||
sleep 1
|
sleep 1
|
||||||
ns socat -u OPEN:__BASEPATH__/small.bin TCP6:[__GW6__%__IFNAME__]:10003
|
ns socat -u OPEN:__BASEPATH__/small.bin TCP6:[__MAP_HOST6__]:10003
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,11 @@ gtools socat ip jq
|
||||||
nstools socat ip jq
|
nstools socat ip jq
|
||||||
htools socat ip jq
|
htools socat ip jq
|
||||||
|
|
||||||
|
set MAP_HOST4 192.0.2.1
|
||||||
|
set MAP_HOST6 2001:db8:9a55::1
|
||||||
|
set MAP_NS4 192.0.2.2
|
||||||
|
set MAP_NS6 2001:db8:9a55::2
|
||||||
|
|
||||||
set TEMP __STATEDIR__/test.bin
|
set TEMP __STATEDIR__/test.bin
|
||||||
set TEMP_NS __STATEDIR__/test_ns.bin
|
set TEMP_NS __STATEDIR__/test_ns.bin
|
||||||
|
|
||||||
|
@ -25,7 +30,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10001,shut-null
|
||||||
guestw
|
guestw
|
||||||
guest cmp test.bin /root/medium.bin
|
guest cmp test.bin /root/medium.bin
|
||||||
|
|
||||||
test UDP/IPv4: host to ns
|
test UDP/IPv4: host to ns (recvmmsg/sendmmsg)
|
||||||
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
|
host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
|
||||||
|
@ -34,16 +39,15 @@ check cmp __TEMP_NS__ __BASEPATH__/medium.bin
|
||||||
|
|
||||||
test UDP/IPv4: guest to host
|
test UDP/IPv4: guest to host
|
||||||
hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
||||||
gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/medium.bin UDP4:__GW__:10003,shut-null
|
guest socat -u OPEN:/root/medium.bin UDP4:__MAP_HOST4__:10003,shut-null
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP__ __BASEPATH__/medium.bin
|
check cmp __TEMP__ __BASEPATH__/medium.bin
|
||||||
|
|
||||||
test UDP/IPv4: guest to ns
|
test UDP/IPv4: guest to ns
|
||||||
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/medium.bin UDP4:__GW__:10002,shut-null
|
guest socat -u OPEN:/root/medium.bin UDP4:__MAP_NS4__:10002,shut-null
|
||||||
nsw
|
nsw
|
||||||
check cmp __TEMP_NS__ __BASEPATH__/medium.bin
|
check cmp __TEMP_NS__ __BASEPATH__/medium.bin
|
||||||
|
|
||||||
|
@ -57,7 +61,7 @@ check cmp __TEMP__ __BASEPATH__/medium.bin
|
||||||
test UDP/IPv4: ns to host (via tap)
|
test UDP/IPv4: ns to host (via tap)
|
||||||
hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null
|
ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__MAP_HOST4__:10003,shut-null
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP__ __BASEPATH__/medium.bin
|
check cmp __TEMP__ __BASEPATH__/medium.bin
|
||||||
|
|
||||||
|
@ -84,7 +88,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10001,shut-null
|
||||||
guestw
|
guestw
|
||||||
guest cmp test.bin /root/medium.bin
|
guest cmp test.bin /root/medium.bin
|
||||||
|
|
||||||
test UDP/IPv6: host to ns
|
test UDP/IPv6: host to ns (recvmmsg/sendmmsg)
|
||||||
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
|
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
|
||||||
|
@ -93,17 +97,15 @@ check cmp __TEMP_NS__ __BASEPATH__/medium.bin
|
||||||
|
|
||||||
test UDP/IPv6: guest to host
|
test UDP/IPv6: guest to host
|
||||||
hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
||||||
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/medium.bin UDP6:[__GW6__%__IFNAME__]:10003,shut-null
|
guest socat -u OPEN:/root/medium.bin UDP6:[__MAP_HOST6__]:10003,shut-null
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP__ __BASEPATH__/medium.bin
|
check cmp __TEMP__ __BASEPATH__/medium.bin
|
||||||
|
|
||||||
test UDP/IPv6: guest to ns
|
test UDP/IPv6: guest to ns
|
||||||
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||||
sleep 1
|
sleep 1
|
||||||
guest socat -u OPEN:/root/medium.bin UDP6:[__GW6__%__IFNAME__]:10002,shut-null
|
guest socat -u OPEN:/root/medium.bin UDP6:[__MAP_NS6__]:10002,shut-null
|
||||||
nsw
|
nsw
|
||||||
check cmp __TEMP_NS__ __BASEPATH__/medium.bin
|
check cmp __TEMP_NS__ __BASEPATH__/medium.bin
|
||||||
|
|
||||||
|
@ -116,9 +118,8 @@ check cmp __TEMP__ __BASEPATH__/medium.bin
|
||||||
|
|
||||||
test UDP/IPv6: ns to host (via tap)
|
test UDP/IPv6: ns to host (via tap)
|
||||||
hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
|
||||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
|
||||||
sleep 1
|
sleep 1
|
||||||
ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__GW6__%__IFNAME__]:10003,shut-null
|
ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__MAP_HOST6__]:10003,shut-null
|
||||||
hostw
|
hostw
|
||||||
check cmp __TEMP__ __BASEPATH__/medium.bin
|
check cmp __TEMP__ __BASEPATH__/medium.bin
|
||||||
|
|
||||||
|
@ -131,6 +132,7 @@ guest cmp test.bin /root/medium.bin
|
||||||
|
|
||||||
test UDP/IPv6: ns to guest (using namespace address)
|
test UDP/IPv6: ns to guest (using namespace address)
|
||||||
guestb socat -u UDP6-LISTEN:10001,null-eof OPEN:test.bin,create,trunc
|
guestb socat -u UDP6-LISTEN:10001,null-eof OPEN:test.bin,create,trunc
|
||||||
|
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||||
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
|
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
|
||||||
sleep 1
|
sleep 1
|
||||||
ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__ADDR6__]:10001,shut-null
|
ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__ADDR6__]:10001,shut-null
|
||||||
|
|
|
@ -35,9 +35,11 @@ check [ __MTU__ = 65520 ]
|
||||||
|
|
||||||
test DHCPv6: address
|
test DHCPv6: address
|
||||||
ns /sbin/dhclient -6 --no-pid __IFNAME__
|
ns /sbin/dhclient -6 --no-pid __IFNAME__
|
||||||
|
# Wait for DAD to complete
|
||||||
|
ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||||
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
|
||||||
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
|
||||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global").local] | .[0]'
|
hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||||
check [ __ADDR6__ = __HOST_ADDR6__ ]
|
check [ __ADDR6__ = __HOST_ADDR6__ ]
|
||||||
|
|
||||||
test DHCPv6: route
|
test DHCPv6: route
|
||||||
|
|
|
@ -18,12 +18,13 @@ test Interface name
|
||||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||||
check [ -n "__IFNAME__" ]
|
check [ -n "__IFNAME__" ]
|
||||||
ns ip link set dev __IFNAME__ up
|
ns ip link set dev __IFNAME__ up
|
||||||
sleep 2
|
# Wait for DAD to complete
|
||||||
|
ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||||
|
|
||||||
test SLAAC: prefix
|
test SLAAC: prefix
|
||||||
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]'
|
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
|
||||||
nsout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4
|
nsout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
|
||||||
hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]'
|
hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
|
||||||
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
|
hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
|
||||||
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]
|
check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]
|
||||||
|
|
||||||
|
|
|
@ -19,8 +19,8 @@ set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin
|
||||||
set TEMP_SMALL __STATEDIR__/test_small.bin
|
set TEMP_SMALL __STATEDIR__/test_small.bin
|
||||||
set TEMP_NS_SMALL __STATEDIR__/test_ns_small.bin
|
set TEMP_NS_SMALL __STATEDIR__/test_ns_small.bin
|
||||||
|
|
||||||
test TCP/IPv4: host to ns: big transfer
|
test TCP/IPv4: host to ns (spliced): big transfer
|
||||||
nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_BIG__,create,trunc
|
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||||
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
|
host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002
|
||||||
nsw
|
nsw
|
||||||
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
|
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
|
||||||
|
@ -38,8 +38,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003
|
||||||
hostw
|
hostw
|
||||||
check cmp __BASEPATH__/big.bin __TEMP_BIG__
|
check cmp __BASEPATH__/big.bin __TEMP_BIG__
|
||||||
|
|
||||||
test TCP/IPv4: host to ns: small transfer
|
test TCP/IPv4: host to ns (spliced): small transfer
|
||||||
nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_SMALL__,create,trunc
|
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||||
host socat OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
|
host socat OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002
|
||||||
nsw
|
nsw
|
||||||
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
|
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
|
||||||
|
@ -57,8 +57,8 @@ ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003
|
||||||
hostw
|
hostw
|
||||||
check cmp __BASEPATH__/small.bin __TEMP_SMALL__
|
check cmp __BASEPATH__/small.bin __TEMP_SMALL__
|
||||||
|
|
||||||
test TCP/IPv6: host to ns: big transfer
|
test TCP/IPv6: host to ns (spliced): big transfer
|
||||||
nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_BIG__,create,trunc
|
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
|
||||||
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
|
host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002
|
||||||
nsw
|
nsw
|
||||||
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
|
check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__
|
||||||
|
@ -77,8 +77,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003
|
||||||
hostw
|
hostw
|
||||||
check cmp __BASEPATH__/big.bin __TEMP_BIG__
|
check cmp __BASEPATH__/big.bin __TEMP_BIG__
|
||||||
|
|
||||||
test TCP/IPv6: host to ns: small transfer
|
test TCP/IPv6: host to ns (spliced): small transfer
|
||||||
nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_SMALL__,create,trunc
|
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
|
||||||
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
|
host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002
|
||||||
nsw
|
nsw
|
||||||
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
|
check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__
|
||||||
|
|
|
@ -17,8 +17,8 @@ htools dd socat ip jq
|
||||||
set TEMP __STATEDIR__/test.bin
|
set TEMP __STATEDIR__/test.bin
|
||||||
set TEMP_NS __STATEDIR__/test_ns.bin
|
set TEMP_NS __STATEDIR__/test_ns.bin
|
||||||
|
|
||||||
test UDP/IPv4: host to ns
|
test UDP/IPv4: host to ns (recvmmsg/sendmmsg)
|
||||||
nsb socat -u UDP4-LISTEN:10002,bind=127.0.0.1,null-eof OPEN:__TEMP_NS__,create,trunc
|
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||||
host socat OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
|
host socat OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null
|
||||||
nsw
|
nsw
|
||||||
check cmp __BASEPATH__/medium.bin __TEMP_NS__
|
check cmp __BASEPATH__/medium.bin __TEMP_NS__
|
||||||
|
@ -37,8 +37,8 @@ ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null
|
||||||
hostw
|
hostw
|
||||||
check cmp __BASEPATH__/medium.bin __TEMP__
|
check cmp __BASEPATH__/medium.bin __TEMP__
|
||||||
|
|
||||||
test UDP/IPv6: host to ns
|
test UDP/IPv6: host to ns (recvmmsg/sendmmsg)
|
||||||
nsb socat -u UDP6-LISTEN:10002,bind=[::1],null-eof OPEN:__TEMP_NS__,create,trunc
|
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
|
||||||
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
|
host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null
|
||||||
nsw
|
nsw
|
||||||
check cmp __BASEPATH__/medium.bin __TEMP_NS__
|
check cmp __BASEPATH__/medium.bin __TEMP_NS__
|
||||||
|
|
|
@ -19,7 +19,7 @@ sleep 1
|
||||||
endef
|
endef
|
||||||
|
|
||||||
def flood_log_client
|
def flood_log_client
|
||||||
host tcp_crr --nolog -P 10001 -C 10002 -6 -c -H ::1
|
host tcp_crr --nolog -l1 -P 10001 -C 10002 -6 -c -H ::1
|
||||||
endef
|
endef
|
||||||
|
|
||||||
def check_log_size_mountns
|
def check_log_size_mountns
|
||||||
|
@ -33,19 +33,16 @@ test Log creation
|
||||||
set PORTS -t 10001,10002 -u 10001,10002
|
set PORTS -t 10001,10002 -u 10001,10002
|
||||||
set LOG_FILE __STATEDIR__/pasta.log
|
set LOG_FILE __STATEDIR__/pasta.log
|
||||||
|
|
||||||
passt ./pasta -l __LOG_FILE__
|
passt ./pasta -l __LOG_FILE__ -- /bin/true
|
||||||
passtb exit
|
|
||||||
sleep 1
|
|
||||||
check [ -s __LOG_FILE__ ]
|
check [ -s __LOG_FILE__ ]
|
||||||
|
|
||||||
test Log truncated on creation
|
test Log truncated on creation
|
||||||
passt ./pasta -l __LOG_FILE__
|
passt ./pasta -l __LOG_FILE__ -- /bin/true & wait
|
||||||
passtb exit
|
pout PID2 echo $!
|
||||||
sleep 1
|
check head -1 __LOG_FILE__ | grep '^pasta .* [(]__PID2__[)]$'
|
||||||
check [ $(cat __LOG_FILE__ | wc -l) -eq 1 ]
|
|
||||||
|
|
||||||
test Maximum log size
|
test Maximum log size
|
||||||
passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -P 10001 -C 10002 -6; done'
|
passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -l1 -P 10001 -C 10002 -6; done'
|
||||||
sleep 1
|
sleep 1
|
||||||
|
|
||||||
flood_log_client
|
flood_log_client
|
||||||
|
|
|
@ -11,11 +11,16 @@
|
||||||
# Copyright (c) 2022 Red Hat GmbH
|
# Copyright (c) 2022 Red Hat GmbH
|
||||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||||
|
|
||||||
htools git make go bats catatonit ip jq socat
|
htools git make go bats ip jq socat ./test/podman/bin/podman
|
||||||
|
|
||||||
|
set PODMAN test/podman/bin/podman
|
||||||
|
hout WD pwd
|
||||||
|
|
||||||
|
test Podman pasta path
|
||||||
|
|
||||||
|
hout PASTA_BIN CONTAINERS_HELPER_BINARY_DIR="__WD__" __PODMAN__ info --format "{{.Host.Pasta.Executable}}"
|
||||||
|
check [ "__PASTA_BIN__" = "__WD__/pasta" ]
|
||||||
|
|
||||||
test Podman system test with bats
|
test Podman system test with bats
|
||||||
|
|
||||||
host git -C __STATEDIR__ clone https://github.com/containers/podman.git
|
host PODMAN="__PODMAN__" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats test/podman/test/system/505-networking-pasta.bats
|
||||||
host make -C __STATEDIR__/podman
|
|
||||||
hout WD pwd
|
|
||||||
host PODMAN="__STATEDIR__/podman/bin/podman" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats __STATEDIR__/podman/test/system/505-networking-pasta.bats
|
|
||||||
|
|
|
@ -15,6 +15,9 @@ gtools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr # From neper
|
||||||
nstools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr
|
nstools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr
|
||||||
htools bc head sed seq
|
htools bc head sed seq
|
||||||
|
|
||||||
|
set MAP_NS4 192.0.2.2
|
||||||
|
set MAP_NS6 2001:db8:9a55::2
|
||||||
|
|
||||||
test passt: throughput and latency
|
test passt: throughput and latency
|
||||||
|
|
||||||
guest /sbin/sysctl -w net.core.rmem_max=536870912
|
guest /sbin/sysctl -w net.core.rmem_max=536870912
|
||||||
|
@ -29,42 +32,39 @@ ns /sbin/sysctl -w net.ipv4.tcp_rmem="4096 524288 134217728"
|
||||||
ns /sbin/sysctl -w net.ipv4.tcp_wmem="4096 524288 134217728"
|
ns /sbin/sysctl -w net.ipv4.tcp_wmem="4096 524288 134217728"
|
||||||
ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0
|
ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0
|
||||||
|
|
||||||
gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||||
|
|
||||||
hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
|
hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
|
||||||
hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
|
hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
|
||||||
hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
|
hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
|
||||||
|
|
||||||
set THREADS 1
|
set THREADS 4
|
||||||
set STREAMS 8
|
set TIME 1
|
||||||
set TIME 10
|
|
||||||
set OMIT 0.1
|
set OMIT 0.1
|
||||||
set OPTS -Z -P __STREAMS__ -l 1M -O__OMIT__
|
set OPTS -Z -P __THREADS__ -l 1M -O__OMIT__
|
||||||
|
|
||||||
info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz, __STREAMS__ streams
|
info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz
|
||||||
report passt tcp __THREADS__ __FREQ__
|
report passt tcp __THREADS__ __FREQ__
|
||||||
|
|
||||||
th MTU 256B 576B 1280B 1500B 9000B 65520B
|
th MTU 256B 576B 1280B 1500B 9000B 65520B
|
||||||
|
|
||||||
|
|
||||||
tr TCP throughput over IPv6: guest to host
|
tr TCP throughput over IPv6: guest to host
|
||||||
iperf3s ns 100${i}2 __THREADS__
|
iperf3s ns 10002
|
||||||
|
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
guest ip link set dev __IFNAME__ mtu 1280
|
guest ip link set dev __IFNAME__ mtu 1280
|
||||||
iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 4M
|
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 4M
|
||||||
bw __BW__ 1.2 1.5
|
bw __BW__ 1.2 1.5
|
||||||
guest ip link set dev __IFNAME__ mtu 1500
|
guest ip link set dev __IFNAME__ mtu 1500
|
||||||
iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 4M
|
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 4M
|
||||||
bw __BW__ 1.6 1.8
|
bw __BW__ 1.6 1.8
|
||||||
guest ip link set dev __IFNAME__ mtu 9000
|
guest ip link set dev __IFNAME__ mtu 9000
|
||||||
iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 8M
|
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 8M
|
||||||
bw __BW__ 4.0 5.0
|
bw __BW__ 4.0 5.0
|
||||||
guest ip link set dev __IFNAME__ mtu 65520
|
guest ip link set dev __IFNAME__ mtu 65520
|
||||||
iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 16M
|
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M
|
||||||
bw __BW__ 7.0 8.0
|
bw __BW__ 7.0 8.0
|
||||||
|
|
||||||
iperf3k ns
|
iperf3k ns
|
||||||
|
@ -76,7 +76,7 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
nsb tcp_rr --nolog -6
|
nsb tcp_rr --nolog -6
|
||||||
gout LAT tcp_rr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
gout LAT tcp_rr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
lat __LAT__ 200 150
|
lat __LAT__ 200 150
|
||||||
|
|
||||||
tl TCP CRR latency over IPv6: guest to host
|
tl TCP CRR latency over IPv6: guest to host
|
||||||
|
@ -86,33 +86,39 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
nsb tcp_crr --nolog -6
|
nsb tcp_crr --nolog -6
|
||||||
gout LAT tcp_crr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
gout LAT tcp_crr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
lat __LAT__ 500 400
|
lat __LAT__ 500 400
|
||||||
|
|
||||||
tr TCP throughput over IPv4: guest to host
|
tr TCP throughput over IPv4: guest to host
|
||||||
iperf3s ns 100${i}2 __THREADS__
|
iperf3s ns 10002
|
||||||
|
|
||||||
guest ip link set dev __IFNAME__ mtu 256
|
guest ip link set dev __IFNAME__ mtu 256
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 1M
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 1M
|
||||||
bw __BW__ 0.2 0.3
|
bw __BW__ 0.2 0.3
|
||||||
guest ip link set dev __IFNAME__ mtu 576
|
guest ip link set dev __IFNAME__ mtu 576
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 1M
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 1M
|
||||||
bw __BW__ 0.5 0.8
|
bw __BW__ 0.5 0.8
|
||||||
guest ip link set dev __IFNAME__ mtu 1280
|
guest ip link set dev __IFNAME__ mtu 1280
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 4M
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M
|
||||||
bw __BW__ 1.2 1.5
|
bw __BW__ 1.2 1.5
|
||||||
guest ip link set dev __IFNAME__ mtu 1500
|
guest ip link set dev __IFNAME__ mtu 1500
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 4M
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M
|
||||||
bw __BW__ 1.6 1.8
|
bw __BW__ 1.6 1.8
|
||||||
guest ip link set dev __IFNAME__ mtu 9000
|
guest ip link set dev __IFNAME__ mtu 9000
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 8M
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M
|
||||||
bw __BW__ 4.0 5.0
|
bw __BW__ 4.0 5.0
|
||||||
guest ip link set dev __IFNAME__ mtu 65520
|
guest ip link set dev __IFNAME__ mtu 65520
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 16M
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M
|
||||||
bw __BW__ 7.0 8.0
|
bw __BW__ 7.0 8.0
|
||||||
|
|
||||||
iperf3k ns
|
iperf3k ns
|
||||||
|
|
||||||
|
# Reducing MTU below 1280 deconfigures IPv6, get our address back
|
||||||
|
guest dhclient -6 -x
|
||||||
|
guest dhclient -6 __IFNAME__
|
||||||
|
# Wait for DAD to complete
|
||||||
|
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
|
||||||
|
|
||||||
tl TCP RR latency over IPv4: guest to host
|
tl TCP RR latency over IPv4: guest to host
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
|
@ -120,7 +126,7 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
nsb tcp_rr --nolog -4
|
nsb tcp_rr --nolog -4
|
||||||
gout LAT tcp_rr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
gout LAT tcp_rr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
lat __LAT__ 200 150
|
lat __LAT__ 200 150
|
||||||
|
|
||||||
tl TCP CRR latency over IPv4: guest to host
|
tl TCP CRR latency over IPv4: guest to host
|
||||||
|
@ -130,18 +136,18 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
nsb tcp_crr --nolog -4
|
nsb tcp_crr --nolog -4
|
||||||
gout LAT tcp_crr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
gout LAT tcp_crr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
lat __LAT__ 500 400
|
lat __LAT__ 500 400
|
||||||
|
|
||||||
tr TCP throughput over IPv6: host to guest
|
tr TCP throughput over IPv6: host to guest
|
||||||
iperf3s guest 100${i}1 __THREADS__
|
iperf3s guest 10001
|
||||||
|
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
iperf3 BW ns ::1 100${i}1 __THREADS__ __TIME__ __OPTS__
|
iperf3 BW ns ::1 10001 __TIME__ __OPTS__
|
||||||
bw __BW__ 6.0 6.8
|
bw __BW__ 6.0 6.8
|
||||||
|
|
||||||
iperf3k guest
|
iperf3k guest
|
||||||
|
@ -154,7 +160,7 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
guestb tcp_rr --nolog -P 10001 -C 10011 -6
|
guestb tcp_rr --nolog -P 10001 -C 10011 -6
|
||||||
sleep 1
|
sleep 1
|
||||||
nsout LAT tcp_rr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
lat __LAT__ 200 150
|
lat __LAT__ 200 150
|
||||||
|
|
||||||
tl TCP CRR latency over IPv6: host to guest
|
tl TCP CRR latency over IPv6: host to guest
|
||||||
|
@ -165,19 +171,19 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
guestb tcp_crr --nolog -P 10001 -C 10011 -6
|
guestb tcp_crr --nolog -P 10001 -C 10011 -6
|
||||||
sleep 1
|
sleep 1
|
||||||
nsout LAT tcp_crr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
lat __LAT__ 500 350
|
lat __LAT__ 500 350
|
||||||
|
|
||||||
|
|
||||||
tr TCP throughput over IPv4: host to guest
|
tr TCP throughput over IPv4: host to guest
|
||||||
iperf3s guest 100${i}1 __THREADS__
|
iperf3s guest 10001
|
||||||
|
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__
|
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__
|
||||||
bw __BW__ 6.0 6.8
|
bw __BW__ 6.0 6.8
|
||||||
|
|
||||||
iperf3k guest
|
iperf3k guest
|
||||||
|
@ -190,7 +196,7 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
guestb tcp_rr --nolog -P 10001 -C 10011 -4
|
guestb tcp_rr --nolog -P 10001 -C 10011 -4
|
||||||
sleep 1
|
sleep 1
|
||||||
nsout LAT tcp_rr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
lat __LAT__ 200 150
|
lat __LAT__ 200 150
|
||||||
|
|
||||||
tl TCP CRR latency over IPv6: host to guest
|
tl TCP CRR latency over IPv6: host to guest
|
||||||
|
@ -201,7 +207,7 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
guestb tcp_crr --nolog -P 10001 -C 10011 -4
|
guestb tcp_crr --nolog -P 10001 -C 10011 -4
|
||||||
sleep 1
|
sleep 1
|
||||||
nsout LAT tcp_crr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
lat __LAT__ 500 300
|
lat __LAT__ 500 300
|
||||||
|
|
||||||
te
|
te
|
||||||
|
|
|
@ -15,6 +15,9 @@ gtools /sbin/sysctl ip jq nproc sleep iperf3 udp_rr # From neper
|
||||||
nstools ip jq sleep iperf3 udp_rr
|
nstools ip jq sleep iperf3 udp_rr
|
||||||
htools bc head sed
|
htools bc head sed
|
||||||
|
|
||||||
|
set MAP_NS4 192.0.2.2
|
||||||
|
set MAP_NS6 2001:db8:9a55::2
|
||||||
|
|
||||||
test passt: throughput and latency
|
test passt: throughput and latency
|
||||||
|
|
||||||
guest /sbin/sysctl -w net.core.rmem_max=16777216
|
guest /sbin/sysctl -w net.core.rmem_max=16777216
|
||||||
|
@ -22,38 +25,33 @@ guest /sbin/sysctl -w net.core.wmem_max=16777216
|
||||||
guest /sbin/sysctl -w net.core.rmem_default=16777216
|
guest /sbin/sysctl -w net.core.rmem_default=16777216
|
||||||
guest /sbin/sysctl -w net.core.wmem_default=16777216
|
guest /sbin/sysctl -w net.core.wmem_default=16777216
|
||||||
|
|
||||||
gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
|
||||||
|
|
||||||
hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
|
hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
|
||||||
hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
|
hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
|
||||||
hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
|
hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
|
||||||
|
|
||||||
set THREADS 4
|
set THREADS 2
|
||||||
set STREAMS 1
|
set TIME 1
|
||||||
set TIME 10
|
set OPTS -u -P __THREADS__ --pacing-timer 1000
|
||||||
set OPTS -u -P __STREAMS__ --pacing-timer 1000
|
|
||||||
|
|
||||||
info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz, one stream each
|
info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz
|
||||||
|
|
||||||
report passt udp __THREADS__ __FREQ__
|
report passt udp __THREADS__ __FREQ__
|
||||||
|
|
||||||
th pktlen 256B 576B 1280B 1500B 9000B 65520B
|
th pktlen 256B 576B 1280B 1500B 9000B 65520B
|
||||||
|
|
||||||
tr UDP throughput over IPv6: guest to host
|
tr UDP throughput over IPv6: guest to host
|
||||||
iperf3s ns 100${i}2 __THREADS__
|
iperf3s ns 10002
|
||||||
# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header
|
# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header
|
||||||
|
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1232
|
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 3G -l 1232
|
||||||
bw __BW__ 0.8 1.2
|
bw __BW__ 0.8 1.2
|
||||||
iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1452
|
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 4G -l 1452
|
||||||
bw __BW__ 1.0 1.5
|
bw __BW__ 1.0 1.5
|
||||||
iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 5G -l 8952
|
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 8G -l 8952
|
||||||
bw __BW__ 4.0 5.0
|
bw __BW__ 4.0 5.0
|
||||||
iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 7G -l 64372
|
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 15G -l 64372
|
||||||
bw __BW__ 4.0 5.0
|
bw __BW__ 4.0 5.0
|
||||||
|
|
||||||
iperf3k ns
|
iperf3k ns
|
||||||
|
@ -65,25 +63,25 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
nsb udp_rr --nolog -6
|
nsb udp_rr --nolog -6
|
||||||
gout LAT udp_rr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
gout LAT udp_rr --nolog -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
lat __LAT__ 200 150
|
lat __LAT__ 200 150
|
||||||
|
|
||||||
|
|
||||||
tr UDP throughput over IPv4: guest to host
|
tr UDP throughput over IPv4: guest to host
|
||||||
iperf3s ns 100${i}2 __THREADS__
|
iperf3s ns 10002
|
||||||
# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header
|
# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header
|
||||||
|
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 500M -l 228
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 1G -l 228
|
||||||
bw __BW__ 0.0 0.0
|
bw __BW__ 0.0 0.0
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 1G -l 548
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 2G -l 548
|
||||||
bw __BW__ 0.4 0.6
|
bw __BW__ 0.4 0.6
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1252
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 3G -l 1252
|
||||||
bw __BW__ 0.8 1.2
|
bw __BW__ 0.8 1.2
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1472
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 4G -l 1472
|
||||||
bw __BW__ 1.0 1.5
|
bw __BW__ 1.0 1.5
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 6G -l 8972
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 8G -l 8972
|
||||||
bw __BW__ 4.0 5.0
|
bw __BW__ 4.0 5.0
|
||||||
iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 7G -l 65492
|
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 15G -l 65492
|
||||||
bw __BW__ 4.0 5.0
|
bw __BW__ 4.0 5.0
|
||||||
|
|
||||||
iperf3k ns
|
iperf3k ns
|
||||||
|
@ -95,23 +93,23 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
nsb udp_rr --nolog -4
|
nsb udp_rr --nolog -4
|
||||||
gout LAT udp_rr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
gout LAT udp_rr --nolog -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
lat __LAT__ 200 150
|
lat __LAT__ 200 150
|
||||||
|
|
||||||
|
|
||||||
tr UDP throughput over IPv6: host to guest
|
tr UDP throughput over IPv6: host to guest
|
||||||
iperf3s guest 100${i}1 __THREADS__
|
iperf3s guest 10001
|
||||||
# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header
|
# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header
|
||||||
|
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
iperf3 BW ns ::1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1232
|
iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 3G -l 1232
|
||||||
bw __BW__ 0.8 1.2
|
bw __BW__ 0.8 1.2
|
||||||
iperf3 BW ns ::1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1452
|
iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 4G -l 1452
|
||||||
bw __BW__ 1.0 1.5
|
bw __BW__ 1.0 1.5
|
||||||
iperf3 BW ns ::1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 8952
|
iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 8G -l 8952
|
||||||
bw __BW__ 3.0 4.0
|
bw __BW__ 3.0 4.0
|
||||||
iperf3 BW ns ::1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 64372
|
iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 15G -l 64372
|
||||||
bw __BW__ 3.0 4.0
|
bw __BW__ 3.0 4.0
|
||||||
|
|
||||||
iperf3k guest
|
iperf3k guest
|
||||||
|
@ -129,20 +127,20 @@ lat __LAT__ 200 150
|
||||||
|
|
||||||
|
|
||||||
tr UDP throughput over IPv4: host to guest
|
tr UDP throughput over IPv4: host to guest
|
||||||
iperf3s guest 100${i}1 __THREADS__
|
iperf3s guest 10001
|
||||||
# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header
|
# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header
|
||||||
|
|
||||||
iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 1G -l 228
|
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 1G -l 228
|
||||||
bw __BW__ 0.0 0.0
|
bw __BW__ 0.0 0.0
|
||||||
iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 1G -l 548
|
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 2G -l 548
|
||||||
bw __BW__ 0.4 0.6
|
bw __BW__ 0.4 0.6
|
||||||
iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1252
|
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 3G -l 1252
|
||||||
bw __BW__ 0.8 1.2
|
bw __BW__ 0.8 1.2
|
||||||
iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1472
|
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 4G -l 1472
|
||||||
bw __BW__ 1.0 1.5
|
bw __BW__ 1.0 1.5
|
||||||
iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 8972
|
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 8G -l 8972
|
||||||
bw __BW__ 3.0 4.0
|
bw __BW__ 3.0 4.0
|
||||||
iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 65492
|
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 15G -l 65492
|
||||||
bw __BW__ 3.0 4.0
|
bw __BW__ 3.0 4.0
|
||||||
|
|
||||||
iperf3k guest
|
iperf3k guest
|
||||||
|
|
|
@ -14,6 +14,9 @@
|
||||||
htools head ip seq bc sleep iperf3 tcp_rr tcp_crr jq sed
|
htools head ip seq bc sleep iperf3 tcp_rr tcp_crr jq sed
|
||||||
nstools /sbin/sysctl nproc ip seq sleep iperf3 tcp_rr tcp_crr jq sed
|
nstools /sbin/sysctl nproc ip seq sleep iperf3 tcp_rr tcp_crr jq sed
|
||||||
|
|
||||||
|
set MAP_HOST4 192.0.2.1
|
||||||
|
set MAP_HOST6 2001:db8:9a55::1
|
||||||
|
|
||||||
test pasta: throughput and latency (local connections)
|
test pasta: throughput and latency (local connections)
|
||||||
|
|
||||||
ns /sbin/sysctl -w net.ipv4.tcp_rmem="131072 524288 134217728"
|
ns /sbin/sysctl -w net.ipv4.tcp_rmem="131072 524288 134217728"
|
||||||
|
@ -21,101 +24,100 @@ ns /sbin/sysctl -w net.ipv4.tcp_wmem="131072 524288 134217728"
|
||||||
ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0
|
ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0
|
||||||
|
|
||||||
|
|
||||||
set THREADS 2
|
set THREADS 4
|
||||||
set STREAMS 2
|
set TIME 1
|
||||||
set TIME 10
|
|
||||||
set OMIT 0.1
|
set OMIT 0.1
|
||||||
set OPTS -Z -w 4M -l 1M -P __STREAMS__ -O__OMIT__
|
set OPTS -Z -w 4M -l 1M -P __THREADS__ -O__OMIT__
|
||||||
|
|
||||||
hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
|
hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
|
||||||
hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
|
hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
|
||||||
hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
|
hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
|
||||||
|
|
||||||
|
|
||||||
info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz, __STREAMS__ streams each
|
info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz
|
||||||
report pasta lo_tcp __THREADS__ __FREQ__
|
report pasta lo_tcp __THREADS__ __FREQ__
|
||||||
|
|
||||||
th MTU 65535B
|
th MTU 65535B
|
||||||
|
|
||||||
tr TCP throughput over IPv6: ns to host
|
tr TCP throughput over IPv6: ns to host
|
||||||
iperf3s host 100${i}3 __THREADS__
|
iperf3s host 10003
|
||||||
|
|
||||||
iperf3 BW ns ::1 100${i}3 __THREADS__ __TIME__ __OPTS__
|
iperf3 BW ns ::1 10003 __THREADS__ __TIME__ __OPTS__
|
||||||
bw __BW__ 15.0 20.0
|
bw __BW__ 15.0 20.0
|
||||||
|
|
||||||
iperf3k host
|
iperf3k host
|
||||||
|
|
||||||
tl TCP RR latency over IPv6: ns to host
|
tl TCP RR latency over IPv6: ns to host
|
||||||
hostb tcp_rr --nolog -P 10003 -C 10013 -6
|
hostb tcp_rr --nolog -P 10003 -C 10013 -6
|
||||||
nsout LAT tcp_rr --nolog -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
hostw
|
hostw
|
||||||
lat __LAT__ 150 100
|
lat __LAT__ 150 100
|
||||||
|
|
||||||
tl TCP CRR latency over IPv6: ns to host
|
tl TCP CRR latency over IPv6: ns to host
|
||||||
hostb tcp_crr --nolog -P 10003 -C 10013 -6
|
hostb tcp_crr --nolog -P 10003 -C 10013 -6
|
||||||
nsout LAT tcp_crr --nolog -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
hostw
|
hostw
|
||||||
lat __LAT__ 500 350
|
lat __LAT__ 500 350
|
||||||
|
|
||||||
|
|
||||||
tr TCP throughput over IPv4: ns to host
|
tr TCP throughput over IPv4: ns to host
|
||||||
iperf3s host 100${i}3 __THREADS__
|
iperf3s host 10003
|
||||||
|
|
||||||
iperf3 BW ns 127.0.0.1 100${i}3 __THREADS__ __TIME__ __OPTS__
|
iperf3 BW ns 127.0.0.1 10003 __THREADS__ __TIME__ __OPTS__
|
||||||
bw __BW__ 15.0 20.0
|
bw __BW__ 15.0 20.0
|
||||||
|
|
||||||
iperf3k host
|
iperf3k host
|
||||||
|
|
||||||
tl TCP RR latency over IPv4: ns to host
|
tl TCP RR latency over IPv4: ns to host
|
||||||
hostb tcp_rr --nolog -P 10003 -C 10013 -4
|
hostb tcp_rr --nolog -P 10003 -C 10013 -4
|
||||||
nsout LAT tcp_rr --nolog -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
hostw
|
hostw
|
||||||
lat __LAT__ 150 100
|
lat __LAT__ 150 100
|
||||||
|
|
||||||
tl TCP CRR latency over IPv4: ns to host
|
tl TCP CRR latency over IPv4: ns to host
|
||||||
hostb tcp_crr --nolog -P 10003 -C 10013 -4
|
hostb tcp_crr --nolog -P 10003 -C 10013 -4
|
||||||
nsout LAT tcp_crr --nolog -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
hostw
|
hostw
|
||||||
lat __LAT__ 500 350
|
lat __LAT__ 500 350
|
||||||
|
|
||||||
tr TCP throughput over IPv6: host to ns
|
tr TCP throughput over IPv6: host to ns
|
||||||
iperf3s ns 100${i}2 __THREADS__
|
iperf3s ns 10002
|
||||||
|
|
||||||
iperf3 BW host ::1 100${i}2 __THREADS__ __TIME__ __OPTS__
|
iperf3 BW host ::1 10002 __TIME__ __OPTS__
|
||||||
bw __BW__ 15.0 20.0
|
bw __BW__ 15.0 20.0
|
||||||
|
|
||||||
iperf3k ns
|
iperf3k ns
|
||||||
|
|
||||||
tl TCP RR latency over IPv6: host to ns
|
tl TCP RR latency over IPv6: host to ns
|
||||||
nsb tcp_rr --nolog -P 10002 -C 10012 -6
|
nsb tcp_rr --nolog -P 10002 -C 10012 -6
|
||||||
hout LAT tcp_rr --nolog -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
nsw
|
nsw
|
||||||
lat __LAT__ 150 100
|
lat __LAT__ 150 100
|
||||||
|
|
||||||
tl TCP CRR latency over IPv6: host to ns
|
tl TCP CRR latency over IPv6: host to ns
|
||||||
nsb tcp_crr --nolog -P 10002 -C 10012 -6
|
nsb tcp_crr --nolog -P 10002 -C 10012 -6
|
||||||
hout LAT tcp_crr --nolog -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
nsw
|
nsw
|
||||||
lat __LAT__ 1000 700
|
lat __LAT__ 1000 700
|
||||||
|
|
||||||
|
|
||||||
tr TCP throughput over IPv4: host to ns
|
tr TCP throughput over IPv4: host to ns
|
||||||
iperf3s ns 100${i}2 __THREADS__
|
iperf3s ns 10002
|
||||||
|
|
||||||
iperf3 BW host 127.0.0.1 100${i}2 __THREADS__ __TIME__ __OPTS__
|
iperf3 BW host 127.0.0.1 10002 __TIME__ __OPTS__
|
||||||
bw __BW__ 15.0 20.0
|
bw __BW__ 15.0 20.0
|
||||||
|
|
||||||
iperf3k ns
|
iperf3k ns
|
||||||
|
|
||||||
tl TCP RR latency over IPv4: host to ns
|
tl TCP RR latency over IPv4: host to ns
|
||||||
nsb tcp_rr --nolog -P 10002 -C 10012 -4
|
nsb tcp_rr --nolog -P 10002 -C 10012 -4
|
||||||
hout LAT tcp_rr --nolog -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
nsw
|
nsw
|
||||||
lat __LAT__ 150 100
|
lat __LAT__ 150 100
|
||||||
|
|
||||||
tl TCP CRR latency over IPv4: host to ns
|
tl TCP CRR latency over IPv4: host to ns
|
||||||
nsb tcp_crr --nolog -P 10002 -C 10012 -4
|
nsb tcp_crr --nolog -P 10002 -C 10012 -4
|
||||||
hout LAT tcp_crr --nolog -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
nsw
|
nsw
|
||||||
lat __LAT__ 1000 700
|
lat __LAT__ 1000 700
|
||||||
|
|
||||||
|
@ -123,32 +125,29 @@ te
|
||||||
|
|
||||||
test pasta: throughput and latency (connections via tap)
|
test pasta: throughput and latency (connections via tap)
|
||||||
|
|
||||||
nsout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
nsout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
|
|
||||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||||
set THREADS 1
|
set THREADS 2
|
||||||
set STREAMS 2
|
set OPTS -Z -P __THREADS__ -i1 -O__OMIT__
|
||||||
set OPTS -Z -P __STREAMS__ -i1 -O__OMIT__
|
|
||||||
|
|
||||||
info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz, __STREAMS__ streams
|
info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz
|
||||||
report pasta tap_tcp __THREADS__ __FREQ__
|
report pasta tap_tcp __THREADS__ __FREQ__
|
||||||
|
|
||||||
th MTU 1500B 4000B 16384B 65520B
|
th MTU 1500B 4000B 16384B 65520B
|
||||||
|
|
||||||
tr TCP throughput over IPv6: ns to host
|
tr TCP throughput over IPv6: ns to host
|
||||||
iperf3s host 100${i}3 __THREADS__
|
iperf3s host 10003
|
||||||
|
|
||||||
ns ip link set dev __IFNAME__ mtu 1500
|
ns ip link set dev __IFNAME__ mtu 1500
|
||||||
iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 512k
|
iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 512k
|
||||||
bw __BW__ 0.2 0.4
|
bw __BW__ 0.2 0.4
|
||||||
ns ip link set dev __IFNAME__ mtu 4000
|
ns ip link set dev __IFNAME__ mtu 4000
|
||||||
iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 1M
|
iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 1M
|
||||||
bw __BW__ 0.3 0.5
|
bw __BW__ 0.3 0.5
|
||||||
ns ip link set dev __IFNAME__ mtu 16384
|
ns ip link set dev __IFNAME__ mtu 16384
|
||||||
iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 8M
|
iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 8M
|
||||||
bw __BW__ 1.5 2.0
|
bw __BW__ 1.5 2.0
|
||||||
ns ip link set dev __IFNAME__ mtu 65520
|
ns ip link set dev __IFNAME__ mtu 65520
|
||||||
iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 8M
|
iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 8M
|
||||||
bw __BW__ 2.0 2.5
|
bw __BW__ 2.0 2.5
|
||||||
|
|
||||||
iperf3k host
|
iperf3k host
|
||||||
|
@ -158,7 +157,7 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
hostb tcp_rr --nolog -P 10003 -C 10013 -6
|
hostb tcp_rr --nolog -P 10003 -C 10013 -6
|
||||||
nsout LAT tcp_rr --nolog -P 10003 -C 10013 -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -6 -c -H __MAP_HOST6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
hostw
|
hostw
|
||||||
lat __LAT__ 150 100
|
lat __LAT__ 150 100
|
||||||
|
|
||||||
|
@ -167,25 +166,25 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
hostb tcp_crr --nolog -P 10003 -C 10013 -6
|
hostb tcp_crr --nolog -P 10003 -C 10013 -6
|
||||||
nsout LAT tcp_crr --nolog -P 10003 -C 10013 -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -6 -c -H __MAP_HOST6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
hostw
|
hostw
|
||||||
lat __LAT__ 1500 500
|
lat __LAT__ 1500 500
|
||||||
|
|
||||||
|
|
||||||
tr TCP throughput over IPv4: ns to host
|
tr TCP throughput over IPv4: ns to host
|
||||||
iperf3s host 100${i}3 __THREADS__
|
iperf3s host 10003
|
||||||
|
|
||||||
ns ip link set dev __IFNAME__ mtu 1500
|
ns ip link set dev __IFNAME__ mtu 1500
|
||||||
iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 512k
|
iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 512k
|
||||||
bw __BW__ 0.2 0.4
|
bw __BW__ 0.2 0.4
|
||||||
ns ip link set dev __IFNAME__ mtu 4000
|
ns ip link set dev __IFNAME__ mtu 4000
|
||||||
iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 1M
|
iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 1M
|
||||||
bw __BW__ 0.3 0.5
|
bw __BW__ 0.3 0.5
|
||||||
ns ip link set dev __IFNAME__ mtu 16384
|
ns ip link set dev __IFNAME__ mtu 16384
|
||||||
iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 8M
|
iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 8M
|
||||||
bw __BW__ 1.5 2.0
|
bw __BW__ 1.5 2.0
|
||||||
ns ip link set dev __IFNAME__ mtu 65520
|
ns ip link set dev __IFNAME__ mtu 65520
|
||||||
iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 8M
|
iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 8M
|
||||||
bw __BW__ 2.0 2.5
|
bw __BW__ 2.0 2.5
|
||||||
|
|
||||||
iperf3k host
|
iperf3k host
|
||||||
|
@ -195,7 +194,7 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
hostb tcp_rr --nolog -P 10003 -C 10013 -4
|
hostb tcp_rr --nolog -P 10003 -C 10013 -4
|
||||||
nsout LAT tcp_rr --nolog -P 10003 -C 10013 -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -4 -c -H __MAP_HOST4__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
hostw
|
hostw
|
||||||
lat __LAT__ 150 100
|
lat __LAT__ 150 100
|
||||||
|
|
||||||
|
@ -204,19 +203,19 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
hostb tcp_crr --nolog -P 10003 -C 10013 -4
|
hostb tcp_crr --nolog -P 10003 -C 10013 -4
|
||||||
nsout LAT tcp_crr --nolog -P 10003 -C 10013 -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -4 -c -H __MAP_HOST4__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
hostw
|
hostw
|
||||||
lat __LAT__ 1500 500
|
lat __LAT__ 1500 500
|
||||||
|
|
||||||
tr TCP throughput over IPv6: host to ns
|
tr TCP throughput over IPv6: host to ns
|
||||||
iperf3s ns 100${i}2 __THREADS__
|
iperf3s ns 10002
|
||||||
|
|
||||||
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||||
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
|
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
iperf3 BW host __ADDR6__ 100${i}2 __THREADS__ __TIME__ __OPTS__
|
iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__
|
||||||
bw __BW__ 8.0 10.0
|
bw __BW__ 8.0 10.0
|
||||||
|
|
||||||
iperf3k ns
|
iperf3k ns
|
||||||
|
@ -226,7 +225,7 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
nsb tcp_rr --nolog -P 10002 -C 10012 -6
|
nsb tcp_rr --nolog -P 10002 -C 10012 -6
|
||||||
hout LAT tcp_rr --nolog -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
nsw
|
nsw
|
||||||
lat __LAT__ 150 100
|
lat __LAT__ 150 100
|
||||||
|
|
||||||
|
@ -236,19 +235,19 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
sleep 1
|
sleep 1
|
||||||
nsb tcp_crr --nolog -P 10002 -C 10012 -6
|
nsb tcp_crr --nolog -P 10002 -C 10012 -6
|
||||||
hout LAT tcp_crr --nolog -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
nsw
|
nsw
|
||||||
lat __LAT__ 5000 10000
|
lat __LAT__ 5000 10000
|
||||||
|
|
||||||
|
|
||||||
tr TCP throughput over IPv4: host to ns
|
tr TCP throughput over IPv4: host to ns
|
||||||
iperf3s ns 100${i}2 __THREADS__
|
iperf3s ns 10002
|
||||||
|
|
||||||
nsout ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
|
nsout ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
bw -
|
bw -
|
||||||
iperf3 BW host __ADDR__ 100${i}2 __THREADS__ __TIME__ __OPTS__
|
iperf3 BW host __ADDR__ 10002 __TIME__ __OPTS__
|
||||||
bw __BW__ 8.0 10.0
|
bw __BW__ 8.0 10.0
|
||||||
|
|
||||||
iperf3k ns
|
iperf3k ns
|
||||||
|
@ -258,7 +257,7 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
lat -
|
lat -
|
||||||
nsb tcp_rr --nolog -P 10002 -C 10012 -4
|
nsb tcp_rr --nolog -P 10002 -C 10012 -4
|
||||||
hout LAT tcp_rr --nolog -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
nsw
|
nsw
|
||||||
lat __LAT__ 150 100
|
lat __LAT__ 150 100
|
||||||
|
|
||||||
|
@ -268,7 +267,7 @@ lat -
|
||||||
lat -
|
lat -
|
||||||
sleep 1
|
sleep 1
|
||||||
nsb tcp_crr --nolog -P 10002 -C 10012 -4
|
nsb tcp_crr --nolog -P 10002 -C 10012 -4
|
||||||
hout LAT tcp_crr --nolog -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||||
nsw
|
nsw
|
||||||
lat __LAT__ 5000 10000
|
lat __LAT__ 5000 10000
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue