mirror of
https://passt.top/passt
synced 2025-05-31 21:35:34 +02:00
Compare commits
144 commits
2025_02_17
...
master
Author | SHA1 | Date | |
---|---|---|---|
![]() |
3262c9b088 | ||
![]() |
b915375a42 | ||
![]() |
2fd0944f21 | ||
![]() |
2046976866 | ||
![]() |
4234ace84c | ||
![]() |
2d3d69c5c3 | ||
![]() |
0f7bf10b0a | ||
![]() |
a6b9832e49 | ||
![]() |
570e7b4454 | ||
![]() |
8ec134109e | ||
![]() |
92d5d68013 | ||
![]() |
eea8a76caf | ||
![]() |
587980ca1e | ||
![]() |
f0021f9e1d | ||
![]() |
93394f4ef0 | ||
![]() |
11be695f5c | ||
![]() |
6a96cd97a5 | ||
![]() |
ea0a1240df | ||
![]() |
aa1cc89228 | ||
![]() |
436afc3044 | ||
![]() |
08e617ec2b | ||
![]() |
4668e91378 | ||
![]() |
9128f6e8f4 | ||
![]() |
2340bbf867 | ||
![]() |
cfc0ee145a | ||
![]() |
f107a86cc0 | ||
![]() |
04984578b0 | ||
![]() |
3f995586b3 | ||
![]() |
1bb8145c22 | ||
![]() |
baf049f8e0 | ||
![]() |
50249086a9 | ||
![]() |
bbff3653d6 | ||
![]() |
59cc89f4cc | ||
![]() |
695c62396e | ||
![]() |
f4b0dd8b06 | ||
![]() |
6693fa1158 | ||
![]() |
d3f33f3b8e | ||
![]() |
ffbef85e97 | ||
![]() |
06ef64cdb7 | ||
![]() |
9725e79888 | ||
![]() |
9eb5406260 | ||
![]() |
bd6a41ee76 | ||
![]() |
159beefa36 | ||
![]() |
fd844a90bc | ||
![]() |
fc6ee68ad3 | ||
![]() |
0304dd9c34 | ||
![]() |
5221e177e1 | ||
![]() |
3a0881dfd0 | ||
![]() |
84ab1305fa | ||
![]() |
1d7bbb101a | ||
![]() |
d74b5a7c10 | ||
![]() |
a7775e9550 | ||
![]() |
06784d7fc6 | ||
![]() |
684870a766 | ||
![]() |
76e554d9ec | ||
![]() |
8aa2d90c8d | ||
![]() |
3d41e4d838 | ||
![]() |
dec3d73e1e | ||
![]() |
6bfc60b095 | ||
![]() |
8e32881ef1 | ||
![]() |
2ed2d59def | ||
![]() |
3de5af6e41 | ||
![]() |
025a3c2686 | ||
![]() |
42a854a52b | ||
![]() |
65cca54be8 | ||
![]() |
664c588be7 | ||
![]() |
77883fbdd1 | ||
![]() |
37d78c9ef3 | ||
![]() |
f67c488b81 | ||
![]() |
269cf6a12a | ||
![]() |
d924b7dfc4 | ||
![]() |
5a977c2f4e | ||
![]() |
89b203b851 | ||
![]() |
cf4d3f05c9 | ||
![]() |
0857515c94 | ||
![]() |
9153aca15b | ||
![]() |
38bcce9977 | ||
![]() |
961aa6a0eb | ||
![]() |
37d9f374d9 | ||
![]() |
c48331ca51 | ||
![]() |
9866d146e6 | ||
![]() |
a41d6d125e | ||
![]() |
e43e00719d | ||
![]() |
4592719a74 | ||
![]() |
32f6212551 | ||
![]() |
07c2d584b3 | ||
![]() |
ebdd46367c | ||
![]() |
c250ffc5c1 | ||
![]() |
cfb3740568 | ||
![]() |
28772ee91a | ||
![]() |
51f3c071a7 | ||
![]() |
cb5b593563 | ||
![]() |
96fe5548cb | ||
![]() |
78f1f0fdfc | ||
![]() |
26df8a3608 | ||
![]() |
9d1a6b3eba | ||
![]() |
b6945e0553 | ||
![]() |
c4bfa3339c | ||
![]() |
1eda8de438 | ||
![]() |
c43972ad67 | ||
![]() |
74cd82adc8 | ||
![]() |
4b17d042c7 | ||
![]() |
bb00a0499f | ||
![]() |
c8b520c062 | ||
![]() |
0470170247 | ||
![]() |
2b58b22845 | ||
![]() |
a83c806d17 | ||
![]() |
27395e67c2 | ||
![]() |
12d5b36b2f | ||
![]() |
e36c35c952 | ||
![]() |
57d2db370b | ||
![]() |
68b04182e0 | ||
![]() |
87e6a46442 | ||
![]() |
55431f0077 | ||
![]() |
82a839be98 | ||
![]() |
1924e25f07 | ||
![]() |
672d786de1 | ||
![]() |
1f236817ea | ||
![]() |
008175636c | ||
![]() |
52419a64f2 | ||
![]() |
b2708218a6 | ||
![]() |
56ce03ed0a | ||
![]() |
39f85bce1a | ||
![]() |
7b92f2e852 | ||
![]() |
87471731e6 | ||
![]() |
be86232f72 | ||
![]() |
ea69ca6a20 | ||
![]() |
4dac2351fa | ||
![]() |
16553c8280 | ||
![]() |
183bedf478 | ||
![]() |
1cc5d4c9fe | ||
![]() |
3dc7da68a2 | ||
![]() |
65e317a8fc | ||
![]() |
b79a22d360 | ||
![]() |
7ffca35fdd | ||
![]() |
adb46c11d0 | ||
![]() |
ba0823f8a0 | ||
![]() |
854bc7b1a3 | ||
![]() |
e56c8038fc | ||
![]() |
5a07eb3ccc | ||
![]() |
6b4065153c | ||
![]() |
354bc0bab1 | ||
![]() |
0a51060f7a | ||
![]() |
bcc4908c2b |
68 changed files with 2625 additions and 1447 deletions
Makefilechecksum.cchecksum.hconf.cconf.h
contrib
dhcp.cdhcpv6.cdoc/platform-requirements
epoll_type.hflow.cflow.hflow_table.hfwd.cfwd.hicmp.cinany.hiov.cip.hisolation.clog.clog.hmigrate.cndp.cnetlink.cpacket.cpacket.hpasst-repair.1passt-repair.cpasst.cpasst.hpasta.cpcap.crepair.crepair.hseccomp.shtap.ctap.htcp.ctcp.htcp_buf.ctcp_conn.htcp_internal.htcp_splice.ctcp_vu.ctest
udp.cudp.hudp_flow.cudp_flow.hudp_internal.hudp_vu.cudp_vu.hutil.cutil.hvhost_user.cvhost_user.hvirtio.cvirtio.hvu_common.c
9
Makefile
9
Makefile
|
@ -20,6 +20,7 @@ $(if $(TARGET),,$(error Failed to get target architecture))
|
|||
# Get 'uname -m'-like architecture description for target
|
||||
TARGET_ARCH := $(firstword $(subst -, ,$(TARGET)))
|
||||
TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH))
|
||||
TARGET_ARCH := $(patsubst arm%,arm,$(TARGET_ARCH))
|
||||
TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH))
|
||||
|
||||
# On some systems enabling optimization also enables source fortification,
|
||||
|
@ -29,7 +30,7 @@ ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /
|
|||
FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
|
||||
endif
|
||||
|
||||
FLAGS := -Wall -Wextra -Wno-format-zero-length
|
||||
FLAGS := -Wall -Wextra -Wno-format-zero-length -Wformat-security
|
||||
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
|
||||
FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE
|
||||
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
|
||||
|
@ -109,9 +110,9 @@ passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h
|
|||
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS)
|
||||
|
||||
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
|
||||
rt_sigreturn getpid gettid kill clock_gettime mmap \
|
||||
mmap2 munmap open unlink gettimeofday futex statx \
|
||||
readlink
|
||||
rt_sigreturn getpid gettid kill clock_gettime \
|
||||
mmap|mmap2 munmap open unlink gettimeofday futex \
|
||||
statx readlink
|
||||
valgrind: FLAGS += -g -DVALGRIND
|
||||
valgrind: all
|
||||
|
||||
|
|
34
checksum.c
34
checksum.c
|
@ -85,7 +85,7 @@
|
|||
*/
|
||||
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
|
||||
__attribute__((optimize("-fno-strict-aliasing")))
|
||||
uint32_t sum_16b(const void *buf, size_t len)
|
||||
static uint32_t sum_16b(const void *buf, size_t len)
|
||||
{
|
||||
const uint16_t *p = buf;
|
||||
uint32_t sum = 0;
|
||||
|
@ -107,7 +107,7 @@ uint32_t sum_16b(const void *buf, size_t len)
|
|||
*
|
||||
* Return: 16-bit folded sum
|
||||
*/
|
||||
uint16_t csum_fold(uint32_t sum)
|
||||
static uint16_t csum_fold(uint32_t sum)
|
||||
{
|
||||
while (sum >> 16)
|
||||
sum = (sum & 0xffff) + (sum >> 16);
|
||||
|
@ -161,6 +161,21 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
|
|||
return psum;
|
||||
}
|
||||
|
||||
/**
|
||||
* csum() - Compute TCP/IP-style checksum
|
||||
* @buf: Input buffer
|
||||
* @len: Input length
|
||||
* @init: Initial 32-bit checksum, 0 for no pre-computed checksum
|
||||
*
|
||||
* Return: 16-bit folded, complemented checksum
|
||||
*/
|
||||
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
|
||||
__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */
|
||||
static uint16_t csum(const void *buf, size_t len, uint32_t init)
|
||||
{
|
||||
return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
|
||||
}
|
||||
|
||||
/**
|
||||
* csum_udp4() - Calculate and set checksum for a UDP over IPv4 packet
|
||||
* @udp4hr: UDP header, initialised apart from checksum
|
||||
|
@ -482,21 +497,6 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
|
|||
}
|
||||
#endif /* !__AVX2__ */
|
||||
|
||||
/**
|
||||
* csum() - Compute TCP/IP-style checksum
|
||||
* @buf: Input buffer
|
||||
* @len: Input length
|
||||
* @init: Initial 32-bit checksum, 0 for no pre-computed checksum
|
||||
*
|
||||
* Return: 16-bit folded, complemented checksum
|
||||
*/
|
||||
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
|
||||
__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */
|
||||
uint16_t csum(const void *buf, size_t len, uint32_t init)
|
||||
{
|
||||
return (uint16_t)~csum_fold(csum_unfolded(buf, len, init));
|
||||
}
|
||||
|
||||
/**
|
||||
* csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector
|
||||
* @tail: IO vector tail to checksum
|
||||
|
|
|
@ -11,8 +11,6 @@ struct icmphdr;
|
|||
struct icmp6hdr;
|
||||
struct iov_tail;
|
||||
|
||||
uint32_t sum_16b(const void *buf, size_t len);
|
||||
uint16_t csum_fold(uint32_t sum);
|
||||
uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init);
|
||||
uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
|
||||
struct in_addr saddr, struct in_addr daddr);
|
||||
|
@ -32,7 +30,6 @@ void csum_icmp6(struct icmp6hdr *icmp6hr,
|
|||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
const void *payload, size_t dlen);
|
||||
uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init);
|
||||
uint16_t csum(const void *buf, size_t len, uint32_t init);
|
||||
uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init);
|
||||
|
||||
#endif /* CHECKSUM_H */
|
||||
|
|
422
conf.c
422
conf.c
|
@ -16,6 +16,7 @@
|
|||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <getopt.h>
|
||||
#include <libgen.h>
|
||||
#include <string.h>
|
||||
#include <sched.h>
|
||||
#include <sys/types.h>
|
||||
|
@ -123,6 +124,75 @@ static int parse_port_range(const char *s, char **endptr,
|
|||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* conf_ports_range_except() - Set up forwarding for a range of ports minus a
|
||||
* bitmap of exclusions
|
||||
* @c: Execution context
|
||||
* @optname: Short option name, t, T, u, or U
|
||||
* @optarg: Option argument (port specification)
|
||||
* @fwd: Pointer to @fwd_ports to be updated
|
||||
* @addr: Listening address
|
||||
* @ifname: Listening interface
|
||||
* @first: First port to forward
|
||||
* @last: Last port to forward
|
||||
* @exclude: Bitmap of ports to exclude
|
||||
* @to: Port to translate @first to when forwarding
|
||||
* @weak: Ignore errors, as long as at least one port is mapped
|
||||
*/
|
||||
static void conf_ports_range_except(const struct ctx *c, char optname,
|
||||
const char *optarg, struct fwd_ports *fwd,
|
||||
const union inany_addr *addr,
|
||||
const char *ifname,
|
||||
uint16_t first, uint16_t last,
|
||||
const uint8_t *exclude, uint16_t to,
|
||||
bool weak)
|
||||
{
|
||||
bool bound_one = false;
|
||||
unsigned i;
|
||||
int ret;
|
||||
|
||||
if (first == 0) {
|
||||
die("Can't forward port 0 for option '-%c %s'",
|
||||
optname, optarg);
|
||||
}
|
||||
|
||||
for (i = first; i <= last; i++) {
|
||||
if (bitmap_isset(exclude, i))
|
||||
continue;
|
||||
|
||||
if (bitmap_isset(fwd->map, i)) {
|
||||
warn(
|
||||
"Altering mapping of already mapped port number: %s", optarg);
|
||||
}
|
||||
|
||||
bitmap_set(fwd->map, i);
|
||||
fwd->delta[i] = to - first;
|
||||
|
||||
if (optname == 't')
|
||||
ret = tcp_sock_init(c, addr, ifname, i);
|
||||
else if (optname == 'u')
|
||||
ret = udp_sock_init(c, 0, addr, ifname, i);
|
||||
else
|
||||
/* No way to check in advance for -T and -U */
|
||||
ret = 0;
|
||||
|
||||
if (ret == -ENFILE || ret == -EMFILE) {
|
||||
die("Can't open enough sockets for port specifier: %s",
|
||||
optarg);
|
||||
}
|
||||
|
||||
if (!ret) {
|
||||
bound_one = true;
|
||||
} else if (!weak) {
|
||||
die("Failed to bind port %u (%s) for option '-%c %s'",
|
||||
i, strerror_(-ret), optname, optarg);
|
||||
}
|
||||
}
|
||||
|
||||
if (!bound_one)
|
||||
die("Failed to bind any port for '-%c %s'", optname, optarg);
|
||||
}
|
||||
|
||||
/**
|
||||
* conf_ports() - Parse port configuration options, initialise UDP/TCP sockets
|
||||
* @c: Execution context
|
||||
|
@ -135,10 +205,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
|
|||
{
|
||||
union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
|
||||
char buf[BUFSIZ], *spec, *ifname = NULL, *p;
|
||||
bool exclude_only = true, bound_one = false;
|
||||
uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
|
||||
bool exclude_only = true;
|
||||
unsigned i;
|
||||
int ret;
|
||||
|
||||
if (!strcmp(optarg, "none")) {
|
||||
if (fwd->mode)
|
||||
|
@ -173,32 +242,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
|
|||
|
||||
fwd->mode = FWD_ALL;
|
||||
|
||||
/* Skip port 0. It has special meaning for many socket APIs, so
|
||||
* trying to bind it is not really safe.
|
||||
*/
|
||||
for (i = 1; i < NUM_PORTS; i++) {
|
||||
/* Exclude ephemeral ports */
|
||||
for (i = 0; i < NUM_PORTS; i++)
|
||||
if (fwd_port_is_ephemeral(i))
|
||||
continue;
|
||||
|
||||
bitmap_set(fwd->map, i);
|
||||
if (optname == 't') {
|
||||
ret = tcp_sock_init(c, NULL, NULL, i);
|
||||
if (ret == -ENFILE || ret == -EMFILE)
|
||||
goto enfile;
|
||||
if (!ret)
|
||||
bound_one = true;
|
||||
} else if (optname == 'u') {
|
||||
ret = udp_sock_init(c, 0, NULL, NULL, i);
|
||||
if (ret == -ENFILE || ret == -EMFILE)
|
||||
goto enfile;
|
||||
if (!ret)
|
||||
bound_one = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!bound_one)
|
||||
goto bind_all_fail;
|
||||
bitmap_set(exclude, i);
|
||||
|
||||
conf_ports_range_except(c, optname, optarg, fwd,
|
||||
NULL, NULL,
|
||||
1, NUM_PORTS - 1, exclude,
|
||||
1, true);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -275,37 +327,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
|
|||
} while ((p = next_chunk(p, ',')));
|
||||
|
||||
if (exclude_only) {
|
||||
/* Skip port 0. It has special meaning for many socket APIs, so
|
||||
* trying to bind it is not really safe.
|
||||
*/
|
||||
for (i = 1; i < NUM_PORTS; i++) {
|
||||
if (fwd_port_is_ephemeral(i) ||
|
||||
bitmap_isset(exclude, i))
|
||||
continue;
|
||||
|
||||
bitmap_set(fwd->map, i);
|
||||
|
||||
if (optname == 't') {
|
||||
ret = tcp_sock_init(c, addr, ifname, i);
|
||||
if (ret == -ENFILE || ret == -EMFILE)
|
||||
goto enfile;
|
||||
if (!ret)
|
||||
bound_one = true;
|
||||
} else if (optname == 'u') {
|
||||
ret = udp_sock_init(c, 0, addr, ifname, i);
|
||||
if (ret == -ENFILE || ret == -EMFILE)
|
||||
goto enfile;
|
||||
if (!ret)
|
||||
bound_one = true;
|
||||
} else {
|
||||
/* No way to check in advance for -T and -U */
|
||||
bound_one = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!bound_one)
|
||||
goto bind_all_fail;
|
||||
/* Exclude ephemeral ports */
|
||||
for (i = 0; i < NUM_PORTS; i++)
|
||||
if (fwd_port_is_ephemeral(i))
|
||||
bitmap_set(exclude, i);
|
||||
|
||||
conf_ports_range_except(c, optname, optarg, fwd,
|
||||
addr, ifname,
|
||||
1, NUM_PORTS - 1, exclude,
|
||||
1, true);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -334,40 +364,18 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
|
|||
if ((*p != '\0') && (*p != ',')) /* Garbage after the ranges */
|
||||
goto bad;
|
||||
|
||||
for (i = orig_range.first; i <= orig_range.last; i++) {
|
||||
if (bitmap_isset(fwd->map, i))
|
||||
warn(
|
||||
"Altering mapping of already mapped port number: %s", optarg);
|
||||
|
||||
if (bitmap_isset(exclude, i))
|
||||
continue;
|
||||
|
||||
bitmap_set(fwd->map, i);
|
||||
|
||||
fwd->delta[i] = mapped_range.first - orig_range.first;
|
||||
|
||||
ret = 0;
|
||||
if (optname == 't')
|
||||
ret = tcp_sock_init(c, addr, ifname, i);
|
||||
else if (optname == 'u')
|
||||
ret = udp_sock_init(c, 0, addr, ifname, i);
|
||||
if (ret)
|
||||
goto bind_fail;
|
||||
}
|
||||
conf_ports_range_except(c, optname, optarg, fwd,
|
||||
addr, ifname,
|
||||
orig_range.first, orig_range.last,
|
||||
exclude,
|
||||
mapped_range.first, false);
|
||||
} while ((p = next_chunk(p, ',')));
|
||||
|
||||
return;
|
||||
enfile:
|
||||
die("Can't open enough sockets for port specifier: %s", optarg);
|
||||
bad:
|
||||
die("Invalid port specifier %s", optarg);
|
||||
mode_conflict:
|
||||
die("Port forwarding mode '%s' conflicts with previous mode", optarg);
|
||||
bind_fail:
|
||||
die("Failed to bind port %u (%s) for option '-%c %s', exiting",
|
||||
i, strerror_(-ret), optname, optarg);
|
||||
bind_all_fail:
|
||||
die("Failed to bind any port for '-%c %s', exiting", optname, optarg);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -406,6 +414,76 @@ static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr,
|
|||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* add_dns_resolv4() - Possibly add one IPv4 nameserver from host's resolv.conf
|
||||
* @c: Execution context
|
||||
* @ns: Nameserver address
|
||||
* @idx: Pointer to index of current IPv4 resolver entry, set on return
|
||||
*/
|
||||
static void add_dns_resolv4(struct ctx *c, struct in_addr *ns, unsigned *idx)
|
||||
{
|
||||
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
|
||||
c->ip4.dns_host = *ns;
|
||||
|
||||
/* Special handling if guest or container can only access local
|
||||
* addresses via redirect, or if the host gateway is also a resolver and
|
||||
* we shadow its address
|
||||
*/
|
||||
if (IN4_IS_ADDR_LOOPBACK(ns) ||
|
||||
IN4_ARE_ADDR_EQUAL(ns, &c->ip4.map_host_loopback)) {
|
||||
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) {
|
||||
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
|
||||
return; /* Address unreachable */
|
||||
|
||||
*ns = c->ip4.map_host_loopback;
|
||||
c->ip4.dns_match = c->ip4.map_host_loopback;
|
||||
} else {
|
||||
/* No general host mapping, but requested for DNS
|
||||
* (--dns-forward and --no-map-gw): advertise resolver
|
||||
* address from --dns-forward, and map that to loopback
|
||||
*/
|
||||
*ns = c->ip4.dns_match;
|
||||
}
|
||||
}
|
||||
|
||||
*idx += add_dns4(c, ns, *idx);
|
||||
}
|
||||
|
||||
/**
|
||||
* add_dns_resolv6() - Possibly add one IPv6 nameserver from host's resolv.conf
|
||||
* @c: Execution context
|
||||
* @ns: Nameserver address
|
||||
* @idx: Pointer to index of current IPv6 resolver entry, set on return
|
||||
*/
|
||||
static void add_dns_resolv6(struct ctx *c, struct in6_addr *ns, unsigned *idx)
|
||||
{
|
||||
if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
|
||||
c->ip6.dns_host = *ns;
|
||||
|
||||
/* Special handling if guest or container can only access local
|
||||
* addresses via redirect, or if the host gateway is also a resolver and
|
||||
* we shadow its address
|
||||
*/
|
||||
if (IN6_IS_ADDR_LOOPBACK(ns) ||
|
||||
IN6_ARE_ADDR_EQUAL(ns, &c->ip6.map_host_loopback)) {
|
||||
if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) {
|
||||
if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
|
||||
return; /* Address unreachable */
|
||||
|
||||
*ns = c->ip6.map_host_loopback;
|
||||
c->ip6.dns_match = c->ip6.map_host_loopback;
|
||||
} else {
|
||||
/* No general host mapping, but requested for DNS
|
||||
* (--dns-forward and --no-map-gw): advertise resolver
|
||||
* address from --dns-forward, and map that to loopback
|
||||
*/
|
||||
*ns = c->ip6.dns_match;
|
||||
}
|
||||
}
|
||||
|
||||
*idx += add_dns6(c, ns, *idx);
|
||||
}
|
||||
|
||||
/**
|
||||
* add_dns_resolv() - Possibly add ns from host resolv.conf to configuration
|
||||
* @c: Execution context
|
||||
|
@ -422,48 +500,11 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver,
|
|||
struct in6_addr ns6;
|
||||
struct in_addr ns4;
|
||||
|
||||
if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) {
|
||||
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
|
||||
c->ip4.dns_host = ns4;
|
||||
if (idx4 && inet_pton(AF_INET, nameserver, &ns4))
|
||||
add_dns_resolv4(c, &ns4, idx4);
|
||||
|
||||
/* Special handling if guest or container can only access local
|
||||
* addresses via redirect, or if the host gateway is also a
|
||||
* resolver and we shadow its address
|
||||
*/
|
||||
if (IN4_IS_ADDR_LOOPBACK(&ns4) ||
|
||||
IN4_ARE_ADDR_EQUAL(&ns4, &c->ip4.map_host_loopback)) {
|
||||
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
|
||||
return;
|
||||
|
||||
ns4 = c->ip4.map_host_loopback;
|
||||
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match))
|
||||
c->ip4.dns_match = c->ip4.map_host_loopback;
|
||||
}
|
||||
|
||||
*idx4 += add_dns4(c, &ns4, *idx4);
|
||||
}
|
||||
|
||||
if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) {
|
||||
if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
|
||||
c->ip6.dns_host = ns6;
|
||||
|
||||
/* Special handling if guest or container can only access local
|
||||
* addresses via redirect, or if the host gateway is also a
|
||||
* resolver and we shadow its address
|
||||
*/
|
||||
if (IN6_IS_ADDR_LOOPBACK(&ns6) ||
|
||||
IN6_ARE_ADDR_EQUAL(&ns6, &c->ip6.map_host_loopback)) {
|
||||
if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
|
||||
return;
|
||||
|
||||
ns6 = c->ip6.map_host_loopback;
|
||||
|
||||
if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match))
|
||||
c->ip6.dns_match = c->ip6.map_host_loopback;
|
||||
}
|
||||
|
||||
*idx6 += add_dns6(c, &ns6, *idx6);
|
||||
}
|
||||
if (idx6 && inet_pton(AF_INET6, nameserver, &ns6))
|
||||
add_dns_resolv6(c, &ns6, idx6);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -991,6 +1032,45 @@ pasta_opts:
|
|||
_exit(status);
|
||||
}
|
||||
|
||||
/**
|
||||
* conf_mode() - Determine passt/pasta's operating mode from command line
|
||||
* @argc: Argument count
|
||||
* @argv: Command line arguments
|
||||
*
|
||||
* Return: mode to operate in, PASTA or PASST
|
||||
*/
|
||||
enum passt_modes conf_mode(int argc, char *argv[])
|
||||
{
|
||||
int vhost_user = 0;
|
||||
const struct option optvu[] = {
|
||||
{"vhost-user", no_argument, &vhost_user, 1 },
|
||||
{ 0 },
|
||||
};
|
||||
char argv0[PATH_MAX], *basearg0;
|
||||
int name;
|
||||
|
||||
optind = 0;
|
||||
do {
|
||||
name = getopt_long(argc, argv, "-:", optvu, NULL);
|
||||
} while (name != -1);
|
||||
|
||||
if (vhost_user)
|
||||
return MODE_VU;
|
||||
|
||||
if (argc < 1)
|
||||
die("Cannot determine argv[0]");
|
||||
|
||||
strncpy(argv0, argv[0], PATH_MAX - 1);
|
||||
basearg0 = basename(argv0);
|
||||
if (strstr(basearg0, "pasta"))
|
||||
return MODE_PASTA;
|
||||
|
||||
if (strstr(basearg0, "passt"))
|
||||
return MODE_PASST;
|
||||
|
||||
die("Cannot determine mode, invoke as \"passt\" or \"pasta\"");
|
||||
}
|
||||
|
||||
/**
|
||||
* conf_print() - Print fundamental configuration parameters
|
||||
* @c: Execution context
|
||||
|
@ -1225,6 +1305,8 @@ static void conf_nat(const char *arg, struct in_addr *addr4,
|
|||
*addr6 = in6addr_any;
|
||||
if (no_map_gw)
|
||||
*no_map_gw = 1;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (inet_pton(AF_INET6, arg, addr6) &&
|
||||
|
@ -1388,16 +1470,17 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
{"repair-path", required_argument, NULL, 28 },
|
||||
{ 0 },
|
||||
};
|
||||
const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
|
||||
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
|
||||
char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 };
|
||||
bool copy_addrs_opt = false, copy_routes_opt = false;
|
||||
enum fwd_ports_mode fwd_default = FWD_NONE;
|
||||
bool v4_only = false, v6_only = false;
|
||||
unsigned dns4_idx = 0, dns6_idx = 0;
|
||||
unsigned long max_mtu = IP_MAX_MTU;
|
||||
struct fqdn *dnss = c->dns_search;
|
||||
unsigned int ifi4 = 0, ifi6 = 0;
|
||||
const char *logfile = NULL;
|
||||
const char *optstring;
|
||||
size_t logsize = 0;
|
||||
char *runas = NULL;
|
||||
long fd_tap_opt;
|
||||
|
@ -1408,11 +1491,11 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
if (c->mode == MODE_PASTA) {
|
||||
c->no_dhcp_dns = c->no_dhcp_dns_search = 1;
|
||||
fwd_default = FWD_AUTO;
|
||||
optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:";
|
||||
} else {
|
||||
optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:";
|
||||
}
|
||||
|
||||
if (tap_l2_max_len(c) - ETH_HLEN < max_mtu)
|
||||
max_mtu = tap_l2_max_len(c) - ETH_HLEN;
|
||||
c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t));
|
||||
c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET;
|
||||
c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET;
|
||||
memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN);
|
||||
|
@ -1580,9 +1663,8 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
|
||||
die("Invalid host nameserver address: %s", optarg);
|
||||
case 25:
|
||||
if (c->mode == MODE_PASTA)
|
||||
die("--vhost-user is for passt mode only");
|
||||
c->mode = MODE_VU;
|
||||
/* Already handled in conf_mode() */
|
||||
ASSERT(c->mode == MODE_VU);
|
||||
break;
|
||||
case 26:
|
||||
vu_print_capabilities();
|
||||
|
@ -1593,7 +1675,14 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
die("Invalid FQDN: %s", optarg);
|
||||
break;
|
||||
case 28:
|
||||
/* Handle this once we checked --vhost-user */
|
||||
if (c->mode != MODE_VU && strcmp(optarg, "none"))
|
||||
die("--repair-path is for vhost-user mode only");
|
||||
|
||||
if (snprintf_check(c->repair_path,
|
||||
sizeof(c->repair_path), "%s",
|
||||
optarg))
|
||||
die("Invalid passt-repair path: %s", optarg);
|
||||
|
||||
break;
|
||||
case 'd':
|
||||
c->debug = 1;
|
||||
|
@ -1613,6 +1702,9 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
c->foreground = 1;
|
||||
break;
|
||||
case 's':
|
||||
if (c->mode == MODE_PASTA)
|
||||
die("-s is for passt / vhost-user mode only");
|
||||
|
||||
ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s",
|
||||
optarg);
|
||||
if (ret <= 0 || ret >= (int)sizeof(c->sock_path))
|
||||
|
@ -1625,7 +1717,8 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
fd_tap_opt = strtol(optarg, NULL, 0);
|
||||
|
||||
if (errno ||
|
||||
fd_tap_opt <= STDERR_FILENO || fd_tap_opt > INT_MAX)
|
||||
(fd_tap_opt != STDIN_FILENO && fd_tap_opt <= STDERR_FILENO) ||
|
||||
fd_tap_opt > INT_MAX)
|
||||
die("Invalid --fd: %s", optarg);
|
||||
|
||||
c->fd_tap = fd_tap_opt;
|
||||
|
@ -1633,6 +1726,9 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
*c->sock_path = 0;
|
||||
break;
|
||||
case 'I':
|
||||
if (c->mode != MODE_PASTA)
|
||||
die("-I is for pasta mode only");
|
||||
|
||||
ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s",
|
||||
optarg);
|
||||
if (ret <= 0 || ret >= IFNAMSIZ)
|
||||
|
@ -1652,20 +1748,24 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
die("Invalid PID file: %s", optarg);
|
||||
|
||||
break;
|
||||
case 'm':
|
||||
case 'm': {
|
||||
unsigned long mtu;
|
||||
char *e;
|
||||
|
||||
errno = 0;
|
||||
c->mtu = strtol(optarg, NULL, 0);
|
||||
mtu = strtoul(optarg, &e, 0);
|
||||
|
||||
if (!c->mtu) {
|
||||
c->mtu = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (c->mtu < ETH_MIN_MTU || c->mtu > (int)ETH_MAX_MTU ||
|
||||
errno)
|
||||
if (errno || *e)
|
||||
die("Invalid MTU: %s", optarg);
|
||||
|
||||
if (mtu > max_mtu) {
|
||||
die("MTU %lu too large (max %lu)",
|
||||
mtu, max_mtu);
|
||||
}
|
||||
|
||||
c->mtu = mtu;
|
||||
break;
|
||||
}
|
||||
case 'a':
|
||||
if (inet_pton(AF_INET6, optarg, &c->ip6.addr) &&
|
||||
!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr) &&
|
||||
|
@ -1785,11 +1885,16 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
break;
|
||||
case 't':
|
||||
case 'u':
|
||||
case 'T':
|
||||
case 'U':
|
||||
case 'D':
|
||||
/* Handle these later, once addresses are configured */
|
||||
break;
|
||||
case 'T':
|
||||
case 'U':
|
||||
if (c->mode != MODE_PASTA)
|
||||
die("-%c is for pasta mode only", name);
|
||||
|
||||
/* Handle properly later, once addresses are configured */
|
||||
break;
|
||||
case 'h':
|
||||
usage(argv[0], stdout, EXIT_SUCCESS);
|
||||
break;
|
||||
|
@ -1837,9 +1942,21 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
c->ifi4 = conf_ip4(ifi4, &c->ip4);
|
||||
if (!v4_only)
|
||||
c->ifi6 = conf_ip6(ifi6, &c->ip6);
|
||||
|
||||
if (c->ifi4 && c->mtu < IPV4_MIN_MTU) {
|
||||
warn("MTU %"PRIu16" is too small for IPv4 (minimum %u)",
|
||||
c->mtu, IPV4_MIN_MTU);
|
||||
}
|
||||
if (c->ifi6 && c->mtu < IPV6_MIN_MTU) {
|
||||
warn("MTU %"PRIu16" is too small for IPv6 (minimum %u)",
|
||||
c->mtu, IPV6_MIN_MTU);
|
||||
}
|
||||
|
||||
if ((*c->ip4.ifname_out && !c->ifi4) ||
|
||||
(*c->ip6.ifname_out && !c->ifi6))
|
||||
die("External interface not usable");
|
||||
|
||||
|
||||
if (!c->ifi4 && !c->ifi6) {
|
||||
info("No external interface as template, switch to local mode");
|
||||
|
||||
|
@ -1866,8 +1983,8 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
|
||||
c->no_dhcp = 1;
|
||||
|
||||
/* Inbound port options, DNS, and --repair-path can be parsed now, after
|
||||
* IPv4/IPv6 settings and --vhost-user.
|
||||
/* Inbound port options and DNS can be parsed now, after IPv4/IPv6
|
||||
* settings
|
||||
*/
|
||||
fwd_probe_ephemeral();
|
||||
udp_portmap_clear();
|
||||
|
@ -1913,16 +2030,6 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
}
|
||||
|
||||
die("Cannot use DNS address %s", optarg);
|
||||
} else if (name == 28) {
|
||||
if (c->mode != MODE_VU && strcmp(optarg, "none"))
|
||||
die("--repair-path is for vhost-user mode only");
|
||||
|
||||
if (snprintf_check(c->repair_path,
|
||||
sizeof(c->repair_path), "%s",
|
||||
optarg))
|
||||
die("Invalid passt-repair path: %s", optarg);
|
||||
|
||||
break;
|
||||
}
|
||||
} while (name != -1);
|
||||
|
||||
|
@ -1971,9 +2078,6 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
c->no_dhcpv6 = 1;
|
||||
}
|
||||
|
||||
if (!c->mtu)
|
||||
c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
|
||||
|
||||
get_dns(c);
|
||||
|
||||
if (!*c->pasta_ifn) {
|
||||
|
|
1
conf.h
1
conf.h
|
@ -6,6 +6,7 @@
|
|||
#ifndef CONF_H
|
||||
#define CONF_H
|
||||
|
||||
enum passt_modes conf_mode(int argc, char *argv[]);
|
||||
void conf(struct ctx *c, int argc, char **argv);
|
||||
|
||||
#endif /* CONF_H */
|
||||
|
|
|
@ -44,7 +44,7 @@ Requires(preun): %{name}
|
|||
Requires(preun): policycoreutils
|
||||
|
||||
%description selinux
|
||||
This package adds SELinux enforcement to passt(1) and pasta(1).
|
||||
This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1).
|
||||
|
||||
%prep
|
||||
%setup -q -n passt-%{git_hash}
|
||||
|
@ -82,6 +82,7 @@ make -f %{_datadir}/selinux/devel/Makefile
|
|||
install -p -m 644 -D passt.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
|
||||
install -p -m 644 -D passt.if %{buildroot}%{_datadir}/selinux/devel/include/distributed/passt.if
|
||||
install -p -m 644 -D pasta.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
|
||||
install -p -m 644 -D passt-repair.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
|
||||
popd
|
||||
|
||||
%pre selinux
|
||||
|
@ -90,11 +91,13 @@ popd
|
|||
%post selinux
|
||||
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
|
||||
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
|
||||
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
|
||||
|
||||
%postun selinux
|
||||
if [ $1 -eq 0 ]; then
|
||||
%selinux_modules_uninstall -s %{selinuxtype} passt
|
||||
%selinux_modules_uninstall -s %{selinuxtype} pasta
|
||||
%selinux_modules_uninstall -s %{selinuxtype} passt-repair
|
||||
fi
|
||||
|
||||
%posttrans selinux
|
||||
|
@ -124,6 +127,7 @@ fi
|
|||
%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
|
||||
%{_datadir}/selinux/devel/include/distributed/passt.if
|
||||
%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
|
||||
%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
|
||||
|
||||
%changelog
|
||||
{{{ passt_git_changelog }}}
|
||||
|
|
|
@ -28,12 +28,22 @@ require {
|
|||
type console_device_t;
|
||||
type user_devpts_t;
|
||||
type user_tmp_t;
|
||||
|
||||
# Workaround: passt-repair needs to needs to access socket files
|
||||
# that passt, started by libvirt, might create under different
|
||||
# labels, depending on whether passt is started as root or not.
|
||||
#
|
||||
# However, libvirt doesn't maintain its own policy, which makes
|
||||
# updates particularly complicated. To avoid breakage in the short
|
||||
# term, deal with that in passt's own policy.
|
||||
type qemu_var_run_t;
|
||||
type virt_var_run_t;
|
||||
}
|
||||
|
||||
type passt_repair_t;
|
||||
domain_type(passt_repair_t);
|
||||
type passt_repair_exec_t;
|
||||
files_type(passt_repair_exec_t);
|
||||
corecmd_executable_file(passt_repair_exec_t);
|
||||
|
||||
role unconfined_r types passt_repair_t;
|
||||
|
||||
|
@ -41,7 +51,8 @@ allow passt_repair_t passt_repair_exec_t:file { read execute execute_no_trans en
|
|||
type_transition unconfined_t passt_repair_exec_t:process passt_repair_t;
|
||||
allow unconfined_t passt_repair_t:process transition;
|
||||
|
||||
allow passt_repair_t self:capability { dac_override net_admin net_raw };
|
||||
allow passt_repair_t self:capability { dac_override dac_read_search net_admin net_raw };
|
||||
allow passt_repair_t self:capability2 bpf;
|
||||
|
||||
allow passt_repair_t console_device_t:chr_file { append open getattr read write ioctl };
|
||||
allow passt_repair_t user_devpts_t:chr_file { append open getattr read write ioctl };
|
||||
|
@ -50,9 +61,27 @@ allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write };
|
|||
allow passt_repair_t passt_t:unix_stream_socket { connectto read write };
|
||||
allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write };
|
||||
|
||||
allow passt_repair_t unconfined_t:sock_file { read write };
|
||||
allow passt_repair_t passt_t:sock_file { read write };
|
||||
allow passt_repair_t user_tmp_t:sock_file { read write };
|
||||
allow passt_repair_t user_tmp_t:dir { getattr read search watch };
|
||||
|
||||
allow passt_repair_t unconfined_t:sock_file { getattr read write };
|
||||
allow passt_repair_t passt_t:sock_file { getattr read write };
|
||||
allow passt_repair_t user_tmp_t:sock_file { getattr read write };
|
||||
|
||||
allow passt_repair_t unconfined_t:tcp_socket { read setopt write };
|
||||
allow passt_repair_t passt_t:tcp_socket { read setopt write };
|
||||
|
||||
# Workaround: passt-repair needs to needs to access socket files
|
||||
# that passt, started by libvirt, might create under different
|
||||
# labels, depending on whether passt is started as root or not.
|
||||
#
|
||||
# However, libvirt doesn't maintain its own policy, which makes
|
||||
# updates particularly complicated. To avoid breakage in the short
|
||||
# term, deal with that in passt's own policy.
|
||||
allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write };
|
||||
allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write };
|
||||
|
||||
allow passt_repair_t qemu_var_run_t:dir { getattr read search watch };
|
||||
allow passt_repair_t virt_var_run_t:dir { getattr read search watch };
|
||||
|
||||
allow passt_repair_t qemu_var_run_t:sock_file { getattr read write };
|
||||
allow passt_repair_t virt_var_run_t:sock_file { getattr read write };
|
||||
|
|
|
@ -29,6 +29,9 @@ require {
|
|||
# particularly complicated. To avoid breakage in the short term,
|
||||
# deal with it in passt's own policy.
|
||||
type svirt_image_t;
|
||||
type svirt_tmpfs_t;
|
||||
type svirt_t;
|
||||
type null_device_t;
|
||||
|
||||
class file { ioctl getattr setattr create read write unlink open relabelto execute execute_no_trans map };
|
||||
class dir { search write add_name remove_name mounton };
|
||||
|
@ -45,8 +48,8 @@ require {
|
|||
type net_conf_t;
|
||||
type proc_net_t;
|
||||
type node_t;
|
||||
class tcp_socket { create accept listen name_bind name_connect getattr };
|
||||
class udp_socket { create accept listen };
|
||||
class tcp_socket { create accept listen name_bind name_connect getattr ioctl };
|
||||
class udp_socket { create accept listen getattr };
|
||||
class icmp_socket { bind create name_bind node_bind setopt read write };
|
||||
class sock_file { create unlink write };
|
||||
|
||||
|
@ -129,8 +132,8 @@ corenet_udp_sendrecv_all_ports(passt_t)
|
|||
allow passt_t node_t:icmp_socket { name_bind node_bind };
|
||||
allow passt_t port_t:icmp_socket name_bind;
|
||||
|
||||
allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr };
|
||||
allow passt_t self:udp_socket { create getopt setopt connect bind read write };
|
||||
allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl };
|
||||
allow passt_t self:udp_socket { create getopt setopt connect bind read write getattr };
|
||||
allow passt_t self:icmp_socket { bind create setopt read write };
|
||||
|
||||
allow passt_t user_tmp_t:dir { add_name write };
|
||||
|
@ -143,3 +146,5 @@ allow passt_t unconfined_t:unix_stream_socket { read write };
|
|||
# particularly complicated. To avoid breakage in the short term,
|
||||
# deal with it in passt's own policy.
|
||||
allow passt_t svirt_image_t:file { read write map };
|
||||
allow passt_t svirt_tmpfs_t:file { read write map };
|
||||
allow passt_t null_device_t:chr_file map;
|
||||
|
|
11
dhcp.c
11
dhcp.c
|
@ -64,9 +64,9 @@ static struct opt opts[255];
|
|||
#define OPT_MIN 60 /* RFC 951 */
|
||||
|
||||
/* Total option size (excluding end option) is 576 (RFC 2131), minus
|
||||
* offset of options (268), minus end option and its length (2).
|
||||
* offset of options (268), minus end option (1).
|
||||
*/
|
||||
#define OPT_MAX 306
|
||||
#define OPT_MAX 307
|
||||
|
||||
/**
|
||||
* dhcp_init() - Initialise DHCP options
|
||||
|
@ -127,7 +127,7 @@ struct msg {
|
|||
uint8_t sname[64];
|
||||
uint8_t file[128];
|
||||
uint32_t magic;
|
||||
uint8_t o[OPT_MAX + 2 /* End option and its length */ ];
|
||||
uint8_t o[OPT_MAX + 1 /* End option */ ];
|
||||
} __attribute__((__packed__));
|
||||
|
||||
/**
|
||||
|
@ -143,7 +143,7 @@ static bool fill_one(struct msg *m, int o, int *offset)
|
|||
size_t slen = opts[o].slen;
|
||||
|
||||
/* If we don't have space to write the option, then just skip */
|
||||
if (*offset + 1 /* length of option */ + slen > OPT_MAX)
|
||||
if (*offset + 2 /* code and length of option */ + slen > OPT_MAX)
|
||||
return true;
|
||||
|
||||
m->o[*offset] = o;
|
||||
|
@ -194,7 +194,6 @@ static int fill(struct msg *m)
|
|||
}
|
||||
|
||||
m->o[offset++] = 255;
|
||||
m->o[offset++] = 0;
|
||||
|
||||
if (offset < OPT_MIN) {
|
||||
memset(&m->o[offset], 0, OPT_MIN - offset);
|
||||
|
@ -418,7 +417,7 @@ int dhcp(const struct ctx *c, const struct pool *p)
|
|||
&c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
|
||||
}
|
||||
|
||||
if (c->mtu != -1) {
|
||||
if (c->mtu) {
|
||||
opts[26].slen = 2;
|
||||
opts[26].s[0] = c->mtu / 256;
|
||||
opts[26].s[1] = c->mtu % 256;
|
||||
|
|
4
dhcpv6.c
4
dhcpv6.c
|
@ -144,7 +144,9 @@ struct opt_ia_addr {
|
|||
struct opt_status_code {
|
||||
struct opt_hdr hdr;
|
||||
uint16_t code;
|
||||
char status_msg[sizeof(STR_NOTONLINK) - 1];
|
||||
/* "nonstring" is only supported since clang 23 */
|
||||
/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
|
||||
__attribute__((nonstring)) char status_msg[sizeof(STR_NOTONLINK) - 1];
|
||||
} __attribute__((packed));
|
||||
|
||||
/**
|
||||
|
|
1
doc/platform-requirements/.gitignore
vendored
1
doc/platform-requirements/.gitignore
vendored
|
@ -1,3 +1,4 @@
|
|||
/listen-vs-repair
|
||||
/reuseaddr-priority
|
||||
/recv-zero
|
||||
/udp-close-dup
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
# Copyright Red Hat
|
||||
# Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
|
||||
TARGETS = reuseaddr-priority recv-zero udp-close-dup
|
||||
SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c
|
||||
TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair
|
||||
SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c
|
||||
CFLAGS = -Wall
|
||||
|
||||
all: cppcheck clang-tidy $(TARGETS:%=check-%)
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
__attribute__((format(printf, 1, 2), noreturn))
|
||||
static inline void die(const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
|
128
doc/platform-requirements/listen-vs-repair.c
Normal file
128
doc/platform-requirements/listen-vs-repair.c
Normal file
|
@ -0,0 +1,128 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* liste-vs-repair.c
|
||||
*
|
||||
* Do listening sockets have address conflicts with sockets under repair
|
||||
* ====================================================================
|
||||
*
|
||||
* When we accept() an incoming connection the accept()ed socket will have the
|
||||
* same local address as the listening socket. This can be a complication on
|
||||
* migration. On the migration target we've already set up listening sockets
|
||||
* according to the command line. However to restore connections that we're
|
||||
* migrating in we need to bind the new sockets to the same address, which would
|
||||
* be an address conflict on the face of it. This test program verifies that
|
||||
* enabling repair mode before bind() correctly suppresses that conflict.
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: David Gibson <david@gibson.dropbear.id.au>
|
||||
*/
|
||||
|
||||
/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */
|
||||
#define _GNU_SOURCE
|
||||
|
||||
#include <arpa/inet.h>
|
||||
#include <errno.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <net/if.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <sched.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define PORT 13256U
|
||||
#define CPORT 13257U
|
||||
|
||||
/* 127.0.0.1:PORT */
|
||||
static const struct sockaddr_in addr = SOCKADDR_INIT(INADDR_LOOPBACK, PORT);
|
||||
|
||||
/* 127.0.0.1:CPORT */
|
||||
static const struct sockaddr_in caddr = SOCKADDR_INIT(INADDR_LOOPBACK, CPORT);
|
||||
|
||||
/* Put ourselves into a network sandbox */
|
||||
static void net_sandbox(void)
|
||||
{
|
||||
/* NOLINTNEXTLINE(altera-struct-pack-align) */
|
||||
const struct req_t {
|
||||
struct nlmsghdr nlh;
|
||||
struct ifinfomsg ifm;
|
||||
} __attribute__((packed)) req = {
|
||||
.nlh.nlmsg_type = RTM_NEWLINK,
|
||||
.nlh.nlmsg_flags = NLM_F_REQUEST,
|
||||
.nlh.nlmsg_len = sizeof(req),
|
||||
.nlh.nlmsg_seq = 1,
|
||||
.ifm.ifi_family = AF_UNSPEC,
|
||||
.ifm.ifi_index = 1,
|
||||
.ifm.ifi_flags = IFF_UP,
|
||||
.ifm.ifi_change = IFF_UP,
|
||||
};
|
||||
int nl;
|
||||
|
||||
if (unshare(CLONE_NEWUSER | CLONE_NEWNET))
|
||||
die("unshare(): %s\n", strerror(errno));
|
||||
|
||||
/* Bring up lo in the new netns */
|
||||
nl = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
|
||||
if (nl < 0)
|
||||
die("Can't create netlink socket: %s\n", strerror(errno));
|
||||
|
||||
if (send(nl, &req, sizeof(req), 0) < 0)
|
||||
die("Netlink send(): %s\n", strerror(errno));
|
||||
close(nl);
|
||||
}
|
||||
|
||||
static void check(void)
|
||||
{
|
||||
int s1, s2, op;
|
||||
|
||||
s1 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
|
||||
if (s1 < 0)
|
||||
die("socket() 1: %s\n", strerror(errno));
|
||||
|
||||
if (bind(s1, (struct sockaddr *)&addr, sizeof(addr)))
|
||||
die("bind() 1: %s\n", strerror(errno));
|
||||
|
||||
if (listen(s1, 0))
|
||||
die("listen(): %s\n", strerror(errno));
|
||||
|
||||
s2 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
|
||||
if (s2 < 0)
|
||||
die("socket() 2: %s\n", strerror(errno));
|
||||
|
||||
op = TCP_REPAIR_ON;
|
||||
if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
|
||||
die("TCP_REPAIR: %s\n", strerror(errno));
|
||||
|
||||
if (bind(s2, (struct sockaddr *)&addr, sizeof(addr)))
|
||||
die("bind() 2: %s\n", strerror(errno));
|
||||
|
||||
if (connect(s2, (struct sockaddr *)&caddr, sizeof(caddr)))
|
||||
die("connect(): %s\n", strerror(errno));
|
||||
|
||||
op = TCP_REPAIR_OFF_NO_WP;
|
||||
if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
|
||||
die("TCP_REPAIR: %s\n", strerror(errno));
|
||||
|
||||
close(s1);
|
||||
close(s2);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
(void)argc;
|
||||
(void)argv;
|
||||
|
||||
net_sandbox();
|
||||
|
||||
check();
|
||||
|
||||
printf("Repair mode appears to properly suppress conflicts with listening sockets\n");
|
||||
|
||||
exit(0);
|
||||
}
|
|
@ -46,13 +46,13 @@
|
|||
/* Different cases for receiving socket configuration */
|
||||
enum sock_type {
|
||||
/* Socket is bound to 0.0.0.0:DSTPORT and not connected */
|
||||
SOCK_BOUND_ANY = 0,
|
||||
SOCK_BOUND_ANY,
|
||||
|
||||
/* Socket is bound to 127.0.0.1:DSTPORT and not connected */
|
||||
SOCK_BOUND_LO = 1,
|
||||
SOCK_BOUND_LO,
|
||||
|
||||
/* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */
|
||||
SOCK_CONNECTED = 2,
|
||||
SOCK_CONNECTED,
|
||||
|
||||
NUM_SOCK_TYPES,
|
||||
};
|
||||
|
|
|
@ -22,8 +22,8 @@ enum epoll_type {
|
|||
EPOLL_TYPE_TCP_TIMER,
|
||||
/* UDP "listening" sockets */
|
||||
EPOLL_TYPE_UDP_LISTEN,
|
||||
/* UDP socket for replies on a specific flow */
|
||||
EPOLL_TYPE_UDP_REPLY,
|
||||
/* UDP socket for a specific flow */
|
||||
EPOLL_TYPE_UDP,
|
||||
/* ICMP/ICMPv6 ping sockets */
|
||||
EPOLL_TYPE_PING,
|
||||
/* inotify fd watching for end of netns (pasta) */
|
||||
|
|
264
flow.c
264
flow.c
|
@ -53,30 +53,8 @@ const uint8_t flow_proto[] = {
|
|||
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
|
||||
"flow_proto[] doesn't match enum flow_type");
|
||||
|
||||
#define foreach_flow(i, flow, bound) \
|
||||
for ((i) = 0, (flow) = &flowtab[(i)]; \
|
||||
(i) < (bound); \
|
||||
(i)++, (flow) = &flowtab[(i)]) \
|
||||
if ((flow)->f.state == FLOW_STATE_FREE) \
|
||||
(i) += (flow)->free.n - 1; \
|
||||
else
|
||||
|
||||
#define foreach_active_flow(i, flow, bound) \
|
||||
foreach_flow((i), (flow), (bound)) \
|
||||
if ((flow)->f.state != FLOW_STATE_ACTIVE) \
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone) */ \
|
||||
continue; \
|
||||
else
|
||||
|
||||
#define foreach_tcp_flow(i, flow, bound) \
|
||||
foreach_active_flow((i), (flow), (bound)) \
|
||||
if ((flow)->f.type != FLOW_TCP) \
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone) */ \
|
||||
continue; \
|
||||
else
|
||||
|
||||
#define foreach_established_tcp_flow(i, flow, bound) \
|
||||
foreach_tcp_flow((i), (flow), (bound)) \
|
||||
#define foreach_established_tcp_flow(flow) \
|
||||
flow_foreach_of_type((flow), FLOW_TCP) \
|
||||
if (!tcp_flow_is_established(&(flow)->tcp)) \
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone) */ \
|
||||
continue; \
|
||||
|
@ -103,7 +81,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
|
|||
*
|
||||
* Free cluster list
|
||||
* flow_first_free gives the index of the first (lowest index) free cluster.
|
||||
* Each free cluster has the index of the next free cluster, or MAX_FLOW if
|
||||
* Each free cluster has the index of the next free cluster, or FLOW_MAX if
|
||||
* it is the last free cluster. Together these form a linked list of free
|
||||
* clusters, in strictly increasing order of index.
|
||||
*
|
||||
|
@ -289,11 +267,13 @@ int flowside_connect(const struct ctx *c, int s,
|
|||
|
||||
/** flow_log_ - Log flow-related message
|
||||
* @f: flow the message is related to
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
* @pri: Log priority
|
||||
* @fmt: Format string
|
||||
* @...: printf-arguments
|
||||
*/
|
||||
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
||||
void flow_log_(const struct flow_common *f, bool newline, int pri,
|
||||
const char *fmt, ...)
|
||||
{
|
||||
const char *type_or_state;
|
||||
char msg[BUFSIZ];
|
||||
|
@ -309,7 +289,7 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
|||
else
|
||||
type_or_state = FLOW_TYPE(f);
|
||||
|
||||
logmsg(true, false, pri,
|
||||
logmsg(newline, false, pri,
|
||||
"Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
|
||||
}
|
||||
|
||||
|
@ -329,7 +309,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
|
|||
const struct flowside *tgt = &f->side[TGTSIDE];
|
||||
|
||||
if (state >= FLOW_STATE_TGT)
|
||||
flow_log_(f, pri,
|
||||
flow_log_(f, true, pri,
|
||||
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
|
||||
pif_name(f->pif[INISIDE]),
|
||||
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
||||
|
@ -342,7 +322,7 @@ void flow_log_details_(const struct flow_common *f, int pri,
|
|||
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
|
||||
tgt->eport);
|
||||
else if (state >= FLOW_STATE_INI)
|
||||
flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
|
||||
flow_log_(f, true, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
|
||||
pif_name(f->pif[INISIDE]),
|
||||
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
||||
ini->eport,
|
||||
|
@ -363,7 +343,7 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
|
|||
ASSERT(oldstate < FLOW_NUM_STATES);
|
||||
|
||||
f->state = state;
|
||||
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
|
||||
flow_log_(f, true, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
|
||||
FLOW_STATE(f));
|
||||
|
||||
flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
|
||||
|
@ -416,18 +396,27 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
|
|||
* @flow: Flow to change state
|
||||
* @pif: pif of the initiating side
|
||||
* @ssa: Source socket address
|
||||
* @daddr: Destination address (may be NULL)
|
||||
* @dport: Destination port
|
||||
*
|
||||
* Return: pointer to the initiating flowside information
|
||||
*/
|
||||
struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
||||
const union sockaddr_inany *ssa,
|
||||
const union inany_addr *daddr,
|
||||
in_port_t dport)
|
||||
{
|
||||
struct flowside *ini = &flow->f.side[INISIDE];
|
||||
|
||||
inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
|
||||
if (inany_v4(&ini->eaddr))
|
||||
if (inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa) < 0) {
|
||||
char str[SOCKADDR_STRLEN];
|
||||
|
||||
ASSERT_WITH_MSG(0, "Bad socket address %s",
|
||||
sockaddr_ntop(ssa, str, sizeof(str)));
|
||||
}
|
||||
if (daddr)
|
||||
ini->oaddr = *daddr;
|
||||
else if (inany_v4(&ini->eaddr))
|
||||
ini->oaddr = inany_any4;
|
||||
else
|
||||
ini->oaddr = inany_any6;
|
||||
|
@ -771,19 +760,30 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
|
|||
* @proto: Protocol of the flow (IP L4 protocol number)
|
||||
* @pif: Interface of the flow
|
||||
* @esa: Socket address of the endpoint
|
||||
* @oaddr: Our address (may be NULL)
|
||||
* @oport: Our port number
|
||||
*
|
||||
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
|
||||
*/
|
||||
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||
const void *esa, in_port_t oport)
|
||||
const void *esa,
|
||||
const union inany_addr *oaddr, in_port_t oport)
|
||||
{
|
||||
struct flowside side = {
|
||||
.oport = oport,
|
||||
};
|
||||
|
||||
inany_from_sockaddr(&side.eaddr, &side.eport, esa);
|
||||
if (inany_v4(&side.eaddr))
|
||||
if (inany_from_sockaddr(&side.eaddr, &side.eport, esa) < 0) {
|
||||
char str[SOCKADDR_STRLEN];
|
||||
|
||||
warn("Flow lookup on bad socket address %s",
|
||||
sockaddr_ntop(esa, str, sizeof(str)));
|
||||
return FLOW_SIDX_NONE;
|
||||
}
|
||||
|
||||
if (oaddr)
|
||||
side.oaddr = *oaddr;
|
||||
else if (inany_v4(&side.eaddr))
|
||||
side.oaddr = inany_any4;
|
||||
else
|
||||
side.oaddr = inany_any6;
|
||||
|
@ -800,8 +800,9 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
{
|
||||
struct flow_free_cluster *free_head = NULL;
|
||||
unsigned *last_next = &flow_first_free;
|
||||
bool to_free[FLOW_MAX] = { 0 };
|
||||
bool timer = false;
|
||||
unsigned idx;
|
||||
union flow *flow;
|
||||
|
||||
if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) {
|
||||
timer = true;
|
||||
|
@ -810,49 +811,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
|
||||
ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
|
||||
|
||||
for (idx = 0; idx < FLOW_MAX; idx++) {
|
||||
union flow *flow = &flowtab[idx];
|
||||
/* Check which flows we might need to close first, but don't free them
|
||||
* yet as it's not safe to do that in the middle of flow_foreach().
|
||||
*/
|
||||
flow_foreach(flow) {
|
||||
bool closed = false;
|
||||
|
||||
switch (flow->f.state) {
|
||||
case FLOW_STATE_FREE: {
|
||||
unsigned skip = flow->free.n;
|
||||
|
||||
/* First entry of a free cluster must have n >= 1 */
|
||||
ASSERT(skip);
|
||||
|
||||
if (free_head) {
|
||||
/* Merge into preceding free cluster */
|
||||
free_head->n += flow->free.n;
|
||||
flow->free.n = flow->free.next = 0;
|
||||
} else {
|
||||
/* New free cluster, add to chain */
|
||||
free_head = &flow->free;
|
||||
*last_next = idx;
|
||||
last_next = &free_head->next;
|
||||
}
|
||||
|
||||
/* Skip remaining empty entries */
|
||||
idx += skip - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
case FLOW_STATE_NEW:
|
||||
case FLOW_STATE_INI:
|
||||
case FLOW_STATE_TGT:
|
||||
case FLOW_STATE_TYPED:
|
||||
/* Incomplete flow at end of cycle */
|
||||
ASSERT(false);
|
||||
break;
|
||||
|
||||
case FLOW_STATE_ACTIVE:
|
||||
/* Nothing to do */
|
||||
break;
|
||||
|
||||
default:
|
||||
ASSERT(false);
|
||||
}
|
||||
|
||||
switch (flow->f.type) {
|
||||
case FLOW_TYPE_NONE:
|
||||
ASSERT(false);
|
||||
|
@ -871,7 +835,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
closed = icmp_ping_timer(c, &flow->ping, now);
|
||||
break;
|
||||
case FLOW_UDP:
|
||||
closed = udp_flow_defer(&flow->udp);
|
||||
closed = udp_flow_defer(c, &flow->udp, now);
|
||||
if (!closed && timer)
|
||||
closed = udp_flow_timer(c, &flow->udp, now);
|
||||
break;
|
||||
|
@ -880,24 +844,67 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
;
|
||||
}
|
||||
|
||||
if (closed) {
|
||||
flow_set_state(&flow->f, FLOW_STATE_FREE);
|
||||
memset(flow, 0, sizeof(*flow));
|
||||
to_free[FLOW_IDX(flow)] = closed;
|
||||
}
|
||||
|
||||
/* Second step: actually free the flows */
|
||||
flow_foreach_slot(flow) {
|
||||
switch (flow->f.state) {
|
||||
case FLOW_STATE_FREE: {
|
||||
unsigned skip = flow->free.n;
|
||||
|
||||
/* First entry of a free cluster must have n >= 1 */
|
||||
ASSERT(skip);
|
||||
|
||||
if (free_head) {
|
||||
/* Add slot to current free cluster */
|
||||
ASSERT(idx == FLOW_IDX(free_head) + free_head->n);
|
||||
free_head->n++;
|
||||
/* Merge into preceding free cluster */
|
||||
free_head->n += flow->free.n;
|
||||
flow->free.n = flow->free.next = 0;
|
||||
} else {
|
||||
/* Create new free cluster */
|
||||
/* New free cluster, add to chain */
|
||||
free_head = &flow->free;
|
||||
free_head->n = 1;
|
||||
*last_next = idx;
|
||||
*last_next = FLOW_IDX(flow);
|
||||
last_next = &free_head->next;
|
||||
}
|
||||
} else {
|
||||
free_head = NULL;
|
||||
|
||||
/* Skip remaining empty entries */
|
||||
flow += skip - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
case FLOW_STATE_NEW:
|
||||
case FLOW_STATE_INI:
|
||||
case FLOW_STATE_TGT:
|
||||
case FLOW_STATE_TYPED:
|
||||
/* Incomplete flow at end of cycle */
|
||||
ASSERT(false);
|
||||
break;
|
||||
|
||||
case FLOW_STATE_ACTIVE:
|
||||
if (to_free[FLOW_IDX(flow)]) {
|
||||
flow_set_state(&flow->f, FLOW_STATE_FREE);
|
||||
memset(flow, 0, sizeof(*flow));
|
||||
|
||||
if (free_head) {
|
||||
/* Add slot to current free cluster */
|
||||
ASSERT(FLOW_IDX(flow) ==
|
||||
FLOW_IDX(free_head) + free_head->n);
|
||||
free_head->n++;
|
||||
flow->free.n = flow->free.next = 0;
|
||||
} else {
|
||||
/* Create new free cluster */
|
||||
free_head = &flow->free;
|
||||
free_head->n = 1;
|
||||
*last_next = FLOW_IDX(flow);
|
||||
last_next = &free_head->next;
|
||||
}
|
||||
} else {
|
||||
free_head = NULL;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -907,22 +914,23 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
|||
/**
|
||||
* flow_migrate_source_rollback() - Disable repair mode, return failure
|
||||
* @c: Execution context
|
||||
* @max_flow: Maximum index of affected flows
|
||||
* @bound: No need to roll back flow indices >= @bound
|
||||
* @ret: Negative error code
|
||||
*
|
||||
* Return: @ret
|
||||
*/
|
||||
static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
|
||||
int ret)
|
||||
static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret)
|
||||
{
|
||||
union flow *flow;
|
||||
unsigned i;
|
||||
|
||||
debug("...roll back migration");
|
||||
|
||||
foreach_established_tcp_flow(i, flow, max_flow)
|
||||
foreach_established_tcp_flow(flow) {
|
||||
if (FLOW_IDX(flow) >= bound)
|
||||
break;
|
||||
if (tcp_flow_repair_off(c, &flow->tcp))
|
||||
die("Failed to roll back TCP_REPAIR mode");
|
||||
}
|
||||
|
||||
if (repair_flush(c))
|
||||
die("Failed to roll back TCP_REPAIR mode");
|
||||
|
@ -930,6 +938,21 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
|
|||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_migrate_need_repair() - Do we need to set repair mode for any flow?
|
||||
*
|
||||
* Return: true if repair mode is needed, false otherwise
|
||||
*/
|
||||
static bool flow_migrate_need_repair(void)
|
||||
{
|
||||
union flow *flow;
|
||||
|
||||
foreach_established_tcp_flow(flow)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* flow_migrate_repair_all() - Turn repair mode on or off for all flows
|
||||
* @c: Execution context
|
||||
|
@ -940,10 +963,13 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
|
|||
static int flow_migrate_repair_all(struct ctx *c, bool enable)
|
||||
{
|
||||
union flow *flow;
|
||||
unsigned i;
|
||||
int rc;
|
||||
|
||||
foreach_established_tcp_flow(i, flow, FLOW_MAX) {
|
||||
/* If we don't have a repair helper, there's nothing we can do */
|
||||
if (c->fd_repair < 0)
|
||||
return 0;
|
||||
|
||||
foreach_established_tcp_flow(flow) {
|
||||
if (enable)
|
||||
rc = tcp_flow_repair_on(c, &flow->tcp);
|
||||
else
|
||||
|
@ -952,14 +978,15 @@ static int flow_migrate_repair_all(struct ctx *c, bool enable)
|
|||
if (rc) {
|
||||
debug("Can't %s repair mode: %s",
|
||||
enable ? "enable" : "disable", strerror_(-rc));
|
||||
return flow_migrate_source_rollback(c, i, rc);
|
||||
return flow_migrate_source_rollback(c, FLOW_IDX(flow),
|
||||
rc);
|
||||
}
|
||||
}
|
||||
|
||||
if ((rc = repair_flush(c))) {
|
||||
debug("Can't %s repair mode: %s",
|
||||
enable ? "enable" : "disable", strerror_(-rc));
|
||||
return flow_migrate_source_rollback(c, i, rc);
|
||||
return flow_migrate_source_rollback(c, FLOW_IDX(flow), rc);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -981,6 +1008,9 @@ int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
|
|||
(void)stage;
|
||||
(void)fd;
|
||||
|
||||
if (flow_migrate_need_repair())
|
||||
repair_wait(c);
|
||||
|
||||
if ((rc = flow_migrate_repair_all(c, true)))
|
||||
return -rc;
|
||||
|
||||
|
@ -1001,14 +1031,16 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
|
|||
uint32_t count = 0;
|
||||
bool first = true;
|
||||
union flow *flow;
|
||||
unsigned i;
|
||||
int rc;
|
||||
|
||||
(void)c;
|
||||
(void)stage;
|
||||
|
||||
foreach_established_tcp_flow(i, flow, FLOW_MAX)
|
||||
count++;
|
||||
/* If we don't have a repair helper, we can't migrate TCP flows */
|
||||
if (c->fd_repair >= 0) {
|
||||
foreach_established_tcp_flow(flow)
|
||||
count++;
|
||||
}
|
||||
|
||||
count = htonl(count);
|
||||
if (write_all_buf(fd, &count, sizeof(count))) {
|
||||
|
@ -1019,6 +1051,9 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
|
|||
|
||||
debug("Sending %u flows", ntohl(count));
|
||||
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
/* Dump and send information that can be stored in the flow table.
|
||||
*
|
||||
* Limited rollback options here: if we fail to transfer any data (that
|
||||
|
@ -1026,10 +1061,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
|
|||
* stream might now be inconsistent, and we might have closed listening
|
||||
* TCP sockets, so just terminate.
|
||||
*/
|
||||
foreach_established_tcp_flow(i, flow, FLOW_MAX) {
|
||||
foreach_established_tcp_flow(flow) {
|
||||
rc = tcp_flow_migrate_source(fd, &flow->tcp);
|
||||
if (rc) {
|
||||
err("Can't send data, flow %u: %s", i, strerror_(-rc));
|
||||
flow_err(flow, "Can't send data: %s",
|
||||
strerror_(-rc));
|
||||
if (!first)
|
||||
die("Inconsistent migration state, exiting");
|
||||
|
||||
|
@ -1052,10 +1088,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
|
|||
* failures but not if the stream might be inconsistent (reported here
|
||||
* as EIO).
|
||||
*/
|
||||
foreach_established_tcp_flow(i, flow, FLOW_MAX) {
|
||||
rc = tcp_flow_migrate_source_ext(fd, i, &flow->tcp);
|
||||
foreach_established_tcp_flow(flow) {
|
||||
rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
|
||||
if (rc) {
|
||||
err("Extended data for flow %u: %s", i, strerror_(-rc));
|
||||
flow_err(flow, "Can't send extended data: %s",
|
||||
strerror_(-rc));
|
||||
|
||||
if (rc == -EIO)
|
||||
die("Inconsistent migration state, exiting");
|
||||
|
@ -1088,6 +1125,11 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
|
|||
count = ntohl(count);
|
||||
debug("Receiving %u flows", count);
|
||||
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
repair_wait(c);
|
||||
|
||||
if ((rc = flow_migrate_repair_all(c, true)))
|
||||
return -rc;
|
||||
|
||||
|
@ -1097,8 +1139,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
|
|||
for (i = 0; i < count; i++) {
|
||||
rc = tcp_flow_migrate_target(c, fd);
|
||||
if (rc) {
|
||||
debug("Migration data failure at flow %u: %s, abort",
|
||||
i, strerror_(-rc));
|
||||
flow_dbg(FLOW(i), "Migration data failure, abort: %s",
|
||||
strerror_(-rc));
|
||||
return -rc;
|
||||
}
|
||||
}
|
||||
|
@ -1106,10 +1148,10 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
|
|||
repair_flush(c);
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
rc = tcp_flow_migrate_target_ext(c, flowtab + i, fd);
|
||||
rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd);
|
||||
if (rc) {
|
||||
debug("Migration data failure at flow %u: %s, abort",
|
||||
i, strerror_(-rc));
|
||||
flow_dbg(FLOW(i), "Migration data failure, abort: %s",
|
||||
strerror_(-rc));
|
||||
return -rc;
|
||||
}
|
||||
}
|
||||
|
|
21
flow.h
21
flow.h
|
@ -243,7 +243,8 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
|
|||
const void *eaddr, const void *oaddr,
|
||||
in_port_t eport, in_port_t oport);
|
||||
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||
const void *esa, in_port_t oport);
|
||||
const void *esa,
|
||||
const union inany_addr *oaddr, in_port_t oport);
|
||||
|
||||
union flow;
|
||||
|
||||
|
@ -258,11 +259,11 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
|
|||
int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
|
||||
int fd);
|
||||
|
||||
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
||||
__attribute__((format(printf, 3, 4)));
|
||||
|
||||
#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, (pri), __VA_ARGS__)
|
||||
void flow_log_(const struct flow_common *f, bool newline, int pri,
|
||||
const char *fmt, ...)
|
||||
__attribute__((format(printf, 4, 5)));
|
||||
|
||||
#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, true, (pri), __VA_ARGS__)
|
||||
#define flow_dbg(f, ...) flow_log((f), LOG_DEBUG, __VA_ARGS__)
|
||||
#define flow_err(f, ...) flow_log((f), LOG_ERR, __VA_ARGS__)
|
||||
|
||||
|
@ -272,6 +273,16 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
|||
flow_dbg((f), __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define flow_log_perror_(f, pri, ...) \
|
||||
do { \
|
||||
int errno_ = errno; \
|
||||
flow_log_((f), false, (pri), __VA_ARGS__); \
|
||||
logmsg(true, true, (pri), ": %s", strerror_(errno_)); \
|
||||
} while (0)
|
||||
|
||||
#define flow_dbg_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_DEBUG, __VA_ARGS__)
|
||||
#define flow_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_ERR, __VA_ARGS__)
|
||||
|
||||
void flow_log_details_(const struct flow_common *f, int pri,
|
||||
enum flow_state state);
|
||||
#define flow_log_details(f_, pri) \
|
||||
|
|
38
flow_table.h
38
flow_table.h
|
@ -50,6 +50,42 @@ extern union flow flowtab[];
|
|||
#define flow_foreach_sidei(sidei_) \
|
||||
for ((sidei_) = INISIDE; (sidei_) < SIDES; (sidei_)++)
|
||||
|
||||
|
||||
/**
|
||||
* flow_foreach_slot() - Step through each flow table entry
|
||||
* @flow: Takes values of pointer to each flow table entry
|
||||
*
|
||||
* Includes FREE slots.
|
||||
*/
|
||||
#define flow_foreach_slot(flow) \
|
||||
for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++)
|
||||
|
||||
/**
|
||||
* flow_foreach() - Step through each active flow
|
||||
* @flow: Takes values of pointer to each active flow
|
||||
*/
|
||||
#define flow_foreach(flow) \
|
||||
flow_foreach_slot((flow)) \
|
||||
if ((flow)->f.state == FLOW_STATE_FREE) \
|
||||
(flow) += (flow)->free.n - 1; \
|
||||
else if ((flow)->f.state != FLOW_STATE_ACTIVE) { \
|
||||
flow_err((flow), "Bad flow state during traversal"); \
|
||||
continue; \
|
||||
} else
|
||||
|
||||
/**
|
||||
* flow_foreach_of_type() - Step through each active flow of given type
|
||||
* @flow: Takes values of pointer to each flow
|
||||
* @type_: Type of flow to traverse
|
||||
*/
|
||||
#define flow_foreach_of_type(flow, type_) \
|
||||
flow_foreach((flow)) \
|
||||
if ((flow)->f.type != (type_)) \
|
||||
/* NOLINTNEXTLINE(bugprone-branch-clone) */ \
|
||||
continue; \
|
||||
else
|
||||
|
||||
|
||||
/** flow_idx() - Index of flow from common structure
|
||||
* @f: Common flow fields pointer
|
||||
*
|
||||
|
@ -57,6 +93,7 @@ extern union flow flowtab[];
|
|||
*/
|
||||
static inline unsigned flow_idx(const struct flow_common *f)
|
||||
{
|
||||
/* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
|
||||
return (union flow *)f - flowtab;
|
||||
}
|
||||
|
||||
|
@ -163,6 +200,7 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
|
|||
const void *daddr, in_port_t dport);
|
||||
struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
||||
const union sockaddr_inany *ssa,
|
||||
const union inany_addr *daddr,
|
||||
in_port_t dport);
|
||||
const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
|
||||
sa_family_t af,
|
||||
|
|
89
fwd.c
89
fwd.c
|
@ -323,6 +323,30 @@ static bool fwd_guest_accessible(const struct ctx *c,
|
|||
return fwd_guest_accessible6(c, &addr->a6);
|
||||
}
|
||||
|
||||
/**
|
||||
* nat_outbound() - Apply address translation for outbound (TAP to HOST)
|
||||
* @c: Execution context
|
||||
* @addr: Input address (as seen on TAP interface)
|
||||
* @translated: Output address (as seen on HOST interface)
|
||||
*
|
||||
* Only handles translations that depend *only* on the address. Anything
|
||||
* related to specific ports or flows is handled elsewhere.
|
||||
*/
|
||||
static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
|
||||
union inany_addr *translated)
|
||||
{
|
||||
if (inany_equals4(addr, &c->ip4.map_host_loopback))
|
||||
*translated = inany_loopback4;
|
||||
else if (inany_equals6(addr, &c->ip6.map_host_loopback))
|
||||
*translated = inany_loopback6;
|
||||
else if (inany_equals4(addr, &c->ip4.map_guest_addr))
|
||||
*translated = inany_from_v4(c->ip4.addr);
|
||||
else if (inany_equals6(addr, &c->ip6.map_guest_addr))
|
||||
translated->a6 = c->ip6.addr;
|
||||
else
|
||||
*translated = *addr;
|
||||
}
|
||||
|
||||
/**
|
||||
* fwd_nat_from_tap() - Determine to forward a flow from the tap interface
|
||||
* @c: Execution context
|
||||
|
@ -342,16 +366,8 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
|
|||
else if (is_dns_flow(proto, ini) &&
|
||||
inany_equals6(&ini->oaddr, &c->ip6.dns_match))
|
||||
tgt->eaddr.a6 = c->ip6.dns_host;
|
||||
else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback))
|
||||
tgt->eaddr = inany_loopback4;
|
||||
else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback))
|
||||
tgt->eaddr = inany_loopback6;
|
||||
else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr))
|
||||
tgt->eaddr = inany_from_v4(c->ip4.addr);
|
||||
else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr))
|
||||
tgt->eaddr.a6 = c->ip6.addr;
|
||||
else
|
||||
tgt->eaddr = ini->oaddr;
|
||||
nat_outbound(c, &ini->oaddr, &tgt->eaddr);
|
||||
|
||||
tgt->eport = ini->oport;
|
||||
|
||||
|
@ -402,7 +418,7 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
|
|||
else
|
||||
tgt->eaddr = inany_loopback6;
|
||||
|
||||
/* Preserve the specific loopback adddress used, but let the kernel pick
|
||||
/* Preserve the specific loopback address used, but let the kernel pick
|
||||
* a source port on the target side
|
||||
*/
|
||||
tgt->oaddr = ini->eaddr;
|
||||
|
@ -423,6 +439,42 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
|
|||
return PIF_HOST;
|
||||
}
|
||||
|
||||
/**
|
||||
* nat_inbound() - Apply address translation for inbound (HOST to TAP)
|
||||
* @c: Execution context
|
||||
* @addr: Input address (as seen on HOST interface)
|
||||
* @translated: Output address (as seen on TAP interface)
|
||||
*
|
||||
* Return: true on success, false if it couldn't translate the address
|
||||
*
|
||||
* Only handles translations that depend *only* on the address. Anything
|
||||
* related to specific ports or flows is handled elsewhere.
|
||||
*/
|
||||
bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
|
||||
union inany_addr *translated)
|
||||
{
|
||||
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
|
||||
inany_equals4(addr, &in4addr_loopback)) {
|
||||
/* Specifically 127.0.0.1, not 127.0.0.0/8 */
|
||||
*translated = inany_from_v4(c->ip4.map_host_loopback);
|
||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
|
||||
inany_equals6(addr, &in6addr_loopback)) {
|
||||
translated->a6 = c->ip6.map_host_loopback;
|
||||
} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
|
||||
inany_equals4(addr, &c->ip4.addr)) {
|
||||
*translated = inany_from_v4(c->ip4.map_guest_addr);
|
||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
|
||||
inany_equals6(addr, &c->ip6.addr)) {
|
||||
translated->a6 = c->ip6.map_guest_addr;
|
||||
} else if (fwd_guest_accessible(c, addr)) {
|
||||
*translated = *addr;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* fwd_nat_from_host() - Determine to forward a flow from the host interface
|
||||
* @c: Execution context
|
||||
|
@ -479,20 +531,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
|
|||
return PIF_SPLICE;
|
||||
}
|
||||
|
||||
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
|
||||
inany_equals4(&ini->eaddr, &in4addr_loopback)) {
|
||||
/* Specifically 127.0.0.1, not 127.0.0.0/8 */
|
||||
tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback);
|
||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
|
||||
inany_equals6(&ini->eaddr, &in6addr_loopback)) {
|
||||
tgt->oaddr.a6 = c->ip6.map_host_loopback;
|
||||
} else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
|
||||
inany_equals4(&ini->eaddr, &c->ip4.addr)) {
|
||||
tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr);
|
||||
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
|
||||
inany_equals6(&ini->eaddr, &c->ip6.addr)) {
|
||||
tgt->oaddr.a6 = c->ip6.map_guest_addr;
|
||||
} else if (!fwd_guest_accessible(c, &ini->eaddr)) {
|
||||
if (!nat_inbound(c, &ini->eaddr, &tgt->oaddr)) {
|
||||
if (inany_v4(&ini->eaddr)) {
|
||||
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr))
|
||||
/* No source address we can use */
|
||||
|
@ -501,8 +540,6 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
|
|||
} else {
|
||||
tgt->oaddr.a6 = c->ip6.our_tap_ll;
|
||||
}
|
||||
} else {
|
||||
tgt->oaddr = ini->eaddr;
|
||||
}
|
||||
tgt->oport = ini->eport;
|
||||
|
||||
|
|
3
fwd.h
3
fwd.h
|
@ -7,6 +7,7 @@
|
|||
#ifndef FWD_H
|
||||
#define FWD_H
|
||||
|
||||
union inany_addr;
|
||||
struct flowside;
|
||||
|
||||
/* Number of ports for both TCP and UDP */
|
||||
|
@ -47,6 +48,8 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
|
|||
const struct fwd_ports *tcp_rev);
|
||||
void fwd_scan_ports_init(struct ctx *c);
|
||||
|
||||
bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
|
||||
union inany_addr *translated);
|
||||
uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
|
||||
const struct flowside *ini, struct flowside *tgt);
|
||||
uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
|
||||
|
|
5
icmp.c
5
icmp.c
|
@ -85,7 +85,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
|
|||
|
||||
n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl);
|
||||
if (n < 0) {
|
||||
flow_err(pingf, "recvfrom() error: %s", strerror_(errno));
|
||||
flow_perror(pingf, "recvfrom() error");
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -300,8 +300,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
|||
|
||||
pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0);
|
||||
if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
|
||||
flow_dbg(pingf, "failed to relay request to socket: %s",
|
||||
strerror_(errno));
|
||||
flow_dbg_perror(pingf, "failed to relay request to socket");
|
||||
} else {
|
||||
flow_dbg(pingf,
|
||||
"echo request to socket, ID: %"PRIu16", seq: %"PRIu16,
|
||||
|
|
29
inany.h
29
inany.h
|
@ -237,23 +237,30 @@ static inline void inany_from_af(union inany_addr *aa,
|
|||
}
|
||||
|
||||
/** inany_from_sockaddr - Extract IPv[46] address and port number from sockaddr
|
||||
* @aa: Pointer to store IPv[46] address
|
||||
* @dst: Pointer to store IPv[46] address (output)
|
||||
* @port: Pointer to store port number, host order
|
||||
* @addr: AF_INET or AF_INET6 socket address
|
||||
* @addr: Socket address
|
||||
*
|
||||
* Return: 0 on success, -1 on error (bad address family)
|
||||
*/
|
||||
static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
|
||||
const union sockaddr_inany *sa)
|
||||
static inline int inany_from_sockaddr(union inany_addr *dst, in_port_t *port,
|
||||
const void *addr)
|
||||
{
|
||||
const union sockaddr_inany *sa = (const union sockaddr_inany *)addr;
|
||||
|
||||
if (sa->sa_family == AF_INET6) {
|
||||
inany_from_af(aa, AF_INET6, &sa->sa6.sin6_addr);
|
||||
inany_from_af(dst, AF_INET6, &sa->sa6.sin6_addr);
|
||||
*port = ntohs(sa->sa6.sin6_port);
|
||||
} else if (sa->sa_family == AF_INET) {
|
||||
inany_from_af(aa, AF_INET, &sa->sa4.sin_addr);
|
||||
*port = ntohs(sa->sa4.sin_port);
|
||||
} else {
|
||||
/* Not valid to call with other address families */
|
||||
ASSERT(0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (sa->sa_family == AF_INET) {
|
||||
inany_from_af(dst, AF_INET, &sa->sa4.sin_addr);
|
||||
*port = ntohs(sa->sa4.sin_port);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/** inany_siphash_feed- Fold IPv[46] address into an in-progress siphash
|
||||
|
|
16
iov.c
16
iov.c
|
@ -26,7 +26,8 @@
|
|||
#include "iov.h"
|
||||
|
||||
|
||||
/* iov_skip_bytes() - Skip leading bytes of an IO vector
|
||||
/**
|
||||
* iov_skip_bytes() - Skip leading bytes of an IO vector
|
||||
* @iov: IO vector
|
||||
* @n: Number of entries in @iov
|
||||
* @skip: Number of leading bytes of @iov to skip
|
||||
|
@ -56,8 +57,8 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
|
|||
}
|
||||
|
||||
/**
|
||||
* iov_from_buf - Copy data from a buffer to an I/O vector (struct iovec)
|
||||
* efficiently.
|
||||
* iov_from_buf() - Copy data from a buffer to an I/O vector (struct iovec)
|
||||
* efficiently.
|
||||
*
|
||||
* @iov: Pointer to the array of struct iovec describing the
|
||||
* scatter/gather I/O vector.
|
||||
|
@ -96,8 +97,8 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
|
|||
}
|
||||
|
||||
/**
|
||||
* iov_to_buf - Copy data from a scatter/gather I/O vector (struct iovec) to
|
||||
* a buffer efficiently.
|
||||
* iov_to_buf() - Copy data from a scatter/gather I/O vector (struct iovec) to
|
||||
* a buffer efficiently.
|
||||
*
|
||||
* @iov: Pointer to the array of struct iovec describing the scatter/gather
|
||||
* I/O vector.
|
||||
|
@ -136,8 +137,8 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
|
|||
}
|
||||
|
||||
/**
|
||||
* iov_size - Calculate the total size of a scatter/gather I/O vector
|
||||
* (struct iovec).
|
||||
* iov_size() - Calculate the total size of a scatter/gather I/O vector
|
||||
* (struct iovec).
|
||||
*
|
||||
* @iov: Pointer to the array of struct iovec describing the
|
||||
* scatter/gather I/O vector.
|
||||
|
@ -203,6 +204,7 @@ size_t iov_tail_size(struct iov_tail *tail)
|
|||
* overruns the IO vector, is not contiguous or doesn't have the
|
||||
* requested alignment.
|
||||
*/
|
||||
/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
|
||||
void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
|
||||
{
|
||||
char *p;
|
||||
|
|
36
ip.h
36
ip.h
|
@ -36,13 +36,14 @@
|
|||
.tos = 0, \
|
||||
.tot_len = 0, \
|
||||
.id = 0, \
|
||||
.frag_off = 0, \
|
||||
.frag_off = htons(IP_DF), \
|
||||
.ttl = 0xff, \
|
||||
.protocol = (proto), \
|
||||
.saddr = 0, \
|
||||
.daddr = 0, \
|
||||
}
|
||||
#define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \
|
||||
(uint32_t)htons_constant(IP_DF) + \
|
||||
(uint32_t)htons(0xff00 | (proto)))
|
||||
|
||||
|
||||
|
@ -90,10 +91,34 @@ struct ipv6_opt_hdr {
|
|||
*/
|
||||
} __attribute__((packed)); /* required for some archs */
|
||||
|
||||
/**
|
||||
* ip6_set_flow_lbl() - Set flow label in an IPv6 header
|
||||
* @ip6h: Pointer to IPv6 header, updated
|
||||
* @flow: Set @ip6h flow label to the low 20 bits of this integer
|
||||
*/
|
||||
static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow)
|
||||
{
|
||||
ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
|
||||
ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
|
||||
ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
|
||||
}
|
||||
|
||||
/** ip6_get_flow_lbl() - Get flow label from an IPv6 header
|
||||
* @ip6h: Pointer to IPv6 header
|
||||
*
|
||||
* Return: flow label from @ip6h as an integer (<= 20 bits)
|
||||
*/
|
||||
static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
|
||||
{
|
||||
return (ip6h->flow_lbl[0] & 0xf) << 16 |
|
||||
ip6h->flow_lbl[1] << 8 |
|
||||
ip6h->flow_lbl[2];
|
||||
}
|
||||
|
||||
char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
|
||||
size_t *dlen);
|
||||
|
||||
/* IPv6 link-local all-nodes multicast adddress, ff02::1 */
|
||||
/* IPv6 link-local all-nodes multicast address, ff02::1 */
|
||||
static const struct in6_addr in6addr_ll_all_nodes = {
|
||||
.s6_addr = {
|
||||
0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
|
@ -104,4 +129,11 @@ static const struct in6_addr in6addr_ll_all_nodes = {
|
|||
/* IPv4 Limited Broadcast (RFC 919, Section 7), 255.255.255.255 */
|
||||
static const struct in_addr in4addr_broadcast = { 0xffffffff };
|
||||
|
||||
#ifndef IPV4_MIN_MTU
|
||||
#define IPV4_MIN_MTU 68
|
||||
#endif
|
||||
#ifndef IPV6_MIN_MTU
|
||||
#define IPV6_MIN_MTU 1280
|
||||
#endif
|
||||
|
||||
#endif /* IP_H */
|
||||
|
|
|
@ -129,7 +129,7 @@ static void drop_caps_ep_except(uint64_t keep)
|
|||
* additional layer of protection. Executing this requires
|
||||
* CAP_SETPCAP, which we will have within our userns.
|
||||
*
|
||||
* Note that dropping capabilites from the bounding set limits
|
||||
* Note that dropping capabilities from the bounding set limits
|
||||
* exec()ed processes, but does not remove them from the effective or
|
||||
* permitted sets, so it doesn't reduce our own capabilities.
|
||||
*/
|
||||
|
@ -174,8 +174,8 @@ static void clamp_caps(void)
|
|||
* Should:
|
||||
* - drop unneeded capabilities
|
||||
* - close all open files except for standard streams and the one from --fd
|
||||
* Musn't:
|
||||
* - remove filesytem access (we need to access files during setup)
|
||||
* Mustn't:
|
||||
* - remove filesystem access (we need to access files during setup)
|
||||
*/
|
||||
void isolate_initial(int argc, char **argv)
|
||||
{
|
||||
|
@ -194,7 +194,7 @@ void isolate_initial(int argc, char **argv)
|
|||
*
|
||||
* It's debatable whether it's useful to drop caps when we
|
||||
* retain SETUID and SYS_ADMIN, but we might as well. We drop
|
||||
* further capabilites in isolate_user() and
|
||||
* further capabilities in isolate_user() and
|
||||
* isolate_prefork().
|
||||
*/
|
||||
keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
|
||||
|
|
53
log.c
53
log.c
|
@ -56,7 +56,7 @@ bool log_stderr = true; /* Not daemonised, no shell spawned */
|
|||
*
|
||||
* Return: pointer to @now, or NULL if there was an error retrieving the time
|
||||
*/
|
||||
const struct timespec *logtime(struct timespec *ts)
|
||||
static const struct timespec *logtime(struct timespec *ts)
|
||||
{
|
||||
if (clock_gettime(CLOCK_MONOTONIC, ts))
|
||||
return NULL;
|
||||
|
@ -249,6 +249,30 @@ static void logfile_write(bool newline, bool cont, int pri,
|
|||
log_written += n;
|
||||
}
|
||||
|
||||
/**
|
||||
* passt_vsyslog() - vsyslog() implementation not using heap memory
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
* @pri: Facility and level map, same as priority for vsyslog()
|
||||
* @format: Same as vsyslog() format
|
||||
* @ap: Same as vsyslog() ap
|
||||
*/
|
||||
static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
|
||||
{
|
||||
char buf[BUFSIZ];
|
||||
int n;
|
||||
|
||||
/* Send without timestamp, the system logger should add it */
|
||||
n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
|
||||
|
||||
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
||||
|
||||
if (newline && format[strlen(format)] != '\n')
|
||||
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
||||
|
||||
if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
|
||||
FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
|
||||
}
|
||||
|
||||
/**
|
||||
* vlogmsg() - Print or send messages to log or output files as configured
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
|
@ -257,6 +281,7 @@ static void logfile_write(bool newline, bool cont, int pri,
|
|||
* @format: Message
|
||||
* @ap: Variable argument list
|
||||
*/
|
||||
/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
|
||||
void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
|
||||
{
|
||||
bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;
|
||||
|
@ -373,35 +398,11 @@ void __setlogmask(int mask)
|
|||
setlogmask(mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* passt_vsyslog() - vsyslog() implementation not using heap memory
|
||||
* @newline: Append newline at the end of the message, if missing
|
||||
* @pri: Facility and level map, same as priority for vsyslog()
|
||||
* @format: Same as vsyslog() format
|
||||
* @ap: Same as vsyslog() ap
|
||||
*/
|
||||
void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
|
||||
{
|
||||
char buf[BUFSIZ];
|
||||
int n;
|
||||
|
||||
/* Send without timestamp, the system logger should add it */
|
||||
n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident);
|
||||
|
||||
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
|
||||
|
||||
if (newline && format[strlen(format)] != '\n')
|
||||
n += snprintf(buf + n, BUFSIZ - n, "\n");
|
||||
|
||||
if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr)
|
||||
FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n);
|
||||
}
|
||||
|
||||
/**
|
||||
* logfile_init() - Open log file and write header with PID, version, path
|
||||
* @name: Identifier for header: passt or pasta
|
||||
* @path: Path to log file
|
||||
* @size: Maximum size of log file: log_cut_size is calculatd here
|
||||
* @size: Maximum size of log file: log_cut_size is calculated here
|
||||
*/
|
||||
void logfile_init(const char *name, const char *path, size_t size)
|
||||
{
|
||||
|
|
1
log.h
1
log.h
|
@ -55,7 +55,6 @@ void trace_init(int enable);
|
|||
|
||||
void __openlog(const char *ident, int option, int facility);
|
||||
void logfile_init(const char *name, const char *path, size_t size);
|
||||
void passt_vsyslog(bool newline, int pri, const char *format, va_list ap);
|
||||
void __setlogmask(int mask);
|
||||
|
||||
#endif /* LOG_H */
|
||||
|
|
10
migrate.c
10
migrate.c
|
@ -96,8 +96,8 @@ static int seen_addrs_target_v1(struct ctx *c,
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* Stages for version 1 */
|
||||
static const struct migrate_stage stages_v1[] = {
|
||||
/* Stages for version 2 */
|
||||
static const struct migrate_stage stages_v2[] = {
|
||||
{
|
||||
.name = "observed addresses",
|
||||
.source = seen_addrs_source_v1,
|
||||
|
@ -118,7 +118,11 @@ static const struct migrate_stage stages_v1[] = {
|
|||
|
||||
/* Supported encoding versions, from latest (most preferred) to oldest */
|
||||
static const struct migrate_version versions[] = {
|
||||
{ 1, stages_v1, },
|
||||
{ 2, stages_v2, },
|
||||
/* v1 was released, but not widely used. It had bad endianness for the
|
||||
* MSS and omitted timestamps, which meant it usually wouldn't work.
|
||||
* Therefore we don't attempt to support compatibility with it.
|
||||
*/
|
||||
{ 0 },
|
||||
};
|
||||
|
||||
|
|
3
ndp.c
3
ndp.c
|
@ -256,7 +256,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
|
|||
|
||||
ptr = &ra.var[0];
|
||||
|
||||
if (c->mtu != -1) {
|
||||
if (c->mtu) {
|
||||
struct opt_mtu *mtu = (struct opt_mtu *)ptr;
|
||||
*mtu = (struct opt_mtu) {
|
||||
.header = {
|
||||
|
@ -328,6 +328,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
|
|||
|
||||
memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
|
||||
|
||||
/* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
|
||||
ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra);
|
||||
}
|
||||
|
||||
|
|
|
@ -355,7 +355,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
|
|||
*
|
||||
* Return: true if a gateway was found, false otherwise
|
||||
*/
|
||||
bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
|
||||
static bool nl_route_get_def_multipath(struct rtattr *rta, void *gw)
|
||||
{
|
||||
int nh_len = RTA_PAYLOAD(rta);
|
||||
struct rtnexthop *rtnh;
|
||||
|
|
152
packet.c
152
packet.c
|
@ -23,51 +23,73 @@
|
|||
#include "log.h"
|
||||
|
||||
/**
|
||||
* packet_check_range() - Check if a packet memory range is valid
|
||||
* packet_check_range() - Check if a memory range is valid for a pool
|
||||
* @p: Packet pool
|
||||
* @offset: Offset of data range in packet descriptor
|
||||
* @ptr: Start of desired data range
|
||||
* @len: Length of desired data range
|
||||
* @start: Start of the packet descriptor
|
||||
* @func: For tracing: name of calling function
|
||||
* @line: For tracing: caller line of function call
|
||||
*
|
||||
* Return: 0 if the range is valid, -1 otherwise
|
||||
*/
|
||||
static int packet_check_range(const struct pool *p, size_t offset, size_t len,
|
||||
const char *start, const char *func, int line)
|
||||
static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
|
||||
const char *func, int line)
|
||||
{
|
||||
if (len > PACKET_MAX_LEN) {
|
||||
debug("packet range length %zu (max %zu), %s:%i",
|
||||
len, PACKET_MAX_LEN, func, line);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (p->buf_size == 0) {
|
||||
int ret;
|
||||
|
||||
ret = vu_packet_check_range((void *)p->buf, offset, len, start);
|
||||
ret = vu_packet_check_range((void *)p->buf, ptr, len);
|
||||
|
||||
if (ret == -1)
|
||||
trace("cannot find region, %s:%i", func, line);
|
||||
debug("cannot find region, %s:%i", func, line);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (start < p->buf) {
|
||||
trace("packet start %p before buffer start %p, "
|
||||
"%s:%i", (void *)start, (void *)p->buf, func, line);
|
||||
if (ptr < p->buf) {
|
||||
debug("packet range start %p before buffer start %p, %s:%i",
|
||||
(void *)ptr, (void *)p->buf, func, line);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (start + len + offset > p->buf + p->buf_size) {
|
||||
trace("packet offset plus length %zu from size %zu, "
|
||||
"%s:%i", start - p->buf + len + offset,
|
||||
p->buf_size, func, line);
|
||||
if (len > p->buf_size) {
|
||||
debug("packet range length %zu larger than buffer %zu, %s:%i",
|
||||
len, p->buf_size, func, line);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ((size_t)(ptr - p->buf) > p->buf_size - len) {
|
||||
debug("packet range %p, len %zu after buffer end %p, %s:%i",
|
||||
(void *)ptr, len, (void *)(p->buf + p->buf_size),
|
||||
func, line);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
/**
|
||||
* pool_full() - Is a packet pool full?
|
||||
* @p: Pointer to packet pool
|
||||
*
|
||||
* Return: true if the pool is full, false if more packets can be added
|
||||
*/
|
||||
bool pool_full(const struct pool *p)
|
||||
{
|
||||
return p->count >= p->size;
|
||||
}
|
||||
|
||||
/**
|
||||
* packet_add_do() - Add data as packet descriptor to given pool
|
||||
* @p: Existing pool
|
||||
* @len: Length of new descriptor
|
||||
* @start: Start of data
|
||||
* @func: For tracing: name of calling function, NULL means no trace()
|
||||
* @func: For tracing: name of calling function
|
||||
* @line: For tracing: caller line of function call
|
||||
*/
|
||||
void packet_add_do(struct pool *p, size_t len, const char *start,
|
||||
|
@ -75,26 +97,63 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
|
|||
{
|
||||
size_t idx = p->count;
|
||||
|
||||
if (idx >= p->size) {
|
||||
trace("add packet index %zu to pool with size %zu, %s:%i",
|
||||
if (pool_full(p)) {
|
||||
debug("add packet index %zu to pool with size %zu, %s:%i",
|
||||
idx, p->size, func, line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (packet_check_range(p, 0, len, start, func, line))
|
||||
if (packet_check_range(p, start, len, func, line))
|
||||
return;
|
||||
|
||||
if (len > UINT16_MAX) {
|
||||
trace("add packet length %zu, %s:%i", len, func, line);
|
||||
return;
|
||||
}
|
||||
|
||||
p->pkt[idx].iov_base = (void *)start;
|
||||
p->pkt[idx].iov_len = len;
|
||||
|
||||
p->count++;
|
||||
}
|
||||
|
||||
/**
|
||||
* packet_get_try_do() - Get data range from packet descriptor from given pool
|
||||
* @p: Packet pool
|
||||
* @idx: Index of packet descriptor in pool
|
||||
* @offset: Offset of data range in packet descriptor
|
||||
* @len: Length of desired data range
|
||||
* @left: Length of available data after range, set on return, can be NULL
|
||||
* @func: For tracing: name of calling function
|
||||
* @line: For tracing: caller line of function call
|
||||
*
|
||||
* Return: pointer to start of data range, NULL on invalid range or descriptor
|
||||
*/
|
||||
void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
|
||||
size_t len, size_t *left, const char *func, int line)
|
||||
{
|
||||
char *ptr;
|
||||
|
||||
ASSERT_WITH_MSG(p->count <= p->size,
|
||||
"Corrupt pool count: %zu, size: %zu, %s:%i",
|
||||
p->count, p->size, func, line);
|
||||
|
||||
if (idx >= p->count) {
|
||||
debug("packet %zu from pool count: %zu, %s:%i",
|
||||
idx, p->count, func, line);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (offset > p->pkt[idx].iov_len ||
|
||||
len > (p->pkt[idx].iov_len - offset))
|
||||
return NULL;
|
||||
|
||||
ptr = (char *)p->pkt[idx].iov_base + offset;
|
||||
|
||||
ASSERT_WITH_MSG(!packet_check_range(p, ptr, len, func, line),
|
||||
"Corrupt packet pool, %s:%i", func, line);
|
||||
|
||||
if (left)
|
||||
*left = p->pkt[idx].iov_len - offset - len;
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* packet_get_do() - Get data range from packet descriptor from given pool
|
||||
* @p: Packet pool
|
||||
|
@ -102,47 +161,24 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
|
|||
* @offset: Offset of data range in packet descriptor
|
||||
* @len: Length of desired data range
|
||||
* @left: Length of available data after range, set on return, can be NULL
|
||||
* @func: For tracing: name of calling function, NULL means no trace()
|
||||
* @func: For tracing: name of calling function
|
||||
* @line: For tracing: caller line of function call
|
||||
*
|
||||
* Return: pointer to start of data range, NULL on invalid range or descriptor
|
||||
* Return: as packet_get_try_do() but log a trace message when returning NULL
|
||||
*/
|
||||
void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
|
||||
size_t len, size_t *left, const char *func, int line)
|
||||
void *packet_get_do(const struct pool *p, const size_t idx,
|
||||
size_t offset, size_t len, size_t *left,
|
||||
const char *func, int line)
|
||||
{
|
||||
if (idx >= p->size || idx >= p->count) {
|
||||
if (func) {
|
||||
trace("packet %zu from pool size: %zu, count: %zu, "
|
||||
"%s:%i", idx, p->size, p->count, func, line);
|
||||
}
|
||||
return NULL;
|
||||
void *r = packet_get_try_do(p, idx, offset, len, left, func, line);
|
||||
|
||||
if (!r) {
|
||||
trace("missing packet data length %zu, offset %zu from "
|
||||
"length %zu, %s:%i",
|
||||
len, offset, p->pkt[idx].iov_len, func, line);
|
||||
}
|
||||
|
||||
if (len > UINT16_MAX) {
|
||||
if (func) {
|
||||
trace("packet data length %zu, %s:%i",
|
||||
len, func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (len + offset > p->pkt[idx].iov_len) {
|
||||
if (func) {
|
||||
trace("data length %zu, offset %zu from length %zu, "
|
||||
"%s:%i", len, offset, p->pkt[idx].iov_len,
|
||||
func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (packet_check_range(p, offset, len, p->pkt[idx].iov_base,
|
||||
func, line))
|
||||
return NULL;
|
||||
|
||||
if (left)
|
||||
*left = p->pkt[idx].iov_len - offset - len;
|
||||
|
||||
return (char *)p->pkt[idx].iov_base + offset;
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
19
packet.h
19
packet.h
|
@ -6,6 +6,11 @@
|
|||
#ifndef PACKET_H
|
||||
#define PACKET_H
|
||||
|
||||
#include <stdbool.h>
|
||||
|
||||
/* Maximum size of a single packet stored in pool, including headers */
|
||||
#define PACKET_MAX_LEN ((size_t)UINT16_MAX)
|
||||
|
||||
/**
|
||||
* struct pool - Generic pool of packets stored in a buffer
|
||||
* @buf: Buffer storing packet descriptors,
|
||||
|
@ -21,27 +26,29 @@ struct pool {
|
|||
size_t buf_size;
|
||||
size_t size;
|
||||
size_t count;
|
||||
struct iovec pkt[1];
|
||||
struct iovec pkt[];
|
||||
};
|
||||
|
||||
int vu_packet_check_range(void *buf, size_t offset, size_t len,
|
||||
const char *start);
|
||||
int vu_packet_check_range(void *buf, const char *ptr, size_t len);
|
||||
void packet_add_do(struct pool *p, size_t len, const char *start,
|
||||
const char *func, int line);
|
||||
void *packet_get_try_do(const struct pool *p, const size_t idx,
|
||||
size_t offset, size_t len, size_t *left,
|
||||
const char *func, int line);
|
||||
void *packet_get_do(const struct pool *p, const size_t idx,
|
||||
size_t offset, size_t len, size_t *left,
|
||||
const char *func, int line);
|
||||
bool pool_full(const struct pool *p);
|
||||
void pool_flush(struct pool *p);
|
||||
|
||||
#define packet_add(p, len, start) \
|
||||
packet_add_do(p, len, start, __func__, __LINE__)
|
||||
|
||||
#define packet_get_try(p, idx, offset, len, left) \
|
||||
packet_get_try_do(p, idx, offset, len, left, __func__, __LINE__)
|
||||
#define packet_get(p, idx, offset, len, left) \
|
||||
packet_get_do(p, idx, offset, len, left, __func__, __LINE__)
|
||||
|
||||
#define packet_get_try(p, idx, offset, len, left) \
|
||||
packet_get_do(p, idx, offset, len, left, NULL, 0)
|
||||
|
||||
#define PACKET_POOL_DECL(_name, _size, _buf) \
|
||||
struct _name ## _t { \
|
||||
char *buf; \
|
||||
|
|
|
@ -16,13 +16,17 @@
|
|||
.B passt-repair
|
||||
is a privileged helper setting and clearing repair mode on TCP sockets on behalf
|
||||
of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain
|
||||
socket, specified by \fIPATH\fR.
|
||||
socket.
|
||||
|
||||
It can be used to migrate TCP connections between guests without granting
|
||||
additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections,
|
||||
\fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR
|
||||
capability (see \fBcapabilities\fR(7)) to be set or cleared.
|
||||
|
||||
If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to
|
||||
connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file
|
||||
ending with \fI.repair\fR appears in it, and then attempts to connect to it.
|
||||
|
||||
.SH PROTOCOL
|
||||
|
||||
\fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via
|
||||
|
|
103
passt-repair.c
103
passt-repair.c
|
@ -16,11 +16,14 @@
|
|||
* off. Reply by echoing the command. Exit on EOF.
|
||||
*/
|
||||
|
||||
#include <sys/inotify.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/un.h>
|
||||
#include <errno.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -39,6 +42,8 @@
|
|||
#include "seccomp_repair.h"
|
||||
|
||||
#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
|
||||
#define REPAIR_EXT ".repair"
|
||||
#define REPAIR_EXT_LEN strlen(REPAIR_EXT)
|
||||
|
||||
/**
|
||||
* main() - Entry point and whole program with loop
|
||||
|
@ -51,6 +56,9 @@
|
|||
* #syscalls:repair socket s390x:socketcall i686:socketcall
|
||||
* #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv
|
||||
* #syscalls:repair sendto sendmsg arm:send ppc64le:send
|
||||
* #syscalls:repair stat|statx stat64|statx statx
|
||||
* #syscalls:repair fstat|fstat64 newfstatat|fstatat64
|
||||
* #syscalls:repair inotify_init1 inotify_add_watch
|
||||
*/
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
|
@ -58,12 +66,14 @@ int main(int argc, char **argv)
|
|||
__attribute__ ((aligned(__alignof__(struct cmsghdr))));
|
||||
struct sockaddr_un a = { AF_UNIX, "" };
|
||||
int fds[SCM_MAX_FD], s, ret, i, n = 0;
|
||||
bool inotify_dir = false;
|
||||
struct sock_fprog prog;
|
||||
int8_t cmd = INT8_MAX;
|
||||
struct cmsghdr *cmsg;
|
||||
struct msghdr msg;
|
||||
struct iovec iov;
|
||||
size_t cmsg_len;
|
||||
struct stat sb;
|
||||
int op;
|
||||
|
||||
prctl(PR_SET_DUMPABLE, 0);
|
||||
|
@ -90,19 +100,96 @@ int main(int argc, char **argv)
|
|||
_exit(2);
|
||||
}
|
||||
|
||||
ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
|
||||
if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
|
||||
fprintf(stderr, "Invalid socket path: %s\n", argv[1]);
|
||||
_exit(2);
|
||||
}
|
||||
|
||||
if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
|
||||
fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
if (connect(s, (struct sockaddr *)&a, sizeof(a))) {
|
||||
fprintf(stderr, "Failed to connect to %s: %s\n", argv[1],
|
||||
if ((stat(argv[1], &sb))) {
|
||||
fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
if ((sb.st_mode & S_IFMT) == S_IFDIR) {
|
||||
char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
|
||||
__attribute__ ((aligned(__alignof__(struct inotify_event))));
|
||||
const struct inotify_event *ev = NULL;
|
||||
char path[PATH_MAX + 1];
|
||||
bool found = false;
|
||||
ssize_t n;
|
||||
int fd;
|
||||
|
||||
if ((fd = inotify_init1(IN_CLOEXEC)) < 0) {
|
||||
fprintf(stderr, "inotify_init1: %i\n", errno);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) {
|
||||
fprintf(stderr, "inotify_add_watch: %i\n", errno);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
do {
|
||||
char *p;
|
||||
|
||||
n = read(fd, buf, sizeof(buf));
|
||||
if (n < 0) {
|
||||
fprintf(stderr, "inotify read: %i", errno);
|
||||
_exit(1);
|
||||
}
|
||||
buf[n - 1] = '\0';
|
||||
|
||||
if (n < (ssize_t)sizeof(*ev)) {
|
||||
fprintf(stderr, "Short inotify read: %zi", n);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
|
||||
ev = (const struct inotify_event *)p;
|
||||
|
||||
if (ev->len >= REPAIR_EXT_LEN &&
|
||||
!memcmp(ev->name +
|
||||
strnlen(ev->name, ev->len) -
|
||||
REPAIR_EXT_LEN,
|
||||
REPAIR_EXT, REPAIR_EXT_LEN)) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (!found);
|
||||
|
||||
if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') {
|
||||
fprintf(stderr, "Invalid filename from inotify\n");
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name);
|
||||
if ((stat(path, &sb))) {
|
||||
fprintf(stderr, "Can't stat() %s: %i\n", path, errno);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", path);
|
||||
inotify_dir = true;
|
||||
} else {
|
||||
ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
|
||||
}
|
||||
|
||||
if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
|
||||
fprintf(stderr, "Invalid socket path");
|
||||
_exit(2);
|
||||
}
|
||||
|
||||
if ((sb.st_mode & S_IFMT) != S_IFSOCK) {
|
||||
fprintf(stderr, "%s is not a socket\n", a.sun_path);
|
||||
_exit(2);
|
||||
}
|
||||
|
||||
while (connect(s, (struct sockaddr *)&a, sizeof(a))) {
|
||||
if (inotify_dir && errno == ECONNREFUSED)
|
||||
continue;
|
||||
|
||||
fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path,
|
||||
strerror(errno));
|
||||
_exit(1);
|
||||
}
|
||||
|
|
24
passt.c
24
passt.c
|
@ -68,7 +68,7 @@ char *epoll_type_str[] = {
|
|||
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
|
||||
[EPOLL_TYPE_TCP_TIMER] = "TCP timer",
|
||||
[EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket",
|
||||
[EPOLL_TYPE_UDP_REPLY] = "UDP reply socket",
|
||||
[EPOLL_TYPE_UDP] = "UDP flow socket",
|
||||
[EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket",
|
||||
[EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch",
|
||||
[EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch",
|
||||
|
@ -166,7 +166,7 @@ void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
|
|||
*
|
||||
* #syscalls exit_group
|
||||
*/
|
||||
void exit_handler(int signal)
|
||||
static void exit_handler(int signal)
|
||||
{
|
||||
(void)signal;
|
||||
|
||||
|
@ -191,7 +191,6 @@ int main(int argc, char **argv)
|
|||
{
|
||||
struct epoll_event events[EPOLL_EVENTS];
|
||||
int nfds, i, devnull_fd = -1;
|
||||
char argv0[PATH_MAX], *name;
|
||||
struct ctx c = { 0 };
|
||||
struct rlimit limit;
|
||||
struct timespec now;
|
||||
|
@ -213,27 +212,18 @@ int main(int argc, char **argv)
|
|||
sigaction(SIGTERM, &sa, NULL);
|
||||
sigaction(SIGQUIT, &sa, NULL);
|
||||
|
||||
if (argc < 1)
|
||||
_exit(EXIT_FAILURE);
|
||||
c.mode = conf_mode(argc, argv);
|
||||
|
||||
strncpy(argv0, argv[0], PATH_MAX - 1);
|
||||
name = basename(argv0);
|
||||
if (strstr(name, "pasta")) {
|
||||
if (c.mode == MODE_PASTA) {
|
||||
sa.sa_handler = pasta_child_handler;
|
||||
if (sigaction(SIGCHLD, &sa, NULL))
|
||||
die_perror("Couldn't install signal handlers");
|
||||
|
||||
c.mode = MODE_PASTA;
|
||||
} else if (strstr(name, "passt")) {
|
||||
c.mode = MODE_PASST;
|
||||
} else {
|
||||
_exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
|
||||
die_perror("Couldn't set disposition for SIGPIPE");
|
||||
|
||||
madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
|
||||
madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE);
|
||||
|
||||
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
|
||||
if (c.epollfd == -1)
|
||||
|
@ -349,8 +339,8 @@ loop:
|
|||
case EPOLL_TYPE_UDP_LISTEN:
|
||||
udp_listen_sock_handler(&c, ref, eventmask, &now);
|
||||
break;
|
||||
case EPOLL_TYPE_UDP_REPLY:
|
||||
udp_reply_sock_handler(&c, ref, eventmask, &now);
|
||||
case EPOLL_TYPE_UDP:
|
||||
udp_sock_handler(&c, ref, eventmask, &now);
|
||||
break;
|
||||
case EPOLL_TYPE_PING:
|
||||
icmp_sock_handler(&c, ref);
|
||||
|
|
10
passt.h
10
passt.h
|
@ -69,12 +69,9 @@ union epoll_ref {
|
|||
static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
|
||||
"epoll_ref must have same size as epoll_data");
|
||||
|
||||
#define TAP_BUF_BYTES \
|
||||
ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
|
||||
#define TAP_MSGS \
|
||||
DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
|
||||
/* Large enough for ~128 maximum size frames */
|
||||
#define PKT_BUF_BYTES (8UL << 20)
|
||||
|
||||
#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0)
|
||||
extern char pkt_buf [PKT_BUF_BYTES];
|
||||
|
||||
extern char *epoll_type_str[];
|
||||
|
@ -274,6 +271,8 @@ struct ctx {
|
|||
int fd_repair;
|
||||
unsigned char our_tap_mac[ETH_ALEN];
|
||||
unsigned char guest_mac[ETH_ALEN];
|
||||
uint16_t mtu;
|
||||
|
||||
uint64_t hash_secret[2];
|
||||
|
||||
int ifi4;
|
||||
|
@ -298,7 +297,6 @@ struct ctx {
|
|||
int no_icmp;
|
||||
struct icmp_ctx icmp;
|
||||
|
||||
int mtu;
|
||||
int no_dns;
|
||||
int no_dns_search;
|
||||
int no_dhcp_dns;
|
||||
|
|
22
pasta.c
22
pasta.c
|
@ -319,7 +319,7 @@ void pasta_ns_conf(struct ctx *c)
|
|||
if (c->pasta_conf_ns) {
|
||||
unsigned int flags = IFF_UP;
|
||||
|
||||
if (c->mtu != -1)
|
||||
if (c->mtu)
|
||||
nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu);
|
||||
|
||||
if (c->ifi6) /* Avoid duplicate address detection on link up */
|
||||
|
@ -498,17 +498,23 @@ void pasta_netns_quit_init(const struct ctx *c)
|
|||
*/
|
||||
void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd)
|
||||
{
|
||||
char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
|
||||
const struct inotify_event *in_ev = (struct inotify_event *)buf;
|
||||
char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
|
||||
__attribute__ ((aligned(__alignof__(struct inotify_event))));
|
||||
const struct inotify_event *ev;
|
||||
ssize_t n;
|
||||
char *p;
|
||||
|
||||
if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev))
|
||||
if ((n = read(inotify_fd, buf, sizeof(buf))) < (ssize_t)sizeof(*ev))
|
||||
return;
|
||||
|
||||
if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base)))
|
||||
return;
|
||||
for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
|
||||
ev = (const struct inotify_event *)p;
|
||||
|
||||
info("Namespace %s is gone, exiting", c->netns_base);
|
||||
_exit(EXIT_SUCCESS);
|
||||
if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) {
|
||||
info("Namespace %s is gone, exiting", c->netns_base);
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
46
pcap.c
46
pcap.c
|
@ -33,33 +33,12 @@
|
|||
#include "log.h"
|
||||
#include "pcap.h"
|
||||
#include "iov.h"
|
||||
#include "tap.h"
|
||||
|
||||
#define PCAP_VERSION_MINOR 4
|
||||
|
||||
static int pcap_fd = -1;
|
||||
|
||||
/* See pcap.h from libpcap, or pcap-savefile(5) */
|
||||
static const struct {
|
||||
uint32_t magic;
|
||||
#define PCAP_MAGIC 0xa1b2c3d4
|
||||
|
||||
uint16_t major;
|
||||
#define PCAP_VERSION_MAJOR 2
|
||||
|
||||
uint16_t minor;
|
||||
#define PCAP_VERSION_MINOR 4
|
||||
|
||||
int32_t thiszone;
|
||||
uint32_t sigfigs;
|
||||
uint32_t snaplen;
|
||||
|
||||
uint32_t linktype;
|
||||
#define PCAP_LINKTYPE_ETHERNET 1
|
||||
} pcap_hdr = {
|
||||
PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU,
|
||||
PCAP_LINKTYPE_ETHERNET
|
||||
};
|
||||
|
||||
struct pcap_pkthdr {
|
||||
uint32_t tv_sec;
|
||||
uint32_t tv_usec;
|
||||
|
@ -162,6 +141,29 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
|
|||
*/
|
||||
void pcap_init(struct ctx *c)
|
||||
{
|
||||
/* See pcap.h from libpcap, or pcap-savefile(5) */
|
||||
#define PCAP_MAGIC 0xa1b2c3d4
|
||||
#define PCAP_VERSION_MAJOR 2
|
||||
#define PCAP_VERSION_MINOR 4
|
||||
#define PCAP_LINKTYPE_ETHERNET 1
|
||||
const struct {
|
||||
uint32_t magic;
|
||||
uint16_t major;
|
||||
uint16_t minor;
|
||||
|
||||
int32_t thiszone;
|
||||
uint32_t sigfigs;
|
||||
uint32_t snaplen;
|
||||
|
||||
uint32_t linktype;
|
||||
} pcap_hdr = {
|
||||
.magic = PCAP_MAGIC,
|
||||
.major = PCAP_VERSION_MAJOR,
|
||||
.minor = PCAP_VERSION_MINOR,
|
||||
.snaplen = tap_l2_max_len(c),
|
||||
.linktype = PCAP_LINKTYPE_ETHERNET
|
||||
};
|
||||
|
||||
if (pcap_fd != -1)
|
||||
return;
|
||||
|
||||
|
|
32
repair.c
32
repair.c
|
@ -27,6 +27,10 @@
|
|||
|
||||
#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
|
||||
|
||||
/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */
|
||||
#define REPAIR_ACCEPT_TIMEOUT_MS 10
|
||||
#define REPAIR_ACCEPT_TIMEOUT_US (REPAIR_ACCEPT_TIMEOUT_MS * 1000)
|
||||
|
||||
/* Pending file descriptors for next repair_flush() call, or command change */
|
||||
static int repair_fds[SCM_MAX_FD];
|
||||
|
||||
|
@ -138,6 +142,34 @@ void repair_handler(struct ctx *c, uint32_t events)
|
|||
repair_close(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect
|
||||
* @c: Execution context
|
||||
*/
|
||||
void repair_wait(struct ctx *c)
|
||||
{
|
||||
struct timeval tv = { .tv_sec = 0,
|
||||
.tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) };
|
||||
static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000,
|
||||
".tv_usec is greater than 1000 * 1000");
|
||||
|
||||
if (c->fd_repair >= 0 || c->fd_repair_listen == -1)
|
||||
return;
|
||||
|
||||
if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
|
||||
&tv, sizeof(tv))) {
|
||||
err_perror("Set timeout on TCP_REPAIR listening socket");
|
||||
return;
|
||||
}
|
||||
|
||||
repair_listen_handler(c, EPOLLIN);
|
||||
|
||||
tv.tv_usec = 0;
|
||||
if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
|
||||
&tv, sizeof(tv)))
|
||||
err_perror("Clear timeout on TCP_REPAIR listening socket");
|
||||
}
|
||||
|
||||
/**
|
||||
* repair_flush() - Flush current set of sockets to helper, with current command
|
||||
* @c: Execution context
|
||||
|
|
1
repair.h
1
repair.h
|
@ -10,6 +10,7 @@ void repair_sock_init(const struct ctx *c);
|
|||
void repair_listen_handler(struct ctx *c, uint32_t events);
|
||||
void repair_handler(struct ctx *c, uint32_t events);
|
||||
void repair_close(struct ctx *c);
|
||||
void repair_wait(struct ctx *c);
|
||||
int repair_flush(struct ctx *c);
|
||||
int repair_set(struct ctx *c, int s, int cmd);
|
||||
|
||||
|
|
|
@ -255,7 +255,7 @@ for __p in ${__profiles}; do
|
|||
__calls="${__calls} ${EXTRA_SYSCALLS:-}"
|
||||
__calls="$(filter ${__calls})"
|
||||
|
||||
cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
|
||||
cols="$(stty -a 2>/dev/null | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
|
||||
case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac
|
||||
echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args}
|
||||
|
||||
|
|
215
tap.c
215
tap.c
|
@ -62,13 +62,64 @@
|
|||
#include "vhost_user.h"
|
||||
#include "vu_common.h"
|
||||
|
||||
/* Maximum allowed frame lengths (including L2 header) */
|
||||
|
||||
/* Verify that an L2 frame length limit is large enough to contain the header,
|
||||
* but small enough to fit in the packet pool
|
||||
*/
|
||||
#define CHECK_FRAME_LEN(len) \
|
||||
static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN, \
|
||||
#len " has bad value")
|
||||
|
||||
CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
|
||||
CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
|
||||
CHECK_FRAME_LEN(L2_MAX_LEN_VU);
|
||||
|
||||
/* We try size the packet pools so that we can use a single batch for the entire
|
||||
* packet buffer. This might be exceeded for vhost-user, though, which uses its
|
||||
* own buffers rather than pkt_buf.
|
||||
*
|
||||
* This is just a tuning parameter, the code will work with slightly more
|
||||
* overhead if it's incorrect. So, we estimate based on the minimum practical
|
||||
* frame size - an empty UDP datagram - rather than the minimum theoretical
|
||||
* frame size.
|
||||
*
|
||||
* FIXME: Profile to work out how big this actually needs to be to amortise
|
||||
* per-batch syscall overheads
|
||||
*/
|
||||
#define TAP_MSGS_IP4 \
|
||||
DIV_ROUND_UP(sizeof(pkt_buf), \
|
||||
ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr))
|
||||
#define TAP_MSGS_IP6 \
|
||||
DIV_ROUND_UP(sizeof(pkt_buf), \
|
||||
ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct udphdr))
|
||||
|
||||
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
|
||||
static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
|
||||
static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
|
||||
static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS_IP4, pkt_buf);
|
||||
static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf);
|
||||
|
||||
#define TAP_SEQS 128 /* Different L4 tuples in one batch */
|
||||
#define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */
|
||||
|
||||
/**
|
||||
* tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
|
||||
* @c: Execution context
|
||||
*/
|
||||
unsigned long tap_l2_max_len(const struct ctx *c)
|
||||
{
|
||||
/* NOLINTBEGIN(bugprone-branch-clone): values can be the same */
|
||||
switch (c->mode) {
|
||||
case MODE_PASST:
|
||||
return L2_MAX_LEN_PASST;
|
||||
case MODE_PASTA:
|
||||
return L2_MAX_LEN_PASTA;
|
||||
case MODE_VU:
|
||||
return L2_MAX_LEN_VU;
|
||||
}
|
||||
/* NOLINTEND(bugprone-branch-clone) */
|
||||
ASSERT(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_send_single() - Send a single frame
|
||||
* @c: Execution context
|
||||
|
@ -122,7 +173,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
|
|||
*
|
||||
* Return: pointer at which to write the packet's payload
|
||||
*/
|
||||
static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
|
||||
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
|
||||
{
|
||||
struct ethhdr *eh = (struct ethhdr *)buf;
|
||||
|
||||
|
@ -143,8 +194,8 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
|
|||
*
|
||||
* Return: pointer at which to write the packet's payload
|
||||
*/
|
||||
static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
|
||||
struct in_addr dst, size_t l4len, uint8_t proto)
|
||||
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
|
||||
struct in_addr dst, size_t l4len, uint8_t proto)
|
||||
{
|
||||
uint16_t l3len = l4len + sizeof(*ip4h);
|
||||
|
||||
|
@ -153,17 +204,17 @@ static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
|
|||
ip4h->tos = 0;
|
||||
ip4h->tot_len = htons(l3len);
|
||||
ip4h->id = 0;
|
||||
ip4h->frag_off = 0;
|
||||
ip4h->frag_off = htons(IP_DF);
|
||||
ip4h->ttl = 255;
|
||||
ip4h->protocol = proto;
|
||||
ip4h->saddr = src.s_addr;
|
||||
ip4h->daddr = dst.s_addr;
|
||||
ip4h->check = csum_ip4_header(l3len, proto, src, dst);
|
||||
return ip4h + 1;
|
||||
return (char *)ip4h + sizeof(*ip4h);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_udp4_send() - Send UDP over IPv4 packet
|
||||
* tap_push_uh4() - Build UDPv4 header with checksum
|
||||
* @c: Execution context
|
||||
* @src: IPv4 source address
|
||||
* @sport: UDP source port
|
||||
|
@ -171,16 +222,14 @@ static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
|
|||
* @dport: UDP destination port
|
||||
* @in: UDP payload contents (not including UDP header)
|
||||
* @dlen: UDP payload length (not including UDP header)
|
||||
*
|
||||
* Return: pointer at which to write the packet's payload
|
||||
*/
|
||||
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
||||
void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
|
||||
struct in_addr dst, in_port_t dport,
|
||||
const void *in, size_t dlen)
|
||||
{
|
||||
size_t l4len = dlen + sizeof(struct udphdr);
|
||||
char buf[USHRT_MAX];
|
||||
struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
|
||||
struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
|
||||
char *data = (char *)(uh + 1);
|
||||
const struct iovec iov = {
|
||||
.iov_base = (void *)in,
|
||||
.iov_len = dlen
|
||||
|
@ -191,8 +240,30 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
|||
uh->dest = htons(dport);
|
||||
uh->len = htons(l4len);
|
||||
csum_udp4(uh, src, dst, &payload);
|
||||
memcpy(data, in, dlen);
|
||||
return (char *)uh + sizeof(*uh);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_udp4_send() - Send UDP over IPv4 packet
|
||||
* @c: Execution context
|
||||
* @src: IPv4 source address
|
||||
* @sport: UDP source port
|
||||
* @dst: IPv4 destination address
|
||||
* @dport: UDP destination port
|
||||
* @in: UDP payload contents (not including UDP header)
|
||||
* @dlen: UDP payload length (not including UDP header)
|
||||
*/
|
||||
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
||||
struct in_addr dst, in_port_t dport,
|
||||
const void *in, size_t dlen)
|
||||
{
|
||||
size_t l4len = dlen + sizeof(struct udphdr);
|
||||
char buf[USHRT_MAX];
|
||||
struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
|
||||
struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
|
||||
char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
|
||||
|
||||
memcpy(data, in, dlen);
|
||||
tap_send_single(c, buf, dlen + (data - buf));
|
||||
}
|
||||
|
||||
|
@ -229,10 +300,9 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
|
|||
*
|
||||
* Return: pointer at which to write the packet's payload
|
||||
*/
|
||||
static void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
||||
const struct in6_addr *src,
|
||||
const struct in6_addr *dst,
|
||||
size_t l4len, uint8_t proto, uint32_t flow)
|
||||
void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
||||
const struct in6_addr *src, const struct in6_addr *dst,
|
||||
size_t l4len, uint8_t proto, uint32_t flow)
|
||||
{
|
||||
ip6h->payload_len = htons(l4len);
|
||||
ip6h->priority = 0;
|
||||
|
@ -241,10 +311,40 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
|||
ip6h->hop_limit = 255;
|
||||
ip6h->saddr = *src;
|
||||
ip6h->daddr = *dst;
|
||||
ip6h->flow_lbl[0] = (flow >> 16) & 0xf;
|
||||
ip6h->flow_lbl[1] = (flow >> 8) & 0xff;
|
||||
ip6h->flow_lbl[2] = (flow >> 0) & 0xff;
|
||||
return ip6h + 1;
|
||||
ip6_set_flow_lbl(ip6h, flow);
|
||||
return (char *)ip6h + sizeof(*ip6h);
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_push_uh6() - Build UDPv6 header with checksum
|
||||
* @c: Execution context
|
||||
* @src: IPv6 source address
|
||||
* @sport: UDP source port
|
||||
* @dst: IPv6 destination address
|
||||
* @dport: UDP destination port
|
||||
* @flow: Flow label
|
||||
* @in: UDP payload contents (not including UDP header)
|
||||
* @dlen: UDP payload length (not including UDP header)
|
||||
*
|
||||
* Return: pointer at which to write the packet's payload
|
||||
*/
|
||||
void *tap_push_uh6(struct udphdr *uh,
|
||||
const struct in6_addr *src, in_port_t sport,
|
||||
const struct in6_addr *dst, in_port_t dport,
|
||||
void *in, size_t dlen)
|
||||
{
|
||||
size_t l4len = dlen + sizeof(struct udphdr);
|
||||
const struct iovec iov = {
|
||||
.iov_base = in,
|
||||
.iov_len = dlen
|
||||
};
|
||||
struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
|
||||
|
||||
uh->source = htons(sport);
|
||||
uh->dest = htons(dport);
|
||||
uh->len = htons(l4len);
|
||||
csum_udp6(uh, src, dst, &payload);
|
||||
return (char *)uh + sizeof(*uh);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -255,7 +355,7 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
|||
* @dst: IPv6 destination address
|
||||
* @dport: UDP destination port
|
||||
* @flow: Flow label
|
||||
* @in: UDP payload contents (not including UDP header)
|
||||
* @in: UDP payload contents (not including UDP header)
|
||||
* @dlen: UDP payload length (not including UDP header)
|
||||
*/
|
||||
void tap_udp6_send(const struct ctx *c,
|
||||
|
@ -268,19 +368,9 @@ void tap_udp6_send(const struct ctx *c,
|
|||
struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
|
||||
struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
|
||||
l4len, IPPROTO_UDP, flow);
|
||||
char *data = (char *)(uh + 1);
|
||||
const struct iovec iov = {
|
||||
.iov_base = in,
|
||||
.iov_len = dlen
|
||||
};
|
||||
struct iov_tail payload = IOV_TAIL(&iov, 1, 0);
|
||||
char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
|
||||
|
||||
uh->source = htons(sport);
|
||||
uh->dest = htons(dport);
|
||||
uh->len = htons(l4len);
|
||||
csum_udp6(uh, src, dst, &payload);
|
||||
memcpy(data, in, dlen);
|
||||
|
||||
tap_send_single(c, buf, dlen + (data - buf));
|
||||
}
|
||||
|
||||
|
@ -469,6 +559,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
|
|||
* struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
|
||||
* @msgs: Count of messages in sequence
|
||||
* @protocol: Protocol number
|
||||
* @ttl: Time to live
|
||||
* @source: Source port
|
||||
* @dest: Destination port
|
||||
* @saddr: Source address
|
||||
|
@ -477,6 +568,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
|
|||
*/
|
||||
static struct tap4_l4_t {
|
||||
uint8_t protocol;
|
||||
uint8_t ttl;
|
||||
|
||||
uint16_t source;
|
||||
uint16_t dest;
|
||||
|
@ -491,14 +583,17 @@ static struct tap4_l4_t {
|
|||
* struct l4_seq6_t - Message sequence for one protocol handler call, IPv6
|
||||
* @msgs: Count of messages in sequence
|
||||
* @protocol: Protocol number
|
||||
* @flow_lbl: IPv6 flow label
|
||||
* @source: Source port
|
||||
* @dest: Destination port
|
||||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
* @hop_limit: Hop limit
|
||||
* @msg: Array of messages that can be handled in a single call
|
||||
*/
|
||||
static struct tap6_l4_t {
|
||||
uint8_t protocol;
|
||||
uint32_t flow_lbl :20;
|
||||
|
||||
uint16_t source;
|
||||
uint16_t dest;
|
||||
|
@ -506,6 +601,8 @@ static struct tap6_l4_t {
|
|||
struct in6_addr saddr;
|
||||
struct in6_addr daddr;
|
||||
|
||||
uint8_t hop_limit;
|
||||
|
||||
struct pool_l4_t p;
|
||||
} tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
|
||||
|
||||
|
@ -694,7 +791,8 @@ resume:
|
|||
#define L4_MATCH(iph, uh, seq) \
|
||||
((seq)->protocol == (iph)->protocol && \
|
||||
(seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \
|
||||
(seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
|
||||
(seq)->saddr.s_addr == (iph)->saddr && \
|
||||
(seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
|
||||
|
||||
#define L4_SET(iph, uh, seq) \
|
||||
do { \
|
||||
|
@ -703,6 +801,7 @@ resume:
|
|||
(seq)->dest = (uh)->dest; \
|
||||
(seq)->saddr.s_addr = (iph)->saddr; \
|
||||
(seq)->daddr.s_addr = (iph)->daddr; \
|
||||
(seq)->ttl = (iph)->ttl; \
|
||||
} while (0)
|
||||
|
||||
if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
|
||||
|
@ -744,14 +843,14 @@ append:
|
|||
for (k = 0; k < p->count; )
|
||||
k += tcp_tap_handler(c, PIF_TAP, AF_INET,
|
||||
&seq->saddr, &seq->daddr,
|
||||
p, k, now);
|
||||
0, p, k, now);
|
||||
} else if (seq->protocol == IPPROTO_UDP) {
|
||||
if (c->no_udp)
|
||||
continue;
|
||||
for (k = 0; k < p->count; )
|
||||
k += udp_tap_handler(c, PIF_TAP, AF_INET,
|
||||
&seq->saddr, &seq->daddr,
|
||||
p, k, now);
|
||||
seq->ttl, p, k, now);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -872,16 +971,20 @@ resume:
|
|||
((seq)->protocol == (proto) && \
|
||||
(seq)->source == (uh)->source && \
|
||||
(seq)->dest == (uh)->dest && \
|
||||
(seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \
|
||||
IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \
|
||||
IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
|
||||
IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \
|
||||
(seq)->hop_limit == (ip6h)->hop_limit)
|
||||
|
||||
#define L4_SET(ip6h, proto, uh, seq) \
|
||||
do { \
|
||||
(seq)->protocol = (proto); \
|
||||
(seq)->source = (uh)->source; \
|
||||
(seq)->dest = (uh)->dest; \
|
||||
(seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \
|
||||
(seq)->saddr = *saddr; \
|
||||
(seq)->daddr = *daddr; \
|
||||
(seq)->hop_limit = (ip6h)->hop_limit; \
|
||||
} while (0)
|
||||
|
||||
if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
|
||||
|
@ -925,14 +1028,14 @@ append:
|
|||
for (k = 0; k < p->count; )
|
||||
k += tcp_tap_handler(c, PIF_TAP, AF_INET6,
|
||||
&seq->saddr, &seq->daddr,
|
||||
p, k, now);
|
||||
seq->flow_lbl, p, k, now);
|
||||
} else if (seq->protocol == IPPROTO_UDP) {
|
||||
if (c->no_udp)
|
||||
continue;
|
||||
for (k = 0; k < p->count; )
|
||||
k += udp_tap_handler(c, PIF_TAP, AF_INET6,
|
||||
&seq->saddr, &seq->daddr,
|
||||
p, k, now);
|
||||
seq->hop_limit, p, k, now);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -967,8 +1070,10 @@ void tap_handler(struct ctx *c, const struct timespec *now)
|
|||
* @c: Execution context
|
||||
* @l2len: Total L2 packet length
|
||||
* @p: Packet buffer
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
|
||||
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
|
||||
const struct timespec *now)
|
||||
{
|
||||
const struct ethhdr *eh;
|
||||
|
||||
|
@ -984,9 +1089,17 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
|
|||
switch (ntohs(eh->h_proto)) {
|
||||
case ETH_P_ARP:
|
||||
case ETH_P_IP:
|
||||
if (pool_full(pool_tap4)) {
|
||||
tap4_handler(c, pool_tap4, now);
|
||||
pool_flush(pool_tap4);
|
||||
}
|
||||
packet_add(pool_tap4, l2len, p);
|
||||
break;
|
||||
case ETH_P_IPV6:
|
||||
if (pool_full(pool_tap6)) {
|
||||
tap6_handler(c, pool_tap6, now);
|
||||
pool_flush(pool_tap6);
|
||||
}
|
||||
packet_add(pool_tap6, l2len, p);
|
||||
break;
|
||||
default:
|
||||
|
@ -1037,7 +1150,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
|
|||
|
||||
do {
|
||||
n = recv(c->fd_tap, pkt_buf + partial_len,
|
||||
TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
|
||||
sizeof(pkt_buf) - partial_len, MSG_DONTWAIT);
|
||||
} while ((n < 0) && errno == EINTR);
|
||||
|
||||
if (n < 0) {
|
||||
|
@ -1054,7 +1167,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
|
|||
while (n >= (ssize_t)sizeof(uint32_t)) {
|
||||
uint32_t l2len = ntohl_unaligned(p);
|
||||
|
||||
if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) {
|
||||
if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) {
|
||||
err("Bad frame size from guest, resetting connection");
|
||||
tap_sock_reset(c);
|
||||
return;
|
||||
|
@ -1067,7 +1180,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
|
|||
p += sizeof(uint32_t);
|
||||
n -= sizeof(uint32_t);
|
||||
|
||||
tap_add_packet(c, l2len, p);
|
||||
tap_add_packet(c, l2len, p, now);
|
||||
|
||||
p += l2len;
|
||||
n -= l2len;
|
||||
|
@ -1108,8 +1221,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
|
|||
|
||||
tap_flush_pools();
|
||||
|
||||
for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
|
||||
len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
|
||||
for (n = 0;
|
||||
n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA);
|
||||
n += len) {
|
||||
len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA);
|
||||
|
||||
if (len == 0) {
|
||||
die("EOF on tap device, exiting");
|
||||
|
@ -1127,10 +1242,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
|
|||
|
||||
/* Ignore frames of bad length */
|
||||
if (len < (ssize_t)sizeof(struct ethhdr) ||
|
||||
len > (ssize_t)ETH_MAX_MTU)
|
||||
len > (ssize_t)L2_MAX_LEN_PASTA)
|
||||
continue;
|
||||
|
||||
tap_add_packet(c, len, pkt_buf + n);
|
||||
tap_add_packet(c, len, pkt_buf + n, now);
|
||||
}
|
||||
|
||||
tap_handler(c, now);
|
||||
|
@ -1328,8 +1443,8 @@ void tap_sock_update_pool(void *base, size_t size)
|
|||
{
|
||||
int i;
|
||||
|
||||
pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, base, size);
|
||||
pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, base, size);
|
||||
pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS_IP4, base, size);
|
||||
pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS_IP6, base, size);
|
||||
|
||||
for (i = 0; i < TAP_SEQS; i++) {
|
||||
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
|
||||
|
|
50
tap.h
50
tap.h
|
@ -6,7 +6,32 @@
|
|||
#ifndef TAP_H
|
||||
#define TAP_H
|
||||
|
||||
#define ETH_HDR_INIT(proto) { .h_proto = htons_constant(proto) }
|
||||
/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header)
|
||||
*
|
||||
* The kernel tuntap device imposes a maximum frame size of 65535 including
|
||||
* 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode).
|
||||
*/
|
||||
#define L2_MAX_LEN_PASTA USHRT_MAX
|
||||
|
||||
/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header)
|
||||
*
|
||||
* The only structural limit the QEMU socket protocol imposes on frames is
|
||||
* (2^32-1) bytes, but that would be ludicrously long in practice. For now,
|
||||
* limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate
|
||||
* limit with more precision.
|
||||
*/
|
||||
#define L2_MAX_LEN_PASST USHRT_MAX
|
||||
|
||||
/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header)
|
||||
*
|
||||
* vhost-user allows multiple buffers per frame, each of which can be quite
|
||||
* large, so the inherent frame size limit is rather large. Much larger than is
|
||||
* actually useful for IP. For now limit arbitrarily to 65535 bytes. FIXME:
|
||||
* Work out an appropriate limit with more precision.
|
||||
*/
|
||||
#define L2_MAX_LEN_VU USHRT_MAX
|
||||
|
||||
struct udphdr;
|
||||
|
||||
/**
|
||||
* struct tap_hdr - tap backend specific headers
|
||||
|
@ -44,6 +69,23 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
|
|||
thdr->vnet_len = htonl(l2len);
|
||||
}
|
||||
|
||||
unsigned long tap_l2_max_len(const struct ctx *c);
|
||||
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
|
||||
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
|
||||
struct in_addr dst, size_t l4len, uint8_t proto);
|
||||
void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
|
||||
struct in_addr dst, in_port_t dport,
|
||||
const void *in, size_t dlen);
|
||||
void *tap_push_uh6(struct udphdr *uh,
|
||||
const struct in6_addr *src, in_port_t sport,
|
||||
const struct in6_addr *dst, in_port_t dport,
|
||||
void *in, size_t dlen);
|
||||
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
|
||||
struct in_addr dst, size_t l4len, uint8_t proto);
|
||||
void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
||||
const struct in6_addr *src,
|
||||
const struct in6_addr *dst,
|
||||
size_t l4len, uint8_t proto, uint32_t flow);
|
||||
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
||||
struct in_addr dst, in_port_t dport,
|
||||
const void *in, size_t dlen);
|
||||
|
@ -51,6 +93,9 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
|
|||
const void *in, size_t l4len);
|
||||
const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
|
||||
const struct in6_addr *src);
|
||||
void *tap_push_ip6h(struct ipv6hdr *ip6h,
|
||||
const struct in6_addr *src, const struct in6_addr *dst,
|
||||
size_t l4len, uint8_t proto, uint32_t flow);
|
||||
void tap_udp6_send(const struct ctx *c,
|
||||
const struct in6_addr *src, in_port_t sport,
|
||||
const struct in6_addr *dst, in_port_t dport,
|
||||
|
@ -74,6 +119,7 @@ void tap_sock_update_pool(void *base, size_t size);
|
|||
void tap_backend_init(struct ctx *c);
|
||||
void tap_flush_pools(void);
|
||||
void tap_handler(struct ctx *c, const struct timespec *now);
|
||||
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
|
||||
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
|
||||
const struct timespec *now);
|
||||
|
||||
#endif /* TAP_H */
|
||||
|
|
3
tcp.h
3
tcp.h
|
@ -16,7 +16,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
|
|||
void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events);
|
||||
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
const void *saddr, const void *daddr,
|
||||
const void *saddr, const void *daddr, uint32_t flow_lbl,
|
||||
const struct pool *p, int idx, const struct timespec *now);
|
||||
int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
|
||||
const char *ifname, in_port_t port);
|
||||
|
@ -25,7 +25,6 @@ void tcp_timer(struct ctx *c, const struct timespec *now);
|
|||
void tcp_defer_handler(struct ctx *c);
|
||||
|
||||
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
|
||||
int tcp_set_peek_offset(int s, int offset);
|
||||
|
||||
extern bool peek_offset_cap;
|
||||
|
||||
|
|
|
@ -125,7 +125,7 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
|
|||
|
||||
conn->seq_to_tap = seq;
|
||||
peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
if (tcp_set_peek_offset(conn->sock, peek_offset))
|
||||
if (tcp_set_peek_offset(conn, peek_offset))
|
||||
tcp_rst(c, conn);
|
||||
}
|
||||
}
|
||||
|
@ -304,7 +304,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
already_sent = 0;
|
||||
if (tcp_set_peek_offset(s, 0)) {
|
||||
if (tcp_set_peek_offset(conn, 0)) {
|
||||
tcp_rst(c, conn);
|
||||
return -1;
|
||||
}
|
||||
|
|
|
@ -152,6 +152,7 @@ struct tcp_tap_transfer {
|
|||
* @notsent: Part of pending send queue that wasn't sent out yet
|
||||
* @rcvq: Length of pending receive queue
|
||||
* @mss: Socket-side MSS clamp
|
||||
* @timestamp: RFC 7323 timestamp
|
||||
* @snd_wl1: Next sequence used in window probe (next sequence - 1)
|
||||
* @snd_wnd: Socket-side sending window
|
||||
* @max_window: Window clamp
|
||||
|
@ -171,6 +172,7 @@ struct tcp_tap_transfer_ext {
|
|||
uint32_t rcvq;
|
||||
|
||||
uint32_t mss;
|
||||
uint32_t timestamp;
|
||||
|
||||
/* We can't just use struct tcp_repair_window: we need network order */
|
||||
uint32_t snd_wl1;
|
||||
|
@ -233,13 +235,11 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn);
|
|||
int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
|
||||
int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
|
||||
|
||||
int tcp_flow_migrate_shrink_window(int fidx, const struct tcp_tap_conn *conn);
|
||||
int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
|
||||
int tcp_flow_migrate_source_ext(int fd, int fidx,
|
||||
const struct tcp_tap_conn *conn);
|
||||
int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
|
||||
|
||||
int tcp_flow_migrate_target(struct ctx *c, int fd);
|
||||
int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd);
|
||||
int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
|
||||
|
||||
bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
|
||||
|
||||
|
|
|
@ -166,8 +166,6 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
|
|||
|
||||
struct tcp_info_linux;
|
||||
|
||||
void tcp_update_csum(uint32_t psum, struct tcphdr *th,
|
||||
struct iov_tail *payload);
|
||||
void tcp_fill_headers(const struct tcp_tap_conn *conn,
|
||||
struct tap_hdr *taph,
|
||||
struct iphdr *ip4h, struct ipv6hdr *ip6h,
|
||||
|
@ -179,5 +177,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
|
||||
size_t *optlen);
|
||||
int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset);
|
||||
|
||||
#endif /* TCP_INTERNAL_H */
|
||||
|
|
54
tcp_splice.c
54
tcp_splice.c
|
@ -164,7 +164,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
|
|||
if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
|
||||
epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
|
||||
int ret = -errno;
|
||||
flow_err(conn, "ERROR on epoll_ctl(): %s", strerror_(errno));
|
||||
flow_perror(conn, "ERROR on epoll_ctl()");
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -317,8 +317,8 @@ static int tcp_splice_connect_finish(const struct ctx *c,
|
|||
|
||||
if (conn->pipe[sidei][0] < 0) {
|
||||
if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
|
||||
flow_err(conn, "cannot create %d->%d pipe: %s",
|
||||
sidei, !sidei, strerror_(errno));
|
||||
flow_perror(conn, "cannot create %d->%d pipe",
|
||||
sidei, !sidei);
|
||||
conn_flag(c, conn, CLOSING);
|
||||
return -EIO;
|
||||
}
|
||||
|
@ -482,8 +482,7 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
|
|||
|
||||
rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl);
|
||||
if (rc)
|
||||
flow_err(conn, "Error retrieving SO_ERROR: %s",
|
||||
strerror_(errno));
|
||||
flow_perror(conn, "Error retrieving SO_ERROR");
|
||||
else
|
||||
flow_trace(conn, "Error event on socket: %s",
|
||||
strerror_(err));
|
||||
|
@ -521,20 +520,21 @@ swap:
|
|||
int more = 0;
|
||||
|
||||
retry:
|
||||
readlen = splice(conn->s[fromsidei], NULL,
|
||||
conn->pipe[fromsidei][1], NULL,
|
||||
c->tcp.pipe_size,
|
||||
SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
|
||||
flow_trace(conn, "%zi from read-side call", readlen);
|
||||
if (readlen < 0) {
|
||||
if (errno == EINTR)
|
||||
goto retry;
|
||||
do
|
||||
readlen = splice(conn->s[fromsidei], NULL,
|
||||
conn->pipe[fromsidei][1], NULL,
|
||||
c->tcp.pipe_size,
|
||||
SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
|
||||
while (readlen < 0 && errno == EINTR);
|
||||
|
||||
if (errno != EAGAIN)
|
||||
goto close;
|
||||
} else if (!readlen) {
|
||||
if (readlen < 0 && errno != EAGAIN)
|
||||
goto close;
|
||||
|
||||
flow_trace(conn, "%zi from read-side call", readlen);
|
||||
|
||||
if (!readlen) {
|
||||
eof = 1;
|
||||
} else {
|
||||
} else if (readlen > 0) {
|
||||
never_read = 0;
|
||||
|
||||
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
|
||||
|
@ -544,10 +544,16 @@ retry:
|
|||
conn_flag(c, conn, lowat_act_flag);
|
||||
}
|
||||
|
||||
eintr:
|
||||
written = splice(conn->pipe[fromsidei][0], NULL,
|
||||
conn->s[!fromsidei], NULL, c->tcp.pipe_size,
|
||||
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
|
||||
do
|
||||
written = splice(conn->pipe[fromsidei][0], NULL,
|
||||
conn->s[!fromsidei], NULL,
|
||||
c->tcp.pipe_size,
|
||||
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
|
||||
while (written < 0 && errno == EINTR);
|
||||
|
||||
if (written < 0 && errno != EAGAIN)
|
||||
goto close;
|
||||
|
||||
flow_trace(conn, "%zi from write-side call (passed %zi)",
|
||||
written, c->tcp.pipe_size);
|
||||
|
||||
|
@ -579,12 +585,6 @@ eintr:
|
|||
conn->written[fromsidei] += written > 0 ? written : 0;
|
||||
|
||||
if (written < 0) {
|
||||
if (errno == EINTR)
|
||||
goto eintr;
|
||||
|
||||
if (errno != EAGAIN)
|
||||
goto close;
|
||||
|
||||
if (conn->read[fromsidei] == conn->written[fromsidei])
|
||||
break;
|
||||
|
||||
|
|
23
tcp_vu.c
23
tcp_vu.c
|
@ -38,7 +38,6 @@
|
|||
static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
|
||||
static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
|
||||
static int head[VIRTQUEUE_MAX_SIZE + 1];
|
||||
static int head_cnt;
|
||||
|
||||
/**
|
||||
* tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP)
|
||||
|
@ -183,7 +182,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
|||
static ssize_t tcp_vu_sock_recv(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn, bool v6,
|
||||
uint32_t already_sent, size_t fillsize,
|
||||
int *iov_cnt)
|
||||
int *iov_cnt, int *head_cnt)
|
||||
{
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
|
@ -202,7 +201,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
|
|||
vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
|
||||
|
||||
elem_cnt = 0;
|
||||
head_cnt = 0;
|
||||
*head_cnt = 0;
|
||||
while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) {
|
||||
struct iovec *iov;
|
||||
size_t frame_size, dlen;
|
||||
|
@ -221,7 +220,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
|
|||
ASSERT(iov->iov_len >= hdrlen);
|
||||
iov->iov_base = (char *)iov->iov_base + hdrlen;
|
||||
iov->iov_len -= hdrlen;
|
||||
head[head_cnt++] = elem_cnt;
|
||||
head[(*head_cnt)++] = elem_cnt;
|
||||
|
||||
fillsize -= dlen;
|
||||
elem_cnt += cnt;
|
||||
|
@ -261,17 +260,18 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
|
|||
len -= iov->iov_len;
|
||||
}
|
||||
/* adjust head count */
|
||||
while (head_cnt > 0 && head[head_cnt - 1] >= i)
|
||||
head_cnt--;
|
||||
while (*head_cnt > 0 && head[*head_cnt - 1] >= i)
|
||||
(*head_cnt)--;
|
||||
|
||||
/* mark end of array */
|
||||
head[head_cnt] = i;
|
||||
head[*head_cnt] = i;
|
||||
*iov_cnt = i;
|
||||
|
||||
/* release unused buffers */
|
||||
vu_queue_rewind(vq, elem_cnt - i);
|
||||
|
||||
/* restore space for headers in iov */
|
||||
for (i = 0; i < head_cnt; i++) {
|
||||
for (i = 0; i < *head_cnt; i++) {
|
||||
struct iovec *iov = &elem[head[i]].in_sg[0];
|
||||
|
||||
iov->iov_base = (char *)iov->iov_base - hdrlen;
|
||||
|
@ -357,11 +357,11 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
ssize_t len, previous_dlen;
|
||||
int i, iov_cnt, head_cnt;
|
||||
size_t hdrlen, fillsize;
|
||||
int v6 = CONN_V6(conn);
|
||||
uint32_t already_sent;
|
||||
const uint16_t *check;
|
||||
int i, iov_cnt;
|
||||
|
||||
if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
|
||||
debug("Got packet, but RX virtqueue not usable yet");
|
||||
|
@ -376,7 +376,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
already_sent = 0;
|
||||
if (tcp_set_peek_offset(conn->sock, 0)) {
|
||||
if (tcp_set_peek_offset(conn, 0)) {
|
||||
tcp_rst(c, conn);
|
||||
return -1;
|
||||
}
|
||||
|
@ -396,7 +396,8 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
/* collect the buffers from vhost-user and fill them with the
|
||||
* data from the socket
|
||||
*/
|
||||
len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, &iov_cnt);
|
||||
len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize,
|
||||
&iov_cnt, &head_cnt);
|
||||
if (len < 0) {
|
||||
if (len != -EAGAIN && len != -EWOULDBLOCK) {
|
||||
tcp_rst(c, conn);
|
||||
|
|
|
@ -19,6 +19,7 @@ STATUS_FILE_INDEX=0
|
|||
STATUS_COLS=
|
||||
STATUS_PASS=0
|
||||
STATUS_FAIL=0
|
||||
STATUS_SKIPPED=0
|
||||
|
||||
PR_RED='\033[1;31m'
|
||||
PR_GREEN='\033[1;32m'
|
||||
|
@ -439,19 +440,21 @@ info_layout() {
|
|||
# status_test_ok() - Update counter of passed tests, log and display message
|
||||
status_test_ok() {
|
||||
STATUS_PASS=$((STATUS_PASS + 1))
|
||||
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
|
||||
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
|
||||
info_passed
|
||||
}
|
||||
|
||||
# status_test_fail() - Update counter of failed tests, log and display message
|
||||
status_test_fail() {
|
||||
STATUS_FAIL=$((STATUS_FAIL + 1))
|
||||
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
|
||||
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
|
||||
info_failed
|
||||
}
|
||||
|
||||
# status_test_fail() - Update counter of failed tests, log and display message
|
||||
status_test_skip() {
|
||||
STATUS_SKIPPED=$((STATUS_SKIPPED + 1))
|
||||
tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
|
||||
info_skipped
|
||||
}
|
||||
|
||||
|
|
|
@ -20,10 +20,7 @@ test_iperf3s() {
|
|||
__sctx="${1}"
|
||||
__port="${2}"
|
||||
|
||||
pane_or_context_run_bg "${__sctx}" \
|
||||
'iperf3 -s -p'${__port}' & echo $! > s.pid' \
|
||||
|
||||
sleep 1 # Wait for server to be ready
|
||||
pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid'
|
||||
}
|
||||
|
||||
# test_iperf3k() - Kill iperf3 server
|
||||
|
@ -31,7 +28,7 @@ test_iperf3s() {
|
|||
test_iperf3k() {
|
||||
__sctx="${1}"
|
||||
|
||||
pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid'
|
||||
pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)'
|
||||
|
||||
sleep 1 # Wait for kernel to free up ports
|
||||
}
|
||||
|
|
6
test/run
6
test/run
|
@ -202,7 +202,7 @@ skip_distro() {
|
|||
perf_finish
|
||||
[ ${CI} -eq 1 ] && video_stop
|
||||
|
||||
log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}"
|
||||
log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}"
|
||||
|
||||
pause_continue \
|
||||
"Press any key to keep test session open" \
|
||||
|
@ -236,7 +236,7 @@ run_selected() {
|
|||
done
|
||||
teardown "${__setup}"
|
||||
|
||||
log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}"
|
||||
log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}"
|
||||
|
||||
pause_continue \
|
||||
"Press any key to keep test session open" \
|
||||
|
@ -307,4 +307,4 @@ fi
|
|||
|
||||
tail -n1 ${LOGFILE}
|
||||
echo "Log at ${LOGFILE}"
|
||||
exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\)$/\1/p')
|
||||
exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\),.*$/\1/p')
|
||||
|
|
696
udp.c
696
udp.c
|
@ -39,27 +39,30 @@
|
|||
* could receive packets from multiple flows, so we use a hash table match to
|
||||
* find the specific flow for a datagram.
|
||||
*
|
||||
* When a UDP flow is initiated from a listening socket we take a duplicate of
|
||||
* the socket and store it in uflow->s[INISIDE]. This will last for the
|
||||
* Flow sockets
|
||||
* ============
|
||||
*
|
||||
* When a UDP flow targets a socket, we create a "flow" socket in
|
||||
* uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
|
||||
* replies on the target side. This socket is both bound and connected and has
|
||||
* EPOLL_TYPE_UDP. The connect() means it will only receive datagrams
|
||||
* associated with this flow, so the epoll reference directly points to the flow
|
||||
* and we don't need a hash lookup.
|
||||
*
|
||||
* When a flow is initiated from a listening socket, we create a "flow" socket
|
||||
* with the same bound address as the listening socket, but also connect()ed to
|
||||
* the flow's peer. This is stored in uflow->s[INISIDE] and will last for the
|
||||
* lifetime of the flow, even if the original listening socket is closed due to
|
||||
* port auto-probing. The duplicate is used to deliver replies back to the
|
||||
* originating side.
|
||||
*
|
||||
* Reply sockets
|
||||
* =============
|
||||
*
|
||||
* When a UDP flow targets a socket, we create a "reply" socket in
|
||||
* uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
|
||||
* replies on the target side. This socket is both bound and connected and has
|
||||
* EPOLL_TYPE_UDP_REPLY. The connect() means it will only receive datagrams
|
||||
* associated with this flow, so the epoll reference directly points to the flow
|
||||
* and we don't need a hash lookup.
|
||||
*
|
||||
* NOTE: it's possible that the reply socket could have a bound address
|
||||
* overlapping with an unrelated listening socket. We assume datagrams for the
|
||||
* flow will come to the reply socket in preference to a listening socket. The
|
||||
* sample program doc/platform-requirements/reuseaddr-priority.c documents and
|
||||
* tests that assumption.
|
||||
* NOTE: A flow socket can have a bound address overlapping with a listening
|
||||
* socket. That will happen naturally for flows initiated from a socket, but is
|
||||
* also possible (though unlikely) for tap initiated flows, depending on the
|
||||
* source port. We assume datagrams for the flow will come to a connect()ed
|
||||
* socket in preference to a listening socket. The sample program
|
||||
* doc/platform-requirements/reuseaddr-priority.c documents and tests that
|
||||
* assumption.
|
||||
*
|
||||
* "Spliced" flows
|
||||
* ===============
|
||||
|
@ -71,8 +74,7 @@
|
|||
* actually used; it doesn't make sense for datagrams and instead a pair of
|
||||
* recvmmsg() and sendmmsg() is used to forward the datagrams.
|
||||
*
|
||||
* Note that a spliced flow will have *both* a duplicated listening socket and a
|
||||
* reply socket (see above).
|
||||
* Note that a spliced flow will have two flow sockets (see above).
|
||||
*/
|
||||
|
||||
#include <sched.h>
|
||||
|
@ -87,6 +89,8 @@
|
|||
#include <netinet/in.h>
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/udp.h>
|
||||
#include <netinet/ip_icmp.h>
|
||||
#include <netinet/icmp6.h>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
@ -112,6 +116,14 @@
|
|||
#include "udp_internal.h"
|
||||
#include "udp_vu.h"
|
||||
|
||||
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
|
||||
|
||||
/* Maximum UDP data to be returned in ICMP messages */
|
||||
#define ICMP4_MAX_DLEN 8
|
||||
#define ICMP6_MAX_DLEN (IPV6_MIN_MTU \
|
||||
- sizeof(struct udphdr) \
|
||||
- sizeof(struct ipv6hdr))
|
||||
|
||||
/* "Spliced" sockets indexed by bound port (host order) */
|
||||
static int udp_splice_ns [IP_VERSIONS][NUM_PORTS];
|
||||
static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
|
||||
|
@ -128,26 +140,31 @@ static struct ethhdr udp4_eth_hdr;
|
|||
static struct ethhdr udp6_eth_hdr;
|
||||
|
||||
/**
|
||||
* struct udp_meta_t - Pre-cooked headers and metadata for UDP packets
|
||||
* struct udp_meta_t - Pre-cooked headers for UDP packets
|
||||
* @ip6h: Pre-filled IPv6 header (except for payload_len and addresses)
|
||||
* @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
|
||||
* @taph: Tap backend specific header
|
||||
* @s_in: Source socket address, filled in by recvmmsg()
|
||||
* @tosidx: sidx for the destination side of this datagram's flow
|
||||
*/
|
||||
static struct udp_meta_t {
|
||||
struct ipv6hdr ip6h;
|
||||
struct iphdr ip4h;
|
||||
struct tap_hdr taph;
|
||||
|
||||
union sockaddr_inany s_in;
|
||||
flow_sidx_t tosidx;
|
||||
}
|
||||
#ifdef __AVX2__
|
||||
__attribute__ ((aligned(32)))
|
||||
#endif
|
||||
udp_meta[UDP_MAX_FRAMES];
|
||||
|
||||
#define PKTINFO_SPACE \
|
||||
MAX(CMSG_SPACE(sizeof(struct in_pktinfo)), \
|
||||
CMSG_SPACE(sizeof(struct in6_pktinfo)))
|
||||
|
||||
#define RECVERR_SPACE \
|
||||
MAX(CMSG_SPACE(sizeof(struct sock_extended_err) + \
|
||||
sizeof(struct sockaddr_in)), \
|
||||
CMSG_SPACE(sizeof(struct sock_extended_err) + \
|
||||
sizeof(struct sockaddr_in6)))
|
||||
|
||||
/**
|
||||
* enum udp_iov_idx - Indices for the buffers making up a single UDP frame
|
||||
* @UDP_IOV_TAP tap specific header
|
||||
|
@ -224,8 +241,6 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
|
|||
tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
|
||||
tiov[UDP_IOV_PAYLOAD].iov_base = payload;
|
||||
|
||||
mh->msg_name = &meta->s_in;
|
||||
mh->msg_namelen = sizeof(meta->s_in);
|
||||
mh->msg_iov = siov;
|
||||
mh->msg_iovlen = 1;
|
||||
}
|
||||
|
@ -245,41 +260,6 @@ static void udp_iov_init(const struct ctx *c)
|
|||
udp_iov_init_one(c, i);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_splice_prepare() - Prepare one datagram for splicing
|
||||
* @mmh: Receiving mmsghdr array
|
||||
* @idx: Index of the datagram to prepare
|
||||
*/
|
||||
static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx)
|
||||
{
|
||||
udp_mh_splice[idx].msg_hdr.msg_iov->iov_len = mmh[idx].msg_len;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_splice_send() - Send a batch of datagrams from socket to socket
|
||||
* @c: Execution context
|
||||
* @start: Index of batch's first datagram in udp[46]_l2_buf
|
||||
* @n: Number of datagrams in batch
|
||||
* @src: Source port for datagram (target side)
|
||||
* @dst: Destination port for datagrams (target side)
|
||||
* @ref: epoll reference for origin socket
|
||||
* @now: Timestamp
|
||||
*/
|
||||
static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
|
||||
flow_sidx_t tosidx)
|
||||
{
|
||||
const struct flowside *toside = flowside_at_sidx(tosidx);
|
||||
const struct udp_flow *uflow = udp_at_sidx(tosidx);
|
||||
uint8_t topif = pif_at_sidx(tosidx);
|
||||
int s = uflow->s[tosidx.sidei];
|
||||
socklen_t sl;
|
||||
|
||||
pif_sockaddr(c, &udp_splice_to, &sl, topif,
|
||||
&toside->eaddr, toside->eport);
|
||||
|
||||
sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_update_hdr4() - Update headers for one IPv4 datagram
|
||||
* @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
|
||||
|
@ -402,28 +382,172 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
|
|||
(*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer
|
||||
* @c: Execution context
|
||||
* @ee: Extended error descriptor
|
||||
* @toside: Destination side of flow
|
||||
* @saddr: Address of ICMP generating node
|
||||
* @in: First bytes (max 8) of original UDP message body
|
||||
* @dlen: Length of the read part of original UDP message body
|
||||
*/
|
||||
static void udp_send_tap_icmp4(const struct ctx *c,
|
||||
const struct sock_extended_err *ee,
|
||||
const struct flowside *toside,
|
||||
struct in_addr saddr,
|
||||
const void *in, size_t dlen)
|
||||
{
|
||||
struct in_addr oaddr = toside->oaddr.v4mapped.a4;
|
||||
struct in_addr eaddr = toside->eaddr.v4mapped.a4;
|
||||
in_port_t eport = toside->eport;
|
||||
in_port_t oport = toside->oport;
|
||||
struct {
|
||||
struct icmphdr icmp4h;
|
||||
struct iphdr ip4h;
|
||||
struct udphdr uh;
|
||||
char data[ICMP4_MAX_DLEN];
|
||||
} __attribute__((packed, aligned(__alignof__(max_align_t)))) msg;
|
||||
size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen;
|
||||
size_t l4len = dlen + sizeof(struct udphdr);
|
||||
|
||||
ASSERT(dlen <= ICMP4_MAX_DLEN);
|
||||
memset(&msg, 0, sizeof(msg));
|
||||
msg.icmp4h.type = ee->ee_type;
|
||||
msg.icmp4h.code = ee->ee_code;
|
||||
if (ee->ee_type == ICMP_DEST_UNREACH && ee->ee_code == ICMP_FRAG_NEEDED)
|
||||
msg.icmp4h.un.frag.mtu = htons((uint16_t) ee->ee_info);
|
||||
|
||||
/* Reconstruct the original headers as returned in the ICMP message */
|
||||
tap_push_ip4h(&msg.ip4h, eaddr, oaddr, l4len, IPPROTO_UDP);
|
||||
tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
|
||||
memcpy(&msg.data, in, dlen);
|
||||
|
||||
tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer
|
||||
* @c: Execution context
|
||||
* @ee: Extended error descriptor
|
||||
* @toside: Destination side of flow
|
||||
* @saddr: Address of ICMP generating node
|
||||
* @in: First bytes (max 1232) of original UDP message body
|
||||
* @dlen: Length of the read part of original UDP message body
|
||||
* @flow: IPv6 flow identifier
|
||||
*/
|
||||
static void udp_send_tap_icmp6(const struct ctx *c,
|
||||
const struct sock_extended_err *ee,
|
||||
const struct flowside *toside,
|
||||
const struct in6_addr *saddr,
|
||||
void *in, size_t dlen, uint32_t flow)
|
||||
{
|
||||
const struct in6_addr *oaddr = &toside->oaddr.a6;
|
||||
const struct in6_addr *eaddr = &toside->eaddr.a6;
|
||||
in_port_t eport = toside->eport;
|
||||
in_port_t oport = toside->oport;
|
||||
struct {
|
||||
struct icmp6_hdr icmp6h;
|
||||
struct ipv6hdr ip6h;
|
||||
struct udphdr uh;
|
||||
char data[ICMP6_MAX_DLEN];
|
||||
} __attribute__((packed, aligned(__alignof__(max_align_t)))) msg;
|
||||
size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen;
|
||||
size_t l4len = dlen + sizeof(struct udphdr);
|
||||
|
||||
ASSERT(dlen <= ICMP6_MAX_DLEN);
|
||||
memset(&msg, 0, sizeof(msg));
|
||||
msg.icmp6h.icmp6_type = ee->ee_type;
|
||||
msg.icmp6h.icmp6_code = ee->ee_code;
|
||||
if (ee->ee_type == ICMP6_PACKET_TOO_BIG)
|
||||
msg.icmp6h.icmp6_dataun.icmp6_un_data32[0] = htonl(ee->ee_info);
|
||||
|
||||
/* Reconstruct the original headers as returned in the ICMP message */
|
||||
tap_push_ip6h(&msg.ip6h, eaddr, oaddr, l4len, IPPROTO_UDP, flow);
|
||||
tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
|
||||
memcpy(&msg.data, in, dlen);
|
||||
|
||||
tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_pktinfo() - Retrieve packet destination address from cmsg
|
||||
* @msg: msghdr into which message has been received
|
||||
* @dst: (Local) destination address of message in @mh (output)
|
||||
*
|
||||
* Return: 0 on success, -1 if the information was missing (@dst is set to
|
||||
* inany_any6).
|
||||
*/
|
||||
static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst)
|
||||
{
|
||||
struct cmsghdr *hdr;
|
||||
|
||||
for (hdr = CMSG_FIRSTHDR(msg); hdr; hdr = CMSG_NXTHDR(msg, hdr)) {
|
||||
if (hdr->cmsg_level == IPPROTO_IP &&
|
||||
hdr->cmsg_type == IP_PKTINFO) {
|
||||
const struct in_pktinfo *i4 = (void *)CMSG_DATA(hdr);
|
||||
|
||||
*dst = inany_from_v4(i4->ipi_addr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (hdr->cmsg_level == IPPROTO_IPV6 &&
|
||||
hdr->cmsg_type == IPV6_PKTINFO) {
|
||||
const struct in6_pktinfo *i6 = (void *)CMSG_DATA(hdr);
|
||||
|
||||
dst->a6 = i6->ipi6_addr;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
debug("Missing PKTINFO cmsg on datagram");
|
||||
*dst = inany_any6;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_sock_recverr() - Receive and clear an error from a socket
|
||||
* @s: Socket to receive from
|
||||
* @c: Execution context
|
||||
* @s: Socket to receive errors from
|
||||
* @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown
|
||||
* @pif: Interface on which the error occurred
|
||||
* (only used if @sidx == FLOW_SIDX_NONE)
|
||||
* @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
|
||||
*
|
||||
* Return: 1 if error received and processed, 0 if no more errors in queue, < 0
|
||||
* if there was an error reading the queue
|
||||
*
|
||||
* #syscalls recvmsg
|
||||
*/
|
||||
static int udp_sock_recverr(int s)
|
||||
static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
|
||||
uint8_t pif, in_port_t port)
|
||||
{
|
||||
char buf[PKTINFO_SPACE + RECVERR_SPACE];
|
||||
const struct sock_extended_err *ee;
|
||||
const struct cmsghdr *hdr;
|
||||
char buf[CMSG_SPACE(sizeof(*ee))];
|
||||
char data[ICMP6_MAX_DLEN];
|
||||
struct cmsghdr *hdr;
|
||||
struct iovec iov = {
|
||||
.iov_base = data,
|
||||
.iov_len = sizeof(data)
|
||||
};
|
||||
union sockaddr_inany src;
|
||||
struct msghdr mh = {
|
||||
.msg_name = NULL,
|
||||
.msg_namelen = 0,
|
||||
.msg_iov = NULL,
|
||||
.msg_iovlen = 0,
|
||||
.msg_name = &src,
|
||||
.msg_namelen = sizeof(src),
|
||||
.msg_iov = &iov,
|
||||
.msg_iovlen = 1,
|
||||
.msg_control = buf,
|
||||
.msg_controllen = sizeof(buf),
|
||||
};
|
||||
const struct flowside *fromside, *toside;
|
||||
union inany_addr offender, otap;
|
||||
char astr[INANY_ADDRSTRLEN];
|
||||
char sastr[SOCKADDR_STRLEN];
|
||||
const struct in_addr *o4;
|
||||
in_port_t offender_port;
|
||||
struct udp_flow *uflow;
|
||||
uint8_t topif;
|
||||
size_t dlen;
|
||||
ssize_t rc;
|
||||
|
||||
rc = recvmsg(s, &mh, MSG_ERRQUEUE);
|
||||
|
@ -440,33 +564,102 @@ static int udp_sock_recverr(int s)
|
|||
return -1;
|
||||
}
|
||||
|
||||
hdr = CMSG_FIRSTHDR(&mh);
|
||||
if (!((hdr->cmsg_level == IPPROTO_IP &&
|
||||
hdr->cmsg_type == IP_RECVERR) ||
|
||||
(hdr->cmsg_level == IPPROTO_IPV6 &&
|
||||
hdr->cmsg_type == IPV6_RECVERR))) {
|
||||
err("Unexpected cmsg reading error queue");
|
||||
for (hdr = CMSG_FIRSTHDR(&mh); hdr; hdr = CMSG_NXTHDR(&mh, hdr)) {
|
||||
if ((hdr->cmsg_level == IPPROTO_IP &&
|
||||
hdr->cmsg_type == IP_RECVERR) ||
|
||||
(hdr->cmsg_level == IPPROTO_IPV6 &&
|
||||
hdr->cmsg_type == IPV6_RECVERR))
|
||||
break;
|
||||
}
|
||||
|
||||
if (!hdr) {
|
||||
err("Missing RECVERR cmsg in error queue");
|
||||
return -1;
|
||||
}
|
||||
|
||||
ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
|
||||
|
||||
/* TODO: When possible propagate and otherwise handle errors */
|
||||
debug("%s error on UDP socket %i: %s",
|
||||
str_ee_origin(ee), s, strerror_(ee->ee_errno));
|
||||
|
||||
if (!flow_sidx_valid(sidx)) {
|
||||
/* No hint from the socket, determine flow from addresses */
|
||||
union inany_addr dst;
|
||||
|
||||
if (udp_pktinfo(&mh, &dst) < 0) {
|
||||
debug("Missing PKTINFO on UDP error");
|
||||
return 1;
|
||||
}
|
||||
|
||||
sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, &src, &dst, port);
|
||||
if (!flow_sidx_valid(sidx)) {
|
||||
debug("Ignoring UDP error without flow");
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
pif = pif_at_sidx(sidx);
|
||||
}
|
||||
|
||||
uflow = udp_at_sidx(sidx);
|
||||
ASSERT(uflow);
|
||||
fromside = &uflow->f.side[sidx.sidei];
|
||||
toside = &uflow->f.side[!sidx.sidei];
|
||||
topif = uflow->f.pif[!sidx.sidei];
|
||||
dlen = rc;
|
||||
|
||||
if (inany_from_sockaddr(&offender, &offender_port,
|
||||
SO_EE_OFFENDER(ee)) < 0)
|
||||
goto fail;
|
||||
|
||||
if (pif != PIF_HOST || topif != PIF_TAP)
|
||||
/* XXX Can we support any other cases? */
|
||||
goto fail;
|
||||
|
||||
/* If the offender *is* the endpoint, make sure our translation is
|
||||
* consistent with the flow's translation. This matters if the flow
|
||||
* endpoint has a port specific translation (like --dns-match).
|
||||
*/
|
||||
if (inany_equals(&offender, &fromside->eaddr))
|
||||
otap = toside->oaddr;
|
||||
else if (!nat_inbound(c, &offender, &otap))
|
||||
goto fail;
|
||||
|
||||
if (hdr->cmsg_level == IPPROTO_IP &&
|
||||
(o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) {
|
||||
dlen = MIN(dlen, ICMP4_MAX_DLEN);
|
||||
udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) {
|
||||
udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen,
|
||||
FLOW_IDX(uflow));
|
||||
return 1;
|
||||
}
|
||||
|
||||
fail:
|
||||
flow_dbg(uflow, "Can't propagate %s error from %s %s to %s %s",
|
||||
str_ee_origin(ee),
|
||||
pif_name(pif),
|
||||
sockaddr_ntop(SO_EE_OFFENDER(ee), sastr, sizeof(sastr)),
|
||||
pif_name(topif),
|
||||
inany_ntop(&toside->eaddr, astr, sizeof(astr)));
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_sock_errs() - Process errors on a socket
|
||||
* @c: Execution context
|
||||
* @s: Socket to receive from
|
||||
* @events: epoll events bitmap
|
||||
* @s: Socket to receive errors from
|
||||
* @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown
|
||||
* @pif: Interface on which the error occurred
|
||||
* (only used if @sidx == FLOW_SIDX_NONE)
|
||||
* @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
|
||||
*
|
||||
* Return: Number of errors handled, or < 0 if we have an unrecoverable error
|
||||
*/
|
||||
int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
|
||||
static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx,
|
||||
uint8_t pif, in_port_t port)
|
||||
{
|
||||
unsigned n_err = 0;
|
||||
socklen_t errlen;
|
||||
|
@ -474,11 +667,8 @@ int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
|
|||
|
||||
ASSERT(!c->no_udp);
|
||||
|
||||
if (!(events & EPOLLERR))
|
||||
return 0; /* Nothing to do */
|
||||
|
||||
/* Empty the error queue */
|
||||
while ((rc = udp_sock_recverr(s)) > 0)
|
||||
while ((rc = udp_sock_recverr(c, s, sidx, pif, port)) > 0)
|
||||
n_err += rc;
|
||||
|
||||
if (rc < 0)
|
||||
|
@ -505,37 +695,62 @@ int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
|
|||
return n_err;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_peek_addr() - Get source address for next packet
|
||||
* @s: Socket to get information from
|
||||
* @src: Socket address (output)
|
||||
* @dst: (Local) destination address (output)
|
||||
*
|
||||
* Return: 0 if no more packets, 1 on success, -ve error code on error
|
||||
*/
|
||||
static int udp_peek_addr(int s, union sockaddr_inany *src,
|
||||
union inany_addr *dst)
|
||||
{
|
||||
char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN];
|
||||
char cmsg[PKTINFO_SPACE];
|
||||
struct msghdr msg = {
|
||||
.msg_name = src,
|
||||
.msg_namelen = sizeof(*src),
|
||||
.msg_control = cmsg,
|
||||
.msg_controllen = sizeof(cmsg),
|
||||
};
|
||||
int rc;
|
||||
|
||||
rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
|
||||
if (rc < 0) {
|
||||
if (errno == EAGAIN || errno == EWOULDBLOCK)
|
||||
return 0;
|
||||
return -errno;
|
||||
}
|
||||
|
||||
udp_pktinfo(&msg, dst);
|
||||
|
||||
trace("Peeked UDP datagram: %s -> %s",
|
||||
sockaddr_ntop(src, sastr, sizeof(sastr)),
|
||||
inany_ntop(dst, dstr, sizeof(dstr)));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_sock_recv() - Receive datagrams from a socket
|
||||
* @c: Execution context
|
||||
* @s: Socket to receive from
|
||||
* @events: epoll events bitmap
|
||||
* @mmh mmsghdr array to receive into
|
||||
* @n: Maximum number of datagrams to receive
|
||||
*
|
||||
* Return: Number of datagrams received
|
||||
*
|
||||
* #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
|
||||
*/
|
||||
static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
|
||||
struct mmsghdr *mmh)
|
||||
static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
|
||||
{
|
||||
/* For not entirely clear reasons (data locality?) pasta gets better
|
||||
* throughput if we receive tap datagrams one at a atime. For small
|
||||
* splice datagrams throughput is slightly better if we do batch, but
|
||||
* it's slightly worse for large splice datagrams. Since we don't know
|
||||
* before we receive whether we'll use tap or splice, always go one at a
|
||||
* time for pasta mode.
|
||||
*/
|
||||
int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
|
||||
|
||||
ASSERT(!c->no_udp);
|
||||
|
||||
if (!(events & EPOLLIN))
|
||||
return 0;
|
||||
|
||||
n = recvmmsg(s, mmh, n, 0, NULL);
|
||||
if (n < 0) {
|
||||
err_perror("Error receiving datagrams");
|
||||
trace("Error receiving datagrams: %s", strerror_(errno));
|
||||
/* Bail out and let the EPOLLERR handler deal with it */
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -543,78 +758,121 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
|
|||
}
|
||||
|
||||
/**
|
||||
* udp_buf_listen_sock_handler() - Handle new data from socket
|
||||
* udp_sock_to_sock() - Forward datagrams from socket to socket
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
* @now: Current timestamp
|
||||
* @from_s: Socket to receive datagrams from
|
||||
* @n: Maximum number of datagrams to forward
|
||||
* @tosidx: Flow & side to forward datagrams to
|
||||
*
|
||||
* #syscalls recvmmsg
|
||||
* #syscalls sendmmsg
|
||||
*/
|
||||
static void udp_buf_listen_sock_handler(const struct ctx *c,
|
||||
union epoll_ref ref, uint32_t events,
|
||||
const struct timespec *now)
|
||||
static void udp_sock_to_sock(const struct ctx *c, int from_s, int n,
|
||||
flow_sidx_t tosidx)
|
||||
{
|
||||
const socklen_t sasize = sizeof(udp_meta[0].s_in);
|
||||
int n, i;
|
||||
const struct flowside *toside = flowside_at_sidx(tosidx);
|
||||
const struct udp_flow *uflow = udp_at_sidx(tosidx);
|
||||
uint8_t topif = pif_at_sidx(tosidx);
|
||||
int to_s = uflow->s[tosidx.sidei];
|
||||
socklen_t sl;
|
||||
int i;
|
||||
|
||||
if (udp_sock_errs(c, ref.fd, events) < 0) {
|
||||
err("UDP: Unrecoverable error on listening socket:"
|
||||
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
|
||||
/* FIXME: what now? close/re-open socket? */
|
||||
if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0)
|
||||
return;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
udp_mh_splice[i].msg_hdr.msg_iov->iov_len
|
||||
= udp_mh_recv[i].msg_len;
|
||||
}
|
||||
|
||||
if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
|
||||
pif_sockaddr(c, &udp_splice_to, &sl, topif,
|
||||
&toside->eaddr, toside->eport);
|
||||
|
||||
sendmmsg(to_s, udp_mh_splice, n, MSG_NOSIGNAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_buf_sock_to_tap() - Forward datagrams from socket to tap
|
||||
* @c: Execution context
|
||||
* @s: Socket to read data from
|
||||
* @n: Maximum number of datagrams to forward
|
||||
* @tosidx: Flow & side to forward data from @s to
|
||||
*/
|
||||
static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
|
||||
flow_sidx_t tosidx)
|
||||
{
|
||||
const struct flowside *toside = flowside_at_sidx(tosidx);
|
||||
int i;
|
||||
|
||||
if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
|
||||
return;
|
||||
|
||||
/* We divide datagrams into batches based on how we need to send them,
|
||||
* determined by udp_meta[i].tosidx. To avoid either two passes through
|
||||
* the array, or recalculating tosidx for a single entry, we have to
|
||||
* populate it one entry *ahead* of the loop counter.
|
||||
*/
|
||||
udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now);
|
||||
udp_mh_recv[0].msg_hdr.msg_namelen = sasize;
|
||||
for (i = 0; i < n; ) {
|
||||
flow_sidx_t batchsidx = udp_meta[i].tosidx;
|
||||
uint8_t batchpif = pif_at_sidx(batchsidx);
|
||||
int batchstart = i;
|
||||
for (i = 0; i < n; i++)
|
||||
udp_tap_prepare(udp_mh_recv, i, toside, false);
|
||||
|
||||
do {
|
||||
if (pif_is_socket(batchpif)) {
|
||||
udp_splice_prepare(udp_mh_recv, i);
|
||||
} else if (batchpif == PIF_TAP) {
|
||||
udp_tap_prepare(udp_mh_recv, i,
|
||||
flowside_at_sidx(batchsidx),
|
||||
false);
|
||||
tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_sock_fwd() - Forward datagrams from a possibly unconnected socket
|
||||
* @c: Execution context
|
||||
* @s: Socket to forward from
|
||||
* @frompif: Interface to which @s belongs
|
||||
* @port: Our (local) port number of @s
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
|
||||
in_port_t port, const struct timespec *now)
|
||||
{
|
||||
union sockaddr_inany src;
|
||||
union inany_addr dst;
|
||||
int rc;
|
||||
|
||||
while ((rc = udp_peek_addr(s, &src, &dst)) != 0) {
|
||||
bool discard = false;
|
||||
flow_sidx_t tosidx;
|
||||
uint8_t topif;
|
||||
|
||||
if (rc < 0) {
|
||||
trace("Error peeking at socket address: %s",
|
||||
strerror_(-rc));
|
||||
/* Clear errors & carry on */
|
||||
if (udp_sock_errs(c, s, FLOW_SIDX_NONE,
|
||||
frompif, port) < 0) {
|
||||
err(
|
||||
"UDP: Unrecoverable error on listening socket: (%s port %hu)",
|
||||
pif_name(frompif), port);
|
||||
/* FIXME: what now? close/re-open socket? */
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (++i >= n)
|
||||
break;
|
||||
tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now);
|
||||
topif = pif_at_sidx(tosidx);
|
||||
|
||||
udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
|
||||
&udp_meta[i].s_in,
|
||||
now);
|
||||
udp_mh_recv[i].msg_hdr.msg_namelen = sasize;
|
||||
} while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx));
|
||||
|
||||
if (pif_is_socket(batchpif)) {
|
||||
udp_splice_send(c, batchstart, i - batchstart,
|
||||
batchsidx);
|
||||
} else if (batchpif == PIF_TAP) {
|
||||
tap_send_frames(c, &udp_l2_iov[batchstart][0],
|
||||
UDP_NUM_IOVS, i - batchstart);
|
||||
} else if (flow_sidx_valid(batchsidx)) {
|
||||
flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx);
|
||||
struct udp_flow *uflow = udp_at_sidx(batchsidx);
|
||||
if (pif_is_socket(topif)) {
|
||||
udp_sock_to_sock(c, s, 1, tosidx);
|
||||
} else if (topif == PIF_TAP) {
|
||||
if (c->mode == MODE_VU)
|
||||
udp_vu_sock_to_tap(c, s, 1, tosidx);
|
||||
else
|
||||
udp_buf_sock_to_tap(c, s, 1, tosidx);
|
||||
} else if (flow_sidx_valid(tosidx)) {
|
||||
struct udp_flow *uflow = udp_at_sidx(tosidx);
|
||||
|
||||
flow_err(uflow,
|
||||
"No support for forwarding UDP from %s to %s",
|
||||
pif_name(pif_at_sidx(fromsidx)),
|
||||
pif_name(batchpif));
|
||||
pif_name(frompif), pif_name(topif));
|
||||
discard = true;
|
||||
} else {
|
||||
debug("Discarding %d datagrams without flow",
|
||||
i - batchstart);
|
||||
debug("Discarding datagram without flow");
|
||||
discard = true;
|
||||
}
|
||||
|
||||
if (discard) {
|
||||
struct msghdr msg = { 0 };
|
||||
|
||||
if (recvmsg(s, &msg, MSG_DONTWAIT) < 0)
|
||||
debug_perror("Failed to discard datagram");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -630,87 +888,69 @@ void udp_listen_sock_handler(const struct ctx *c,
|
|||
union epoll_ref ref, uint32_t events,
|
||||
const struct timespec *now)
|
||||
{
|
||||
if (c->mode == MODE_VU) {
|
||||
udp_vu_listen_sock_handler(c, ref, events, now);
|
||||
return;
|
||||
}
|
||||
|
||||
udp_buf_listen_sock_handler(c, ref, events, now);
|
||||
if (events & (EPOLLERR | EPOLLIN))
|
||||
udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_buf_reply_sock_handler() - Handle new data from flow specific socket
|
||||
* udp_sock_handler() - Handle new data from flow specific socket
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
* @now: Current timestamp
|
||||
*
|
||||
* #syscalls recvmmsg
|
||||
*/
|
||||
static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events,
|
||||
const struct timespec *now)
|
||||
void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now)
|
||||
{
|
||||
flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
|
||||
const struct flowside *toside = flowside_at_sidx(tosidx);
|
||||
struct udp_flow *uflow = udp_at_sidx(ref.flowside);
|
||||
uint8_t topif = pif_at_sidx(tosidx);
|
||||
int n, i, from_s;
|
||||
|
||||
ASSERT(!c->no_udp && uflow);
|
||||
|
||||
from_s = uflow->s[ref.flowside.sidei];
|
||||
|
||||
if (udp_sock_errs(c, from_s, events) < 0) {
|
||||
flow_err(uflow, "Unrecoverable error on reply socket");
|
||||
flow_err_details(uflow);
|
||||
udp_flow_close(c, uflow);
|
||||
return;
|
||||
if (events & EPOLLERR) {
|
||||
if (udp_sock_errs(c, ref.fd, ref.flowside, PIF_NONE, 0) < 0) {
|
||||
flow_err(uflow, "Unrecoverable error on flow socket");
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
|
||||
return;
|
||||
if (events & EPOLLIN) {
|
||||
/* For not entirely clear reasons (data locality?) pasta gets
|
||||
* better throughput if we receive tap datagrams one at a
|
||||
* time. For small splice datagrams throughput is slightly
|
||||
* better if we do batch, but it's slightly worse for large
|
||||
* splice datagrams. Since we don't know the size before we
|
||||
* receive, always go one at a time for pasta mode.
|
||||
*/
|
||||
size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
|
||||
flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
|
||||
uint8_t topif = pif_at_sidx(tosidx);
|
||||
int s = ref.fd;
|
||||
|
||||
flow_trace(uflow, "Received %d datagrams on reply socket", n);
|
||||
uflow->ts = now->tv_sec;
|
||||
flow_trace(uflow, "Received data on reply socket");
|
||||
uflow->ts = now->tv_sec;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
if (pif_is_socket(topif))
|
||||
udp_splice_prepare(udp_mh_recv, i);
|
||||
else if (topif == PIF_TAP)
|
||||
udp_tap_prepare(udp_mh_recv, i, toside, false);
|
||||
/* Restore sockaddr length clobbered by recvmsg() */
|
||||
udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
|
||||
if (pif_is_socket(topif)) {
|
||||
udp_sock_to_sock(c, ref.fd, n, tosidx);
|
||||
} else if (topif == PIF_TAP) {
|
||||
if (c->mode == MODE_VU) {
|
||||
udp_vu_sock_to_tap(c, s, UDP_MAX_FRAMES,
|
||||
tosidx);
|
||||
} else {
|
||||
udp_buf_sock_to_tap(c, s, n, tosidx);
|
||||
}
|
||||
} else {
|
||||
flow_err(uflow,
|
||||
"No support for forwarding UDP from %s to %s",
|
||||
pif_name(pif_at_sidx(ref.flowside)),
|
||||
pif_name(topif));
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
return;
|
||||
|
||||
if (pif_is_socket(topif)) {
|
||||
udp_splice_send(c, 0, n, tosidx);
|
||||
} else if (topif == PIF_TAP) {
|
||||
tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
|
||||
} else {
|
||||
uint8_t frompif = pif_at_sidx(ref.flowside);
|
||||
|
||||
flow_err(uflow, "No support for forwarding UDP from %s to %s",
|
||||
pif_name(frompif), pif_name(topif));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_reply_sock_handler() - Handle new data from flow specific socket
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now)
|
||||
{
|
||||
if (c->mode == MODE_VU) {
|
||||
udp_vu_reply_sock_handler(c, ref, events, now);
|
||||
return;
|
||||
}
|
||||
|
||||
udp_buf_reply_sock_handler(c, ref, events, now);
|
||||
fail:
|
||||
flow_err_details(uflow);
|
||||
udp_flow_close(c, uflow);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -720,6 +960,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
* @af: Address family, AF_INET or AF_INET6
|
||||
* @saddr: Source address
|
||||
* @daddr: Destination address
|
||||
* @ttl: TTL or hop limit for packets to be sent in this call
|
||||
* @p: Pool of UDP packets, with UDP headers
|
||||
* @idx: Index of first packet to process
|
||||
* @now: Current timestamp
|
||||
|
@ -730,7 +971,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
*/
|
||||
int udp_tap_handler(const struct ctx *c, uint8_t pif,
|
||||
sa_family_t af, const void *saddr, const void *daddr,
|
||||
const struct pool *p, int idx, const struct timespec *now)
|
||||
uint8_t ttl, const struct pool *p, int idx,
|
||||
const struct timespec *now)
|
||||
{
|
||||
const struct flowside *toside;
|
||||
struct mmsghdr mm[UIO_MAXIOV];
|
||||
|
@ -778,7 +1020,7 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
|
|||
}
|
||||
toside = flowside_at_sidx(tosidx);
|
||||
|
||||
s = udp_at_sidx(tosidx)->s[tosidx.sidei];
|
||||
s = uflow->s[tosidx.sidei];
|
||||
ASSERT(s >= 0);
|
||||
|
||||
pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport);
|
||||
|
@ -809,6 +1051,24 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
|
|||
mm[i].msg_hdr.msg_controllen = 0;
|
||||
mm[i].msg_hdr.msg_flags = 0;
|
||||
|
||||
if (ttl != uflow->ttl[tosidx.sidei]) {
|
||||
uflow->ttl[tosidx.sidei] = ttl;
|
||||
if (af == AF_INET) {
|
||||
if (setsockopt(s, IPPROTO_IP, IP_TTL,
|
||||
&ttl, sizeof(ttl)) < 0)
|
||||
flow_perror(uflow,
|
||||
"setsockopt IP_TTL");
|
||||
} else {
|
||||
/* IPv6 hop_limit cannot be only 1 byte */
|
||||
int hop_limit = ttl;
|
||||
|
||||
if (setsockopt(s, SOL_IPV6, IPV6_UNICAST_HOPS,
|
||||
&hop_limit, sizeof(hop_limit)) < 0)
|
||||
flow_perror(uflow,
|
||||
"setsockopt IPV6_UNICAST_HOPS");
|
||||
}
|
||||
}
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
|
|
7
udp.h
7
udp.h
|
@ -11,11 +11,12 @@
|
|||
void udp_portmap_clear(void);
|
||||
void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now);
|
||||
void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now);
|
||||
void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now);
|
||||
int udp_tap_handler(const struct ctx *c, uint8_t pif,
|
||||
sa_family_t af, const void *saddr, const void *daddr,
|
||||
const struct pool *p, int idx, const struct timespec *now);
|
||||
uint8_t ttl, const struct pool *p, int idx,
|
||||
const struct timespec *now);
|
||||
int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
|
||||
const char *ifname, in_port_t port);
|
||||
int udp_init(struct ctx *c);
|
||||
|
|
242
udp_flow.c
242
udp_flow.c
|
@ -9,10 +9,12 @@
|
|||
#include <fcntl.h>
|
||||
#include <sys/uio.h>
|
||||
#include <unistd.h>
|
||||
#include <netinet/udp.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "passt.h"
|
||||
#include "flow_table.h"
|
||||
#include "udp_internal.h"
|
||||
|
||||
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
|
||||
|
||||
|
@ -41,121 +43,145 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
|
|||
*/
|
||||
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
|
||||
{
|
||||
unsigned sidei;
|
||||
|
||||
if (uflow->closed)
|
||||
return; /* Nothing to do */
|
||||
|
||||
if (uflow->s[INISIDE] >= 0) {
|
||||
/* The listening socket needs to stay in epoll */
|
||||
close(uflow->s[INISIDE]);
|
||||
uflow->s[INISIDE] = -1;
|
||||
flow_foreach_sidei(sidei) {
|
||||
flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
|
||||
if (uflow->s[sidei] >= 0) {
|
||||
epoll_del(c, uflow->s[sidei]);
|
||||
close(uflow->s[sidei]);
|
||||
uflow->s[sidei] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (uflow->s[TGTSIDE] >= 0) {
|
||||
/* But the flow specific one needs to be removed */
|
||||
epoll_del(c, uflow->s[TGTSIDE]);
|
||||
close(uflow->s[TGTSIDE]);
|
||||
uflow->s[TGTSIDE] = -1;
|
||||
}
|
||||
flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
|
||||
if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
|
||||
flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
|
||||
|
||||
uflow->closed = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_flow_sock() - Create, bind and connect a flow specific UDP socket
|
||||
* @c: Execution context
|
||||
* @uflow: UDP flow to open socket for
|
||||
* @sidei: Side of @uflow to open socket for
|
||||
*
|
||||
* Return: fd of new socket on success, -ve error code on failure
|
||||
*/
|
||||
static int udp_flow_sock(const struct ctx *c,
|
||||
struct udp_flow *uflow, unsigned sidei)
|
||||
{
|
||||
const struct flowside *side = &uflow->f.side[sidei];
|
||||
uint8_t pif = uflow->f.pif[sidei];
|
||||
union {
|
||||
flow_sidx_t sidx;
|
||||
uint32_t data;
|
||||
} fref = { .sidx = FLOW_SIDX(uflow, sidei) };
|
||||
int s;
|
||||
|
||||
s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data);
|
||||
if (s < 0) {
|
||||
flow_dbg_perror(uflow, "Couldn't open flow specific socket");
|
||||
return s;
|
||||
}
|
||||
|
||||
if (flowside_connect(c, s, pif, side) < 0) {
|
||||
int rc = -errno;
|
||||
|
||||
epoll_del(c, s);
|
||||
close(s);
|
||||
|
||||
flow_dbg_perror(uflow, "Couldn't connect flow socket");
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* It's possible, if unlikely, that we could receive some packets in
|
||||
* between the bind() and connect() which may or may not be for this
|
||||
* flow. Being UDP we could just discard them, but it's not ideal.
|
||||
*
|
||||
* There's also a tricky case if a bunch of datagrams for a new flow
|
||||
* arrive in rapid succession, the first going to the original listening
|
||||
* socket and later ones going to this new socket. If we forwarded the
|
||||
* datagrams from the new socket immediately here they would go before
|
||||
* the datagram which established the flow. Again, not strictly wrong
|
||||
* for UDP, but not ideal.
|
||||
*
|
||||
* So, we flag that the new socket is in a transient state where it
|
||||
* might have datagrams for a different flow queued. Before the next
|
||||
* epoll cycle, udp_flow_defer() will flush out any such datagrams, and
|
||||
* thereafter everything on the new socket should be strictly for this
|
||||
* flow.
|
||||
*/
|
||||
if (sidei)
|
||||
uflow->flush1 = true;
|
||||
else
|
||||
uflow->flush0 = true;
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_flow_new() - Common setup for a new UDP flow
|
||||
* @c: Execution context
|
||||
* @flow: Initiated flow
|
||||
* @s_ini: Initiating socket (or -1)
|
||||
* @now: Timestamp
|
||||
*
|
||||
* Return: UDP specific flow, if successful, NULL on failure
|
||||
*
|
||||
* #syscalls getsockname
|
||||
*/
|
||||
static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
|
||||
int s_ini, const struct timespec *now)
|
||||
const struct timespec *now)
|
||||
{
|
||||
struct udp_flow *uflow = NULL;
|
||||
const struct flowside *tgt;
|
||||
uint8_t tgtpif;
|
||||
unsigned sidei;
|
||||
|
||||
if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
|
||||
goto cancel;
|
||||
tgtpif = flow->f.pif[TGTSIDE];
|
||||
|
||||
uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
|
||||
uflow->ts = now->tv_sec;
|
||||
uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
|
||||
uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;
|
||||
|
||||
if (s_ini >= 0) {
|
||||
/* When using auto port-scanning the listening port could go
|
||||
* away, so we need to duplicate the socket
|
||||
*/
|
||||
uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0);
|
||||
if (uflow->s[INISIDE] < 0) {
|
||||
flow_err(uflow,
|
||||
"Couldn't duplicate listening socket: %s",
|
||||
strerror_(errno));
|
||||
flow_foreach_sidei(sidei) {
|
||||
if (pif_is_socket(uflow->f.pif[sidei]))
|
||||
if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0)
|
||||
goto cancel;
|
||||
}
|
||||
|
||||
if (uflow->s[TGTSIDE] >= 0 && inany_is_unspecified(&tgt->oaddr)) {
|
||||
/* When we target a socket, we connect() it, but might not
|
||||
* always bind(), leaving the kernel to pick our address. In
|
||||
* that case connect() will implicitly bind() the socket, but we
|
||||
* need to determine its local address so that we can match
|
||||
* reply packets back to the correct flow. Update the flow with
|
||||
* the information from getsockname() */
|
||||
union sockaddr_inany sa;
|
||||
socklen_t sl = sizeof(sa);
|
||||
in_port_t port;
|
||||
|
||||
if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0 ||
|
||||
inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr,
|
||||
&port, &sa) < 0) {
|
||||
flow_perror(uflow, "Unable to determine local address");
|
||||
goto cancel;
|
||||
}
|
||||
if (port != tgt->oport) {
|
||||
flow_err(uflow, "Unexpected local port");
|
||||
goto cancel;
|
||||
}
|
||||
}
|
||||
|
||||
if (pif_is_socket(tgtpif)) {
|
||||
struct mmsghdr discard[UIO_MAXIOV] = { 0 };
|
||||
union {
|
||||
flow_sidx_t sidx;
|
||||
uint32_t data;
|
||||
} fref = {
|
||||
.sidx = FLOW_SIDX(flow, TGTSIDE),
|
||||
};
|
||||
int rc;
|
||||
|
||||
uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY,
|
||||
tgtpif, tgt, fref.data);
|
||||
if (uflow->s[TGTSIDE] < 0) {
|
||||
flow_dbg(uflow,
|
||||
"Couldn't open socket for spliced flow: %s",
|
||||
strerror_(errno));
|
||||
goto cancel;
|
||||
}
|
||||
|
||||
if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) {
|
||||
flow_dbg(uflow,
|
||||
"Couldn't connect flow socket: %s",
|
||||
strerror_(errno));
|
||||
goto cancel;
|
||||
}
|
||||
|
||||
/* It's possible, if unlikely, that we could receive some
|
||||
* unrelated packets in between the bind() and connect() of this
|
||||
* socket. For now we just discard these. We could consider
|
||||
* trying to redirect these to an appropriate handler, if we
|
||||
* need to.
|
||||
*/
|
||||
rc = recvmmsg(uflow->s[TGTSIDE], discard, ARRAY_SIZE(discard),
|
||||
MSG_DONTWAIT, NULL);
|
||||
if (rc >= ARRAY_SIZE(discard)) {
|
||||
flow_dbg(uflow,
|
||||
"Too many (%d) spurious reply datagrams", rc);
|
||||
goto cancel;
|
||||
} else if (rc > 0) {
|
||||
flow_trace(uflow,
|
||||
"Discarded %d spurious reply datagrams", rc);
|
||||
} else if (errno != EAGAIN) {
|
||||
flow_err(uflow,
|
||||
"Unexpected error discarding datagrams: %s",
|
||||
strerror_(errno));
|
||||
}
|
||||
}
|
||||
|
||||
flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE));
|
||||
|
||||
/* If the target side is a socket, it will be a reply socket that knows
|
||||
* its own flowside. But if it's tap, then we need to look it up by
|
||||
* hash.
|
||||
/* Tap sides always need to be looked up by hash. Socket sides don't
|
||||
* always, but sometimes do (receiving packets on a socket not specific
|
||||
* to one flow). Unconditionally hash both sides so all our bases are
|
||||
* covered
|
||||
*/
|
||||
if (!pif_is_socket(tgtpif))
|
||||
flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE));
|
||||
flow_foreach_sidei(sidei)
|
||||
flow_hash_insert(c, FLOW_SIDX(uflow, sidei));
|
||||
|
||||
FLOW_ACTIVATE(uflow);
|
||||
|
||||
return FLOW_SIDX(uflow, TGTSIDE);
|
||||
|
@ -168,9 +194,11 @@ cancel:
|
|||
}
|
||||
|
||||
/**
|
||||
* udp_flow_from_sock() - Find or create UDP flow for "listening" socket
|
||||
* udp_flow_from_sock() - Find or create UDP flow for incoming datagram
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference of the receiving socket
|
||||
* @pif: Interface the datagram is arriving from
|
||||
* @dst: Our (local) address to which the datagram is arriving
|
||||
* @port: Our (local) port number to which the datagram is arriving
|
||||
* @s_in: Source socket address, filled in by recvmmsg()
|
||||
* @now: Timestamp
|
||||
*
|
||||
|
@ -179,7 +207,8 @@ cancel:
|
|||
* Return: sidx for the destination side of the flow for this packet, or
|
||||
* FLOW_SIDX_NONE if we couldn't find or create a flow.
|
||||
*/
|
||||
flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
|
||||
flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
|
||||
const union inany_addr *dst, in_port_t port,
|
||||
const union sockaddr_inany *s_in,
|
||||
const struct timespec *now)
|
||||
{
|
||||
|
@ -188,9 +217,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
|
|||
union flow *flow;
|
||||
flow_sidx_t sidx;
|
||||
|
||||
ASSERT(ref.type == EPOLL_TYPE_UDP_LISTEN);
|
||||
|
||||
sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, s_in, ref.udp.port);
|
||||
sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port);
|
||||
if ((uflow = udp_at_sidx(sidx))) {
|
||||
uflow->ts = now->tv_sec;
|
||||
return flow_sidx_opposite(sidx);
|
||||
|
@ -200,12 +227,11 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
|
|||
char sastr[SOCKADDR_STRLEN];
|
||||
|
||||
debug("Couldn't allocate flow for UDP datagram from %s %s",
|
||||
pif_name(ref.udp.pif),
|
||||
sockaddr_ntop(s_in, sastr, sizeof(sastr)));
|
||||
pif_name(pif), sockaddr_ntop(s_in, sastr, sizeof(sastr)));
|
||||
return FLOW_SIDX_NONE;
|
||||
}
|
||||
|
||||
ini = flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port);
|
||||
ini = flow_initiate_sa(flow, pif, s_in, dst, port);
|
||||
|
||||
if (!inany_is_unicast(&ini->eaddr) ||
|
||||
ini->eport == 0 || ini->oport == 0) {
|
||||
|
@ -218,7 +244,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
|
|||
return FLOW_SIDX_NONE;
|
||||
}
|
||||
|
||||
return udp_flow_new(c, flow, ref.fd, now);
|
||||
return udp_flow_new(c, flow, now);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -274,17 +300,45 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
|
|||
return FLOW_SIDX_NONE;
|
||||
}
|
||||
|
||||
return udp_flow_new(c, flow, -1, now);
|
||||
return udp_flow_new(c, flow, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_flush_flow() - Flush datagrams that might not be for this flow
|
||||
* @c: Execution context
|
||||
* @uflow: Flow to handle
|
||||
* @sidei: Side of the flow to flush
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
static void udp_flush_flow(const struct ctx *c,
|
||||
const struct udp_flow *uflow, unsigned sidei,
|
||||
const struct timespec *now)
|
||||
{
|
||||
/* We don't know exactly where the datagrams will come from, but we know
|
||||
* they'll have an interface and oport matching this flow */
|
||||
udp_sock_fwd(c, uflow->s[sidei], uflow->f.pif[sidei],
|
||||
uflow->f.side[sidei].oport, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
|
||||
* @c: Execution context
|
||||
* @uflow: Flow to handle
|
||||
* @now: Current timestamp
|
||||
*
|
||||
* Return: true if the connection is ready to free, false otherwise
|
||||
*/
|
||||
bool udp_flow_defer(const struct udp_flow *uflow)
|
||||
bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
|
||||
const struct timespec *now)
|
||||
{
|
||||
if (uflow->flush0) {
|
||||
udp_flush_flow(c, uflow, INISIDE, now);
|
||||
uflow->flush0 = false;
|
||||
}
|
||||
if (uflow->flush1) {
|
||||
udp_flush_flow(c, uflow, TGTSIDE, now);
|
||||
uflow->flush1 = false;
|
||||
}
|
||||
return uflow->closed;
|
||||
}
|
||||
|
||||
|
|
18
udp_flow.h
18
udp_flow.h
|
@ -8,9 +8,12 @@
|
|||
#define UDP_FLOW_H
|
||||
|
||||
/**
|
||||
* struct udp - Descriptor for a flow of UDP packets
|
||||
* struct udp_flow - Descriptor for a flow of UDP packets
|
||||
* @f: Generic flow information
|
||||
* @ttl: TTL or hop_limit for both sides
|
||||
* @closed: Flow is already closed
|
||||
* @flush0: @s[0] may have datagrams queued for other flows
|
||||
* @flush1: @s[1] may have datagrams queued for other flows
|
||||
* @ts: Activity timestamp
|
||||
* @s: Socket fd (or -1) for each side of the flow
|
||||
*/
|
||||
|
@ -18,13 +21,19 @@ struct udp_flow {
|
|||
/* Must be first element */
|
||||
struct flow_common f;
|
||||
|
||||
bool closed :1;
|
||||
uint8_t ttl[SIDES];
|
||||
|
||||
bool closed :1,
|
||||
flush0 :1,
|
||||
flush1 :1;
|
||||
|
||||
time_t ts;
|
||||
int s[SIDES];
|
||||
};
|
||||
|
||||
struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
|
||||
flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
|
||||
flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
|
||||
const union inany_addr *dst, in_port_t port,
|
||||
const union sockaddr_inany *s_in,
|
||||
const struct timespec *now);
|
||||
flow_sidx_t udp_flow_from_tap(const struct ctx *c,
|
||||
|
@ -33,7 +42,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
|
|||
in_port_t srcport, in_port_t dstport,
|
||||
const struct timespec *now);
|
||||
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
|
||||
bool udp_flow_defer(const struct udp_flow *uflow);
|
||||
bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
|
||||
const struct timespec *now);
|
||||
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
|
||||
const struct timespec *now);
|
||||
|
||||
|
|
|
@ -8,8 +8,6 @@
|
|||
|
||||
#include "tap.h" /* needed by udp_meta_t */
|
||||
|
||||
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
|
||||
|
||||
/**
|
||||
* struct udp_payload_t - UDP header and data for inbound messages
|
||||
* @uh: UDP header
|
||||
|
@ -30,5 +28,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
|
|||
size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
|
||||
const struct flowside *toside, size_t dlen,
|
||||
bool no_udp_csum);
|
||||
int udp_sock_errs(const struct ctx *c, int s, uint32_t events);
|
||||
void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
|
||||
in_port_t port, const struct timespec *now);
|
||||
|
||||
#endif /* UDP_INTERNAL_H */
|
||||
|
|
138
udp_vu.c
138
udp_vu.c
|
@ -57,35 +57,16 @@ static size_t udp_vu_hdrlen(bool v6)
|
|||
return hdrlen;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_vu_sock_info() - get socket information
|
||||
* @s: Socket to get information from
|
||||
* @s_in: Socket address (output)
|
||||
*
|
||||
* Return: 0 if socket address can be read, -1 otherwise
|
||||
*/
|
||||
static int udp_vu_sock_info(int s, union sockaddr_inany *s_in)
|
||||
{
|
||||
struct msghdr msg = {
|
||||
.msg_name = s_in,
|
||||
.msg_namelen = sizeof(union sockaddr_inany),
|
||||
};
|
||||
|
||||
return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers
|
||||
* @c: Execution context
|
||||
* @s: Socket to receive from
|
||||
* @events: epoll events bitmap
|
||||
* @v6: Set for IPv6 connections
|
||||
* @dlen: Size of received data (output)
|
||||
*
|
||||
* Return: Number of iov entries used to store the datagram
|
||||
*/
|
||||
static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
|
||||
bool v6, ssize_t *dlen)
|
||||
static int udp_vu_sock_recv(const struct ctx *c, int s, bool v6, ssize_t *dlen)
|
||||
{
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
|
@ -95,9 +76,6 @@ static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
|
|||
|
||||
ASSERT(!c->no_udp);
|
||||
|
||||
if (!(events & EPOLLIN))
|
||||
return 0;
|
||||
|
||||
/* compute L2 header length */
|
||||
hdrlen = udp_vu_hdrlen(v6);
|
||||
|
||||
|
@ -214,125 +192,27 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used)
|
|||
}
|
||||
|
||||
/**
|
||||
* udp_vu_listen_sock_handler() - Handle new data from socket
|
||||
* udp_vu_sock_to_tap() - Forward datagrams from socket to tap
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
* @now: Current timestamp
|
||||
* @s: Socket to read data from
|
||||
* @n: Maximum number of datagrams to forward
|
||||
* @tosidx: Flow & side to forward data from @s to
|
||||
*/
|
||||
void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now)
|
||||
void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx)
|
||||
{
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
int i;
|
||||
|
||||
if (udp_sock_errs(c, ref.fd, events) < 0) {
|
||||
err("UDP: Unrecoverable error on listening socket:"
|
||||
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < UDP_MAX_FRAMES; i++) {
|
||||
const struct flowside *toside;
|
||||
union sockaddr_inany s_in;
|
||||
flow_sidx_t sidx;
|
||||
uint8_t pif;
|
||||
ssize_t dlen;
|
||||
int iov_used;
|
||||
bool v6;
|
||||
|
||||
if (udp_vu_sock_info(ref.fd, &s_in) < 0)
|
||||
break;
|
||||
|
||||
sidx = udp_flow_from_sock(c, ref, &s_in, now);
|
||||
pif = pif_at_sidx(sidx);
|
||||
|
||||
if (pif != PIF_TAP) {
|
||||
if (flow_sidx_valid(sidx)) {
|
||||
flow_sidx_t fromsidx = flow_sidx_opposite(sidx);
|
||||
struct udp_flow *uflow = udp_at_sidx(sidx);
|
||||
|
||||
flow_err(uflow,
|
||||
"No support for forwarding UDP from %s to %s",
|
||||
pif_name(pif_at_sidx(fromsidx)),
|
||||
pif_name(pif));
|
||||
} else {
|
||||
debug("Discarding 1 datagram without flow");
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
toside = flowside_at_sidx(sidx);
|
||||
|
||||
v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
|
||||
|
||||
iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen);
|
||||
if (iov_used <= 0)
|
||||
break;
|
||||
|
||||
udp_vu_prepare(c, toside, dlen);
|
||||
if (*c->pcap) {
|
||||
udp_vu_csum(toside, iov_used);
|
||||
pcap_iov(iov_vu, iov_used,
|
||||
sizeof(struct virtio_net_hdr_mrg_rxbuf));
|
||||
}
|
||||
vu_flush(vdev, vq, elem, iov_used);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_vu_reply_sock_handler() - Handle new data from flow specific socket
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now)
|
||||
{
|
||||
flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
|
||||
const struct flowside *toside = flowside_at_sidx(tosidx);
|
||||
struct udp_flow *uflow = udp_at_sidx(ref.flowside);
|
||||
int from_s = uflow->s[ref.flowside.sidei];
|
||||
bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
int i;
|
||||
|
||||
ASSERT(!c->no_udp);
|
||||
|
||||
if (udp_sock_errs(c, from_s, events) < 0) {
|
||||
flow_err(uflow, "Unrecoverable error on reply socket");
|
||||
flow_err_details(uflow);
|
||||
udp_flow_close(c, uflow);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < UDP_MAX_FRAMES; i++) {
|
||||
uint8_t topif = pif_at_sidx(tosidx);
|
||||
for (i = 0; i < n; i++) {
|
||||
ssize_t dlen;
|
||||
int iov_used;
|
||||
bool v6;
|
||||
|
||||
ASSERT(uflow);
|
||||
|
||||
if (topif != PIF_TAP) {
|
||||
uint8_t frompif = pif_at_sidx(ref.flowside);
|
||||
|
||||
flow_err(uflow,
|
||||
"No support for forwarding UDP from %s to %s",
|
||||
pif_name(frompif), pif_name(topif));
|
||||
continue;
|
||||
}
|
||||
|
||||
v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
|
||||
|
||||
iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen);
|
||||
iov_used = udp_vu_sock_recv(c, s, v6, &dlen);
|
||||
if (iov_used <= 0)
|
||||
break;
|
||||
flow_trace(uflow, "Received 1 datagram on reply socket");
|
||||
uflow->ts = now->tv_sec;
|
||||
|
||||
udp_vu_prepare(c, toside, dlen);
|
||||
if (*c->pcap) {
|
||||
|
|
8
udp_vu.h
8
udp_vu.h
|
@ -6,8 +6,8 @@
|
|||
#ifndef UDP_VU_H
|
||||
#define UDP_VU_H
|
||||
|
||||
void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now);
|
||||
void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now);
|
||||
void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
|
||||
const struct timespec *now);
|
||||
void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx);
|
||||
|
||||
#endif /* UDP_VU_H */
|
||||
|
|
33
util.c
33
util.c
|
@ -71,7 +71,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
|
|||
case EPOLL_TYPE_UDP_LISTEN:
|
||||
freebind = c->freebind;
|
||||
/* fallthrough */
|
||||
case EPOLL_TYPE_UDP_REPLY:
|
||||
case EPOLL_TYPE_UDP:
|
||||
proto = IPPROTO_UDP;
|
||||
socktype = SOCK_DGRAM | SOCK_NONBLOCK;
|
||||
break;
|
||||
|
@ -109,11 +109,15 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
|
|||
debug("Failed to set SO_REUSEADDR on socket %i", fd);
|
||||
|
||||
if (proto == IPPROTO_UDP) {
|
||||
int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
|
||||
int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
|
||||
int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
|
||||
int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
|
||||
|
||||
if (setsockopt(fd, level, opt, &y, sizeof(y)))
|
||||
if (setsockopt(fd, level, recverr, &y, sizeof(y)))
|
||||
die_perror("Failed to set RECVERR on socket %i", fd);
|
||||
|
||||
if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
|
||||
die_perror("Failed to set PKTINFO on socket %i", fd);
|
||||
}
|
||||
|
||||
if (ifname && *ifname) {
|
||||
|
@ -871,7 +875,9 @@ void close_open_files(int argc, char **argv)
|
|||
errno = 0;
|
||||
fd = strtol(optarg, NULL, 0);
|
||||
|
||||
if (errno || fd <= STDERR_FILENO || fd > INT_MAX)
|
||||
if (errno ||
|
||||
(fd != STDIN_FILENO && fd <= STDERR_FILENO) ||
|
||||
fd > INT_MAX)
|
||||
die("Invalid --fd: %s", optarg);
|
||||
}
|
||||
} while (name != -1);
|
||||
|
@ -1017,3 +1023,22 @@ void encode_domain_name(char *buf, const char *domain_name)
|
|||
}
|
||||
p[i] = 0L;
|
||||
}
|
||||
|
||||
/**
|
||||
* abort_with_msg() - Print error message and abort
|
||||
* @fmt: Format string
|
||||
* @...: Format parameters
|
||||
*/
|
||||
void abort_with_msg(const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, fmt);
|
||||
vlogmsg(true, false, LOG_CRIT, fmt, ap);
|
||||
va_end(ap);
|
||||
|
||||
/* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp,
|
||||
* but that will still get the job done.
|
||||
*/
|
||||
abort();
|
||||
}
|
||||
|
|
44
util.h
44
util.h
|
@ -31,18 +31,9 @@
|
|||
#ifndef SECCOMP_RET_KILL_PROCESS
|
||||
#define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL
|
||||
#endif
|
||||
#ifndef ETH_MAX_MTU
|
||||
#define ETH_MAX_MTU USHRT_MAX
|
||||
#endif
|
||||
#ifndef ETH_MIN_MTU
|
||||
#define ETH_MIN_MTU 68
|
||||
#endif
|
||||
#ifndef IP_MAX_MTU
|
||||
#define IP_MAX_MTU USHRT_MAX
|
||||
#endif
|
||||
#ifndef IPV6_MIN_MTU
|
||||
#define IPV6_MIN_MTU 1280
|
||||
#endif
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
|
||||
|
@ -70,27 +61,22 @@
|
|||
#define STRINGIFY(x) #x
|
||||
#define STR(x) STRINGIFY(x)
|
||||
|
||||
#ifdef CPPCHECK_6936
|
||||
void abort_with_msg(const char *fmt, ...)
|
||||
__attribute__((format(printf, 1, 2), noreturn));
|
||||
|
||||
/* Some cppcheck versions get confused by aborts inside a loop, causing
|
||||
* it to give false positive uninitialised variable warnings later in
|
||||
* the function, because it doesn't realise the non-initialising path
|
||||
* already exited. See https://trac.cppcheck.net/ticket/13227
|
||||
*
|
||||
* Therefore, avoid using the usual do while wrapper we use to force the macro
|
||||
* to act like a single statement requiring a ';'.
|
||||
*/
|
||||
#define ASSERT(expr) \
|
||||
((expr) ? (void)0 : abort())
|
||||
#else
|
||||
#define ASSERT_WITH_MSG(expr, ...) \
|
||||
((expr) ? (void)0 : abort_with_msg(__VA_ARGS__))
|
||||
#define ASSERT(expr) \
|
||||
do { \
|
||||
if (!(expr)) { \
|
||||
err("ASSERTION FAILED in %s (%s:%d): %s", \
|
||||
__func__, __FILE__, __LINE__, STRINGIFY(expr)); \
|
||||
/* This may actually SIGSYS, due to seccomp, \
|
||||
* but that will still get the job done \
|
||||
*/ \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
ASSERT_WITH_MSG((expr), "ASSERTION FAILED in %s (%s:%d): %s", \
|
||||
__func__, __FILE__, __LINE__, STRINGIFY(expr))
|
||||
|
||||
#ifdef P_tmpdir
|
||||
#define TMPDIR P_tmpdir
|
||||
|
@ -385,6 +371,16 @@ static inline int wrap_accept4(int sockfd, struct sockaddr *addr,
|
|||
#define accept4(s, addr, addrlen, flags) \
|
||||
wrap_accept4((s), (addr), (addrlen), (flags))
|
||||
|
||||
static inline int wrap_getsockname(int sockfd, struct sockaddr *addr,
|
||||
/* cppcheck-suppress constParameterPointer */
|
||||
socklen_t *addrlen)
|
||||
{
|
||||
sa_init(addr, addrlen);
|
||||
return getsockname(sockfd, addr, addrlen);
|
||||
}
|
||||
#define getsockname(s, addr, addrlen) \
|
||||
wrap_getsockname((s), (addr), (addrlen))
|
||||
|
||||
#define PASST_MAXDNAME 254 /* 253 (RFC 1035) + 1 (the terminator) */
|
||||
void encode_domain_name(char *buf, const char *domain_name);
|
||||
|
||||
|
|
225
vhost_user.c
225
vhost_user.c
|
@ -302,13 +302,13 @@ static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg)
|
|||
* @conn_fd: vhost-user command socket
|
||||
* @vmsg: vhost-user message
|
||||
*/
|
||||
static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
|
||||
static void vu_send_reply(int conn_fd, struct vhost_user_msg *vmsg)
|
||||
{
|
||||
msg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
|
||||
msg->hdr.flags |= VHOST_USER_VERSION;
|
||||
msg->hdr.flags |= VHOST_USER_REPLY_MASK;
|
||||
vmsg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
|
||||
vmsg->hdr.flags |= VHOST_USER_VERSION;
|
||||
vmsg->hdr.flags |= VHOST_USER_REPLY_MASK;
|
||||
|
||||
vu_message_write(conn_fd, msg);
|
||||
vu_message_write(conn_fd, vmsg);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -319,7 +319,7 @@ static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
|
|||
* Return: True as a reply is requested
|
||||
*/
|
||||
static bool vu_get_features_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
uint64_t features =
|
||||
1ULL << VIRTIO_F_VERSION_1 |
|
||||
|
@ -329,9 +329,9 @@ static bool vu_get_features_exec(struct vu_dev *vdev,
|
|||
|
||||
(void)vdev;
|
||||
|
||||
vmsg_set_reply_u64(msg, features);
|
||||
vmsg_set_reply_u64(vmsg, features);
|
||||
|
||||
debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64);
|
||||
debug("Sending back to guest u64: 0x%016"PRIx64, vmsg->payload.u64);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -357,11 +357,11 @@ static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable)
|
|||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_features_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
debug("u64: 0x%016"PRIx64, msg->payload.u64);
|
||||
debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
|
||||
|
||||
vdev->features = msg->payload.u64;
|
||||
vdev->features = vmsg->payload.u64;
|
||||
/* We only support devices conforming to VIRTIO 1.0 or
|
||||
* later
|
||||
*/
|
||||
|
@ -382,10 +382,10 @@ static bool vu_set_features_exec(struct vu_dev *vdev,
|
|||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_owner_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
(void)vdev;
|
||||
(void)msg;
|
||||
(void)vmsg;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -423,9 +423,9 @@ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq)
|
|||
* #syscalls:vu mmap|mmap2 munmap
|
||||
*/
|
||||
static bool vu_set_mem_table_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
struct vhost_user_memory m = msg->payload.memory, *memory = &m;
|
||||
struct vhost_user_memory m = vmsg->payload.memory, *memory = &m;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < vdev->nregions; i++) {
|
||||
|
@ -465,7 +465,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
|
|||
*/
|
||||
mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
|
||||
PROT_READ | PROT_WRITE, MAP_SHARED |
|
||||
MAP_NORESERVE, msg->fds[i], 0);
|
||||
MAP_NORESERVE, vmsg->fds[i], 0);
|
||||
|
||||
if (mmap_addr == MAP_FAILED)
|
||||
die_perror("vhost-user region mmap error");
|
||||
|
@ -474,7 +474,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
|
|||
debug(" mmap_addr: 0x%016"PRIx64,
|
||||
dev_region->mmap_addr);
|
||||
|
||||
close(msg->fds[i]);
|
||||
close(vmsg->fds[i]);
|
||||
}
|
||||
|
||||
for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
|
||||
|
@ -517,7 +517,7 @@ static void vu_close_log(struct vu_dev *vdev)
|
|||
* vu_log_kick() - Inform the front-end that the log has been modified
|
||||
* @vdev: vhost-user device
|
||||
*/
|
||||
void vu_log_kick(const struct vu_dev *vdev)
|
||||
static void vu_log_kick(const struct vu_dev *vdev)
|
||||
{
|
||||
if (vdev->log_call_fd != -1) {
|
||||
int rc;
|
||||
|
@ -541,7 +541,7 @@ static void vu_log_page(uint8_t *log_table, uint64_t page)
|
|||
|
||||
/**
|
||||
* vu_log_write() - Log memory write
|
||||
* @dev: vhost-user device
|
||||
* @vdev: vhost-user device
|
||||
* @address: Memory address
|
||||
* @length: Memory size
|
||||
*/
|
||||
|
@ -566,23 +566,23 @@ void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length)
|
|||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
* Return: True as a reply is requested
|
||||
*
|
||||
* #syscalls:vu mmap|mmap2 munmap
|
||||
*/
|
||||
static bool vu_set_log_base_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
uint64_t log_mmap_size, log_mmap_offset;
|
||||
void *base;
|
||||
int fd;
|
||||
|
||||
if (msg->fd_num != 1 || msg->hdr.size != sizeof(msg->payload.log))
|
||||
if (vmsg->fd_num != 1 || vmsg->hdr.size != sizeof(vmsg->payload.log))
|
||||
die("vhost-user: Invalid log_base message");
|
||||
|
||||
fd = msg->fds[0];
|
||||
log_mmap_offset = msg->payload.log.mmap_offset;
|
||||
log_mmap_size = msg->payload.log.mmap_size;
|
||||
fd = vmsg->fds[0];
|
||||
log_mmap_offset = vmsg->payload.log.mmap_offset;
|
||||
log_mmap_size = vmsg->payload.log.mmap_size;
|
||||
|
||||
debug("vhost-user log mmap_offset: %"PRId64, log_mmap_offset);
|
||||
debug("vhost-user log mmap_size: %"PRId64, log_mmap_size);
|
||||
|
@ -599,8 +599,8 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev,
|
|||
vdev->log_table = base;
|
||||
vdev->log_size = log_mmap_size;
|
||||
|
||||
msg->hdr.size = sizeof(msg->payload.u64);
|
||||
msg->fd_num = 0;
|
||||
vmsg->hdr.size = sizeof(vmsg->payload.u64);
|
||||
vmsg->fd_num = 0;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -613,15 +613,15 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev,
|
|||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_log_fd_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
if (msg->fd_num != 1)
|
||||
if (vmsg->fd_num != 1)
|
||||
die("Invalid log_fd message");
|
||||
|
||||
if (vdev->log_call_fd != -1)
|
||||
close(vdev->log_call_fd);
|
||||
|
||||
vdev->log_call_fd = msg->fds[0];
|
||||
vdev->log_call_fd = vmsg->fds[0];
|
||||
|
||||
debug("Got log_call_fd: %d", vdev->log_call_fd);
|
||||
|
||||
|
@ -636,10 +636,10 @@ static bool vu_set_log_fd_exec(struct vu_dev *vdev,
|
|||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_num_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
unsigned int idx = msg->payload.state.index;
|
||||
unsigned int num = msg->payload.state.num;
|
||||
unsigned int idx = vmsg->payload.state.index;
|
||||
unsigned int num = vmsg->payload.state.num;
|
||||
|
||||
trace("State.index: %u", idx);
|
||||
trace("State.num: %u", num);
|
||||
|
@ -656,13 +656,13 @@ static bool vu_set_vring_num_exec(struct vu_dev *vdev,
|
|||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
/* We need to copy the payload to vhost_vring_addr structure
|
||||
* to access index because address of msg->payload.addr
|
||||
* to access index because address of vmsg->payload.addr
|
||||
* can be unaligned as it is packed.
|
||||
*/
|
||||
struct vhost_vring_addr addr = msg->payload.addr;
|
||||
struct vhost_vring_addr addr = vmsg->payload.addr;
|
||||
struct vu_virtq *vq = &vdev->vq[addr.index];
|
||||
|
||||
debug("vhost_vring_addr:");
|
||||
|
@ -677,7 +677,7 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
|
|||
debug(" log_guest_addr: 0x%016" PRIx64,
|
||||
(uint64_t)addr.log_guest_addr);
|
||||
|
||||
vq->vra = msg->payload.addr;
|
||||
vq->vra = vmsg->payload.addr;
|
||||
vq->vring.flags = addr.flags;
|
||||
vq->vring.log_guest_addr = addr.log_guest_addr;
|
||||
|
||||
|
@ -702,10 +702,10 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
|
|||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_base_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
unsigned int idx = msg->payload.state.index;
|
||||
unsigned int num = msg->payload.state.num;
|
||||
unsigned int idx = vmsg->payload.state.index;
|
||||
unsigned int num = vmsg->payload.state.num;
|
||||
|
||||
debug("State.index: %u", idx);
|
||||
debug("State.num: %u", num);
|
||||
|
@ -723,13 +723,13 @@ static bool vu_set_vring_base_exec(struct vu_dev *vdev,
|
|||
* Return: True as a reply is requested
|
||||
*/
|
||||
static bool vu_get_vring_base_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
unsigned int idx = msg->payload.state.index;
|
||||
unsigned int idx = vmsg->payload.state.index;
|
||||
|
||||
debug("State.index: %u", idx);
|
||||
msg->payload.state.num = vdev->vq[idx].last_avail_idx;
|
||||
msg->hdr.size = sizeof(msg->payload.state);
|
||||
vmsg->payload.state.num = vdev->vq[idx].last_avail_idx;
|
||||
vmsg->hdr.size = sizeof(vmsg->payload.state);
|
||||
|
||||
vdev->vq[idx].started = false;
|
||||
vdev->vq[idx].vring.avail = 0;
|
||||
|
@ -771,21 +771,21 @@ static void vu_set_watch(const struct vu_dev *vdev, int idx)
|
|||
* close fds if NOFD bit is set
|
||||
* @vmsg: vhost-user message
|
||||
*/
|
||||
static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
|
||||
static void vu_check_queue_msg_file(struct vhost_user_msg *vmsg)
|
||||
{
|
||||
bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
|
||||
if (idx >= VHOST_USER_MAX_QUEUES)
|
||||
die("Invalid vhost-user queue index: %u", idx);
|
||||
|
||||
if (nofd) {
|
||||
vmsg_close_fds(msg);
|
||||
vmsg_close_fds(vmsg);
|
||||
return;
|
||||
}
|
||||
|
||||
if (msg->fd_num != 1)
|
||||
die("Invalid fds in vhost-user request: %d", msg->hdr.request);
|
||||
if (vmsg->fd_num != 1)
|
||||
die("Invalid fds in vhost-user request: %d", vmsg->hdr.request);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -797,14 +797,14 @@ static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
|
|||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
|
||||
debug("u64: 0x%016"PRIx64, msg->payload.u64);
|
||||
debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
|
||||
|
||||
vu_check_queue_msg_file(msg);
|
||||
vu_check_queue_msg_file(vmsg);
|
||||
|
||||
if (vdev->vq[idx].kick_fd != -1) {
|
||||
epoll_del(vdev->context, vdev->vq[idx].kick_fd);
|
||||
|
@ -813,7 +813,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
|
|||
}
|
||||
|
||||
if (!nofd)
|
||||
vdev->vq[idx].kick_fd = msg->fds[0];
|
||||
vdev->vq[idx].kick_fd = vmsg->fds[0];
|
||||
|
||||
debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx);
|
||||
|
||||
|
@ -837,14 +837,14 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
|
|||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_call_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
|
||||
debug("u64: 0x%016"PRIx64, msg->payload.u64);
|
||||
debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
|
||||
|
||||
vu_check_queue_msg_file(msg);
|
||||
vu_check_queue_msg_file(vmsg);
|
||||
|
||||
if (vdev->vq[idx].call_fd != -1) {
|
||||
close(vdev->vq[idx].call_fd);
|
||||
|
@ -852,11 +852,11 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev,
|
|||
}
|
||||
|
||||
if (!nofd)
|
||||
vdev->vq[idx].call_fd = msg->fds[0];
|
||||
vdev->vq[idx].call_fd = vmsg->fds[0];
|
||||
|
||||
/* in case of I/O hang after reconnecting */
|
||||
if (vdev->vq[idx].call_fd != -1)
|
||||
eventfd_write(msg->fds[0], 1);
|
||||
eventfd_write(vmsg->fds[0], 1);
|
||||
|
||||
debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx);
|
||||
|
||||
|
@ -872,14 +872,14 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev,
|
|||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_err_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
|
||||
debug("u64: 0x%016"PRIx64, msg->payload.u64);
|
||||
debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
|
||||
|
||||
vu_check_queue_msg_file(msg);
|
||||
vu_check_queue_msg_file(vmsg);
|
||||
|
||||
if (vdev->vq[idx].err_fd != -1) {
|
||||
close(vdev->vq[idx].err_fd);
|
||||
|
@ -887,7 +887,7 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev,
|
|||
}
|
||||
|
||||
if (!nofd)
|
||||
vdev->vq[idx].err_fd = msg->fds[0];
|
||||
vdev->vq[idx].err_fd = vmsg->fds[0];
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -901,7 +901,7 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev,
|
|||
* Return: True as a reply is requested
|
||||
*/
|
||||
static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
|
||||
1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
|
||||
|
@ -909,7 +909,7 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
|
|||
1ULL << VHOST_USER_PROTOCOL_F_RARP;
|
||||
|
||||
(void)vdev;
|
||||
vmsg_set_reply_u64(msg, features);
|
||||
vmsg_set_reply_u64(vmsg, features);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -922,13 +922,13 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
|
|||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
uint64_t features = msg->payload.u64;
|
||||
uint64_t features = vmsg->payload.u64;
|
||||
|
||||
debug("u64: 0x%016"PRIx64, features);
|
||||
|
||||
vdev->protocol_features = msg->payload.u64;
|
||||
vdev->protocol_features = vmsg->payload.u64;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -941,11 +941,11 @@ static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
|
|||
* Return: True as a reply is requested
|
||||
*/
|
||||
static bool vu_get_queue_num_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
(void)vdev;
|
||||
|
||||
vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES);
|
||||
vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_QUEUES);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -958,10 +958,10 @@ static bool vu_get_queue_num_exec(struct vu_dev *vdev,
|
|||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
unsigned int enable = msg->payload.state.num;
|
||||
unsigned int idx = msg->payload.state.index;
|
||||
unsigned int enable = vmsg->payload.state.num;
|
||||
unsigned int idx = vmsg->payload.state.index;
|
||||
|
||||
debug("State.index: %u", idx);
|
||||
debug("State.enable: %u", enable);
|
||||
|
@ -974,17 +974,17 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
|
|||
}
|
||||
|
||||
/**
|
||||
* vu_set_send_rarp_exec() - vhost-user specification says: "Broadcast a fake
|
||||
* RARP to notify the migration is terminated",
|
||||
* but passt doesn't need to update any ARP table,
|
||||
* so do nothing to silence QEMU bogus error message
|
||||
* vu_send_rarp_exec() - vhost-user specification says: "Broadcast a fake
|
||||
* RARP to notify the migration is terminated",
|
||||
* but passt doesn't need to update any ARP table,
|
||||
* so do nothing to silence QEMU bogus error message
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_send_rarp_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
char macstr[ETH_ADDRSTRLEN];
|
||||
|
||||
|
@ -993,7 +993,7 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
|
|||
/* ignore the command */
|
||||
|
||||
debug("Ignore command VHOST_USER_SEND_RARP for %s",
|
||||
eth_ntop((unsigned char *)&msg->payload.u64, macstr,
|
||||
eth_ntop((unsigned char *)&vmsg->payload.u64, macstr,
|
||||
sizeof(macstr)));
|
||||
|
||||
return false;
|
||||
|
@ -1008,12 +1008,12 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
|
|||
* and set bit 8 as we don't provide our own fd.
|
||||
*/
|
||||
static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
unsigned int direction = msg->payload.transfer_state.direction;
|
||||
unsigned int phase = msg->payload.transfer_state.phase;
|
||||
unsigned int direction = vmsg->payload.transfer_state.direction;
|
||||
unsigned int phase = vmsg->payload.transfer_state.phase;
|
||||
|
||||
if (msg->fd_num != 1)
|
||||
if (vmsg->fd_num != 1)
|
||||
die("Invalid device_state_fd message");
|
||||
|
||||
if (phase != VHOST_USER_TRANSFER_STATE_PHASE_STOPPED)
|
||||
|
@ -1021,13 +1021,13 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
|
|||
|
||||
if (direction != VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE &&
|
||||
direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD)
|
||||
die("Invalide device_state_fd direction: %d", direction);
|
||||
die("Invalid device_state_fd direction: %d", direction);
|
||||
|
||||
migrate_request(vdev->context, msg->fds[0],
|
||||
migrate_request(vdev->context, vmsg->fds[0],
|
||||
direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
|
||||
|
||||
/* We don't provide a new fd for the data transfer */
|
||||
vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
|
||||
vmsg_set_reply_u64(vmsg, VHOST_USER_VRING_NOFD_MASK);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -1041,9 +1041,9 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
|
|||
*/
|
||||
/* cppcheck-suppress constParameterCallback */
|
||||
static bool vu_check_device_state_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
struct vhost_user_msg *vmsg)
|
||||
{
|
||||
vmsg_set_reply_u64(msg, vdev->context->device_state_result);
|
||||
vmsg_set_reply_u64(vmsg, vdev->context->device_state_result);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -1051,7 +1051,6 @@ static bool vu_check_device_state_exec(struct vu_dev *vdev,
|
|||
/**
|
||||
* vu_init() - Initialize vhost-user device structure
|
||||
* @c: execution context
|
||||
* @vdev: vhost-user device
|
||||
*/
|
||||
void vu_init(struct ctx *c)
|
||||
{
|
||||
|
@ -1134,7 +1133,7 @@ static void vu_sock_reset(struct vu_dev *vdev)
|
|||
}
|
||||
|
||||
static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg) = {
|
||||
struct vhost_user_msg *vmsg) = {
|
||||
[VHOST_USER_GET_FEATURES] = vu_get_features_exec,
|
||||
[VHOST_USER_SET_FEATURES] = vu_set_features_exec,
|
||||
[VHOST_USER_GET_PROTOCOL_FEATURES] = vu_get_protocol_features_exec,
|
||||
|
@ -1165,7 +1164,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
|
|||
*/
|
||||
void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
|
||||
{
|
||||
struct vhost_user_msg msg = { 0 };
|
||||
struct vhost_user_msg vmsg = { 0 };
|
||||
bool need_reply, reply_requested;
|
||||
int ret;
|
||||
|
||||
|
@ -1174,38 +1173,38 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
|
|||
return;
|
||||
}
|
||||
|
||||
ret = vu_message_read_default(fd, &msg);
|
||||
ret = vu_message_read_default(fd, &vmsg);
|
||||
if (ret == 0) {
|
||||
vu_sock_reset(vdev);
|
||||
return;
|
||||
}
|
||||
debug("================ Vhost user message ================");
|
||||
debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
|
||||
msg.hdr.request);
|
||||
debug("Flags: 0x%x", msg.hdr.flags);
|
||||
debug("Size: %u", msg.hdr.size);
|
||||
debug("Request: %s (%d)", vu_request_to_string(vmsg.hdr.request),
|
||||
vmsg.hdr.request);
|
||||
debug("Flags: 0x%x", vmsg.hdr.flags);
|
||||
debug("Size: %u", vmsg.hdr.size);
|
||||
|
||||
need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
|
||||
need_reply = vmsg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
|
||||
|
||||
if (msg.hdr.request >= 0 && msg.hdr.request < VHOST_USER_MAX &&
|
||||
vu_handle[msg.hdr.request])
|
||||
reply_requested = vu_handle[msg.hdr.request](vdev, &msg);
|
||||
if (vmsg.hdr.request >= 0 && vmsg.hdr.request < VHOST_USER_MAX &&
|
||||
vu_handle[vmsg.hdr.request])
|
||||
reply_requested = vu_handle[vmsg.hdr.request](vdev, &vmsg);
|
||||
else
|
||||
die("Unhandled request: %d", msg.hdr.request);
|
||||
die("Unhandled request: %d", vmsg.hdr.request);
|
||||
|
||||
/* cppcheck-suppress legacyUninitvar */
|
||||
if (!reply_requested && need_reply) {
|
||||
msg.payload.u64 = 0;
|
||||
msg.hdr.flags = 0;
|
||||
msg.hdr.size = sizeof(msg.payload.u64);
|
||||
msg.fd_num = 0;
|
||||
vmsg.payload.u64 = 0;
|
||||
vmsg.hdr.flags = 0;
|
||||
vmsg.hdr.size = sizeof(vmsg.payload.u64);
|
||||
vmsg.fd_num = 0;
|
||||
reply_requested = true;
|
||||
}
|
||||
|
||||
if (reply_requested)
|
||||
vu_send_reply(fd, &msg);
|
||||
vu_send_reply(fd, &vmsg);
|
||||
|
||||
if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
|
||||
if (vmsg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
|
||||
vdev->context->device_state_result == 0 &&
|
||||
!vdev->context->migrate_target) {
|
||||
info("Migration complete, exiting");
|
||||
|
|
|
@ -184,7 +184,7 @@ union vhost_user_payload {
|
|||
};
|
||||
|
||||
/**
|
||||
* struct vhost_user_msg - vhost-use message
|
||||
* struct vhost_user_msg - vhost-user message
|
||||
* @hdr: Message header
|
||||
* @payload: Message payload
|
||||
* @fds: File descriptors associated with the message
|
||||
|
@ -241,7 +241,6 @@ static inline bool vu_queue_started(const struct vu_virtq *vq)
|
|||
void vu_print_capabilities(void);
|
||||
void vu_init(struct ctx *c);
|
||||
void vu_cleanup(struct vu_dev *vdev);
|
||||
void vu_log_kick(const struct vu_dev *vdev);
|
||||
void vu_log_write(const struct vu_dev *vdev, uint64_t address,
|
||||
uint64_t length);
|
||||
void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events);
|
||||
|
|
39
virtio.c
39
virtio.c
|
@ -156,9 +156,9 @@ static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i)
|
|||
}
|
||||
|
||||
/**
|
||||
* virtq_used_event - Get location of used event indices
|
||||
* virtq_used_event() - Get location of used event indices
|
||||
* (only with VIRTIO_F_EVENT_IDX)
|
||||
* @vq Virtqueue
|
||||
* @vq: Virtqueue
|
||||
*
|
||||
* Return: return the location of the used event index
|
||||
*/
|
||||
|
@ -170,7 +170,7 @@ static inline uint16_t *virtq_used_event(const struct vu_virtq *vq)
|
|||
|
||||
/**
|
||||
* vring_get_used_event() - Get the used event from the available ring
|
||||
* @vq Virtqueue
|
||||
* @vq: Virtqueue
|
||||
*
|
||||
* Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set)
|
||||
* used_event is a performant alternative where the driver
|
||||
|
@ -235,6 +235,7 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev,
|
|||
memcpy(desc, orig_desc, read_len);
|
||||
len -= read_len;
|
||||
addr += read_len;
|
||||
/* NOLINTNEXTLINE(bugprone-sizeof-expression,cert-arr39-c) */
|
||||
desc += read_len / sizeof(struct vring_desc);
|
||||
}
|
||||
|
||||
|
@ -243,9 +244,9 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev,
|
|||
|
||||
/**
|
||||
* enum virtqueue_read_desc_state - State in the descriptor chain
|
||||
* @VIRTQUEUE_READ_DESC_ERROR Found an invalid descriptor
|
||||
* @VIRTQUEUE_READ_DESC_DONE No more descriptors in the chain
|
||||
* @VIRTQUEUE_READ_DESC_MORE there are more descriptors in the chain
|
||||
* @VIRTQUEUE_READ_DESC_ERROR: Found an invalid descriptor
|
||||
* @VIRTQUEUE_READ_DESC_DONE: No more descriptors in the chain
|
||||
* @VIRTQUEUE_READ_DESC_MORE: there are more descriptors in the chain
|
||||
*/
|
||||
enum virtqueue_read_desc_state {
|
||||
VIRTQUEUE_READ_DESC_ERROR = -1,
|
||||
|
@ -286,7 +287,7 @@ static int virtqueue_read_next_desc(const struct vring_desc *desc,
|
|||
*
|
||||
* Return: true if the virtqueue is empty, false otherwise
|
||||
*/
|
||||
bool vu_queue_empty(struct vu_virtq *vq)
|
||||
static bool vu_queue_empty(struct vu_virtq *vq)
|
||||
{
|
||||
if (!vq->vring.avail)
|
||||
return true;
|
||||
|
@ -346,8 +347,9 @@ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq)
|
|||
die_perror("Error writing vhost-user queue eventfd");
|
||||
}
|
||||
|
||||
/* virtq_avail_event() - Get location of available event indices
|
||||
* (only with VIRTIO_F_EVENT_IDX)
|
||||
/**
|
||||
* virtq_avail_event() - Get location of available event indices
|
||||
* (only with VIRTIO_F_EVENT_IDX)
|
||||
* @vq: Virtqueue
|
||||
*
|
||||
* Return: return the location of the available event index
|
||||
|
@ -420,8 +422,8 @@ static bool virtqueue_map_desc(const struct vu_dev *dev,
|
|||
}
|
||||
|
||||
/**
|
||||
* vu_queue_map_desc - Map the virtqueue descriptor ring into our virtual
|
||||
* address space
|
||||
* vu_queue_map_desc() - Map the virtqueue descriptor ring into our virtual
|
||||
* address space
|
||||
* @dev: Vhost-user device
|
||||
* @vq: Virtqueue
|
||||
* @idx: First descriptor ring entry to map
|
||||
|
@ -504,7 +506,7 @@ static int vu_queue_map_desc(const struct vu_dev *dev,
|
|||
* vu_queue_pop() - Pop an entry from the virtqueue
|
||||
* @dev: Vhost-user device
|
||||
* @vq: Virtqueue
|
||||
* @elem: Virtqueue element to file with the entry information
|
||||
* @elem: Virtqueue element to fill with the entry information
|
||||
*
|
||||
* Return: -1 if there is an error, 0 otherwise
|
||||
*/
|
||||
|
@ -544,7 +546,7 @@ int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq,
|
|||
}
|
||||
|
||||
/**
|
||||
* vu_queue_detach_element() - Detach an element from the virqueue
|
||||
* vu_queue_detach_element() - Detach an element from the virtqueue
|
||||
* @vq: Virtqueue
|
||||
*/
|
||||
void vu_queue_detach_element(struct vu_virtq *vq)
|
||||
|
@ -554,7 +556,7 @@ void vu_queue_detach_element(struct vu_virtq *vq)
|
|||
}
|
||||
|
||||
/**
|
||||
* vu_queue_unpop() - Push back the previously popped element from the virqueue
|
||||
* vu_queue_unpop() - Push back the previously popped element from the virtqueue
|
||||
* @vq: Virtqueue
|
||||
*/
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
|
@ -568,6 +570,8 @@ void vu_queue_unpop(struct vu_virtq *vq)
|
|||
* vu_queue_rewind() - Push back a given number of popped elements
|
||||
* @vq: Virtqueue
|
||||
* @num: Number of element to unpop
|
||||
*
|
||||
* Return: True on success, false if not
|
||||
*/
|
||||
bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num)
|
||||
{
|
||||
|
@ -671,9 +675,10 @@ static void vu_log_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq,
|
|||
* @len: Size of the element
|
||||
* @idx: Used ring entry index
|
||||
*/
|
||||
void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq,
|
||||
unsigned int index, unsigned int len,
|
||||
unsigned int idx)
|
||||
static void vu_queue_fill_by_index(const struct vu_dev *vdev,
|
||||
struct vu_virtq *vq,
|
||||
unsigned int index, unsigned int len,
|
||||
unsigned int idx)
|
||||
{
|
||||
struct vring_used_elem uelem;
|
||||
|
||||
|
|
8
virtio.h
8
virtio.h
|
@ -150,7 +150,7 @@ static inline bool has_feature(uint64_t features, unsigned int fbit)
|
|||
/**
|
||||
* vu_has_feature() - Check if a virtio-net feature is available
|
||||
* @vdev: Vhost-user device
|
||||
* @bit: Feature to check
|
||||
* @fbit: Feature to check
|
||||
*
|
||||
* Return: True if the feature is available
|
||||
*/
|
||||
|
@ -163,7 +163,7 @@ static inline bool vu_has_feature(const struct vu_dev *vdev,
|
|||
/**
|
||||
* vu_has_protocol_feature() - Check if a vhost-user feature is available
|
||||
* @vdev: Vhost-user device
|
||||
* @bit: Feature to check
|
||||
* @fbit: Feature to check
|
||||
*
|
||||
* Return: True if the feature is available
|
||||
*/
|
||||
|
@ -174,16 +174,12 @@ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev,
|
|||
return has_feature(vdev->protocol_features, fbit);
|
||||
}
|
||||
|
||||
bool vu_queue_empty(struct vu_virtq *vq);
|
||||
void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq);
|
||||
int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq,
|
||||
struct vu_virtq_element *elem);
|
||||
void vu_queue_detach_element(struct vu_virtq *vq);
|
||||
void vu_queue_unpop(struct vu_virtq *vq);
|
||||
bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num);
|
||||
void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq,
|
||||
unsigned int index, unsigned int len,
|
||||
unsigned int idx);
|
||||
void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq,
|
||||
const struct vu_virtq_element *elem, unsigned int len,
|
||||
unsigned int idx);
|
||||
|
|
22
vu_common.c
22
vu_common.c
|
@ -26,24 +26,25 @@
|
|||
* vu_packet_check_range() - Check if a given memory zone is contained in
|
||||
* a mapped guest memory region
|
||||
* @buf: Array of the available memory regions
|
||||
* @offset: Offset of data range in packet descriptor
|
||||
* @ptr: Start of desired data range
|
||||
* @size: Length of desired data range
|
||||
* @start: Start of the packet descriptor
|
||||
*
|
||||
* Return: 0 if the zone is in a mapped memory region, -1 otherwise
|
||||
*/
|
||||
int vu_packet_check_range(void *buf, size_t offset, size_t len,
|
||||
const char *start)
|
||||
int vu_packet_check_range(void *buf, const char *ptr, size_t len)
|
||||
{
|
||||
struct vu_dev_region *dev_region;
|
||||
|
||||
for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
|
||||
uintptr_t base_addr = dev_region->mmap_addr +
|
||||
dev_region->mmap_offset;
|
||||
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
|
||||
char *m = (char *)(uintptr_t)dev_region->mmap_addr;
|
||||
const char *base = (const char *)base_addr;
|
||||
|
||||
if (m <= start &&
|
||||
start + offset + len <= m + dev_region->mmap_offset +
|
||||
dev_region->size)
|
||||
ASSERT(base_addr >= dev_region->mmap_addr);
|
||||
|
||||
if (len <= dev_region->size && base <= ptr &&
|
||||
(size_t)(ptr - base) <= dev_region->size - len)
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -194,7 +195,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
|
|||
tap_add_packet(vdev->context,
|
||||
elem[count].out_sg[0].iov_len - hdrlen,
|
||||
(char *)elem[count].out_sg[0].iov_base +
|
||||
hdrlen);
|
||||
hdrlen, now);
|
||||
} else {
|
||||
/* vnet header can be in a separate iovec */
|
||||
if (elem[count].out_num != 2) {
|
||||
|
@ -206,7 +207,8 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
|
|||
} else {
|
||||
tap_add_packet(vdev->context,
|
||||
elem[count].out_sg[1].iov_len,
|
||||
(char *)elem[count].out_sg[1].iov_base);
|
||||
(char *)elem[count].out_sg[1].iov_base,
|
||||
now);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue