conf, tcp: Periodic detection of bound ports for pasta port forwarding

Detecting bound ports at start-up time isn't terribly useful: do this
periodically instead, if configured.

This is only implemented for TCP at the moment, UDP is somewhat more
complicated: leave a TODO there.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Stefano Brivio 2021-09-27 05:24:30 +02:00
parent e69e13671d
commit 9657b6ed05
8 changed files with 316 additions and 97 deletions

85
conf.c
View file

@ -39,6 +39,42 @@
#include "udp.h"
#include "tcp.h"
/**
* get_bound_ports() - Get maps of ports with bound sockets
* @c: Execution context
* @ns: If set, set bitmaps for ports to tap/ns -- to init otherwise
* @proto: Protocol number (IPPROTO_TCP or IPPROTO_UDP)
*/
void get_bound_ports(struct ctx *c, int ns, uint8_t proto)
{
uint8_t *udp_map, *udp_exclude, *tcp_map, *tcp_exclude;
if (ns) {
udp_map = c->udp.port_to_tap;
udp_exclude = c->udp.port_to_init;
tcp_map = c->tcp.port_to_tap;
tcp_exclude = c->tcp.port_to_init;
} else {
udp_map = c->udp.port_to_init;
udp_exclude = c->udp.port_to_tap;
tcp_map = c->tcp.port_to_init;
tcp_exclude = c->tcp.port_to_tap;
}
if (proto == IPPROTO_UDP) {
memset(udp_map, 0, USHRT_MAX / 8);
procfs_scan_listen("udp", udp_map, udp_exclude);
procfs_scan_listen("udp6", udp_map, udp_exclude);
procfs_scan_listen("tcp", udp_map, udp_exclude);
procfs_scan_listen("tcp6", udp_map, udp_exclude);
} else if (proto == IPPROTO_TCP) {
memset(tcp_map, 0, USHRT_MAX / 8);
procfs_scan_listen("tcp", tcp_map, tcp_exclude);
procfs_scan_listen("tcp6", tcp_map, tcp_exclude);
}
}
/**
* struct get_bound_ports_ns_arg - Arguments for get_bound_ports_ns()
* @c: Execution context
@ -50,7 +86,7 @@ struct get_bound_ports_ns_arg {
};
/**
* get_bound_ports_ns() - Get maps of ports namespace with bound sockets
* get_bound_ports_ns() - Get maps of ports in namespace with bound sockets
* @arg: See struct get_bound_ports_ns_arg
*
* Return: 0
@ -63,39 +99,11 @@ static int get_bound_ports_ns(void *arg)
if (!c->pasta_pid || ns_enter(c->pasta_pid))
return 0;
if (a->proto == IPPROTO_UDP) {
procfs_scan_listen("udp", c->udp.port_to_tap);
procfs_scan_listen("udp6", c->udp.port_to_tap);
procfs_scan_listen("tcp", c->udp.port_to_tap);
procfs_scan_listen("tcp6", c->udp.port_to_tap);
} else if (a->proto == IPPROTO_TCP) {
procfs_scan_listen("tcp", c->tcp.port_to_tap);
procfs_scan_listen("tcp6", c->tcp.port_to_tap);
}
get_bound_ports(c, 1, a->proto);
return 0;
}
/**
* get_bound_ports() - Get maps of ports in init namespace with bound sockets
* @c: Execution context
* @proto: Protocol number (IPPROTO_TCP or IPPROTO_UDP)
*/
static void get_bound_ports(struct ctx *c, uint8_t proto)
{
if (proto == IPPROTO_UDP) {
procfs_scan_listen("udp", c->udp.port_to_init);
procfs_scan_listen("udp6", c->udp.port_to_init);
procfs_scan_listen("tcp", c->udp.port_to_init);
procfs_scan_listen("tcp6", c->udp.port_to_init);
} else if (proto == IPPROTO_TCP) {
procfs_scan_listen("tcp", c->tcp.port_to_init);
procfs_scan_listen("tcp6", c->tcp.port_to_init);
}
}
enum conf_port_type {
PORT_SPEC = 1,
PORT_NONE,
@ -1172,19 +1180,28 @@ void conf(struct ctx *c, int argc, char **argv)
}
#endif
c->tcp.ns_detect_ports = c->udp.ns_detect_ports = 0;
c->tcp.init_detect_ports = c->udp.init_detect_ports = 0;
if (c->mode == MODE_PASTA) {
if (!tcp_tap || tcp_tap == PORT_AUTO) {
c->tcp.ns_detect_ports = 1;
ns_ports_arg.proto = IPPROTO_TCP;
NS_CALL(get_bound_ports_ns, &ns_ports_arg);
}
if (!udp_tap || udp_tap == PORT_AUTO) {
c->udp.ns_detect_ports = 1;
ns_ports_arg.proto = IPPROTO_UDP;
NS_CALL(get_bound_ports_ns, &ns_ports_arg);
}
if (!tcp_init || tcp_init == PORT_AUTO)
get_bound_ports(c, IPPROTO_TCP);
if (!udp_init || udp_init == PORT_AUTO)
get_bound_ports(c, IPPROTO_UDP);
if (!tcp_init || tcp_init == PORT_AUTO) {
c->tcp.init_detect_ports = 1;
get_bound_ports(c, 0, IPPROTO_TCP);
}
if (!udp_init || udp_init == PORT_AUTO) {
c->udp.init_detect_ports = 1;
get_bound_ports(c, 0, IPPROTO_UDP);
}
}
conf_print(c);

1
conf.h
View file

@ -1 +1,2 @@
void conf(struct ctx *c, int argc, char **argv);
void get_bound_ports(struct ctx *c, int ns, uint8_t proto);

15
passt.1
View file

@ -297,9 +297,9 @@ Don't forward any ports
.TP
.BR auto
Forward all ports currently bound in the namespace. The list of ports is derived
from listening sockets reported by \fI/proc/net/tcp\fR and \fI/proc/net/tcp6\fR,
see \fBproc\fR(5).
Dynamically forward ports bound in the namespace. The list of ports is
periodically derived (every second) from listening sockets reported by
\fI/proc/net/tcp\fR and \fI/proc/net/tcp6\fR, see \fBproc\fR(5).
.TP
.BR ports
@ -331,9 +331,10 @@ Default is \fBauto\fR.
.TP
.BR \-u ", " \-\-udp-ports " " \fIspec
Configure UDP port forwarding to guest. \fIspec\fR is as described for TCP
Configure UDP port forwarding to namespace. \fIspec\fR is as described for TCP
above, and the list of ports is derived from listening sockets reported by
\fI/proc/net/udp\fR and \fI/proc/net/udp6\fR, see \fBproc\fR(5).
\fI/proc/net/udp\fR and \fI/proc/net/udp6\fR, see \fBproc\fR(5),
when \fBpasta\fR starts (not periodically).
Note: unless overridden, UDP ports with numbers corresponding to forwarded TCP
port numbers are forwarded too, without, however, any port translation.
@ -345,14 +346,14 @@ Default is \fBauto\fR.
.TP
.BR \-T ", " \-\-tcp-ns " " \fIspec
Configure TCP port forwarding from target namespace to init namespace.
\fIspec\fR is as described above.
\fIspec\fR is as described above for TCP.
Default is \fBauto\fR.
.TP
.BR \-U ", " \-\-udp-ns " " \fIspec
Configure UDP port forwarding from target namespace to init namespace.
\fIspec\fR is as described above.
\fIspec\fR is as described above for UDP.
Default is \fBauto\fR.

292
tcp.c
View file

@ -334,6 +334,7 @@
#include "tap.h"
#include "siphash.h"
#include "pcap.h"
#include "conf.h"
#define MAX_TAP_CONNS (128 * 1024)
#define MAX_SPLICE_CONNS (128 * 1024)
@ -363,6 +364,8 @@
#define TCP_SPLICE_PIPE_POOL_SIZE 256
#define REFILL_INTERVAL 1000
#define PORT_DETECT_INTERVAL 1000
/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
* <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
*/
@ -525,6 +528,11 @@ struct tcp_splice_conn {
static in_port_t tcp_port_delta_to_tap [USHRT_MAX];
static in_port_t tcp_port_delta_to_init [USHRT_MAX];
/* Listening sockets, used for automatic port forwarding in pasta mode only */
static int tcp_sock_init_lo [USHRT_MAX][IP_VERSIONS];
static int tcp_sock_init_ext [USHRT_MAX][IP_VERSIONS];
static int tcp_sock_ns [USHRT_MAX][IP_VERSIONS];
/**
* tcp_remap_to_tap() - Set delta for port translation toward guest/tap
* @port: Original destination port, host order
@ -3001,6 +3009,93 @@ smaller:
goto smaller;
}
/**
* tcp_sock_init_one() - Initialise listening sockets for a given port
* @c: Execution context
* @ns: In pasta mode, if set, bind with loopback address in namespace
* @port: Port, host order
*/
static void tcp_sock_init_one(struct ctx *c, int ns, in_port_t port)
{
union tcp_epoll_ref tref = { .listen = 1 };
int s;
if (ns)
tref.index = (in_port_t)(port + tcp_port_delta_to_init[port]);
else
tref.index = (in_port_t)(port + tcp_port_delta_to_tap[port]);
if (c->v4) {
tref.v6 = 0;
tref.splice = 0;
if (!ns) {
s = sock_l4(c, AF_INET, IPPROTO_TCP, port,
c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY,
tref.u32);
if (s > 0)
tcp_sock_set_bufsize(s);
else
s = -1;
if (c->tcp.init_detect_ports)
tcp_sock_init_ext[port][V4] = s;
}
if (c->mode == MODE_PASTA) {
tref.splice = 1;
s = sock_l4(c, AF_INET, IPPROTO_TCP, port,
BIND_LOOPBACK, tref.u32);
if (s > 0)
tcp_sock_set_bufsize(s);
else
s = -1;
if (c->tcp.ns_detect_ports) {
if (ns)
tcp_sock_ns[port][V4] = s;
else
tcp_sock_init_lo[port][V4] = s;
}
}
}
if (c->v6) {
tref.v6 = 1;
tref.splice = 0;
if (!ns) {
s = sock_l4(c, AF_INET6, IPPROTO_TCP, port,
c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY,
tref.u32);
if (s > 0)
tcp_sock_set_bufsize(s);
else
s = -1;
if (c->tcp.init_detect_ports)
tcp_sock_init_ext[port][V6] = s;
}
if (c->mode == MODE_PASTA) {
tref.splice = 1;
s = sock_l4(c, AF_INET6, IPPROTO_TCP, port,
BIND_LOOPBACK, tref.u32);
if (s > 0)
tcp_sock_set_bufsize(s);
else
s = -1;
if (c->tcp.ns_detect_ports) {
if (ns)
tcp_sock_ns[port][V6] = s;
else
tcp_sock_init_lo[port][V6] = s;
}
}
}
}
/**
* tcp_sock_init_ns() - Bind sockets in namespace for inbound connections
* @arg: Execution context
@ -3009,10 +3104,8 @@ smaller:
*/
static int tcp_sock_init_ns(void *arg)
{
union tcp_epoll_ref tref = { .listen = 1, .splice = 1 };
struct ctx *c = (struct ctx *)arg;
in_port_t port;
int s;
ns_enter(c->pasta_pid);
@ -3020,21 +3113,7 @@ static int tcp_sock_init_ns(void *arg)
if (!bitmap_isset(c->tcp.port_to_init, port))
continue;
tref.index = (in_port_t)(port + tcp_port_delta_to_init[port]);
if (c->v4) {
tref.v6 = 0;
s = sock_l4(c, AF_INET, IPPROTO_TCP, port,
BIND_LOOPBACK, tref.u32);
tcp_sock_set_bufsize(s);
}
if (c->v6) {
tref.v6 = 1;
s = sock_l4(c, AF_INET6, IPPROTO_TCP, port,
BIND_LOOPBACK, tref.u32);
tcp_sock_set_bufsize(s);
}
tcp_sock_init_one(c, 1, port);
}
return 0;
@ -3128,9 +3207,7 @@ static int tcp_sock_refill(void *arg)
int tcp_sock_init(struct ctx *c, struct timespec *now)
{
struct tcp_sock_refill_arg refill_arg = { c, 0 };
union tcp_epoll_ref tref = { .listen = 1 };
in_port_t port;
int s;
getrandom(&c->tcp.hash_secret, sizeof(c->tcp.hash_secret), GRND_RANDOM);
@ -3138,40 +3215,7 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
if (!bitmap_isset(c->tcp.port_to_tap, port))
continue;
tref.index = (in_port_t)(port + tcp_port_delta_to_tap[port]);
if (c->v4) {
tref.v6 = 0;
tref.splice = 0;
s = sock_l4(c, AF_INET, IPPROTO_TCP, port,
c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY,
tref.u32);
tcp_sock_set_bufsize(s);
if (c->mode == MODE_PASTA) {
tref.splice = 1;
s = sock_l4(c, AF_INET, IPPROTO_TCP, port,
BIND_LOOPBACK, tref.u32);
tcp_sock_set_bufsize(s);
}
}
if (c->v6) {
tref.v6 = 1;
tref.splice = 0;
s = sock_l4(c, AF_INET6, IPPROTO_TCP, port,
c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY,
tref.u32);
tcp_sock_set_bufsize(s);
if (c->mode == MODE_PASTA) {
tref.splice = 1;
s = sock_l4(c, AF_INET6, IPPROTO_TCP, port,
BIND_LOOPBACK, tref.u32);
tcp_sock_set_bufsize(s);
}
}
tcp_sock_init_one(c, 0, port);
}
if (c->v4)
@ -3190,6 +3234,8 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
refill_arg.ns = 1;
NS_CALL(tcp_sock_refill, &refill_arg);
tcp_splice_pipe_refill(c);
c->tcp.port_detect_ts = *now;
}
return 0;
@ -3283,6 +3329,122 @@ static void tcp_timer_one(struct ctx *c, struct tcp_tap_conn *conn,
}
}
/**
* struct tcp_port_detect_arg - Arguments for tcp_port_detect()
* @c: Execution context
* @detect_in_ns: Detect ports bound in namespace, not in init
*/
struct tcp_port_detect_arg {
struct ctx *c;
int detect_in_ns;
};
/**
* tcp_port_detect() - Detect ports bound in namespace or init
* @arg: See struct tcp_port_detect_arg
*
* Return: 0
*/
static int tcp_port_detect(void *arg)
{
struct tcp_port_detect_arg *a = (struct tcp_port_detect_arg *)arg;
if (a->detect_in_ns) {
ns_enter(a->c->pasta_pid);
get_bound_ports(a->c, 1, IPPROTO_TCP);
} else {
get_bound_ports(a->c, 0, IPPROTO_TCP);
}
return 0;
}
/**
* struct tcp_port_rebind_arg - Arguments for tcp_port_rebind()
* @c: Execution context
* @bind_in_ns: Rebind ports in namespace, not in init
*/
struct tcp_port_rebind_arg {
struct ctx *c;
int bind_in_ns;
};
/**
* tcp_port_rebind() - Rebind ports in namespace or init
* @arg: See struct tcp_port_rebind_arg
*
* Return: 0
*/
static int tcp_port_rebind(void *arg)
{
struct tcp_port_rebind_arg *a = (struct tcp_port_rebind_arg *)arg;
in_port_t port;
if (a->bind_in_ns) {
ns_enter(a->c->pasta_pid);
for (port = 0; port < USHRT_MAX; port++) {
if (!bitmap_isset(a->c->tcp.port_to_init, port)) {
if (tcp_sock_ns[port][V4] > 0) {
close(tcp_sock_ns[port][V4]);
tcp_sock_ns[port][V4] = 0;
}
if (tcp_sock_ns[port][V6] > 0) {
close(tcp_sock_ns[port][V6]);
tcp_sock_ns[port][V6] = 0;
}
continue;
}
/* Don't loop back our own ports */
if (bitmap_isset(a->c->tcp.port_to_tap, port))
continue;
if ((a->c->v4 && !tcp_sock_ns[port][V4]) ||
(a->c->v6 && !tcp_sock_ns[port][V6]))
tcp_sock_init_one(a->c, 1, port);
}
} else {
for (port = 0; port < USHRT_MAX; port++) {
if (!bitmap_isset(a->c->tcp.port_to_tap, port)) {
if (tcp_sock_init_ext[port][V4] > 0) {
close(tcp_sock_init_ext[port][V4]);
tcp_sock_init_ext[port][V4] = 0;
}
if (tcp_sock_init_ext[port][V6] > 0) {
close(tcp_sock_init_ext[port][V6]);
tcp_sock_init_ext[port][V6] = 0;
}
if (tcp_sock_init_lo[port][V4] > 0) {
close(tcp_sock_init_lo[port][V4]);
tcp_sock_init_lo[port][V4] = 0;
}
if (tcp_sock_init_lo[port][V6] > 0) {
close(tcp_sock_init_lo[port][V6]);
tcp_sock_init_lo[port][V6] = 0;
}
continue;
}
/* Don't loop back our own ports */
if (bitmap_isset(a->c->tcp.port_to_init, port))
continue;
if ((a->c->v4 && !tcp_sock_init_ext[port][V4]) ||
(a->c->v6 && !tcp_sock_init_ext[port][V6]))
tcp_sock_init_one(a->c, 0, port);
}
}
return 0;
}
/**
* tcp_timer() - Scan activity bitmap for sockets waiting for timed events
* @c: Execution context
@ -3293,6 +3455,30 @@ void tcp_timer(struct ctx *c, struct timespec *now)
struct tcp_sock_refill_arg refill_arg = { c, 0 };
int i;
if (c->mode == MODE_PASTA) {
if (timespec_diff_ms(now, &c->tcp.port_detect_ts) >
PORT_DETECT_INTERVAL) {
struct tcp_port_detect_arg detect_arg = { c, 0 };
struct tcp_port_rebind_arg rebind_arg = { c, 0 };
if (c->tcp.init_detect_ports) {
detect_arg.detect_in_ns = 0;
tcp_port_detect(&detect_arg);
rebind_arg.bind_in_ns = 1;
NS_CALL(tcp_port_rebind, &rebind_arg);
}
if (c->tcp.ns_detect_ports) {
detect_arg.detect_in_ns = 1;
NS_CALL(tcp_port_detect, &detect_arg);
rebind_arg.bind_in_ns = 0;
tcp_port_rebind(&rebind_arg);
}
c->tcp.port_detect_ts = *now;
}
}
if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) {
tcp_sock_refill(&refill_arg);
if (c->mode == MODE_PASTA) {

6
tcp.h
View file

@ -43,22 +43,28 @@ union tcp_epoll_ref {
* @tap_conn_count: Count of tap connections in connection table
* @splice_conn_count: Count of spliced connections in connection table
* @port_to_tap: Ports bound host-side, packets to tap or spliced
* @init_detect_ports: If set, periodically detect ports bound in init
* @port_to_init: Ports bound namespace-side, spliced to init
* @ns_detect_ports: If set, periodically detect ports bound in namespace
* @timer_run: Timestamp of most recent timer run
* @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035)
* @pipe_size: Size of pipes for spliced connections
* @refill_ts: Time of last refill operation for pools of sockets/pipes
* @port_detect_ts: Time of last TCP port detection/rebind, if enabled
*/
struct tcp_ctx {
uint64_t hash_secret[2];
int tap_conn_count;
int splice_conn_count;
uint8_t port_to_tap [USHRT_MAX / 8];
int init_detect_ports;
uint8_t port_to_init [USHRT_MAX / 8];
int ns_detect_ports;
struct timespec timer_run;
int kernel_snd_wnd;
size_t pipe_size;
struct timespec refill_ts;
struct timespec port_detect_ts;
};
#endif /* TCP_H */

4
udp.h
View file

@ -40,12 +40,16 @@ union udp_epoll_ref {
/**
* struct udp_ctx - Execution context for UDP
* @port_to_tap: Ports bound host-side, data to tap or ns L4 socket
* @init_detect_ports: If set, periodically detect ports bound in init (TODO)
* @port_to_init: Ports bound namespace-side, data to init L4 socket
* @ns_detect_ports: If set, periodically detect ports bound in namespace
* @timer_run: Timestamp of most recent timer run
*/
struct udp_ctx {
uint8_t port_to_tap [USHRT_MAX / 8];
int init_detect_ports;
uint8_t port_to_init [USHRT_MAX / 8];
int ns_detect_ports;
struct timespec timer_run;
};

8
util.c
View file

@ -266,8 +266,9 @@ int bitmap_isset(uint8_t *map, int bit)
* procfs_scan_listen() - Set bits for listening TCP or UDP sockets from procfs
* @name: Corresponding name of file under /proc/net/
* @map: Bitmap where numbers of ports in listening state will be set
* @exclude: Bitmap of ports to exclude from setting (and clear)
*/
void procfs_scan_listen(char *name, uint8_t *map)
void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude)
{
char line[200], path[PATH_MAX];
unsigned long port;
@ -288,7 +289,10 @@ void procfs_scan_listen(char *name, uint8_t *map)
(strstr(name, "udp") && state != 0x07))
continue;
bitmap_set(map, port);
if (bitmap_isset(exclude, port))
bitmap_clear(map, port);
else
bitmap_set(map, port);
}
fclose(fp);

2
util.h
View file

@ -137,5 +137,5 @@ int timespec_diff_ms(struct timespec *a, struct timespec *b);
void bitmap_set(uint8_t *map, int bit);
void bitmap_clear(uint8_t *map, int bit);
int bitmap_isset(uint8_t *map, int bit);
void procfs_scan_listen(char *name, uint8_t *map);
void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude);
int ns_enter(int target_pid);