pif: Record originating pif in listening socket refs

For certain socket types, we record in the epoll ref whether they're
sockets in the namespace, or on the host.  We now have the notion of "pif"
to indicate what "place" a socket is associated with, so generalise the
simple one-bit 'ns' to a pif id.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
David Gibson 2023-11-07 12:40:15 +11:00 committed by Stefano Brivio
parent 125c5e52a5
commit 732e249376
6 changed files with 28 additions and 23 deletions

View file

@ -35,6 +35,7 @@ union epoll_ref;
#include <assert.h> #include <assert.h>
#include <sys/epoll.h> #include <sys/epoll.h>
#include "pif.h"
#include "packet.h" #include "packet.h"
#include "icmp.h" #include "icmp.h"
#include "port_fwd.h" #include "port_fwd.h"

5
tcp.c
View file

@ -2964,6 +2964,7 @@ static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port,
{ {
union tcp_listen_epoll_ref tref = { union tcp_listen_epoll_ref tref = {
.port = port + c->tcp.fwd_in.delta[port], .port = port + c->tcp.fwd_in.delta[port],
.pif = PIF_HOST,
}; };
int s; int s;
@ -3025,7 +3026,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
{ {
union tcp_listen_epoll_ref tref = { union tcp_listen_epoll_ref tref = {
.port = port + c->tcp.fwd_out.delta[port], .port = port + c->tcp.fwd_out.delta[port],
.ns = true, .pif = PIF_SPLICE,
}; };
struct in_addr loopback = { htonl(INADDR_LOOPBACK) }; struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
int s; int s;
@ -3051,7 +3052,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
{ {
union tcp_listen_epoll_ref tref = { union tcp_listen_epoll_ref tref = {
.port = port + c->tcp.fwd_out.delta[port], .port = port + c->tcp.fwd_out.delta[port],
.ns = true, .pif = PIF_SPLICE,
}; };
int s; int s;

4
tcp.h
View file

@ -41,13 +41,13 @@ union tcp_epoll_ref {
/** /**
* union tcp_listen_epoll_ref - epoll reference portion for TCP listening * union tcp_listen_epoll_ref - epoll reference portion for TCP listening
* @port: Port number we're forwarding *to* (listening port plus delta) * @port: Port number we're forwarding *to* (listening port plus delta)
* @ns: True if listening within the pasta namespace * @pif: pif in which the socket is listening
* @u32: Opaque u32 value of reference * @u32: Opaque u32 value of reference
*/ */
union tcp_listen_epoll_ref { union tcp_listen_epoll_ref {
struct { struct {
in_port_t port; in_port_t port;
bool ns; uint8_t pif;
}; };
uint32_t u32; uint32_t u32;
}; };

View file

@ -411,12 +411,12 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
* @c: Execution context * @c: Execution context
* @conn: Connection pointer * @conn: Connection pointer
* @port: Destination port, host order * @port: Destination port, host order
* @outbound: Connection request coming from namespace * @pif: Originating pif of the splice
* *
* Return: return code from connect() * Return: return code from connect()
*/ */
static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn,
in_port_t port, int outbound) in_port_t port, uint8_t pif)
{ {
int s = -1; int s = -1;
@ -427,7 +427,7 @@ static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn,
* entering the ns anyway, so we might as well refill the * entering the ns anyway, so we might as well refill the
* pool. * pool.
*/ */
if (outbound) { if (pif == PIF_SPLICE) {
int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
int af = CONN_V6(conn) ? AF_INET6 : AF_INET; int af = CONN_V6(conn) ? AF_INET6 : AF_INET;
@ -437,6 +437,8 @@ static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn,
} else { } else {
int *p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4; int *p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
ASSERT(pif == PIF_HOST);
/* If pool is empty, refill it first */ /* If pool is empty, refill it first */
if (p[TCP_SOCK_POOL_SIZE-1] < 0) if (p[TCP_SOCK_POOL_SIZE-1] < 0)
NS_CALL(tcp_sock_refill_ns, c); NS_CALL(tcp_sock_refill_ns, c);
@ -516,7 +518,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c,
conn->c.spliced = true; conn->c.spliced = true;
conn->a = s; conn->a = s;
if (tcp_splice_new(c, conn, ref.port, ref.ns)) if (tcp_splice_new(c, conn, ref.port, ref.pif))
conn_flag(c, conn, CLOSING); conn_flag(c, conn, CLOSING);
return true; return true;

23
udp.c
View file

@ -365,7 +365,7 @@ static void udp_sock6_iov_init(const struct ctx *c)
* @c: Execution context * @c: Execution context
* @v6: Set for IPv6 sockets * @v6: Set for IPv6 sockets
* @src: Source port of original connection, host order * @src: Source port of original connection, host order
* @splice: UDP_BACK_TO_INIT from init, UDP_BACK_TO_NS from namespace * @ns: Does the splice originate in the ns or not
* *
* Return: prepared socket, negative error code on failure * Return: prepared socket, negative error code on failure
* *
@ -375,16 +375,17 @@ int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns)
{ {
struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP }; struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP };
union epoll_ref ref = { .type = EPOLL_TYPE_UDP, union epoll_ref ref = { .type = EPOLL_TYPE_UDP,
.udp = { .splice = true, .ns = ns, .udp = { .splice = true, .v6 = v6, .port = src }
.v6 = v6, .port = src }
}; };
struct udp_splice_port *sp; struct udp_splice_port *sp;
int act, s; int act, s;
if (ns) { if (ns) {
ref.udp.pif = PIF_SPLICE;
sp = &udp_splice_ns[v6 ? V6 : V4][src]; sp = &udp_splice_ns[v6 ? V6 : V4][src];
act = UDP_ACT_SPLICE_NS; act = UDP_ACT_SPLICE_NS;
} else { } else {
ref.udp.pif = PIF_HOST;
sp = &udp_splice_init[v6 ? V6 : V4][src]; sp = &udp_splice_init[v6 ? V6 : V4][src];
act = UDP_ACT_SPLICE_INIT; act = UDP_ACT_SPLICE_INIT;
} }
@ -495,15 +496,15 @@ static int udp_mmh_splice_port(bool v6, const struct mmsghdr *mmh)
* @n: Number of datagrams to send * @n: Number of datagrams to send
* @src: Datagrams will be sent from this port (on origin side) * @src: Datagrams will be sent from this port (on origin side)
* @dst: Datagrams will be send to this port (on destination side) * @dst: Datagrams will be send to this port (on destination side)
* @from_pif: pif from which the packet originated
* @v6: Send as IPv6? * @v6: Send as IPv6?
* @from_ns: If true send from pasta ns to init, otherwise reverse
* @allow_new: If true create sending socket if needed, if false discard * @allow_new: If true create sending socket if needed, if false discard
* if no sending socket is available * if no sending socket is available
* @now: Timestamp * @now: Timestamp
*/ */
static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
in_port_t src, in_port_t dst, in_port_t src, in_port_t dst, uint8_t from_pif,
bool v6, bool from_ns, bool allow_new, bool v6, bool allow_new,
const struct timespec *now) const struct timespec *now)
{ {
struct mmsghdr *mmh_recv, *mmh_send; struct mmsghdr *mmh_recv, *mmh_send;
@ -518,7 +519,7 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
mmh_send = udp4_mh_splice; mmh_send = udp4_mh_splice;
} }
if (from_ns) { if (from_pif == PIF_SPLICE) {
src += c->udp.fwd_in.rdelta[src]; src += c->udp.fwd_in.rdelta[src];
s = udp_splice_init[v6][src].sock; s = udp_splice_init[v6][src].sock;
if (!s && allow_new) if (!s && allow_new)
@ -530,6 +531,7 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
udp_splice_ns[v6][dst].ts = now->tv_sec; udp_splice_ns[v6][dst].ts = now->tv_sec;
udp_splice_init[v6][src].ts = now->tv_sec; udp_splice_init[v6][src].ts = now->tv_sec;
} else { } else {
ASSERT(from_pif == PIF_HOST);
src += c->udp.fwd_out.rdelta[src]; src += c->udp.fwd_out.rdelta[src];
s = udp_splice_ns[v6][src].sock; s = udp_splice_ns[v6][src].sock;
if (!s && allow_new) { if (!s && allow_new) {
@ -776,7 +778,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
if (splicefrom >= 0) if (splicefrom >= 0)
udp_splice_sendfrom(c, i, m, splicefrom, dstport, udp_splice_sendfrom(c, i, m, splicefrom, dstport,
v6, ref.udp.ns, ref.udp.orig, now); ref.udp.pif, v6, ref.udp.orig, now);
else else
udp_tap_send(c, i, m, dstport, v6, now); udp_tap_send(c, i, m, dstport, v6, now);
} }
@ -974,8 +976,10 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
if (ns) { if (ns) {
uref.pif = PIF_SPLICE;
uref.port = (in_port_t)(port + c->udp.fwd_out.f.delta[port]); uref.port = (in_port_t)(port + c->udp.fwd_out.f.delta[port]);
} else { } else {
uref.pif = PIF_HOST;
uref.port = (in_port_t)(port + c->udp.fwd_in.f.delta[port]); uref.port = (in_port_t)(port + c->udp.fwd_in.f.delta[port]);
} }
@ -990,7 +994,6 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
udp_splice_init[V4][port].sock = s < 0 ? -1 : s; udp_splice_init[V4][port].sock = s < 0 ? -1 : s;
} else { } else {
struct in_addr loopback = { htonl(INADDR_LOOPBACK) }; struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
uref.ns = true;
r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, &loopback, r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, &loopback,
ifname, port, uref.u32); ifname, port, uref.u32);
@ -1008,8 +1011,6 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
udp_tap_map[V6][uref.port].sock = s < 0 ? -1 : s; udp_tap_map[V6][uref.port].sock = s < 0 ? -1 : s;
udp_splice_init[V6][port].sock = s < 0 ? -1 : s; udp_splice_init[V6][port].sock = s < 0 ? -1 : s;
} else { } else {
uref.ns = true;
r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP,
&in6addr_loopback, &in6addr_loopback,
ifname, port, uref.u32); ifname, port, uref.u32);

8
udp.h
View file

@ -20,21 +20,21 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
/** /**
* union udp_epoll_ref - epoll reference portion for TCP connections * union udp_epoll_ref - epoll reference portion for TCP connections
* @port: Source port for connected sockets, bound port otherwise
* @pif: pif for this socket
* @bound: Set if this file descriptor is a bound socket * @bound: Set if this file descriptor is a bound socket
* @splice: Set if descriptor packets to be "spliced" * @splice: Set if descriptor packets to be "spliced"
* @orig: Set if a spliced socket which can originate "connections" * @orig: Set if a spliced socket which can originate "connections"
* @ns: Set if this is a socket in the pasta network namespace
* @v6: Set for IPv6 sockets or connections * @v6: Set for IPv6 sockets or connections
* @port: Source port for connected sockets, bound port otherwise
* @u32: Opaque u32 value of reference * @u32: Opaque u32 value of reference
*/ */
union udp_epoll_ref { union udp_epoll_ref {
struct { struct {
in_port_t port;
uint8_t pif;
bool splice:1, bool splice:1,
orig:1, orig:1,
ns:1,
v6:1; v6:1;
uint32_t port:16;
}; };
uint32_t u32; uint32_t u32;
}; };