epoll: Split handling of listening TCP sockets into their own handler

tcp_sock_handler() handles both listening TCP sockets, and connected TCP
sockets, but what it needs to do in those cases has essentially nothing in
common.  Therefore, give listening sockets their own epoll_type value and
dispatch directly to their own handler from the top level.  Furthermore,
the two handlers need essentially entirely different information from the
reference: we re-(ab)used the index field in the tcp_epoll_ref to indicate
the port for the listening socket, but that's not the same meaning.  So,
switch listening sockets to their own reference type which we can lay out
as we please.  That lets us remove the listen and outbound fields from the
normal (connected) tcp_epoll_ref, reducing it to just the connection table
index.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
David Gibson 2023-08-11 15:12:27 +10:00 committed by Stefano Brivio
parent e6f81e5578
commit 485b5fb8f9
7 changed files with 55 additions and 45 deletions

View file

@ -56,7 +56,8 @@
char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE))); char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE)));
char *epoll_type_str[EPOLL_TYPE_MAX + 1] = { char *epoll_type_str[EPOLL_TYPE_MAX + 1] = {
[EPOLL_TYPE_TCP] = "TCP socket", [EPOLL_TYPE_TCP] = "connected TCP socket",
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
[EPOLL_TYPE_TCP_TIMER] = "TCP timer", [EPOLL_TYPE_TCP_TIMER] = "TCP timer",
[EPOLL_TYPE_UDP] = "UDP socket", [EPOLL_TYPE_UDP] = "UDP socket",
[EPOLL_TYPE_ICMP] = "ICMP socket", [EPOLL_TYPE_ICMP] = "ICMP socket",
@ -323,7 +324,10 @@ loop:
break; break;
case EPOLL_TYPE_TCP: case EPOLL_TYPE_TCP:
if (!c.no_tcp) if (!c.no_tcp)
tcp_sock_handler(&c, ref, eventmask, &now); tcp_sock_handler(&c, ref, eventmask);
break;
case EPOLL_TYPE_TCP_LISTEN:
tcp_listen_handler(&c, ref, &now);
break; break;
case EPOLL_TYPE_TCP_TIMER: case EPOLL_TYPE_TCP_TIMER:
tcp_timer_handler(&c, ref); tcp_timer_handler(&c, ref);

View file

@ -47,8 +47,10 @@ union epoll_ref;
enum epoll_type { enum epoll_type {
/* Special value to indicate an invalid type */ /* Special value to indicate an invalid type */
EPOLL_TYPE_NONE = 0, EPOLL_TYPE_NONE = 0,
/* TCP sockets */ /* Connected TCP sockets */
EPOLL_TYPE_TCP, EPOLL_TYPE_TCP,
/* Listening TCP sockets */
EPOLL_TYPE_TCP_LISTEN,
/* timerfds used for TCP timers */ /* timerfds used for TCP timers */
EPOLL_TYPE_TCP_TIMER, EPOLL_TYPE_TCP_TIMER,
/* UDP sockets */ /* UDP sockets */
@ -69,7 +71,8 @@ enum epoll_type {
* union epoll_ref - Breakdown of reference for epoll fd bookkeeping * union epoll_ref - Breakdown of reference for epoll fd bookkeeping
* @type: Type of fd (tells us what to do with events) * @type: Type of fd (tells us what to do with events)
* @fd: File descriptor number (implies < 2^24 total descriptors) * @fd: File descriptor number (implies < 2^24 total descriptors)
* @tcp: TCP-specific reference part * @tcp: TCP-specific reference part (connected sockets)
* @tcp_listen: TCP-specific reference part (listening sockets)
* @udp: UDP-specific reference part * @udp: UDP-specific reference part
* @icmp: ICMP-specific reference part * @icmp: ICMP-specific reference part
* @data: Data handled by protocol handlers * @data: Data handled by protocol handlers
@ -83,6 +86,7 @@ union epoll_ref {
int32_t fd:FD_REF_BITS; int32_t fd:FD_REF_BITS;
union { union {
union tcp_epoll_ref tcp; union tcp_epoll_ref tcp;
union tcp_listen_epoll_ref tcp_listen;
union udp_epoll_ref udp; union udp_epoll_ref udp;
union icmp_epoll_ref icmp; union icmp_epoll_ref icmp;
uint32_t data; uint32_t data;

51
tcp.c
View file

@ -2735,7 +2735,8 @@ static void tcp_snat_inbound(const struct ctx *c, union inany_addr *addr)
* @sa: Peer socket address (from accept()) * @sa: Peer socket address (from accept())
* @now: Current timestamp * @now: Current timestamp
*/ */
static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref, static void tcp_tap_conn_from_sock(struct ctx *c,
union tcp_listen_epoll_ref ref,
struct tcp_tap_conn *conn, int s, struct tcp_tap_conn *conn, int s,
struct sockaddr *sa, struct sockaddr *sa,
const struct timespec *now) const struct timespec *now)
@ -2747,7 +2748,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref,
conn_event(c, conn, SOCK_ACCEPTED); conn_event(c, conn, SOCK_ACCEPTED);
inany_from_sockaddr(&conn->addr, &conn->sock_port, sa); inany_from_sockaddr(&conn->addr, &conn->sock_port, sa);
conn->tap_port = ref.tcp.index; conn->tap_port = ref.port;
tcp_snat_inbound(c, &conn->addr); tcp_snat_inbound(c, &conn->addr);
@ -2765,22 +2766,20 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref,
} }
/** /**
* tcp_conn_from_sock() - Handle new connection request from listening socket * tcp_listen_handler() - Handle new connection request from listening socket
* @c: Execution context * @c: Execution context
* @ref: epoll reference of listening socket * @ref: epoll reference of listening socket
* @now: Current timestamp * @now: Current timestamp
*/ */
static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
const struct timespec *now) const struct timespec *now)
{ {
struct sockaddr_storage sa; struct sockaddr_storage sa;
union tcp_conn *conn; union tcp_conn *conn;
socklen_t sl; socklen_t sl;
int s; int s;
ASSERT(ref.tcp.listen); if (c->no_tcp || c->tcp.conn_count >= TCP_MAX_CONNS)
if (c->tcp.conn_count >= TCP_MAX_CONNS)
return; return;
sl = sizeof(sa); sl = sizeof(sa);
@ -2796,11 +2795,11 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
conn = tc + c->tcp.conn_count++; conn = tc + c->tcp.conn_count++;
if (c->mode == MODE_PASTA && if (c->mode == MODE_PASTA &&
tcp_splice_conn_from_sock(c, ref, &conn->splice, tcp_splice_conn_from_sock(c, ref.tcp_listen, &conn->splice,
s, (struct sockaddr *)&sa)) s, (struct sockaddr *)&sa))
return; return;
tcp_tap_conn_from_sock(c, ref, &conn->tap, s, tcp_tap_conn_from_sock(c, ref.tcp_listen, &conn->tap, s,
(struct sockaddr *)&sa, now); (struct sockaddr *)&sa, now);
} }
@ -2926,19 +2925,10 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn,
* @c: Execution context * @c: Execution context
* @ref: epoll reference * @ref: epoll reference
* @events: epoll events bitmap * @events: epoll events bitmap
* @now: Current timestamp
*/ */
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
const struct timespec *now)
{ {
union tcp_conn *conn; union tcp_conn *conn = tc + ref.tcp.index;
if (ref.tcp.listen) {
tcp_conn_from_sock(c, ref, now);
return;
}
conn = tc + ref.tcp.index;
if (conn->c.spliced) if (conn->c.spliced)
tcp_splice_sock_handler(c, &conn->splice, ref.fd, events); tcp_splice_sock_handler(c, &conn->splice, ref.fd, events);
@ -2959,8 +2949,9 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port, static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port,
const struct in_addr *addr, const char *ifname) const struct in_addr *addr, const char *ifname)
{ {
in_port_t idx = port + c->tcp.fwd_in.delta[port]; union tcp_listen_epoll_ref tref = {
union tcp_epoll_ref tref = { .listen = 1, .index = idx }; .port = port + c->tcp.fwd_in.delta[port],
};
int s; int s;
s = sock_l4(c, af, IPPROTO_TCP, addr, ifname, port, tref.u32); s = sock_l4(c, af, IPPROTO_TCP, addr, ifname, port, tref.u32);
@ -3019,9 +3010,10 @@ int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
*/ */
static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
{ {
in_port_t idx = port + c->tcp.fwd_out.delta[port]; union tcp_listen_epoll_ref tref = {
union tcp_epoll_ref tref = { .listen = 1, .outbound = 1, .port = port + c->tcp.fwd_out.delta[port],
.index = idx }; .ns = true,
};
struct in_addr loopback = { htonl(INADDR_LOOPBACK) }; struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
int s; int s;
@ -3044,9 +3036,10 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
*/ */
static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
{ {
in_port_t idx = port + c->tcp.fwd_out.delta[port]; union tcp_listen_epoll_ref tref = {
union tcp_epoll_ref tref = { .listen = 1, .outbound = 1, .port = port + c->tcp.fwd_out.delta[port],
.index = idx }; .ns = true,
};
int s; int s;
ASSERT(c->mode == MODE_PASTA); ASSERT(c->mode == MODE_PASTA);

25
tcp.h
View file

@ -14,8 +14,9 @@
struct ctx; struct ctx;
void tcp_timer_handler(struct ctx *c, union epoll_ref ref); void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
const struct timespec *now); const struct timespec *now);
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
int tcp_tap_handler(struct ctx *c, int af, const void *addr, int tcp_tap_handler(struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now); const struct pool *p, const struct timespec *now);
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
@ -30,16 +31,24 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
/** /**
* union tcp_epoll_ref - epoll reference portion for TCP connections * union tcp_epoll_ref - epoll reference portion for TCP connections
* @listen: Set if this file descriptor is a listening socket * @index: Index of connection in table
* @outbound: Listening socket maps to outbound, spliced connection
* @index: Index of connection in table, or port for bound sockets
* @u32: Opaque u32 value of reference * @u32: Opaque u32 value of reference
*/ */
union tcp_epoll_ref { union tcp_epoll_ref {
uint32_t index:20;
uint32_t u32;
};
/**
* union tcp_listen_epoll_ref - epoll reference portion for TCP listening
* @port: Port number we're forwarding *to* (listening port plus delta)
* @ns: True if listening within the pasta namespace
* @u32: Opaque u32 value of reference
*/
union tcp_listen_epoll_ref {
struct { struct {
uint32_t listen:1, in_port_t port;
outbound:1, bool ns;
index:20;
}; };
uint32_t u32; uint32_t u32;
}; };

View file

@ -486,7 +486,7 @@ static void tcp_splice_dir(struct tcp_splice_conn *conn, int ref_sock,
* Return: true if able to create a spliced connection, false otherwise * Return: true if able to create a spliced connection, false otherwise
* #syscalls:pasta setsockopt * #syscalls:pasta setsockopt
*/ */
bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, bool tcp_splice_conn_from_sock(struct ctx *c, union tcp_listen_epoll_ref ref,
struct tcp_splice_conn *conn, int s, struct tcp_splice_conn *conn, int s,
const struct sockaddr *sa) const struct sockaddr *sa)
{ {
@ -516,7 +516,7 @@ bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
c->tcp.splice_conn_count++; c->tcp.splice_conn_count++;
conn->a = s; conn->a = s;
if (tcp_splice_new(c, conn, ref.tcp.index, ref.tcp.outbound)) if (tcp_splice_new(c, conn, ref.port, ref.ns))
conn_flag(c, conn, CLOSING); conn_flag(c, conn, CLOSING);
return true; return true;

View file

@ -10,7 +10,7 @@ struct tcp_splice_conn;
void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn, void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn,
int s, uint32_t events); int s, uint32_t events);
bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, bool tcp_splice_conn_from_sock(struct ctx *c, union tcp_listen_epoll_ref ref,
struct tcp_splice_conn *conn, int s, struct tcp_splice_conn *conn, int s,
const struct sockaddr *sa); const struct sockaddr *sa);
void tcp_splice_init(struct ctx *c); void tcp_splice_init(struct ctx *c);

2
util.c
View file

@ -120,7 +120,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
switch (proto) { switch (proto) {
case IPPROTO_TCP: case IPPROTO_TCP:
ref.type = EPOLL_TYPE_TCP; ref.type = EPOLL_TYPE_TCP_LISTEN;
break; break;
case IPPROTO_UDP: case IPPROTO_UDP:
ref.type = EPOLL_TYPE_UDP; ref.type = EPOLL_TYPE_UDP;