From 485b5fb8f97c203a9842639fc708f32458be0c18 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 11 Aug 2023 15:12:27 +1000 Subject: [PATCH] epoll: Split handling of listening TCP sockets into their own handler tcp_sock_handler() handles both listening TCP sockets, and connected TCP sockets, but what it needs to do in those cases has essentially nothing in common. Therefore, give listening sockets their own epoll_type value and dispatch directly to their own handler from the top level. Furthermore, the two handlers need essentially entirely different information from the reference: we re-(ab)used the index field in the tcp_epoll_ref to indicate the port for the listening socket, but that's not the same meaning. So, switch listening sockets to their own reference type which we can lay out as we please. That lets us remove the listen and outbound fields from the normal (connected) tcp_epoll_ref, reducing it to just the connection table index. Signed-off-by: David Gibson Signed-off-by: Stefano Brivio --- passt.c | 8 ++++++-- passt.h | 8 ++++++-- tcp.c | 51 ++++++++++++++++++++++----------------------------- tcp.h | 25 +++++++++++++++++-------- tcp_splice.c | 4 ++-- tcp_splice.h | 2 +- util.c | 2 +- 7 files changed, 55 insertions(+), 45 deletions(-) diff --git a/passt.c b/passt.c index a700e41..5867d0e 100644 --- a/passt.c +++ b/passt.c @@ -56,7 +56,8 @@ char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE))); char *epoll_type_str[EPOLL_TYPE_MAX + 1] = { - [EPOLL_TYPE_TCP] = "TCP socket", + [EPOLL_TYPE_TCP] = "connected TCP socket", + [EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket", [EPOLL_TYPE_TCP_TIMER] = "TCP timer", [EPOLL_TYPE_UDP] = "UDP socket", [EPOLL_TYPE_ICMP] = "ICMP socket", @@ -323,7 +324,10 @@ loop: break; case EPOLL_TYPE_TCP: if (!c.no_tcp) - tcp_sock_handler(&c, ref, eventmask, &now); + tcp_sock_handler(&c, ref, eventmask); + break; + case EPOLL_TYPE_TCP_LISTEN: + tcp_listen_handler(&c, ref, &now); break; case EPOLL_TYPE_TCP_TIMER: tcp_timer_handler(&c, ref); diff --git a/passt.h b/passt.h index fc1efdb..a625d48 100644 --- a/passt.h +++ b/passt.h @@ -47,8 +47,10 @@ union epoll_ref; enum epoll_type { /* Special value to indicate an invalid type */ EPOLL_TYPE_NONE = 0, - /* TCP sockets */ + /* Connected TCP sockets */ EPOLL_TYPE_TCP, + /* Listening TCP sockets */ + EPOLL_TYPE_TCP_LISTEN, /* timerfds used for TCP timers */ EPOLL_TYPE_TCP_TIMER, /* UDP sockets */ @@ -69,7 +71,8 @@ enum epoll_type { * union epoll_ref - Breakdown of reference for epoll fd bookkeeping * @type: Type of fd (tells us what to do with events) * @fd: File descriptor number (implies < 2^24 total descriptors) - * @tcp: TCP-specific reference part + * @tcp: TCP-specific reference part (connected sockets) + * @tcp_listen: TCP-specific reference part (listening sockets) * @udp: UDP-specific reference part * @icmp: ICMP-specific reference part * @data: Data handled by protocol handlers @@ -83,6 +86,7 @@ union epoll_ref { int32_t fd:FD_REF_BITS; union { union tcp_epoll_ref tcp; + union tcp_listen_epoll_ref tcp_listen; union udp_epoll_ref udp; union icmp_epoll_ref icmp; uint32_t data; diff --git a/tcp.c b/tcp.c index 98761a2..0322842 100644 --- a/tcp.c +++ b/tcp.c @@ -2735,7 +2735,8 @@ static void tcp_snat_inbound(const struct ctx *c, union inany_addr *addr) * @sa: Peer socket address (from accept()) * @now: Current timestamp */ -static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref, +static void tcp_tap_conn_from_sock(struct ctx *c, + union tcp_listen_epoll_ref ref, struct tcp_tap_conn *conn, int s, struct sockaddr *sa, const struct timespec *now) @@ -2747,7 +2748,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref, conn_event(c, conn, SOCK_ACCEPTED); inany_from_sockaddr(&conn->addr, &conn->sock_port, sa); - conn->tap_port = ref.tcp.index; + conn->tap_port = ref.port; tcp_snat_inbound(c, &conn->addr); @@ -2765,22 +2766,20 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref, } /** - * tcp_conn_from_sock() - Handle new connection request from listening socket + * tcp_listen_handler() - Handle new connection request from listening socket * @c: Execution context * @ref: epoll reference of listening socket * @now: Current timestamp */ -static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, - const struct timespec *now) +void tcp_listen_handler(struct ctx *c, union epoll_ref ref, + const struct timespec *now) { struct sockaddr_storage sa; union tcp_conn *conn; socklen_t sl; int s; - ASSERT(ref.tcp.listen); - - if (c->tcp.conn_count >= TCP_MAX_CONNS) + if (c->no_tcp || c->tcp.conn_count >= TCP_MAX_CONNS) return; sl = sizeof(sa); @@ -2796,11 +2795,11 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn = tc + c->tcp.conn_count++; if (c->mode == MODE_PASTA && - tcp_splice_conn_from_sock(c, ref, &conn->splice, + tcp_splice_conn_from_sock(c, ref.tcp_listen, &conn->splice, s, (struct sockaddr *)&sa)) return; - tcp_tap_conn_from_sock(c, ref, &conn->tap, s, + tcp_tap_conn_from_sock(c, ref.tcp_listen, &conn->tap, s, (struct sockaddr *)&sa, now); } @@ -2926,19 +2925,10 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn, * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap - * @now: Current timestamp */ -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, - const struct timespec *now) +void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) { - union tcp_conn *conn; - - if (ref.tcp.listen) { - tcp_conn_from_sock(c, ref, now); - return; - } - - conn = tc + ref.tcp.index; + union tcp_conn *conn = tc + ref.tcp.index; if (conn->c.spliced) tcp_splice_sock_handler(c, &conn->splice, ref.fd, events); @@ -2959,8 +2949,9 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port, const struct in_addr *addr, const char *ifname) { - in_port_t idx = port + c->tcp.fwd_in.delta[port]; - union tcp_epoll_ref tref = { .listen = 1, .index = idx }; + union tcp_listen_epoll_ref tref = { + .port = port + c->tcp.fwd_in.delta[port], + }; int s; s = sock_l4(c, af, IPPROTO_TCP, addr, ifname, port, tref.u32); @@ -3019,9 +3010,10 @@ int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, */ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) { - in_port_t idx = port + c->tcp.fwd_out.delta[port]; - union tcp_epoll_ref tref = { .listen = 1, .outbound = 1, - .index = idx }; + union tcp_listen_epoll_ref tref = { + .port = port + c->tcp.fwd_out.delta[port], + .ns = true, + }; struct in_addr loopback = { htonl(INADDR_LOOPBACK) }; int s; @@ -3044,9 +3036,10 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) */ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) { - in_port_t idx = port + c->tcp.fwd_out.delta[port]; - union tcp_epoll_ref tref = { .listen = 1, .outbound = 1, - .index = idx }; + union tcp_listen_epoll_ref tref = { + .port = port + c->tcp.fwd_out.delta[port], + .ns = true, + }; int s; ASSERT(c->mode == MODE_PASTA); diff --git a/tcp.h b/tcp.h index 8eb7782..be296ec 100644 --- a/tcp.h +++ b/tcp.h @@ -14,8 +14,9 @@ struct ctx; void tcp_timer_handler(struct ctx *c, union epoll_ref ref); -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, - const struct timespec *now); +void tcp_listen_handler(struct ctx *c, union epoll_ref ref, + const struct timespec *now); +void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); int tcp_tap_handler(struct ctx *c, int af, const void *addr, const struct pool *p, const struct timespec *now); int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, @@ -30,16 +31,24 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, /** * union tcp_epoll_ref - epoll reference portion for TCP connections - * @listen: Set if this file descriptor is a listening socket - * @outbound: Listening socket maps to outbound, spliced connection - * @index: Index of connection in table, or port for bound sockets + * @index: Index of connection in table * @u32: Opaque u32 value of reference */ union tcp_epoll_ref { + uint32_t index:20; + uint32_t u32; +}; + +/** + * union tcp_listen_epoll_ref - epoll reference portion for TCP listening + * @port: Port number we're forwarding *to* (listening port plus delta) + * @ns: True if listening within the pasta namespace + * @u32: Opaque u32 value of reference + */ +union tcp_listen_epoll_ref { struct { - uint32_t listen:1, - outbound:1, - index:20; + in_port_t port; + bool ns; }; uint32_t u32; }; diff --git a/tcp_splice.c b/tcp_splice.c index 24995e2..64c1263 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -486,7 +486,7 @@ static void tcp_splice_dir(struct tcp_splice_conn *conn, int ref_sock, * Return: true if able to create a spliced connection, false otherwise * #syscalls:pasta setsockopt */ -bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, +bool tcp_splice_conn_from_sock(struct ctx *c, union tcp_listen_epoll_ref ref, struct tcp_splice_conn *conn, int s, const struct sockaddr *sa) { @@ -516,7 +516,7 @@ bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, c->tcp.splice_conn_count++; conn->a = s; - if (tcp_splice_new(c, conn, ref.tcp.index, ref.tcp.outbound)) + if (tcp_splice_new(c, conn, ref.port, ref.ns)) conn_flag(c, conn, CLOSING); return true; diff --git a/tcp_splice.h b/tcp_splice.h index 99dbac8..e7a583a 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -10,7 +10,7 @@ struct tcp_splice_conn; void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn, int s, uint32_t events); -bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, +bool tcp_splice_conn_from_sock(struct ctx *c, union tcp_listen_epoll_ref ref, struct tcp_splice_conn *conn, int s, const struct sockaddr *sa); void tcp_splice_init(struct ctx *c); diff --git a/util.c b/util.c index 2cac7ba..d965f48 100644 --- a/util.c +++ b/util.c @@ -120,7 +120,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto, switch (proto) { case IPPROTO_TCP: - ref.type = EPOLL_TYPE_TCP; + ref.type = EPOLL_TYPE_TCP_LISTEN; break; case IPPROTO_UDP: ref.type = EPOLL_TYPE_UDP;