tcp: Use the same sockets to listen for spliced and non-spliced connections

In pasta mode, tcp_sock_init[46]() create separate sockets to listen for
spliced connections (these are bound to localhost) and non-spliced
connections (these are bound to the host address).  This introduces a
subtle behavioural difference between pasta and passt: by default, pasta
will listen only on a single host address, whereas passt will listen on
all addresses (0.0.0.0 or ::).  This also prevents us using some additional
optimizations that only work with the unspecified (0.0.0.0 or ::) address.

However, it turns out we don't need to do this.  We can splice a connection
if and only if it originates from the loopback address.  Currently we
ensure this by having the "spliced" listening sockets listening only on
loopback.  Instead, defer the decision about whether to splice a connection
until after accept(), by checking if the connection was made from the
loopback address.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
David Gibson 2022-11-17 16:58:52 +11:00 committed by Stefano Brivio
parent 356c6e0677
commit d909fda1e8
3 changed files with 56 additions and 102 deletions

99
tcp.c
View file

@ -434,7 +434,6 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = {
}; };
/* Listening sockets, used for automatic port forwarding in pasta mode only */ /* Listening sockets, used for automatic port forwarding in pasta mode only */
static int tcp_sock_init_lo [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS]; static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
@ -2851,19 +2850,29 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
socklen_t sl; socklen_t sl;
int s; int s;
assert(ref.r.p.tcp.tcp.listen);
assert(!ref.r.p.tcp.tcp.splice);
if (c->tcp.conn_count >= TCP_MAX_CONNS) if (c->tcp.conn_count >= TCP_MAX_CONNS)
return; return;
sl = sizeof(sa); sl = sizeof(sa);
/* FIXME: Workaround clang-tidy not realizing that accept4()
* writes the socket address. See
* https://github.com/llvm/llvm-project/issues/58992
*/
memset(&sa, 0, sizeof(struct sockaddr_in6));
s = accept4(ref.r.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK); s = accept4(ref.r.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
if (s < 0) if (s < 0)
return; return;
conn = tc + c->tcp.conn_count++; conn = tc + c->tcp.conn_count++;
if (ref.r.p.tcp.tcp.splice) if (c->mode == MODE_PASTA &&
tcp_splice_conn_from_sock(c, ref, &conn->splice, s); tcp_splice_conn_from_sock(c, ref, &conn->splice,
else s, (struct sockaddr *)&sa))
return;
tcp_tap_conn_from_sock(c, ref, &conn->tap, s, tcp_tap_conn_from_sock(c, ref, &conn->tap, s,
(struct sockaddr *)&sa, now); (struct sockaddr *)&sa, now);
} }
@ -3018,22 +3027,9 @@ static void tcp_sock_init4(const struct ctx *c, const struct in_addr *addr,
{ {
in_port_t idx = port + c->tcp.fwd_in.delta[port]; in_port_t idx = port + c->tcp.fwd_in.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.index = idx }; union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.index = idx };
bool spliced = false, tap = true;
int s; int s;
if (c->mode == MODE_PASTA) { s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port, tref.u32);
spliced = !addr || IN4_IS_ADDR_UNSPECIFIED(addr) ||
IN4_IS_ADDR_LOOPBACK(addr);
if (!addr)
addr = &c->ip4.addr;
tap = !IN4_IS_ADDR_LOOPBACK(addr);
}
if (tap) {
s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port,
tref.u32);
if (s >= 0) if (s >= 0)
tcp_sock_set_bufsize(c, s); tcp_sock_set_bufsize(c, s);
else else
@ -3043,24 +3039,6 @@ static void tcp_sock_init4(const struct ctx *c, const struct in_addr *addr,
tcp_sock_init_ext[port][V4] = s; tcp_sock_init_ext[port][V4] = s;
} }
if (spliced) {
struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
tref.tcp.splice = 1;
addr = &loopback;
s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port,
tref.u32);
if (s >= 0)
tcp_sock_set_bufsize(c, s);
else
s = -1;
if (c->tcp.fwd_out.mode == FWD_AUTO)
tcp_sock_init_lo[port][V4] = s;
}
}
/** /**
* tcp_sock_init6() - Initialise listening sockets for a given IPv6 port * tcp_sock_init6() - Initialise listening sockets for a given IPv6 port
* @c: Execution context * @c: Execution context
@ -3075,23 +3053,9 @@ static void tcp_sock_init6(const struct ctx *c,
in_port_t idx = port + c->tcp.fwd_in.delta[port]; in_port_t idx = port + c->tcp.fwd_in.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.v6 = 1, union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.v6 = 1,
.tcp.index = idx }; .tcp.index = idx };
bool spliced = false, tap = true;
int s; int s;
if (c->mode == MODE_PASTA) { s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port, tref.u32);
spliced = !addr ||
IN6_IS_ADDR_UNSPECIFIED(addr) ||
IN6_IS_ADDR_LOOPBACK(addr);
if (!addr)
addr = &c->ip6.addr;
tap = !IN6_IS_ADDR_LOOPBACK(addr);
}
if (tap) {
s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port,
tref.u32);
if (s >= 0) if (s >= 0)
tcp_sock_set_bufsize(c, s); tcp_sock_set_bufsize(c, s);
else else
@ -3101,23 +3065,6 @@ static void tcp_sock_init6(const struct ctx *c,
tcp_sock_init_ext[port][V6] = s; tcp_sock_init_ext[port][V6] = s;
} }
if (spliced) {
tref.tcp.splice = 1;
addr = &in6addr_loopback;
s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port,
tref.u32);
if (s >= 0)
tcp_sock_set_bufsize(c, s);
else
s = -1;
if (c->tcp.fwd_out.mode == FWD_AUTO)
tcp_sock_init_lo[port][V6] = s;
}
}
/** /**
* tcp_sock_init() - Create listening sockets for a given host ("inbound") port * tcp_sock_init() - Create listening sockets for a given host ("inbound") port
* @c: Execution context * @c: Execution context
@ -3144,7 +3091,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
{ {
in_port_t idx = port + c->tcp.fwd_out.delta[port]; in_port_t idx = port + c->tcp.fwd_out.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1, union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1,
.tcp.splice = 1, .tcp.index = idx }; .tcp.index = idx };
struct in_addr loopback = { htonl(INADDR_LOOPBACK) }; struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
int s; int s;
@ -3169,8 +3116,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
{ {
in_port_t idx = port + c->tcp.fwd_out.delta[port]; in_port_t idx = port + c->tcp.fwd_out.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1, union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1,
.tcp.splice = 1, .tcp.v6 = 1, .tcp.v6 = 1, .tcp.index = idx };
.tcp.index = idx };
int s; int s;
assert(c->mode == MODE_PASTA); assert(c->mode == MODE_PASTA);
@ -3337,7 +3283,6 @@ int tcp_init(struct ctx *c)
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
memset(ns_sock_pool4, 0xff, sizeof(ns_sock_pool4)); memset(ns_sock_pool4, 0xff, sizeof(ns_sock_pool4));
memset(ns_sock_pool6, 0xff, sizeof(ns_sock_pool6)); memset(ns_sock_pool6, 0xff, sizeof(ns_sock_pool6));
memset(tcp_sock_init_lo, 0xff, sizeof(tcp_sock_init_lo));
memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext)); memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext));
memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns)); memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns));
@ -3445,16 +3390,6 @@ static int tcp_port_rebind(void *arg)
close(tcp_sock_init_ext[port][V6]); close(tcp_sock_init_ext[port][V6]);
tcp_sock_init_ext[port][V6] = -1; tcp_sock_init_ext[port][V6] = -1;
} }
if (tcp_sock_init_lo[port][V4] >= 0) {
close(tcp_sock_init_lo[port][V4]);
tcp_sock_init_lo[port][V4] = -1;
}
if (tcp_sock_init_lo[port][V6] >= 0) {
close(tcp_sock_init_lo[port][V6]);
tcp_sock_init_lo[port][V6] = -1;
}
continue; continue;
} }

View file

@ -502,30 +502,48 @@ static void tcp_splice_dir(struct tcp_splice_conn *conn, int ref_sock,
} }
/** /**
* tcp_splice_conn_from_sock() - Initialize state for spliced connection * tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection
* @c: Execution context * @c: Execution context
* @ref: epoll reference of listening socket * @ref: epoll reference of listening socket
* @conn: connection structure to initialize * @conn: connection structure to initialize
* @s: Accepted socket * @s: Accepted socket
* @sa: Peer address of connection
* *
* Return: true if able to create a spliced connection, false otherwise
* #syscalls:pasta setsockopt * #syscalls:pasta setsockopt
*/ */
void tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
struct tcp_splice_conn *conn, int s) struct tcp_splice_conn *conn, int s,
const struct sockaddr *sa)
{ {
assert(c->mode == MODE_PASTA); assert(c->mode == MODE_PASTA);
if (ref.r.p.tcp.tcp.v6) {
const struct sockaddr_in6 *sa6;
sa6 = (const struct sockaddr_in6 *)sa;
if (!IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
return false;
conn->flags = SPLICE_V6;
} else {
const struct sockaddr_in *sa4 = (const struct sockaddr_in *)sa;
if (!IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr))
return false;
conn->flags = 0;
}
if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int)))
trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s); trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s);
conn->c.spliced = true; conn->c.spliced = true;
c->tcp.splice_conn_count++; c->tcp.splice_conn_count++;
conn->a = s; conn->a = s;
conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0;
if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index, if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index,
ref.r.p.tcp.tcp.outbound)) ref.r.p.tcp.tcp.outbound))
conn_flag(c, conn, CLOSING); conn_flag(c, conn, CLOSING);
return true;
} }
/** /**

View file

@ -10,8 +10,9 @@ struct tcp_splice_conn;
void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
uint32_t events); uint32_t events);
void tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
struct tcp_splice_conn *conn, int s); struct tcp_splice_conn *conn, int s,
const struct sockaddr *sa);
void tcp_splice_init(struct ctx *c); void tcp_splice_init(struct ctx *c);
#endif /* TCP_SPLICE_H */ #endif /* TCP_SPLICE_H */