From 54a9d3801b9549e68bd169e2c938c265ef46e973 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Tue, 18 Jun 2024 12:32:17 +0200 Subject: [PATCH] tcp: Don't rely on bind() to fail to decide that connection target is valid Commit e1a2e2780c91 ("tcp: Check if connection is local or low RTT was seen before using large MSS") added a call to bind() before we issue a connect() to the target for an outbound connection. If bind() fails, but neither with EADDRNOTAVAIL, nor with EACCESS, we can conclude that the target address is a local (host) address, and we can use an unlimited MSS. While at it, according to the reasoning of that commit, if bind() succeeds, we would know right away that nobody is listening at that (local) address and port, and we don't even need to call connect(): we can just fail early and reset the connection attempt. But if non-local binds are enabled via net.ipv4.ip_nonlocal_bind or net.ipv6.ip_nonlocal_bind sysctl, binding to a non-local address will actually succeed, so we can't rely on it to fail in general. The visible issue with the existing behaviour is that we would reset any outbound connection to non-local addresses, if non-local binds are enabled. Keep the significant optimisation for local addresses along with the bind() call, but if it succeeds, don't draw any conclusion: close the socket, grab another one, and proceed normally. This will incur a small latency penalty if non-local binds are enabled (we'll likely fetch an existing socket from the pool but additionally call close()), or if the target is local but not bound: we'll need to call connect() and get a failure before relaying that failure back. Link: https://github.com/containers/podman/issues/23003 Signed-off-by: Stefano Brivio Reviewed-by: David Gibson --- tcp.c | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/tcp.c b/tcp.c index 6852423..a2e81d5 100644 --- a/tcp.c +++ b/tcp.c @@ -1631,6 +1631,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, flow_initiate(flow, PIF_TAP); + flow_target(flow, PIF_HOST); + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); + if (af == AF_INET) { if (IN4_IS_ADDR_UNSPECIFIED(saddr) || IN4_IS_ADDR_BROADCAST(saddr) || @@ -1647,6 +1650,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, dstport); goto cancel; } + + sa = (struct sockaddr *)&addr4; + sl = sizeof(addr4); } else if (af == AF_INET6) { if (IN6_IS_ADDR_UNSPECIFIED(saddr) || IN6_IS_ADDR_MULTICAST(saddr) || srcport == 0 || @@ -1661,6 +1667,11 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, dstport); goto cancel; } + + sa = (struct sockaddr *)&addr6; + sl = sizeof(addr6); + } else { + ASSERT(0); } if ((s = tcp_conn_sock(c, af)) < 0) @@ -1673,6 +1684,26 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, addr6.sin6_addr = in6addr_loopback; } + /* Use bind() to check if the target address is local (EADDRINUSE or + * similar) and already bound, and set the LOCAL flag in that case. + * + * If bind() succeeds, in general, we could infer that nobody (else) is + * listening on that address and port and reset the connection attempt + * early, but we can't rely on that if non-local binds are enabled, + * because bind() would succeed for any non-local address we can reach. + * + * So, if bind() succeeds, close the socket, get a new one, and proceed. + */ + if (bind(s, sa, sl)) { + if (errno != EADDRNOTAVAIL && errno != EACCES) + conn_flag(c, conn, LOCAL); + } else { + /* Not a local, bound destination, inconclusive test */ + close(s); + if ((s = tcp_conn_sock(c, af)) < 0) + goto cancel; + } + if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) { struct sockaddr_in6 addr6_ll = { .sin6_family = AF_INET6, @@ -1683,8 +1714,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, goto cancel; } - flow_target(flow, PIF_HOST); - conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -1706,14 +1735,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, inany_from_af(&conn->faddr, af, daddr); - if (af == AF_INET) { - sa = (struct sockaddr *)&addr4; - sl = sizeof(addr4); - } else { - sa = (struct sockaddr *)&addr6; - sl = sizeof(addr6); - } - conn->fport = dstport; conn->eport = srcport; @@ -1726,13 +1747,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, tcp_hash_insert(c, conn); - if (!bind(s, sa, sl)) { - tcp_rst(c, conn); /* Nobody is listening then */ - goto cancel; - } - if (errno != EADDRNOTAVAIL && errno != EACCES) - conn_flag(c, conn, LOCAL); - if ((af == AF_INET && !IN4_IS_ADDR_LOOPBACK(&addr4.sin_addr)) || (af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&addr6.sin6_addr) && !IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)))