From a8f4fc481ce3afbf48522a0af44d222d665b515e Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 16 Jan 2025 20:47:00 +0100
Subject: [PATCH] tcp: Mask EPOLLIN altogether if we're blocked waiting on an
 ACK from the guest

There are pretty much two cases of the (misnomer) STALLED: in one
case, we could send more data to the guest if it becomes available,
and in another case, we can't, because we filled the window.

If, in this second case, we keep EPOLLIN enabled, but never read from
the socket, we get short but CPU-annoying storms of EPOLLIN events,
upon which we reschedule the ACK timeout handler, never read from the
socket, go back to epoll_wait(), and so on:

  timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0
  epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1
  timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0
  epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1
  timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0
  epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1

also known as:

  29.1517: Flow 2 (TCP connection): timer expires in 2.000s
  29.1517: Flow 2 (TCP connection): timer expires in 2.000s
  29.1517: Flow 2 (TCP connection): timer expires in 2.000s

which, for some reason, becomes very visible with muvm and aria2c
downloading from a server nearby in parallel chunks.

That's because EPOLLIN isn't cleared if we don't read from the socket,
and even with EPOLLET, epoll_wait() will repeatedly wake us up until
we actually read something.

In this case, we don't want to subscribe to EPOLLIN at all: all we're
waiting for is an ACK segment from the guest. Differentiate this case
with a new connection flag, ACK_FROM_TAP_BLOCKS, which doesn't just
indicate that we're waiting for an ACK from the guest
(ACK_FROM_TAP_DUE), but also that we're blocked waiting for it.

If this flag is set before we set STALLED, EPOLLIN will be masked
while we set EPOLLET because of STALLED. Whenever we clear STALLED,
we also clear this flag.

This is definitely not elegant, but it's a minimal fix.

We can probably simplify this at a later point by having a category
of connection flags directly corresponding to epoll flags, and
dropping STALLED altogether, or, perhaps, always using EPOLLET (but
we need a mechanism to re-check sockets for pending data if we can't
temporarily write to the guest).

I suspect that this might also be implied in
https://github.com/containers/podman/issues/23686, hence the Link:
tag. It doesn't necessarily mean I'm fixing it (I can't reproduce
that).

Link: https://github.com/containers/podman/issues/23686
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c      | 8 ++++++--
 tcp_buf.c  | 2 ++
 tcp_conn.h | 1 +
 tcp_vu.c   | 2 ++
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index ef33388..3b3193a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -345,7 +345,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = {
 
 static const char *tcp_flag_str[] __attribute((__unused__)) = {
 	"STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE",
-	"ACK_FROM_TAP_DUE",
+	"ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS",
 };
 
 /* Listening sockets, used for automatic port forwarding in pasta mode only */
@@ -436,8 +436,12 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
 		if (events & TAP_FIN_SENT)
 			return EPOLLET;
 
-		if (conn_flags & STALLED)
+		if (conn_flags & STALLED) {
+			if (conn_flags & ACK_FROM_TAP_BLOCKS)
+				return EPOLLRDHUP | EPOLLET;
+
 			return EPOLLIN | EPOLLRDHUP | EPOLLET;
+		}
 
 		return EPOLLIN | EPOLLRDHUP;
 	}
diff --git a/tcp_buf.c b/tcp_buf.c
index 8c15101..cbefa42 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -309,6 +309,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	}
 
 	if (!wnd_scaled || already_sent >= wnd_scaled) {
+		conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
 		conn_flag(c, conn, STALLED);
 		conn_flag(c, conn, ACK_FROM_TAP_DUE);
 		return 0;
@@ -387,6 +388,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
+	conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
 	conn_flag(c, conn, ~STALLED);
 
 	send_bufs = DIV_ROUND_UP(len, mss);
diff --git a/tcp_conn.h b/tcp_conn.h
index 6ae0511..d342680 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -77,6 +77,7 @@ struct tcp_tap_conn {
 #define ACTIVE_CLOSE		BIT(2)
 #define ACK_TO_TAP_DUE		BIT(3)
 #define ACK_FROM_TAP_DUE	BIT(4)
+#define ACK_FROM_TAP_BLOCKS	BIT(5)
 
 #define SNDBUF_BITS		24
 	unsigned int	sndbuf		:SNDBUF_BITS;
diff --git a/tcp_vu.c b/tcp_vu.c
index 8256f53..a216bb1 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -381,6 +381,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 	}
 
 	if (!wnd_scaled || already_sent >= wnd_scaled) {
+		conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
 		conn_flag(c, conn, STALLED);
 		conn_flag(c, conn, ACK_FROM_TAP_DUE);
 		return 0;
@@ -423,6 +424,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
+	conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
 	conn_flag(c, conn, ~STALLED);
 
 	/* Likely, some new data was acked too. */