From 11be695f5c0a6a7d74e9628e9863e665f59d511f Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 30 Apr 2025 18:05:25 +0200
Subject: [PATCH] flow: fix podman issue #25959

While running piHole using podman, traffic can trigger the following
assert:

ASSSERTION FAILED in flow_alloc (flow.c:521): flow->f.state == FLOW_STATE_FREE

Backtrace shows that this happens in flow_defer_handler():

    #4  0x00005610d6f5b481 flow_alloc (passt + 0xb481)
    #5  0x00005610d6f74f86 udp_flow_from_sock (passt + 0x24f86)
    #6  0x00005610d6f737c3 udp_sock_fwd (passt + 0x237c3)
    #7  0x00005610d6f74c07 udp_flush_flow (passt + 0x24c07)
    #8  0x00005610d6f752c2 udp_flow_defer (passt + 0x252c2)
    #9  0x00005610d6f5bce1 flow_defer_handler (passt + 0xbce1)

We are trying to allocate a new flow inside the loop freeing them.

Inside the loop free_head points to the first free flow entry in the
current cluster. But if we allocate a new entry during the loop,
free_head is not updated and can point now to the entry we have just
allocated.

We can fix the problem by spliting the loop in two parts:
- first part where we can close some of them and allocate some new
  flow entries,
- second part where we free the entries closed in the previous loop
  and we aggregate the free entries to merge consecutive the clusters.

Reported-by: Martin Rijntjes <bugs@air-global.nl>
Link: https://github.com/containers/podman/issues/25959
Fixes: 9725e7988837 ("udp_flow: Don't discard packets that arrive between bind() and connect()")
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c | 109 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 58 insertions(+), 51 deletions(-)

diff --git a/flow.c b/flow.c
index 447c021..c5718e3 100644
--- a/flow.c
+++ b/flow.c
@@ -800,6 +800,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 {
 	struct flow_free_cluster *free_head = NULL;
 	unsigned *last_next = &flow_first_free;
+	bool to_free[FLOW_MAX] = { 0 };
 	bool timer = false;
 	union flow *flow;
 
@@ -810,9 +811,44 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 
 	ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
 
-	flow_foreach_slot(flow) {
+	/* Check which flows we might need to close first, but don't free them
+	 * yet as it's not safe to do that in the middle of flow_foreach().
+	 */
+	flow_foreach(flow) {
 		bool closed = false;
 
+		switch (flow->f.type) {
+		case FLOW_TYPE_NONE:
+			ASSERT(false);
+			break;
+		case FLOW_TCP:
+			closed = tcp_flow_defer(&flow->tcp);
+			break;
+		case FLOW_TCP_SPLICE:
+			closed = tcp_splice_flow_defer(&flow->tcp_splice);
+			if (!closed && timer)
+				tcp_splice_timer(c, &flow->tcp_splice);
+			break;
+		case FLOW_PING4:
+		case FLOW_PING6:
+			if (timer)
+				closed = icmp_ping_timer(c, &flow->ping, now);
+			break;
+		case FLOW_UDP:
+			closed = udp_flow_defer(c, &flow->udp, now);
+			if (!closed && timer)
+				closed = udp_flow_timer(c, &flow->udp, now);
+			break;
+		default:
+			/* Assume other flow types don't need any handling */
+			;
+		}
+
+		to_free[FLOW_IDX(flow)] = closed;
+	}
+
+	/* Second step: actually free the flows */
+	flow_foreach_slot(flow) {
 		switch (flow->f.state) {
 		case FLOW_STATE_FREE: {
 			unsigned skip = flow->free.n;
@@ -845,59 +881,30 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 			break;
 
 		case FLOW_STATE_ACTIVE:
-			/* Nothing to do */
-			break;
+			if (to_free[FLOW_IDX(flow)]) {
+				flow_set_state(&flow->f, FLOW_STATE_FREE);
+				memset(flow, 0, sizeof(*flow));
 
-		default:
-			ASSERT(false);
-		}
-
-		switch (flow->f.type) {
-		case FLOW_TYPE_NONE:
-			ASSERT(false);
-			break;
-		case FLOW_TCP:
-			closed = tcp_flow_defer(&flow->tcp);
-			break;
-		case FLOW_TCP_SPLICE:
-			closed = tcp_splice_flow_defer(&flow->tcp_splice);
-			if (!closed && timer)
-				tcp_splice_timer(c, &flow->tcp_splice);
-			break;
-		case FLOW_PING4:
-		case FLOW_PING6:
-			if (timer)
-				closed = icmp_ping_timer(c, &flow->ping, now);
-			break;
-		case FLOW_UDP:
-			closed = udp_flow_defer(c, &flow->udp, now);
-			if (!closed && timer)
-				closed = udp_flow_timer(c, &flow->udp, now);
-			break;
-		default:
-			/* Assume other flow types don't need any handling */
-			;
-		}
-
-		if (closed) {
-			flow_set_state(&flow->f, FLOW_STATE_FREE);
-			memset(flow, 0, sizeof(*flow));
-
-			if (free_head) {
-				/* Add slot to current free cluster */
-				ASSERT(FLOW_IDX(flow) ==
-				       FLOW_IDX(free_head) + free_head->n);
-				free_head->n++;
-				flow->free.n = flow->free.next = 0;
+				if (free_head) {
+					/* Add slot to current free cluster */
+					ASSERT(FLOW_IDX(flow) ==
+					    FLOW_IDX(free_head) + free_head->n);
+					free_head->n++;
+					flow->free.n = flow->free.next = 0;
+				} else {
+					/* Create new free cluster */
+					free_head = &flow->free;
+					free_head->n = 1;
+					*last_next = FLOW_IDX(flow);
+					last_next = &free_head->next;
+				}
 			} else {
-				/* Create new free cluster */
-				free_head = &flow->free;
-				free_head->n = 1;
-				*last_next = FLOW_IDX(flow);
-				last_next = &free_head->next;
+				free_head = NULL;
 			}
-		} else {
-			free_head = NULL;
+			break;
+
+		default:
+			ASSERT(false);
 		}
 	}