From f6bff339a95ea19852a6a5d841d141e6155f662e Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 4 Oct 2021 21:50:05 +0200
Subject: [PATCH] tcp: Adjust usage of sending buffer depending on its size

If we start with a very small sending buffer, we can make the kernel
expand it if we cause the congestion window to get bigger, but this
won't reliably happen if we use just half (other half is accounted
as overhead).

Scale usage depending on its own size, we might eventually get some
retransmissions because we can't queue messages the sender sends us
in-window, but it's better than keeping that small buffer forever.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 63 insertions(+), 12 deletions(-)

diff --git a/tcp.c b/tcp.c
index d5e47c9..5d034cb 100644
--- a/tcp.c
+++ b/tcp.c
@@ -341,6 +341,9 @@
 
 #define TCP_TAP_FRAMES			8
 
+#define RCVBUF_BIG			(2 * 1024 * 1024)
+#define SNDBUF_BIG			(2 * 1024 * 1024)
+#define SNDBUF_SMALL			(128 * 1024)
 #define MAX_PIPE_SIZE			(2 * 1024 * 1024)
 
 #define TCP_HASH_TABLE_LOAD		70		/* % */
@@ -701,6 +704,56 @@ static void tcp_splice_state(struct tcp_splice_conn *conn, enum tcp_state state)
 	conn->state = state;
 }
 
+/**
+ * tcp_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
+ * @c:		Execution context
+ */
+static void tcp_probe_mem(struct ctx *c)
+{
+	int v = INT_MAX / 2, s;
+	socklen_t sl;
+
+	if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
+		c->tcp.low_wmem = c->tcp.low_rmem = 1;
+		return;
+	}
+
+	sl = sizeof(v);
+	if (setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v))	||
+	    getsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, &sl) || v < SNDBUF_BIG)
+		c->tcp.low_wmem = 1;
+
+	v = INT_MAX / 2;
+	if (setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v))	||
+	    getsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, &sl) || v < RCVBUF_BIG)
+		c->tcp.low_rmem = 1;
+
+	close(s);
+}
+
+/**
+ * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage)
+ * @conn:	Connection pointer
+ */
+static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
+{
+	int s = conn->sock, v;
+	socklen_t sl;
+
+	sl = sizeof(v);
+	if (getsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, &sl)) {
+		conn->snd_buf = WINDOW_DEFAULT;
+		return;
+	}
+
+	if (v >= SNDBUF_BIG)
+		v /= 2;
+	else if (v > SNDBUF_SMALL)
+		v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;
+
+	conn->snd_buf = v;
+}
+
 /**
  * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values
  * @s:		Socket, can be -1 to avoid check in the caller
@@ -1170,6 +1223,7 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags,
 	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
 	struct tcp_info info = { 0 };
 	socklen_t sl = sizeof(info);
+	int s = conn->sock;
 	struct tcphdr *th;
 	char *data;
 
@@ -1177,7 +1231,10 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags,
 	    !flags && conn->wnd_to_tap)
 		return 0;
 
-	if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &info, &sl)) {
+	if (conn->snd_buf < SNDBUF_SMALL)
+		tcp_get_sndbuf(c, conn);
+
+	if (getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) {
 		tcp_rst(c, conn);
 		return -ECONNRESET;
 	}
@@ -1540,21 +1597,19 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
 		}
 
 		ev.events = EPOLLOUT | EPOLLRDHUP;
+
+		tcp_get_sndbuf(conn);
 	} else {
 		tcp_tap_state(conn, TAP_SYN_RCVD);
 
+		tcp_get_sndbuf(conn);
+
 		if (tcp_send_to_tap(c, conn, SYN | ACK, now))
 			return;
 
 		ev.events = EPOLLIN | EPOLLRDHUP;
 	}
 
-	sl = sizeof(conn->snd_buf);
-	if (getsockopt(s, SOL_SOCKET, SO_SNDBUF, &conn->snd_buf, &sl))
-		conn->snd_buf = WINDOW_DEFAULT;
-	else
-		conn->snd_buf /= 2;
-
 	conn->events = ev.events;
 	ref.tcp.index = conn - tt;
 	ev.data.u64 = ref.u64;
@@ -2642,11 +2697,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 
 	tcp_tap_state(conn, SOCK_SYN_SENT);
 
-	sl = sizeof(conn->snd_buf);
-	if (getsockopt(s, SOL_SOCKET, SO_SNDBUF, &conn->snd_buf, &sl))
-		conn->snd_buf = WINDOW_DEFAULT;
-	else
-		conn->snd_buf /= 2;
+	tcp_get_sndbuf(conn);
 }
 
 /**