tcp: Fast re-transmit, more fixes for closing states and no_snd_wnd

...and while at it, fix an issue in the calculation of the last IOV
buffer size: if we can't receive enough data to fill up the window,
the last buffer can be filled completely.

Also streamline the code setting iovec lengths if cached values are
not matching.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Stefano Brivio 2021-08-04 01:35:45 +02:00
parent 0017bc3c3e
commit 539dcf5add

94
tcp.c
View file

@ -1545,12 +1545,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
if (!conn->tap_window || already_sent >= conn->tap_window)
return 1;
return 0;
fill_bufs = DIV_ROUND_UP(conn->tap_window - already_sent,
conn->mss_guest);
if (fill_bufs > TCP_TAP_FRAMES)
if (fill_bufs > TCP_TAP_FRAMES) {
fill_bufs = TCP_TAP_FRAMES;
iov_rem = 0;
} else {
iov_rem = (conn->tap_window - already_sent) % conn->mss_guest;
}
/* Adjust iovec length for recvmsg() based on what was set last time. */
if (v4) {
@ -1562,21 +1566,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
buf_mss = &tcp6_l2_buf_mss;
buf_mss_nr_set = &tcp6_l2_buf_mss_nr_set;
}
if (*buf_mss != conn->mss_guest) {
for (i = 0; i < fill_bufs; i++)
if (*buf_mss != conn->mss_guest)
*buf_mss_nr_set = 0;
for (i = *buf_mss_nr_set; i < fill_bufs; i++)
iov[i].iov_len = conn->mss_guest;
*buf_mss = conn->mss_guest;
*buf_mss_nr_set = fill_bufs - 1;
} else if (*buf_mss_nr_set < fill_bufs) {
for (i = *buf_mss_nr_set; i < fill_bufs; i++)
iov[i].iov_len = conn->mss_guest;
*buf_mss_nr_set = fill_bufs - 1;
}
/* First buffer is to discard data, last one may be partially filled. */
iov[-1].iov_len = already_sent;
iov_rem = (conn->tap_window - already_sent) % conn->mss_guest;
if (iov_rem && fill_bufs < TCP_TAP_FRAMES)
if (iov_rem)
iov[fill_bufs - 1].iov_len = iov_rem;
if (v4)
tcp4_l2_mh_sock.msg_iovlen = fill_bufs + 1;
@ -1616,27 +1615,28 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
buf_mss_tap = &tcp6_l2_buf_mss_tap;
buf_mss_tap_nr_set = &tcp6_l2_buf_mss_tap_nr_set;
}
if (*buf_mss_tap != mss_tap) {
for (i = 0; i < send_bufs; i++)
if (*buf_mss_tap != mss_tap)
*buf_mss_tap_nr_set = 0;
for (i = *buf_mss_tap_nr_set; i < send_bufs; i++)
iov_tap[i].iov_len = mss_tap;
*buf_mss_tap = mss_tap;
*buf_mss_tap_nr_set = send_bufs;
} else if (*buf_mss_tap_nr_set < send_bufs) {
for (i = *buf_mss_tap_nr_set; i < send_bufs; i++)
iov_tap[i].iov_len = mss_tap;
*buf_mss_tap_nr_set = send_bufs;
}
iov_tap[send_bufs - 1].iov_len = mss_tap - conn->mss_guest + last_len;
/* Likely, some new data was acked too. */
if (conn->seq_from_tap != conn->seq_ack_to_tap) {
if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &info, &sl))
if (conn->no_snd_wnd) {
conn->seq_ack_to_tap = conn->seq_from_tap;
} else {
if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &info,
&sl))
goto err;
conn->tcpi_acked_last = info.tcpi_bytes_acked;
conn->seq_ack_to_tap = info.tcpi_bytes_acked +
conn->seq_init_from_tap;
}
} else {
info.tcpi_snd_wscale = conn->ws;
info.tcpi_snd_wnd = conn->tcpi_snd_wnd;
@ -1776,11 +1776,14 @@ err:
goto out_restore_iov;
zero_len:
if (conn->state >= ESTABLISHED_SOCK_FIN)
goto out_restore_iov;
if (conn->state == FIN_WAIT_1) {
tcp_tap_state(conn, FIN_WAIT_1_SOCK_FIN);
} else if (conn->state < ESTABLISHED_SOCK_FIN) {
tcp_tap_state(conn, ESTABLISHED_SOCK_FIN);
shutdown(conn->sock, SHUT_RD);
tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
}
goto out_restore_iov;
out:
@ -1807,10 +1810,11 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
struct tap_msg *msg, int count,
struct timespec *now)
{
int i, iov_i, keep = -1, ack = 0, fin = 0, retr = 0;
struct msghdr mh = { .msg_iov = tcp_tap_iov };
uint32_t max_ack_seq = conn->seq_ack_from_tap;
uint32_t seq_from_tap = conn->seq_from_tap;
int i, iov_i, keep = -1, ack = 0, fin = 0;
uint16_t max_ack_seq_wnd;
ssize_t len;
for (i = 0, iov_i = 0; i < count; i++) {
@ -1840,17 +1844,33 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
seq = ntohl(th->seq);
ack_seq = ntohl(th->ack_seq);
if (!i) {
if (count == 1)
max_ack_seq_wnd = ntohs(th->window);
else
max_ack_seq_wnd = ntohs(th->window) - 1;
}
if (th->ack) {
ack = 1;
if (ack_seq - conn->seq_ack_from_tap < MAX_WINDOW &&
ack_seq - max_ack_seq < MAX_WINDOW)
ack_seq - max_ack_seq < MAX_WINDOW) {
/* Fast re-transmit */
retr = !len && ack_seq == max_ack_seq &&
max_ack_seq_wnd == ntohs(th->window);
max_ack_seq_wnd = ntohs(th->window);
max_ack_seq = ack_seq;
}
}
if (th->fin)
fin = 1;
if (!len)
continue;
seq_offset = seq_from_tap - seq;
/* Use data from this buffer only in these two cases:
*
@ -1858,7 +1878,6 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
* |--------| <-- len |--------| <-- len
* '----' <-- offset ' <-- offset
* ^ seq ^ seq
*
* (offset >= 0, seq + len > seq_from_tap)
*
* discard in these two cases:
@ -1900,6 +1919,11 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
tcp_sock_consume(conn, max_ack_seq);
}
if (retr) {
conn->seq_to_tap = max_ack_seq;
tcp_data_from_sock(c, conn, now);
}
if (!iov_i) {
if (keep != -1) {
tcp_send_to_tap(c, conn, ACK, NULL, 0);
@ -1926,6 +1950,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
}
conn->seq_from_tap += len;
if (!fin)
tcp_send_to_tap(c, conn, 0, NULL, 0);
fin:
@ -1935,11 +1960,12 @@ fin:
if (fin) {
shutdown(conn->sock, SHUT_WR);
if (conn->state == ESTABLISHED)
if (conn->state == ESTABLISHED) {
tcp_tap_state(conn, FIN_WAIT_1);
else
tcp_data_from_sock(c, conn, now);
} else {
tcp_tap_state(conn, LAST_ACK);
return;
}
}
}
@ -2023,12 +2049,12 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
conn->seq_ack_to_tap = conn->seq_from_tap;
tcp_tap_state(conn, ESTABLISHED);
tcp_send_to_tap(c, conn, ACK, NULL, 0);
/* The client might have sent data already, which we didn't
* dequeue waiting for SYN,ACK from tap -- check now.
*/
tcp_data_from_sock(c, conn, now);
tcp_send_to_tap(c, conn, 0, NULL, 0);
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
ref.s = conn->sock;
@ -2055,11 +2081,13 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
case CLOSE_WAIT:
tcp_data_from_tap(c, conn, msg, count, now);
return count;
case FIN_WAIT_1:
tcp_send_to_tap(c, conn, ACK, NULL, 0);
break;
case FIN_WAIT_1_SOCK_FIN:
if (th->ack)
tcp_tap_destroy(c, conn);
break;
case FIN_WAIT_1:
case TAP_SYN_SENT:
case LAST_ACK:
case SPLICE_ACCEPTED:
@ -2570,9 +2598,9 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
tcp_tap_state(conn, FIN_WAIT_1_SOCK_FIN);
shutdown(conn->sock, SHUT_RD);
tcp_data_from_sock(c, conn, now);
tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
tcp_sock_consume(conn, conn->seq_ack_from_tap);
} else {
tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
} else if (conn->state != ESTABLISHED_SOCK_FIN) {
tcp_tap_destroy(c, conn);
}
}
@ -2731,7 +2759,7 @@ static void tcp_timer_one(struct ctx *c, struct tcp_tap_conn *conn,
if (sock_ms > ACK_INTERVAL) {
if (conn->seq_from_tap > conn->seq_ack_to_tap)
tcp_send_to_tap(c, conn, ACK, NULL, 0);
tcp_send_to_tap(c, conn, 0, NULL, 0);
}
if (ack_tap_ms > ACK_TIMEOUT) {