tcp: Fast re-transmit, more fixes for closing states and no_snd_wnd
...and while at it, fix an issue in the calculation of the last IOV buffer size: if we can't receive enough data to fill up the window, the last buffer can be filled completely. Also streamline the code setting iovec lengths if cached values are not matching. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
parent
0017bc3c3e
commit
539dcf5add
1 changed files with 73 additions and 45 deletions
94
tcp.c
94
tcp.c
|
@ -1545,12 +1545,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||||
|
|
||||||
if (!conn->tap_window || already_sent >= conn->tap_window)
|
if (!conn->tap_window || already_sent >= conn->tap_window)
|
||||||
return 1;
|
return 0;
|
||||||
|
|
||||||
fill_bufs = DIV_ROUND_UP(conn->tap_window - already_sent,
|
fill_bufs = DIV_ROUND_UP(conn->tap_window - already_sent,
|
||||||
conn->mss_guest);
|
conn->mss_guest);
|
||||||
if (fill_bufs > TCP_TAP_FRAMES)
|
if (fill_bufs > TCP_TAP_FRAMES) {
|
||||||
fill_bufs = TCP_TAP_FRAMES;
|
fill_bufs = TCP_TAP_FRAMES;
|
||||||
|
iov_rem = 0;
|
||||||
|
} else {
|
||||||
|
iov_rem = (conn->tap_window - already_sent) % conn->mss_guest;
|
||||||
|
}
|
||||||
|
|
||||||
/* Adjust iovec length for recvmsg() based on what was set last time. */
|
/* Adjust iovec length for recvmsg() based on what was set last time. */
|
||||||
if (v4) {
|
if (v4) {
|
||||||
|
@ -1562,21 +1566,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
buf_mss = &tcp6_l2_buf_mss;
|
buf_mss = &tcp6_l2_buf_mss;
|
||||||
buf_mss_nr_set = &tcp6_l2_buf_mss_nr_set;
|
buf_mss_nr_set = &tcp6_l2_buf_mss_nr_set;
|
||||||
}
|
}
|
||||||
if (*buf_mss != conn->mss_guest) {
|
if (*buf_mss != conn->mss_guest)
|
||||||
for (i = 0; i < fill_bufs; i++)
|
*buf_mss_nr_set = 0;
|
||||||
|
for (i = *buf_mss_nr_set; i < fill_bufs; i++)
|
||||||
iov[i].iov_len = conn->mss_guest;
|
iov[i].iov_len = conn->mss_guest;
|
||||||
*buf_mss = conn->mss_guest;
|
*buf_mss = conn->mss_guest;
|
||||||
*buf_mss_nr_set = fill_bufs - 1;
|
*buf_mss_nr_set = fill_bufs - 1;
|
||||||
} else if (*buf_mss_nr_set < fill_bufs) {
|
|
||||||
for (i = *buf_mss_nr_set; i < fill_bufs; i++)
|
|
||||||
iov[i].iov_len = conn->mss_guest;
|
|
||||||
*buf_mss_nr_set = fill_bufs - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* First buffer is to discard data, last one may be partially filled. */
|
/* First buffer is to discard data, last one may be partially filled. */
|
||||||
iov[-1].iov_len = already_sent;
|
iov[-1].iov_len = already_sent;
|
||||||
iov_rem = (conn->tap_window - already_sent) % conn->mss_guest;
|
if (iov_rem)
|
||||||
if (iov_rem && fill_bufs < TCP_TAP_FRAMES)
|
|
||||||
iov[fill_bufs - 1].iov_len = iov_rem;
|
iov[fill_bufs - 1].iov_len = iov_rem;
|
||||||
if (v4)
|
if (v4)
|
||||||
tcp4_l2_mh_sock.msg_iovlen = fill_bufs + 1;
|
tcp4_l2_mh_sock.msg_iovlen = fill_bufs + 1;
|
||||||
|
@ -1616,27 +1615,28 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
buf_mss_tap = &tcp6_l2_buf_mss_tap;
|
buf_mss_tap = &tcp6_l2_buf_mss_tap;
|
||||||
buf_mss_tap_nr_set = &tcp6_l2_buf_mss_tap_nr_set;
|
buf_mss_tap_nr_set = &tcp6_l2_buf_mss_tap_nr_set;
|
||||||
}
|
}
|
||||||
if (*buf_mss_tap != mss_tap) {
|
if (*buf_mss_tap != mss_tap)
|
||||||
for (i = 0; i < send_bufs; i++)
|
*buf_mss_tap_nr_set = 0;
|
||||||
|
for (i = *buf_mss_tap_nr_set; i < send_bufs; i++)
|
||||||
iov_tap[i].iov_len = mss_tap;
|
iov_tap[i].iov_len = mss_tap;
|
||||||
*buf_mss_tap = mss_tap;
|
*buf_mss_tap = mss_tap;
|
||||||
*buf_mss_tap_nr_set = send_bufs;
|
*buf_mss_tap_nr_set = send_bufs;
|
||||||
} else if (*buf_mss_tap_nr_set < send_bufs) {
|
|
||||||
for (i = *buf_mss_tap_nr_set; i < send_bufs; i++)
|
|
||||||
iov_tap[i].iov_len = mss_tap;
|
|
||||||
*buf_mss_tap_nr_set = send_bufs;
|
|
||||||
}
|
|
||||||
|
|
||||||
iov_tap[send_bufs - 1].iov_len = mss_tap - conn->mss_guest + last_len;
|
iov_tap[send_bufs - 1].iov_len = mss_tap - conn->mss_guest + last_len;
|
||||||
|
|
||||||
/* Likely, some new data was acked too. */
|
/* Likely, some new data was acked too. */
|
||||||
if (conn->seq_from_tap != conn->seq_ack_to_tap) {
|
if (conn->seq_from_tap != conn->seq_ack_to_tap) {
|
||||||
if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &info, &sl))
|
if (conn->no_snd_wnd) {
|
||||||
|
conn->seq_ack_to_tap = conn->seq_from_tap;
|
||||||
|
} else {
|
||||||
|
if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &info,
|
||||||
|
&sl))
|
||||||
goto err;
|
goto err;
|
||||||
|
|
||||||
conn->tcpi_acked_last = info.tcpi_bytes_acked;
|
conn->tcpi_acked_last = info.tcpi_bytes_acked;
|
||||||
conn->seq_ack_to_tap = info.tcpi_bytes_acked +
|
conn->seq_ack_to_tap = info.tcpi_bytes_acked +
|
||||||
conn->seq_init_from_tap;
|
conn->seq_init_from_tap;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
info.tcpi_snd_wscale = conn->ws;
|
info.tcpi_snd_wscale = conn->ws;
|
||||||
info.tcpi_snd_wnd = conn->tcpi_snd_wnd;
|
info.tcpi_snd_wnd = conn->tcpi_snd_wnd;
|
||||||
|
@ -1776,11 +1776,14 @@ err:
|
||||||
goto out_restore_iov;
|
goto out_restore_iov;
|
||||||
|
|
||||||
zero_len:
|
zero_len:
|
||||||
if (conn->state >= ESTABLISHED_SOCK_FIN)
|
if (conn->state == FIN_WAIT_1) {
|
||||||
goto out_restore_iov;
|
tcp_tap_state(conn, FIN_WAIT_1_SOCK_FIN);
|
||||||
|
} else if (conn->state < ESTABLISHED_SOCK_FIN) {
|
||||||
tcp_tap_state(conn, ESTABLISHED_SOCK_FIN);
|
tcp_tap_state(conn, ESTABLISHED_SOCK_FIN);
|
||||||
|
shutdown(conn->sock, SHUT_RD);
|
||||||
tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
|
tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
|
||||||
|
}
|
||||||
|
|
||||||
goto out_restore_iov;
|
goto out_restore_iov;
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
@ -1807,10 +1810,11 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
struct tap_msg *msg, int count,
|
struct tap_msg *msg, int count,
|
||||||
struct timespec *now)
|
struct timespec *now)
|
||||||
{
|
{
|
||||||
|
int i, iov_i, keep = -1, ack = 0, fin = 0, retr = 0;
|
||||||
struct msghdr mh = { .msg_iov = tcp_tap_iov };
|
struct msghdr mh = { .msg_iov = tcp_tap_iov };
|
||||||
uint32_t max_ack_seq = conn->seq_ack_from_tap;
|
uint32_t max_ack_seq = conn->seq_ack_from_tap;
|
||||||
uint32_t seq_from_tap = conn->seq_from_tap;
|
uint32_t seq_from_tap = conn->seq_from_tap;
|
||||||
int i, iov_i, keep = -1, ack = 0, fin = 0;
|
uint16_t max_ack_seq_wnd;
|
||||||
ssize_t len;
|
ssize_t len;
|
||||||
|
|
||||||
for (i = 0, iov_i = 0; i < count; i++) {
|
for (i = 0, iov_i = 0; i < count; i++) {
|
||||||
|
@ -1840,17 +1844,33 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
|
|
||||||
seq = ntohl(th->seq);
|
seq = ntohl(th->seq);
|
||||||
ack_seq = ntohl(th->ack_seq);
|
ack_seq = ntohl(th->ack_seq);
|
||||||
|
if (!i) {
|
||||||
|
if (count == 1)
|
||||||
|
max_ack_seq_wnd = ntohs(th->window);
|
||||||
|
else
|
||||||
|
max_ack_seq_wnd = ntohs(th->window) - 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (th->ack) {
|
if (th->ack) {
|
||||||
ack = 1;
|
ack = 1;
|
||||||
if (ack_seq - conn->seq_ack_from_tap < MAX_WINDOW &&
|
if (ack_seq - conn->seq_ack_from_tap < MAX_WINDOW &&
|
||||||
ack_seq - max_ack_seq < MAX_WINDOW)
|
ack_seq - max_ack_seq < MAX_WINDOW) {
|
||||||
|
|
||||||
|
/* Fast re-transmit */
|
||||||
|
retr = !len && ack_seq == max_ack_seq &&
|
||||||
|
max_ack_seq_wnd == ntohs(th->window);
|
||||||
|
|
||||||
|
max_ack_seq_wnd = ntohs(th->window);
|
||||||
max_ack_seq = ack_seq;
|
max_ack_seq = ack_seq;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (th->fin)
|
if (th->fin)
|
||||||
fin = 1;
|
fin = 1;
|
||||||
|
|
||||||
|
if (!len)
|
||||||
|
continue;
|
||||||
|
|
||||||
seq_offset = seq_from_tap - seq;
|
seq_offset = seq_from_tap - seq;
|
||||||
/* Use data from this buffer only in these two cases:
|
/* Use data from this buffer only in these two cases:
|
||||||
*
|
*
|
||||||
|
@ -1858,7 +1878,6 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
* |--------| <-- len |--------| <-- len
|
* |--------| <-- len |--------| <-- len
|
||||||
* '----' <-- offset ' <-- offset
|
* '----' <-- offset ' <-- offset
|
||||||
* ^ seq ^ seq
|
* ^ seq ^ seq
|
||||||
*
|
|
||||||
* (offset >= 0, seq + len > seq_from_tap)
|
* (offset >= 0, seq + len > seq_from_tap)
|
||||||
*
|
*
|
||||||
* discard in these two cases:
|
* discard in these two cases:
|
||||||
|
@ -1900,6 +1919,11 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
tcp_sock_consume(conn, max_ack_seq);
|
tcp_sock_consume(conn, max_ack_seq);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (retr) {
|
||||||
|
conn->seq_to_tap = max_ack_seq;
|
||||||
|
tcp_data_from_sock(c, conn, now);
|
||||||
|
}
|
||||||
|
|
||||||
if (!iov_i) {
|
if (!iov_i) {
|
||||||
if (keep != -1) {
|
if (keep != -1) {
|
||||||
tcp_send_to_tap(c, conn, ACK, NULL, 0);
|
tcp_send_to_tap(c, conn, ACK, NULL, 0);
|
||||||
|
@ -1926,6 +1950,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
}
|
}
|
||||||
|
|
||||||
conn->seq_from_tap += len;
|
conn->seq_from_tap += len;
|
||||||
|
if (!fin)
|
||||||
tcp_send_to_tap(c, conn, 0, NULL, 0);
|
tcp_send_to_tap(c, conn, 0, NULL, 0);
|
||||||
|
|
||||||
fin:
|
fin:
|
||||||
|
@ -1935,11 +1960,12 @@ fin:
|
||||||
|
|
||||||
if (fin) {
|
if (fin) {
|
||||||
shutdown(conn->sock, SHUT_WR);
|
shutdown(conn->sock, SHUT_WR);
|
||||||
if (conn->state == ESTABLISHED)
|
if (conn->state == ESTABLISHED) {
|
||||||
tcp_tap_state(conn, FIN_WAIT_1);
|
tcp_tap_state(conn, FIN_WAIT_1);
|
||||||
else
|
tcp_data_from_sock(c, conn, now);
|
||||||
|
} else {
|
||||||
tcp_tap_state(conn, LAST_ACK);
|
tcp_tap_state(conn, LAST_ACK);
|
||||||
return;
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2023,12 +2049,12 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
|
||||||
conn->seq_ack_to_tap = conn->seq_from_tap;
|
conn->seq_ack_to_tap = conn->seq_from_tap;
|
||||||
|
|
||||||
tcp_tap_state(conn, ESTABLISHED);
|
tcp_tap_state(conn, ESTABLISHED);
|
||||||
tcp_send_to_tap(c, conn, ACK, NULL, 0);
|
|
||||||
|
|
||||||
/* The client might have sent data already, which we didn't
|
/* The client might have sent data already, which we didn't
|
||||||
* dequeue waiting for SYN,ACK from tap -- check now.
|
* dequeue waiting for SYN,ACK from tap -- check now.
|
||||||
*/
|
*/
|
||||||
tcp_data_from_sock(c, conn, now);
|
tcp_data_from_sock(c, conn, now);
|
||||||
|
tcp_send_to_tap(c, conn, 0, NULL, 0);
|
||||||
|
|
||||||
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
|
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
|
||||||
ref.s = conn->sock;
|
ref.s = conn->sock;
|
||||||
|
@ -2055,11 +2081,13 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
|
||||||
case CLOSE_WAIT:
|
case CLOSE_WAIT:
|
||||||
tcp_data_from_tap(c, conn, msg, count, now);
|
tcp_data_from_tap(c, conn, msg, count, now);
|
||||||
return count;
|
return count;
|
||||||
|
case FIN_WAIT_1:
|
||||||
|
tcp_send_to_tap(c, conn, ACK, NULL, 0);
|
||||||
|
break;
|
||||||
case FIN_WAIT_1_SOCK_FIN:
|
case FIN_WAIT_1_SOCK_FIN:
|
||||||
if (th->ack)
|
if (th->ack)
|
||||||
tcp_tap_destroy(c, conn);
|
tcp_tap_destroy(c, conn);
|
||||||
break;
|
break;
|
||||||
case FIN_WAIT_1:
|
|
||||||
case TAP_SYN_SENT:
|
case TAP_SYN_SENT:
|
||||||
case LAST_ACK:
|
case LAST_ACK:
|
||||||
case SPLICE_ACCEPTED:
|
case SPLICE_ACCEPTED:
|
||||||
|
@ -2570,9 +2598,9 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
|
||||||
tcp_tap_state(conn, FIN_WAIT_1_SOCK_FIN);
|
tcp_tap_state(conn, FIN_WAIT_1_SOCK_FIN);
|
||||||
shutdown(conn->sock, SHUT_RD);
|
shutdown(conn->sock, SHUT_RD);
|
||||||
tcp_data_from_sock(c, conn, now);
|
tcp_data_from_sock(c, conn, now);
|
||||||
tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
|
|
||||||
tcp_sock_consume(conn, conn->seq_ack_from_tap);
|
tcp_sock_consume(conn, conn->seq_ack_from_tap);
|
||||||
} else {
|
tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
|
||||||
|
} else if (conn->state != ESTABLISHED_SOCK_FIN) {
|
||||||
tcp_tap_destroy(c, conn);
|
tcp_tap_destroy(c, conn);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2731,7 +2759,7 @@ static void tcp_timer_one(struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
|
|
||||||
if (sock_ms > ACK_INTERVAL) {
|
if (sock_ms > ACK_INTERVAL) {
|
||||||
if (conn->seq_from_tap > conn->seq_ack_to_tap)
|
if (conn->seq_from_tap > conn->seq_ack_to_tap)
|
||||||
tcp_send_to_tap(c, conn, ACK, NULL, 0);
|
tcp_send_to_tap(c, conn, 0, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ack_tap_ms > ACK_TIMEOUT) {
|
if (ack_tap_ms > ACK_TIMEOUT) {
|
||||||
|
|
Loading…
Reference in a new issue