tcp: Introduce scatter-gather IO path from socket to tap

...similarly to what was done for UDP. Quick performance test with
32KiB buffers, host to VM:

$ iperf3 -c 192.0.2.2 -N
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec  8.47 GBytes  7.27 Gbits/sec    0             sender
[  5]   0.00-10.00  sec  8.45 GBytes  7.26 Gbits/sec                  receiver

$ iperf3 -c 2a01:598:88ba:a056:271f:473a:c0d9:abc1
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec  8.43 GBytes  7.24 Gbits/sec    0             sender
[  5]   0.00-10.00  sec  8.41 GBytes  7.22 Gbits/sec                  receiver

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Stefano Brivio 2021-07-26 14:20:36 +02:00
parent 85a820a66f
commit 39ad062100
4 changed files with 516 additions and 47 deletions

View file

@ -441,6 +441,7 @@ static void timer_handler(struct ctx *c, struct timespec *now)
void proto_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
uint32_t *ip_da)
{
tcp_update_l2_buf(eth_d, eth_s, ip_da);
udp_update_l2_buf(eth_d, eth_s, ip_da);
}

558
tcp.c
View file

@ -343,14 +343,18 @@
#include <linux/tcp.h>
#include <time.h>
#include "checksum.h"
#include "util.h"
#include "passt.h"
#include "tap.h"
#include "siphash.h"
#include "pcap.h"
#define MAX_TAP_CONNS (128 * 1024)
#define MAX_SPLICE_CONNS (128 * 1024)
#define TCP_TAP_FRAMES 64
#define PIPE_SIZE (1024 * 1024)
#define TCP_HASH_TABLE_LOAD 70 /* % */
@ -503,8 +507,101 @@ struct tcp_splice_conn {
int v6;
};
/* Socket receive buffer */
static char sock_buf[MAX_WINDOW];
/* Static buffers */
/**
* tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
* @psum: Partial IP header checksum (excluding tot_len and saddr)
* @psum: Partial TCP header checksum (excluding length and saddr)
* @vnet_len: 4-byte qemu vnet buffer length descriptor, only for passt mode
* @eh: Pre-filled Ethernet header
* @iph: Pre-filled IP header (except for tot_len and saddr)
* @uh: Headroom for TCP header
* @data: Storage for TCP payload
*/
__extension__ static struct tcp4_l2_buf_t {
uint32_t psum; /* 0 */
uint32_t tsum; /* 4 */
#ifdef __AVX2__
uint8_t pad[18]; /* 8, align th to 32 bytes */
#endif
uint32_t vnet_len; /* 26 */
struct ethhdr eh; /* 30 */
struct iphdr iph; /* 44 */
struct tcphdr th; /* 64 */
uint8_t data[USHRT_MAX - sizeof(struct tcphdr)];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_buf[TCP_TAP_FRAMES] = {
[ 0 ... TCP_TAP_FRAMES - 1 ] = {
0, 0,
#ifdef __AVX2__
{ 0 },
#endif
0, L2_BUF_ETH_IP4_INIT, L2_BUF_IP4_INIT(IPPROTO_TCP),
{ .doff = sizeof(struct tcphdr) / 4, .ack = 1 }, { 0 },
},
};
static int tcp4_l2_buf_mss;
static int tcp4_l2_buf_mss_nr_set;
static int tcp4_l2_buf_mss_tap;
static int tcp4_l2_buf_mss_tap_nr_set;
/**
* tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
* @vnet_len: 4-byte qemu vnet buffer length descriptor, only for passt mode
* @eh: Pre-filled Ethernet header
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
* @th: Headroom for TCP header
* @data: Storage for TCP payload
*/
__extension__ struct tcp6_l2_buf_t {
#ifdef __AVX2__
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
#else
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
#endif
uint32_t vnet_len; /* 14 2 */
struct ethhdr eh; /* 18 6 */
struct ipv6hdr ip6h; /* 32 20 */
struct tcphdr th; /* 72 60 */
uint8_t data[USHRT_MAX -
(sizeof(struct ipv6hdr) + sizeof(struct tcphdr))];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_buf[TCP_TAP_FRAMES] = {
[ 0 ... TCP_TAP_FRAMES - 1 ] = {
{ 0 },
0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_TCP),
{ .doff = sizeof(struct tcphdr) / 4, .ack = 1 }, { 0 },
},
};
static int tcp6_l2_buf_mss;
static int tcp6_l2_buf_mss_nr_set;
static int tcp6_l2_buf_mss_tap;
static int tcp6_l2_buf_mss_tap_nr_set;
/* recvmmsg()/sendmmsg() data for tap */
static struct iovec tcp4_l2_iov_sock [TCP_TAP_FRAMES + 1];
static struct iovec tcp6_l2_iov_sock [TCP_TAP_FRAMES + 1];
static char tcp_buf_discard [MAX_WINDOW];
static struct iovec tcp4_l2_iov_tap [TCP_TAP_FRAMES];
static struct iovec tcp6_l2_iov_tap [TCP_TAP_FRAMES];
static struct msghdr tcp4_l2_mh_sock;
static struct msghdr tcp6_l2_mh_sock;
static struct mmsghdr tcp_l2_mh_tap [TCP_TAP_FRAMES];
/* Bitmap, activity monitoring needed for connection via tap */
static uint8_t tcp_act[MAX_TAP_CONNS / 8] = { 0 };
@ -540,6 +637,148 @@ static void tcp_splice_state(struct tcp_splice_conn *conn, enum tcp_state state)
conn->state = state;
}
/**
* tcp_update_check_ip4() - Update IPv4 with variable parts from stored one
* @buf: L2 packet buffer with final IPv4 header
*/
static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf)
{
uint32_t sum = buf->psum;
sum += buf->iph.tot_len;
sum += (buf->iph.saddr >> 16) & 0xffff;
sum += buf->iph.saddr & 0xffff;
buf->iph.check = (uint16_t)~csum_fold(sum);
}
/**
* tcp_update_check_tcp4() - Update TCP checksum from stored one
* @buf: L2 packet buffer with final IPv4 header
*/
static void tcp_update_check_tcp4(struct tcp4_l2_buf_t *buf)
{
uint16_t tlen = ntohs(buf->iph.tot_len) - 20;
uint32_t sum = buf->tsum;
sum += (buf->iph.saddr >> 16) & 0xffff;
sum += buf->iph.saddr & 0xffff;
sum += htons(ntohs(buf->iph.tot_len) - 20);
buf->th.check = 0;
buf->th.check = csum(&buf->th, tlen, sum);
}
/**
* tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
* @buf: L2 packet buffer with final IPv6 header
*/
static void tcp_update_check_tcp6(struct tcp6_l2_buf_t *buf)
{
int len = ntohs(buf->ip6h.payload_len) + sizeof(struct ipv6hdr);
buf->ip6h.hop_limit = IPPROTO_TCP;
buf->ip6h.version = 0;
buf->ip6h.nexthdr = 0;
buf->th.check = 0;
buf->th.check = csum(&buf->ip6h, len, 0);
buf->ip6h.hop_limit = 255;
buf->ip6h.version = 6;
buf->ip6h.nexthdr = IPPROTO_TCP;
}
/**
* tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
* @eth_d: Ethernet destination address, NULL if unchanged
* @eth_s: Ethernet source address, NULL if unchanged
* @ip_da: Pointer to IPv4 destination address, NULL if unchanged
*/
void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
uint32_t *ip_da)
{
int i;
for (i = 0; i < TCP_TAP_FRAMES; i++) {
struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
if (eth_d) {
memcpy(b4->eh.h_dest, eth_d, ETH_ALEN);
memcpy(b6->eh.h_dest, eth_d, ETH_ALEN);
}
if (eth_s) {
memcpy(b4->eh.h_source, eth_s, ETH_ALEN);
memcpy(b6->eh.h_source, eth_s, ETH_ALEN);
}
if (ip_da) {
b4->iph.daddr = *ip_da;
if (!i) {
b4->iph.saddr = 0;
b4->iph.tot_len = 0;
b4->iph.check = 0;
b4->psum = sum_16b(&b4->iph, 20);
b4->tsum = ((*ip_da >> 16) & 0xffff) +
(*ip_da & 0xffff) +
htons(IPPROTO_TCP);
} else {
b4->psum = tcp4_l2_buf[0].psum;
b4->tsum = tcp4_l2_buf[0].tsum;
}
}
}
}
/**
* tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
*/
static void tcp_sock4_iov_init(void)
{
struct iovec *iov;
int i;
tcp4_l2_iov_sock[0].iov_base = tcp_buf_discard;
for (i = 0, iov = tcp4_l2_iov_sock + 1; i < TCP_TAP_FRAMES;
i++, iov++) {
iov->iov_base = &tcp4_l2_buf[i].data;
iov->iov_len = MSS_DEFAULT;
}
tcp4_l2_mh_sock.msg_iov = tcp4_l2_iov_sock;
for (i = 0, iov = tcp4_l2_iov_tap; i < TCP_TAP_FRAMES; i++, iov++) {
iov->iov_base = &tcp4_l2_buf[i].vnet_len;
iov->iov_len = MSS_DEFAULT;
}
}
/**
* tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
*/
static void tcp_sock6_iov_init(void)
{
struct iovec *iov;
int i;
tcp6_l2_iov_sock[0].iov_base = tcp_buf_discard;
for (i = 0, iov = tcp6_l2_iov_sock + 1; i < TCP_TAP_FRAMES;
i++, iov++) {
iov->iov_base = &tcp6_l2_buf[i].data;
iov->iov_len = MSS_DEFAULT;
}
tcp6_l2_mh_sock.msg_iov = tcp6_l2_iov_sock;
for (i = 0, iov = tcp6_l2_iov_tap; i < TCP_TAP_FRAMES; i++, iov++) {
iov->iov_base = &tcp6_l2_buf[i].vnet_len;
iov->iov_len = MSS_DEFAULT;
}
}
/**
* tcp_opt_get() - Get option, and value if any, from TCP header
* @th: Pointer to TCP header
@ -846,6 +1085,7 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
err = 0;
info.tcpi_bytes_acked = conn->tcpi_acked_last;
info.tcpi_snd_wnd = conn->tcpi_snd_wnd;
info.tcpi_snd_wscale = conn->ws;
} else {
err = getsockopt(conn->sock, SOL_TCP, TCP_INFO, &info, &sl);
if (err && !(flags & RST)) {
@ -979,8 +1219,19 @@ static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th,
} else {
unsigned int window = ntohs(th->window) << conn->ws;
if (conn->tap_window == window && conn->window_clamped)
return;
if (conn->window_clamped) {
if (conn->tap_window == window)
return;
/* Discard +/- 1% updates to spare some syscalls. */
if ((window > conn->tap_window &&
window * 99 / 100 < conn->tap_window) ||
(window < conn->tap_window &&
window * 101 / 100 > conn->tap_window)) {
conn->tap_window = window;
return;
}
}
conn->tap_window = window;
if (window < 256)
@ -1087,6 +1338,10 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
conn->mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL);
if (conn->mss_guest < 0)
conn->mss_guest = MSS_DEFAULT;
if (c->mode == MODE_PASST && c->v6 && conn->mss_guest > SHRT_MAX)
conn->mss_guest = SHRT_MAX;
sl = sizeof(conn->mss_guest);
setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->mss_guest, sl);
@ -1271,6 +1526,7 @@ static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
* needed from the buffer, and we won't rewind back to a lower ACK
* sequence.
*/
if (to_ack > MAX_WINDOW)
return;
@ -1286,69 +1542,272 @@ static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
* @conn: Connection pointer
* @now: Current timestamp
*
* Return: negative on connection reset, 1 on pending data, 0 otherwise
* Return: negative on connection reset, 0 otherwise
*/
static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
struct timespec *now)
{
uint32_t offset = conn->seq_to_tap - conn->seq_ack_from_tap;
int len, err, left, send, s = conn->sock;
int mss_tap, fill_bufs, send_bufs = 0, last_len, msg_len, iov_rem = 0;
int *buf_mss, *buf_mss_nr_set, *buf_mss_tap, *buf_mss_tap_nr_set;
int send, len, plen, v4 = IN6_IS_ADDR_V4MAPPED(&conn->a.a6);
socklen_t sl = sizeof(struct tcp_info);
struct mmsghdr *mh = tcp_l2_mh_tap;
int s = conn->sock, i, ret = 0;
struct iovec *iov, *iov_tap;
uint32_t already_sent;
struct tcp_info info;
if (!conn->tap_window || offset >= conn->tap_window)
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
if (!conn->tap_window || already_sent >= conn->tap_window)
return 1;
len = recv(s, sock_buf,
/* TODO: Drop 64KiB limit (needed for responsiveness) once
* tap-side coalescing and zero-copy are fully implemented.
*/
MIN(64 * 1024, conn->tap_window),
/* Don't dequeue until acknowledged by guest */
MSG_DONTWAIT | MSG_PEEK);
fill_bufs = DIV_ROUND_UP(conn->tap_window - already_sent,
conn->mss_guest);
if (fill_bufs > TCP_TAP_FRAMES)
fill_bufs = TCP_TAP_FRAMES;
if (len < 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK) {
tcp_rst(c, conn);
return -errno;
}
return 0;
/* Adjust iovec length for recvmsg() based on what was set last time. */
if (v4) {
iov = tcp4_l2_iov_sock + 1;
buf_mss = &tcp4_l2_buf_mss;
buf_mss_nr_set = &tcp4_l2_buf_mss_nr_set;
} else {
iov = tcp6_l2_iov_sock + 1;
buf_mss = &tcp6_l2_buf_mss;
buf_mss_nr_set = &tcp6_l2_buf_mss_nr_set;
}
if (*buf_mss != conn->mss_guest) {
for (i = 0; i < fill_bufs; i++)
iov[i].iov_len = conn->mss_guest;
*buf_mss = conn->mss_guest;
*buf_mss_nr_set = fill_bufs - 1;
} else if (*buf_mss_nr_set < fill_bufs) {
for (i = *buf_mss_nr_set; i < fill_bufs; i++)
iov[i].iov_len = conn->mss_guest;
*buf_mss_nr_set = fill_bufs - 1;
}
if (len == 0) {
if (conn->state >= ESTABLISHED_SOCK_FIN)
return 0;
/* First buffer is to discard data, last one may be partially filled. */
iov[-1].iov_len = already_sent;
iov_rem = (conn->tap_window - already_sent) % conn->mss_guest;
if (iov_rem && fill_bufs < TCP_TAP_FRAMES)
iov[fill_bufs - 1].iov_len = iov_rem;
if (v4)
tcp4_l2_mh_sock.msg_iovlen = fill_bufs + 1;
else
tcp6_l2_mh_sock.msg_iovlen = fill_bufs + 1;
tcp_tap_state(conn, ESTABLISHED_SOCK_FIN);
if ((err = tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0)))
return err;
/* Don't dequeue until acknowledged by guest. */
len = recvmsg(s, v4 ? &tcp4_l2_mh_sock : &tcp6_l2_mh_sock, MSG_PEEK);
if (len < 0)
goto err;
left = 0;
if (!len)
goto zero_len;
send = len - already_sent;
if (send <= 0)
goto out_restore_iov;
send_bufs = DIV_ROUND_UP(send, conn->mss_guest);
last_len = send - (send_bufs - 1) * conn->mss_guest;
/* Adjust iovec length for sending based on what was set last time. */
if (v4) {
mss_tap = conn->mss_guest +
offsetof(struct tcp4_l2_buf_t, data) -
offsetof(struct tcp4_l2_buf_t, vnet_len);
iov_tap = tcp4_l2_iov_tap;
buf_mss_tap = &tcp4_l2_buf_mss_tap;
buf_mss_tap_nr_set = &tcp4_l2_buf_mss_tap_nr_set;
} else {
mss_tap = conn->mss_guest +
offsetof(struct tcp6_l2_buf_t, data) -
offsetof(struct tcp6_l2_buf_t, vnet_len);
iov_tap = tcp6_l2_iov_tap;
buf_mss_tap = &tcp6_l2_buf_mss_tap;
buf_mss_tap_nr_set = &tcp6_l2_buf_mss_tap_nr_set;
}
if (*buf_mss_tap != mss_tap) {
for (i = 0; i < send_bufs; i++)
iov_tap[i].iov_len = mss_tap;
*buf_mss_tap = mss_tap;
*buf_mss_tap_nr_set = send_bufs;
} else if (*buf_mss_tap_nr_set < send_bufs) {
for (i = *buf_mss_tap_nr_set; i < send_bufs; i++)
iov_tap[i].iov_len = mss_tap;
*buf_mss_tap_nr_set = send_bufs;
}
iov_tap[send_bufs - 1].iov_len = mss_tap - conn->mss_guest + last_len;
/* Likely, some new data was acked too. */
if (conn->seq_from_tap != conn->seq_ack_to_tap) {
if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &info, &sl))
goto err;
conn->tcpi_acked_last = info.tcpi_bytes_acked;
conn->seq_ack_to_tap = info.tcpi_bytes_acked +
conn->seq_init_from_tap;
} else {
info.tcpi_snd_wscale = conn->ws;
info.tcpi_snd_wnd = conn->tcpi_snd_wnd;
}
if (v4)
mh->msg_hdr.msg_iov = tcp4_l2_iov_tap;
else
mh->msg_hdr.msg_iov = tcp6_l2_iov_tap;
mh->msg_hdr.msg_iovlen = 0;
plen = conn->mss_guest;
msg_len = 0;
for (i = 0; i < send_bufs; i++) {
int iov_len, ip_len;
if (i == send_bufs - 1)
plen = last_len;
if (v4) {
struct tcp4_l2_buf_t *b = &tcp4_l2_buf[i];
ip_len = plen + sizeof(struct iphdr) +
sizeof(struct tcphdr);
b->iph.tot_len = htons(ip_len);
b->iph.saddr = conn->a.a4.a.s_addr;
b->iph.daddr = c->addr4_seen;
if (!i || i == send_bufs - 1)
tcp_update_check_ip4(b);
else
b->iph.check = tcp4_l2_buf[0].iph.check;
b->th.source = htons(conn->sock_port);
b->th.dest = htons(conn->tap_port);
b->th.seq = htonl(conn->seq_to_tap);
b->th.ack_seq = htonl(conn->seq_ack_to_tap);
if (conn->no_snd_wnd) {
b->th.window = htons(WINDOW_DEFAULT);
} else {
b->th.window = htons(info.tcpi_snd_wnd >>
info.tcpi_snd_wscale);
conn->tcpi_snd_wnd = info.tcpi_snd_wnd;
}
tcp_update_check_tcp4(b);
if (c->mode == MODE_PASTA) {
ip_len += sizeof(struct ethhdr);
write(c->fd_tap, &b->eh, ip_len);
pcap((char *)&b->eh, ip_len);
conn->seq_to_tap += plen;
continue;
}
b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len);
} else {
struct tcp6_l2_buf_t *b = &tcp6_l2_buf[i];
uint32_t flow = conn->seq_init_to_tap;
ip_len = plen + sizeof(struct ipv6hdr) +
sizeof(struct tcphdr);
b->ip6h.payload_len = htons(plen +
sizeof(struct tcphdr));
b->ip6h.saddr = conn->a.a6;
if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr))
b->ip6h.daddr = c->addr6_ll_seen;
else
b->ip6h.daddr = c->addr6_seen;
b->th.source = htons(conn->sock_port);
b->th.dest = htons(conn->tap_port);
b->th.seq = htonl(conn->seq_to_tap);
b->th.ack_seq = htonl(conn->seq_ack_to_tap);
if (conn->no_snd_wnd) {
b->th.window = htons(WINDOW_DEFAULT);
} else {
b->th.window = htons(info.tcpi_snd_wnd >>
info.tcpi_snd_wscale);
conn->tcpi_snd_wnd = info.tcpi_snd_wnd;
}
memset(b->ip6h.flow_lbl, 0, 3);
tcp_update_check_tcp6(b);
b->ip6h.flow_lbl[0] = (flow >> 16) & 0xf;
b->ip6h.flow_lbl[1] = (flow >> 8) & 0xff;
b->ip6h.flow_lbl[2] = (flow >> 0) & 0xff;
if (c->mode == MODE_PASTA) {
ip_len += sizeof(struct ethhdr);
write(c->fd_tap, &b->eh, ip_len);
pcap((char *)&b->eh, ip_len);
conn->seq_to_tap += plen;
continue;
}
b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len);
}
iov_len = sizeof(uint32_t) + sizeof(struct ethhdr) + ip_len;
/* Switch to a new message if this one is too long for qemu. */
if (msg_len && msg_len + iov_len > SHRT_MAX) {
mh++;
mh->msg_hdr.msg_iovlen = 0;
msg_len = 0;
if (v4)
mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i];
else
mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i];
}
mh->msg_hdr.msg_iovlen++;
msg_len += iov_len;
conn->seq_to_tap += plen;
}
if (c->mode == MODE_PASTA)
goto out;
sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1,
MSG_NOSIGNAL | MSG_DONTWAIT);
pcapmm(tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1);
goto out;
err:
if (errno != EAGAIN && errno != EWOULDBLOCK) {
tcp_rst(c, conn);
ret = -errno;
}
goto out_restore_iov;
left = len - offset;
while (left && (offset + conn->mss_guest <= conn->tap_window)) {
if (left < conn->mss_guest)
send = left;
else
send = conn->mss_guest;
zero_len:
if (conn->state >= ESTABLISHED_SOCK_FIN)
goto out_restore_iov;
if (offset + send > MAX_WINDOW) {
tcp_rst(c, conn);
return -EIO;
}
err = tcp_send_to_tap(c, conn, 0, sock_buf + offset, send);
if (err)
return err;
offset += send;
left -= send;
}
tcp_tap_state(conn, ESTABLISHED_SOCK_FIN);
tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
goto out_restore_iov;
out:
conn->ts_sock = *now;
return !!left;
out_restore_iov:
if (iov_rem)
iov[fill_bufs - 1].iov_len = conn->mss_guest;
if (send_bufs)
iov_tap[send_bufs - 1].iov_len = mss_tap;
return ret;
}
/**
@ -1446,6 +1905,11 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (conn->mss_guest < 0)
conn->mss_guest = MSS_DEFAULT;
/* Don't upset qemu */
if (c->mode == MODE_PASST && c->v6 &&
conn->mss_guest > SHRT_MAX)
conn->mss_guest = SHRT_MAX;
ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
if (ws > MAX_WS) {
if (tcp_send_to_tap(c, conn, RST, NULL, 0))

2
tcp.h
View file

@ -14,6 +14,8 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now);
int tcp_sock_init(struct ctx *c);
void tcp_timer(struct ctx *c, struct timespec *ts);
void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
uint32_t *ip_da);
/**
* union tcp_epoll_ref - epoll reference portion for TCP connections

2
util.h
View file

@ -29,6 +29,8 @@ void debug(const char *format, ...);
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#endif
#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
#define SWAP(a, b) \
do { \
typeof(a) __x = (a); (a) = (b); (b) = __x; \