tcp: Replace TCP buffer structure by an iovec array

To be able to provide pointers to TCP headers and IP headers without
worrying about alignment in the structure, split the structure into
several arrays and point to each part of the frame using an iovec array.

Using iovec also allows us to simply ignore the first entry when the
vnet length header is not needed. And as the payload buffer contains
only the TCP header and the TCP data we can increase the size of the
TCP data to USHRT_MAX - sizeof(struct tcphdr).

As a side effect, these changes improve performance by a factor of
x1.5.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
This commit is contained in:
Laurent Vivier 2024-03-07 12:01:44 +01:00
parent 137ce01789
commit a66fceb280
3 changed files with 301 additions and 258 deletions

104
tap.c
View file

@ -350,6 +350,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c,
return i; return i;
} }
/**
* tap_send_iov_pasta() - Send out multiple prepared frames
* @c: Execution context
* @iov: Array of frames, each frames is divided in an array of iovecs.
* The first entry of the iovec is ignored
* @n: Number of frames in @iov
*
* Return: number of frames actually sent
*/
static size_t tap_send_iov_pasta(const struct ctx *c,
struct iovec iov[][TCP_IOV_NUM], size_t n)
{
unsigned int i;
for (i = 0; i < n; i++) {
if (!tap_send_frames_pasta(c, &iov[i][TCP_IOV_ETH],
TCP_IOV_NUM - TCP_IOV_ETH))
break;
}
return i;
}
/** /**
* tap_send_frames_passt() - Send multiple frames to the passt tap * tap_send_frames_passt() - Send multiple frames to the passt tap
* @c: Execution context * @c: Execution context
@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c,
return i; return i;
} }
/**
* tap_send_iov_passt() - Send out multiple prepared frames
* @c: Execution context
* @iov: Array of frames, each frames is divided in an array of iovecs.
* The first entry of the iovec is updated to point to an
* uint32_t storing the frame length.
* @n: Number of frames in @iov
*
* Return: number of frames actually sent
*/
static size_t tap_send_iov_passt(const struct ctx *c,
struct iovec iov[][TCP_IOV_NUM],
size_t n)
{
unsigned int i;
for (i = 0; i < n; i++) {
uint32_t vnet_len;
int j;
vnet_len = 0;
for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
vnet_len += iov[i][j].iov_len;
vnet_len = htonl(vnet_len);
iov[i][TCP_IOV_VNET].iov_base = &vnet_len;
iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len);
if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))
break;
}
return i;
}
/** /**
* tap_send_frames() - Send out multiple prepared frames * tap_send_frames() - Send out multiple prepared frames
* @c: Execution context * @c: Execution context
@ -418,6 +478,50 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n)
return m; return m;
} }
/**
* tap_send_iov() - Send out multiple prepared frames
* @c: Execution context
* @iov: Array of frames, each frames is divided in an array of iovecs.
* iovec array is:
* TCP_IOV_VNET (0) vnet length
* TCP_IOV_ETH (1) ethernet header
* TCP_IOV_IP (2) IP (v4/v6) header
* TCP_IOV_PAYLOAD (3) IP payload (TCP header + data)
* TCP_IOV_NUM (4) is the number of entries in the iovec array
* TCP_IOV_VNET entry is updated with passt, ignored with pasta.
* @n: Number of frames in @iov
*
* Return: number of frames actually sent
*/
size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
size_t n)
{
size_t m;
unsigned int i;
if (!n)
return 0;
switch (c->mode) {
case MODE_PASST:
m = tap_send_iov_passt(c, iov, n);
break;
case MODE_PASTA:
m = tap_send_iov_pasta(c, iov, n);
break;
default:
ASSERT(0);
}
if (m < n)
debug("tap: failed to send %zu frames of %zu", n - m, n);
for (i = 0; i < m; i++)
pcap_iov(&iov[i][TCP_IOV_ETH], TCP_IOV_NUM - TCP_IOV_ETH);
return m;
}
/** /**
* eth_update_mac() - Update tap L2 header with new Ethernet addresses * eth_update_mac() - Update tap L2 header with new Ethernet addresses
* @eh: Ethernet headers to update * @eh: Ethernet headers to update

16
tap.h
View file

@ -6,6 +6,20 @@
#ifndef TAP_H #ifndef TAP_H
#define TAP_H #define TAP_H
/*
* TCP frame iovec array:
* TCP_IOV_VNET vnet length
* TCP_IOV_ETH ethernet header
* TCP_IOV_IP IP (v4/v6) header
* TCP_IOV_PAYLOAD IP payload (TCP header + data)
* TCP_IOV_NUM is the number of entries in the iovec array
*/
#define TCP_IOV_VNET 0
#define TCP_IOV_ETH 1
#define TCP_IOV_IP 2
#define TCP_IOV_PAYLOAD 3
#define TCP_IOV_NUM 4
/** /**
* struct tap_hdr - L2 and tap specific headers * struct tap_hdr - L2 and tap specific headers
* @vnet_len: Frame length (for qemu socket transport) * @vnet_len: Frame length (for qemu socket transport)
@ -74,6 +88,8 @@ void tap_icmp6_send(const struct ctx *c,
const void *in, size_t len); const void *in, size_t len);
int tap_send(const struct ctx *c, const void *data, size_t len); int tap_send(const struct ctx *c, const void *data, size_t len);
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n); size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n);
size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
size_t n);
void eth_update_mac(struct ethhdr *eh, void eth_update_mac(struct ethhdr *eh,
const unsigned char *eth_d, const unsigned char *eth_s); const unsigned char *eth_d, const unsigned char *eth_s);
void tap_listen_handler(struct ctx *c, uint32_t events); void tap_listen_handler(struct ctx *c, uint32_t events);

439
tcp.c
View file

@ -318,39 +318,7 @@
/* MSS rounding: see SET_MSS() */ /* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536 #define MSS_DEFAULT 536
#define MSS (USHRT_MAX - sizeof(struct tcphdr))
struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
#ifdef __AVX2__
uint8_t pad[26];
#else
uint8_t pad[2];
#endif
struct tap_hdr taph;
struct iphdr iph;
struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
#ifdef __AVX2__
uint8_t pad[14];
#else
uint8_t pad[2];
#endif
struct tap_hdr taph;
struct ipv6hdr ip6h;
struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
#define WINDOW_DEFAULT 14600 /* RFC 6928 */ #define WINDOW_DEFAULT 14600 /* RFC 6928 */
#ifdef HAS_SND_WND #ifdef HAS_SND_WND
@ -445,133 +413,78 @@ struct tcp_buf_seq_update {
}; };
/* Static buffers */ /* Static buffers */
/** /**
* tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections * tcp_l2_flags_t - TCP header and data to send option flags
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only * @th: TCP header
* @taph: Tap-level headers (partially pre-filled) * @opts TCP option flags
* @iph: Pre-filled IP header (except for tot_len and saddr)
* @uh: Headroom for TCP header
* @data: Storage for TCP payload
*/ */
static struct tcp4_l2_buf_t { struct tcp_l2_flags_t {
struct tcphdr th;
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
};
/**
* tcp_l2_payload_t - TCP header and data to send data
* 32 bytes aligned to be able to use AVX2 checksum
* @th: TCP header
* @data: TCP data
*/
struct tcp_l2_payload_t {
struct tcphdr th; /* 20 bytes */
uint8_t data[MSS]; /* 65516 bytes */
#ifdef __AVX2__ #ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */ } __attribute__ ((packed, aligned(32)));
#else #else
uint8_t pad[2]; /* align iph to 4 bytes 0 */ } __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif #endif
struct tap_hdr taph; /* 26 2 */
struct iphdr iph; /* 44 20 */ /* Ethernet header for IPv4 frames */
struct tcphdr th; /* 64 40 */ static struct ethhdr tcp4_eth_src;
uint8_t data[MSS4]; /* 84 60 */
/* 65536 65532 */ /* IPv4 headers */
#ifdef __AVX2__ static struct iphdr tcp4_l2_ip[TCP_FRAMES_MEM];
} __attribute__ ((packed, aligned(32))) /* TCP headers and data for IPv4 frames */
#else static struct tcp_l2_payload_t tcp4_l2_payload[TCP_FRAMES_MEM];
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_buf[TCP_FRAMES_MEM];
static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
static unsigned int tcp4_l2_buf_used; static unsigned int tcp4_l2_buf_used;
/** /* IPv4 headers for TCP option flags frames */
* tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections static struct iphdr tcp4_l2_flags_ip[TCP_FRAMES_MEM];
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B /* TCP headers and option flags for IPv4 frames */
* @taph: Tap-level headers (partially pre-filled) static struct tcp_l2_flags_t tcp4_l2_flags[TCP_FRAMES_MEM];
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
* @th: Headroom for TCP header static unsigned int tcp4_l2_flags_buf_used;
* @data: Storage for TCP payload
*/ /* Ethernet header for IPv6 frames */
struct tcp6_l2_buf_t { static struct ethhdr tcp6_eth_src;
#ifdef __AVX2__
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ /* IPv6 headers */
#else static struct ipv6hdr tcp6_l2_ip[TCP_FRAMES_MEM];
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ /* TCP headers and data for IPv6 frames */
#endif static struct tcp_l2_payload_t tcp6_l2_payload[TCP_FRAMES_MEM];
struct tap_hdr taph; /* 14 2 */
struct ipv6hdr ip6h; /* 32 20 */
struct tcphdr th; /* 72 60 */
uint8_t data[MSS6]; /* 92 80 */
/* 65536 65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_buf[TCP_FRAMES_MEM];
static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
static unsigned int tcp6_l2_buf_used; static unsigned int tcp6_l2_buf_used;
/* IPv6 headers for TCP option flags frames */
static struct ipv6hdr tcp6_l2_flags_ip[TCP_FRAMES_MEM];
/* TCP headers and option flags for IPv6 frames */
static struct tcp_l2_flags_t tcp6_l2_flags[TCP_FRAMES_MEM];
static unsigned int tcp6_l2_flags_buf_used;
/* recvmsg()/sendmsg() data for tap */ /* recvmsg()/sendmsg() data for tap */
static char tcp_buf_discard [MAX_WINDOW]; static char tcp_buf_discard [MAX_WINDOW];
static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
/* sendmsg() to socket */ /* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV]; static struct iovec tcp_iov [UIO_MAXIOV];
/**
* tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
* @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr)
* @th: Headroom for TCP header
* @opts: Headroom for TCP options
*/
static struct tcp4_l2_flags_buf_t {
#ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */
#else
uint8_t pad[2]; /* align iph to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 26 2 */
struct iphdr iph; /* 44 20 */
struct tcphdr th; /* 64 40 */
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_flags_buf[TCP_FRAMES_MEM];
static unsigned int tcp4_l2_flags_buf_used;
/**
* tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
* @taph: Tap-level headers (partially pre-filled)
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
* @th: Headroom for TCP header
* @opts: Headroom for TCP options
*/
static struct tcp6_l2_flags_buf_t {
#ifdef __AVX2__
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
#else
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 14 2 */
struct ipv6hdr ip6h; /* 32 20 */
struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_flags_buf[TCP_FRAMES_MEM];
static unsigned int tcp6_l2_flags_buf_used;
#define CONN(idx) (&(FLOW(idx)->tcp)) #define CONN(idx) (&(FLOW(idx)->tcp))
/* Table for lookup from remote address, local port, remote port */ /* Table for lookup from remote address, local port, remote port */
@ -973,19 +886,8 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
*/ */
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
{ {
int i; eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
for (i = 0; i < TCP_FRAMES_MEM; i++) {
struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
eth_update_mac(&b4->taph.eh, eth_d, eth_s);
eth_update_mac(&b6->taph.eh, eth_d, eth_s);
eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
}
} }
/** /**
@ -995,29 +897,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
static void tcp_sock4_iov_init(const struct ctx *c) static void tcp_sock4_iov_init(const struct ctx *c)
{ {
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
struct iovec *iov;
int i; int i;
for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { (void)c;
tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IP), tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
.iph = iph, for (i = 0; i < TCP_FRAMES_MEM; i++) {
.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } struct iovec *iov;
};
/* headers */
tcp4_l2_ip[i] = iph;
tcp4_l2_payload[i].th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
tcp4_l2_flags_ip[i] = iph;
tcp4_l2_flags[i].th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
/* iovecs */
iov = tcp4_l2_iov[i];
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i];
iov = tcp4_l2_flags_iov[i];
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i];
} }
for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IP),
.iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
};
}
for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph);
for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph);
} }
/** /**
@ -1026,29 +941,43 @@ static void tcp_sock4_iov_init(const struct ctx *c)
*/ */
static void tcp_sock6_iov_init(const struct ctx *c) static void tcp_sock6_iov_init(const struct ctx *c)
{ {
struct iovec *iov; struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
int i; int i;
for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { (void)c;
tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IPV6), tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), for (i = 0; i < TCP_FRAMES_MEM; i++) {
.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } struct iovec *iov;
};
/* headers */
tcp6_l2_ip[i] = ip6;
tcp6_l2_payload[i].th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
tcp6_l2_flags_ip[i] = ip6;
tcp6_l2_flags[i].th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
/* iovecs */
iov = tcp6_l2_iov[i];
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i];
iov = tcp6_l2_flags_iov[i];
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i];
} }
for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IPV6),
.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
};
}
for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph);
for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph);
} }
/** /**
@ -1289,10 +1218,10 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
*/ */
static void tcp_l2_flags_buf_flush(const struct ctx *c) static void tcp_l2_flags_buf_flush(const struct ctx *c)
{ {
tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
tcp6_l2_flags_buf_used = 0; tcp6_l2_flags_buf_used = 0;
tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
tcp4_l2_flags_buf_used = 0; tcp4_l2_flags_buf_used = 0;
} }
@ -1305,12 +1234,12 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
unsigned i; unsigned i;
size_t m; size_t m;
m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used);
for (i = 0; i < m; i++) for (i = 0; i < m; i++)
*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
tcp6_l2_buf_used = 0; tcp6_l2_buf_used = 0;
m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used);
for (i = 0; i < m; i++) for (i = 0; i < m; i++)
*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
tcp4_l2_buf_used = 0; tcp4_l2_buf_used = 0;
@ -1433,7 +1362,7 @@ static size_t tcp_fill_headers6(const struct ctx *c,
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
* @c: Execution context * @c: Execution context
* @conn: Connection pointer * @conn: Connection pointer
* @p: Pointer to any type of TCP pre-cooked buffer * @iov: Pointer to an array of iovec of TCP pre-cooked buffer
* @plen: Payload length (including TCP header options) * @plen: Payload length (including TCP header options)
* @check: Checksum, if already known * @check: Checksum, if already known
* @seq: Sequence number for this segment * @seq: Sequence number for this segment
@ -1442,26 +1371,22 @@ static size_t tcp_fill_headers6(const struct ctx *c,
*/ */
static size_t tcp_l2_buf_fill_headers(const struct ctx *c, static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
const struct tcp_tap_conn *conn, const struct tcp_tap_conn *conn,
void *p, size_t plen, struct iovec *iov, size_t plen,
const uint16_t *check, uint32_t seq) const uint16_t *check, uint32_t seq)
{ {
const struct in_addr *a4 = inany_v4(&conn->faddr); const struct in_addr *a4 = inany_v4(&conn->faddr);
size_t ip_len, tlen; size_t ip_len, tlen;
if (a4) { if (a4) {
struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p; ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, plen,
ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
check, seq); check, seq);
tlen = ip_len - sizeof(struct iphdr);
tlen = tap_iov_len(c, &b->taph, ip_len);
} else { } else {
struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, plen,
ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
seq); seq);
tlen = ip_len - sizeof(struct ipv6hdr);
tlen = tap_iov_len(c, &b->taph, ip_len);
} }
return tlen; return tlen;
@ -1595,16 +1520,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{ {
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
uint32_t prev_wnd_to_tap = conn->wnd_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
struct tcp4_l2_flags_buf_t *b4 = NULL; struct tcp_l2_flags_t *payload;
struct tcp6_l2_flags_buf_t *b6 = NULL;
struct tcp_info tinfo = { 0 }; struct tcp_info tinfo = { 0 };
socklen_t sl = sizeof(tinfo); socklen_t sl = sizeof(tinfo);
int s = conn->sock; int s = conn->sock;
size_t optlen = 0; size_t optlen = 0;
struct iovec *iov; struct iovec *iov;
struct iovec *dup_iov;
struct tcphdr *th; struct tcphdr *th;
char *data; char *data;
void *p;
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
!flags && conn->wnd_to_tap) !flags && conn->wnd_to_tap)
@ -1627,18 +1551,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
return 0; return 0;
if (CONN_V4(conn)) { if (CONN_V4(conn)) {
iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++];
p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used];
th = &b4->th;
/* gcc 11.2 would complain on data = (char *)(th + 1); */
data = b4->opts;
} else { } else {
iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++];
p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used];
th = &b6->th;
data = b6->opts;
} }
payload = iov[TCP_IOV_PAYLOAD].iov_base;
th = &payload->th;
data = payload->opts;
if (flags & SYN) { if (flags & SYN) {
int mss; int mss;
@ -1653,10 +1574,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
mss = tinfo.tcpi_snd_mss; mss = tinfo.tcpi_snd_mss;
} else { } else {
mss = c->mtu - sizeof(struct tcphdr); mss = c->mtu - sizeof(struct tcphdr);
if (CONN_V4(conn))
mss -= sizeof(struct iphdr);
else
mss -= sizeof(struct ipv6hdr);
if (c->low_wmem && if (c->low_wmem &&
!(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn))
@ -1688,8 +1605,9 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
th->syn = !!(flags & SYN); th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN); th->fin = !!(flags & FIN);
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen, iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, iov,
NULL, conn->seq_to_tap); optlen, NULL,
conn->seq_to_tap);
if (th->ack) { if (th->ack) {
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
@ -1705,23 +1623,26 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (th->fin || th->syn) if (th->fin || th->syn)
conn->seq_to_tap++; conn->seq_to_tap++;
if (CONN_V4(conn)) { if (flags & DUP_ACK) {
if (flags & DUP_ACK) { int i;
memcpy(b4 + 1, b4, sizeof(*b4)); for (i = 0; i < TCP_IOV_NUM; i++) {
(iov + 1)->iov_len = iov->iov_len; memcpy(dup_iov[i].iov_base, iov[i].iov_base,
tcp4_l2_flags_buf_used++; iov[i].iov_len);
dup_iov[i].iov_len = iov[i].iov_len;
} }
}
if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) if (CONN_V4(conn)) {
if (flags & DUP_ACK)
tcp4_l2_flags_buf_used++;
if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
tcp_l2_flags_buf_flush(c); tcp_l2_flags_buf_flush(c);
} else { } else {
if (flags & DUP_ACK) { if (flags & DUP_ACK)
memcpy(b6 + 1, b6, sizeof(*b6));
(iov + 1)->iov_len = iov->iov_len;
tcp6_l2_flags_buf_used++; tcp6_l2_flags_buf_used++;
}
if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
tcp_l2_flags_buf_flush(c); tcp_l2_flags_buf_flush(c);
} }
@ -1887,15 +1808,14 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
unsigned int mss; unsigned int mss;
int ret; int ret;
(void)conn; /* unused */
if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0) if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
mss = MSS_DEFAULT; mss = MSS_DEFAULT;
else else
mss = ret; mss = ret;
if (CONN_V4(conn)) mss = MIN(MSS, mss);
mss = MIN(MSS4, mss);
else
mss = MIN(MSS6, mss);
return MIN(mss, USHRT_MAX); return MIN(mss, USHRT_MAX);
} }
@ -2171,27 +2091,30 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
struct iovec *iov; struct iovec *iov;
if (CONN_V4(conn)) { if (CONN_V4(conn)) {
struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1];
const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; const uint16_t *check = NULL;
if (no_csum) {
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
check = &iph->check;
}
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
iov = tcp4_l2_iov + tcp4_l2_buf_used++; iov = tcp4_l2_iov[tcp4_l2_buf_used++];
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn,
check, seq); iov, plen, check, seq);
if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1)
tcp_l2_data_buf_flush(c); tcp_l2_data_buf_flush(c);
} else if (CONN_V6(conn)) { } else if (CONN_V6(conn)) {
struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
iov = tcp6_l2_iov + tcp6_l2_buf_used++; iov = tcp6_l2_iov[tcp6_l2_buf_used++];
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn,
NULL, seq); iov, plen, NULL, seq);
if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1)
tcp_l2_data_buf_flush(c); tcp_l2_data_buf_flush(c);
} }
} }
@ -2247,8 +2170,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
iov_sock[0].iov_base = tcp_buf_discard; iov_sock[0].iov_base = tcp_buf_discard;
iov_sock[0].iov_len = already_sent; iov_sock[0].iov_len = already_sent;
if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) ||
(!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { (!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) {
tcp_l2_data_buf_flush(c); tcp_l2_data_buf_flush(c);
/* Silence Coverity CWE-125 false positive */ /* Silence Coverity CWE-125 false positive */
@ -2257,9 +2180,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
if (v4) if (v4)
iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data;
else else
iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data;
iov->iov_len = mss; iov->iov_len = mss;
} }
if (iov_rem) if (iov_rem)