tcp, flow: Remove redundant information, repack connection structures
Some information we explicitly store in the TCP connection is now duplicated in the common flow structure. Access it from there instead, and remove it from the TCP specific structure. With that done we can reorder both the "tap" and "splice" TCP structures a bit to get better packing for the new combined flow table entries. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
parent
4e2d36e83f
commit
f9fe212b1f
3 changed files with 47 additions and 51 deletions
50
tcp.c
50
tcp.c
|
@ -333,8 +333,6 @@
|
||||||
|
|
||||||
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
|
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
|
||||||
|
|
||||||
#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
|
|
||||||
|
|
||||||
#define CONN_IS_CLOSING(conn) \
|
#define CONN_IS_CLOSING(conn) \
|
||||||
(((conn)->events & ESTABLISHED) && \
|
(((conn)->events & ESTABLISHED) && \
|
||||||
((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
|
((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
|
||||||
|
@ -673,10 +671,11 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
*/
|
*/
|
||||||
static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
|
static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
|
||||||
{
|
{
|
||||||
|
const struct flowside *tapside = TAPFLOW(conn);
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
|
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
|
||||||
if (inany_equals(&conn->faddr, low_rtt_dst + i))
|
if (inany_equals(&tapside->faddr, low_rtt_dst + i))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -691,6 +690,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
|
||||||
const struct tcp_info *tinfo)
|
const struct tcp_info *tinfo)
|
||||||
{
|
{
|
||||||
#ifdef HAS_MIN_RTT
|
#ifdef HAS_MIN_RTT
|
||||||
|
const struct flowside *tapside = TAPFLOW(conn);
|
||||||
int i, hole = -1;
|
int i, hole = -1;
|
||||||
|
|
||||||
if (!tinfo->tcpi_min_rtt ||
|
if (!tinfo->tcpi_min_rtt ||
|
||||||
|
@ -698,7 +698,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
|
||||||
return;
|
return;
|
||||||
|
|
||||||
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
|
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
|
||||||
if (inany_equals(&conn->faddr, low_rtt_dst + i))
|
if (inany_equals(&tapside->faddr, low_rtt_dst + i))
|
||||||
return;
|
return;
|
||||||
if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
|
if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
|
||||||
hole = i;
|
hole = i;
|
||||||
|
@ -710,7 +710,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
|
||||||
if (hole == -1)
|
if (hole == -1)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
low_rtt_dst[hole++] = conn->faddr;
|
low_rtt_dst[hole++] = tapside->faddr;
|
||||||
if (hole == LOW_RTT_TABLE_SIZE)
|
if (hole == LOW_RTT_TABLE_SIZE)
|
||||||
hole = 0;
|
hole = 0;
|
||||||
inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
|
inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
|
||||||
|
@ -865,8 +865,10 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn,
|
||||||
const union inany_addr *faddr,
|
const union inany_addr *faddr,
|
||||||
in_port_t eport, in_port_t fport)
|
in_port_t eport, in_port_t fport)
|
||||||
{
|
{
|
||||||
if (inany_equals(&conn->faddr, faddr) &&
|
const struct flowside *tapside = TAPFLOW(conn);
|
||||||
conn->eport == eport && conn->fport == fport)
|
|
||||||
|
if (inany_equals(&tapside->faddr, faddr) &&
|
||||||
|
tapside->eport == eport && tapside->fport == fport)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -900,7 +902,10 @@ static uint64_t tcp_hash(const struct ctx *c, const union inany_addr *faddr,
|
||||||
static uint64_t tcp_conn_hash(const struct ctx *c,
|
static uint64_t tcp_conn_hash(const struct ctx *c,
|
||||||
const struct tcp_tap_conn *conn)
|
const struct tcp_tap_conn *conn)
|
||||||
{
|
{
|
||||||
return tcp_hash(c, &conn->faddr, conn->eport, conn->fport);
|
const struct flowside *tapside = TAPFLOW(conn);
|
||||||
|
|
||||||
|
return tcp_hash(c, &tapside->faddr, tapside->eport,
|
||||||
|
tapside->fport);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1037,8 +1042,10 @@ void tcp_defer_handler(struct ctx *c)
|
||||||
static void tcp_fill_header(struct tcphdr *th,
|
static void tcp_fill_header(struct tcphdr *th,
|
||||||
const struct tcp_tap_conn *conn, uint32_t seq)
|
const struct tcp_tap_conn *conn, uint32_t seq)
|
||||||
{
|
{
|
||||||
th->source = htons(conn->fport);
|
const struct flowside *tapside = TAPFLOW(conn);
|
||||||
th->dest = htons(conn->eport);
|
|
||||||
|
th->source = htons(tapside->fport);
|
||||||
|
th->dest = htons(tapside->eport);
|
||||||
th->seq = htonl(seq);
|
th->seq = htonl(seq);
|
||||||
th->ack_seq = htonl(conn->seq_ack_to_tap);
|
th->ack_seq = htonl(conn->seq_ack_to_tap);
|
||||||
if (conn->events & ESTABLISHED) {
|
if (conn->events & ESTABLISHED) {
|
||||||
|
@ -1070,7 +1077,8 @@ static size_t tcp_fill_headers4(const struct ctx *c,
|
||||||
size_t dlen, const uint16_t *check,
|
size_t dlen, const uint16_t *check,
|
||||||
uint32_t seq)
|
uint32_t seq)
|
||||||
{
|
{
|
||||||
const struct in_addr *a4 = inany_v4(&conn->faddr);
|
const struct flowside *tapside = TAPFLOW(conn);
|
||||||
|
const struct in_addr *a4 = inany_v4(&tapside->faddr);
|
||||||
size_t l4len = dlen + sizeof(*th);
|
size_t l4len = dlen + sizeof(*th);
|
||||||
size_t l3len = l4len + sizeof(*iph);
|
size_t l3len = l4len + sizeof(*iph);
|
||||||
|
|
||||||
|
@ -1112,10 +1120,11 @@ static size_t tcp_fill_headers6(const struct ctx *c,
|
||||||
struct ipv6hdr *ip6h, struct tcphdr *th,
|
struct ipv6hdr *ip6h, struct tcphdr *th,
|
||||||
size_t dlen, uint32_t seq)
|
size_t dlen, uint32_t seq)
|
||||||
{
|
{
|
||||||
|
const struct flowside *tapside = TAPFLOW(conn);
|
||||||
size_t l4len = dlen + sizeof(*th);
|
size_t l4len = dlen + sizeof(*th);
|
||||||
|
|
||||||
ip6h->payload_len = htons(l4len);
|
ip6h->payload_len = htons(l4len);
|
||||||
ip6h->saddr = conn->faddr.a6;
|
ip6h->saddr = tapside->faddr.a6;
|
||||||
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr))
|
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr))
|
||||||
ip6h->daddr = c->ip6.addr_ll_seen;
|
ip6h->daddr = c->ip6.addr_ll_seen;
|
||||||
else
|
else
|
||||||
|
@ -1154,7 +1163,8 @@ size_t tcp_l2_buf_fill_headers(const struct ctx *c,
|
||||||
struct iovec *iov, size_t dlen,
|
struct iovec *iov, size_t dlen,
|
||||||
const uint16_t *check, uint32_t seq)
|
const uint16_t *check, uint32_t seq)
|
||||||
{
|
{
|
||||||
const struct in_addr *a4 = inany_v4(&conn->faddr);
|
const struct flowside *tapside = TAPFLOW(conn);
|
||||||
|
const struct in_addr *a4 = inany_v4(&tapside->faddr);
|
||||||
|
|
||||||
if (a4) {
|
if (a4) {
|
||||||
return tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
|
return tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
|
||||||
|
@ -1465,6 +1475,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
const struct timespec *now)
|
const struct timespec *now)
|
||||||
{
|
{
|
||||||
struct siphash_state state = SIPHASH_INIT(c->hash_secret);
|
struct siphash_state state = SIPHASH_INIT(c->hash_secret);
|
||||||
|
const struct flowside *tapside = TAPFLOW(conn);
|
||||||
union inany_addr aany;
|
union inany_addr aany;
|
||||||
uint64_t hash;
|
uint64_t hash;
|
||||||
uint32_t ns;
|
uint32_t ns;
|
||||||
|
@ -1474,10 +1485,10 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||||
else
|
else
|
||||||
inany_from_af(&aany, AF_INET6, &c->ip6.addr);
|
inany_from_af(&aany, AF_INET6, &c->ip6.addr);
|
||||||
|
|
||||||
inany_siphash_feed(&state, &conn->faddr);
|
inany_siphash_feed(&state, &tapside->faddr);
|
||||||
inany_siphash_feed(&state, &aany);
|
inany_siphash_feed(&state, &aany);
|
||||||
hash = siphash_final(&state, 36,
|
hash = siphash_final(&state, 36,
|
||||||
(uint64_t)conn->fport << 16 | conn->eport);
|
(uint64_t)tapside->fport << 16 | tapside->eport);
|
||||||
|
|
||||||
/* 32ns ticks, overflows 32 bits every 137s */
|
/* 32ns ticks, overflows 32 bits every 137s */
|
||||||
ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
|
ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
|
||||||
|
@ -1766,11 +1777,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
|
||||||
if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap)))
|
if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap)))
|
||||||
conn->wnd_from_tap = 1;
|
conn->wnd_from_tap = 1;
|
||||||
|
|
||||||
inany_from_af(&conn->faddr, af, daddr);
|
|
||||||
|
|
||||||
conn->fport = dstport;
|
|
||||||
conn->eport = srcport;
|
|
||||||
|
|
||||||
conn->seq_init_from_tap = ntohl(th->seq);
|
conn->seq_init_from_tap = ntohl(th->seq);
|
||||||
conn->seq_from_tap = conn->seq_init_from_tap + 1;
|
conn->seq_from_tap = conn->seq_init_from_tap + 1;
|
||||||
conn->seq_ack_to_tap = conn->seq_from_tap;
|
conn->seq_ack_to_tap = conn->seq_from_tap;
|
||||||
|
@ -2314,10 +2320,6 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport,
|
||||||
conn->ws_to_tap = conn->ws_from_tap = 0;
|
conn->ws_to_tap = conn->ws_from_tap = 0;
|
||||||
conn_event(c, conn, SOCK_ACCEPTED);
|
conn_event(c, conn, SOCK_ACCEPTED);
|
||||||
|
|
||||||
conn->faddr = saddr;
|
|
||||||
conn->fport = srcport;
|
|
||||||
conn->eport = dstport;
|
|
||||||
|
|
||||||
tcp_seq_init(c, conn, now);
|
tcp_seq_init(c, conn, now);
|
||||||
tcp_hash_insert(c, conn);
|
tcp_hash_insert(c, conn);
|
||||||
|
|
||||||
|
|
40
tcp_conn.h
40
tcp_conn.h
|
@ -13,19 +13,16 @@
|
||||||
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
|
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
|
||||||
* @f: Generic flow information
|
* @f: Generic flow information
|
||||||
* @in_epoll: Is the connection in the epoll set?
|
* @in_epoll: Is the connection in the epoll set?
|
||||||
|
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
|
||||||
|
* @ws_from_tap: Window scaling factor advertised from tap/guest
|
||||||
|
* @ws_to_tap: Window scaling factor advertised to tap/guest
|
||||||
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
|
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
|
||||||
* @sock: Socket descriptor number
|
* @sock: Socket descriptor number
|
||||||
* @events: Connection events, implying connection states
|
* @events: Connection events, implying connection states
|
||||||
* @timer: timerfd descriptor for timeout events
|
* @timer: timerfd descriptor for timeout events
|
||||||
* @flags: Connection flags representing internal attributes
|
* @flags: Connection flags representing internal attributes
|
||||||
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
|
|
||||||
* @ws_from_tap: Window scaling factor advertised from tap/guest
|
|
||||||
* @ws_to_tap: Window scaling factor advertised to tap/guest
|
|
||||||
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
|
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
|
||||||
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
|
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
|
||||||
* @faddr: Guest side forwarding address (guest's remote address)
|
|
||||||
* @eport: Guest side endpoint port (guest's local port)
|
|
||||||
* @fport: Guest side forwarding port (guest's remote port)
|
|
||||||
* @wnd_from_tap: Last window size from tap, unscaled (as received)
|
* @wnd_from_tap: Last window size from tap, unscaled (as received)
|
||||||
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
|
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
|
||||||
* @seq_to_tap: Next sequence for packets to tap
|
* @seq_to_tap: Next sequence for packets to tap
|
||||||
|
@ -49,6 +46,10 @@ struct tcp_tap_conn {
|
||||||
unsigned int ws_from_tap :TCP_WS_BITS;
|
unsigned int ws_from_tap :TCP_WS_BITS;
|
||||||
unsigned int ws_to_tap :TCP_WS_BITS;
|
unsigned int ws_to_tap :TCP_WS_BITS;
|
||||||
|
|
||||||
|
#define TCP_MSS_BITS 14
|
||||||
|
unsigned int tap_mss :TCP_MSS_BITS;
|
||||||
|
#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
|
||||||
|
#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
|
||||||
|
|
||||||
int sock :FD_REF_BITS;
|
int sock :FD_REF_BITS;
|
||||||
|
|
||||||
|
@ -77,13 +78,6 @@ struct tcp_tap_conn {
|
||||||
#define ACK_TO_TAP_DUE BIT(3)
|
#define ACK_TO_TAP_DUE BIT(3)
|
||||||
#define ACK_FROM_TAP_DUE BIT(4)
|
#define ACK_FROM_TAP_DUE BIT(4)
|
||||||
|
|
||||||
|
|
||||||
#define TCP_MSS_BITS 14
|
|
||||||
unsigned int tap_mss :TCP_MSS_BITS;
|
|
||||||
#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
|
|
||||||
#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
|
|
||||||
|
|
||||||
|
|
||||||
#define SNDBUF_BITS 24
|
#define SNDBUF_BITS 24
|
||||||
unsigned int sndbuf :SNDBUF_BITS;
|
unsigned int sndbuf :SNDBUF_BITS;
|
||||||
#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
|
#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
|
||||||
|
@ -91,11 +85,6 @@ struct tcp_tap_conn {
|
||||||
|
|
||||||
uint8_t seq_dup_ack_approx;
|
uint8_t seq_dup_ack_approx;
|
||||||
|
|
||||||
|
|
||||||
union inany_addr faddr;
|
|
||||||
in_port_t eport;
|
|
||||||
in_port_t fport;
|
|
||||||
|
|
||||||
uint16_t wnd_from_tap;
|
uint16_t wnd_from_tap;
|
||||||
uint16_t wnd_to_tap;
|
uint16_t wnd_to_tap;
|
||||||
|
|
||||||
|
@ -109,22 +98,24 @@ struct tcp_tap_conn {
|
||||||
/**
|
/**
|
||||||
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
|
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
|
||||||
* @f: Generic flow information
|
* @f: Generic flow information
|
||||||
* @in_epoll: Is the connection in the epoll set?
|
|
||||||
* @s: File descriptor for sockets
|
* @s: File descriptor for sockets
|
||||||
* @pipe: File descriptors for pipes
|
* @pipe: File descriptors for pipes
|
||||||
* @events: Events observed/actions performed on connection
|
|
||||||
* @flags: Connection flags (attributes, not events)
|
|
||||||
* @read: Bytes read (not fully written to other side in one shot)
|
* @read: Bytes read (not fully written to other side in one shot)
|
||||||
* @written: Bytes written (not fully written from one other side read)
|
* @written: Bytes written (not fully written from one other side read)
|
||||||
*/
|
* @events: Events observed/actions performed on connection
|
||||||
|
* @flags: Connection flags (attributes, not events)
|
||||||
|
* @in_epoll: Is the connection in the epoll set?
|
||||||
|
*/
|
||||||
struct tcp_splice_conn {
|
struct tcp_splice_conn {
|
||||||
/* Must be first element */
|
/* Must be first element */
|
||||||
struct flow_common f;
|
struct flow_common f;
|
||||||
|
|
||||||
bool in_epoll :1;
|
|
||||||
int s[SIDES];
|
int s[SIDES];
|
||||||
int pipe[SIDES][2];
|
int pipe[SIDES][2];
|
||||||
|
|
||||||
|
uint32_t read[SIDES];
|
||||||
|
uint32_t written[SIDES];
|
||||||
|
|
||||||
uint8_t events;
|
uint8_t events;
|
||||||
#define SPLICE_CLOSED 0
|
#define SPLICE_CLOSED 0
|
||||||
#define SPLICE_CONNECT BIT(0)
|
#define SPLICE_CONNECT BIT(0)
|
||||||
|
@ -139,8 +130,7 @@ struct tcp_splice_conn {
|
||||||
#define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(4) : BIT(3))
|
#define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(4) : BIT(3))
|
||||||
#define CLOSING BIT(5)
|
#define CLOSING BIT(5)
|
||||||
|
|
||||||
uint32_t read[SIDES];
|
bool in_epoll :1;
|
||||||
uint32_t written[SIDES];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Socket pools */
|
/* Socket pools */
|
||||||
|
|
|
@ -39,7 +39,11 @@
|
||||||
#define OPT_SACKP 4
|
#define OPT_SACKP 4
|
||||||
#define OPT_SACK 5
|
#define OPT_SACK 5
|
||||||
#define OPT_TS 8
|
#define OPT_TS 8
|
||||||
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
|
|
||||||
|
#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
|
||||||
|
#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
|
||||||
|
|
||||||
|
#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->faddr))
|
||||||
#define CONN_V6(conn) (!CONN_V4(conn))
|
#define CONN_V6(conn) (!CONN_V4(conn))
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Reference in a new issue