5913f26415
Currently we deal with hash collisions by letting a hash bucket contain multiple entries, forming a linked list using an index in the connection structure. That's a pretty standard and simple approach, but in our case we can use an even simpler one: linear probing. Here if a hash bucket is occupied we just move onto the next one until we find a feww one. This slightly simplifies lookup and more importantly saves some precious bytes in the connection structure by removing the need for a link. It does require some additional complexity for hash removal. This approach can perform poorly with hash table load is high. However, we already size our hash table of pointers larger than the connection table, which puts an upper bound on the load. It's relatively cheap to decrease that bound if we find we need to. I adapted the linear probing operations from Knuth's The Art of Computer Programming, Volume 3, 2nd Edition. Specifically Algorithm L and Algorithm R in Section 6.4. Note that there is an error in Algorithm R as printed, see errata at [0]. [0] https://www-cs-faculty.stanford.edu/~knuth/all3-prepre.ps.gz Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
168 lines
5.2 KiB
C
168 lines
5.2 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later
|
|
* Copyright Red Hat
|
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
|
*
|
|
* TCP connection tracking data structures, used by tcp.c and
|
|
* tcp_splice.c. Shouldn't be included in non-TCP code.
|
|
*/
|
|
#ifndef TCP_CONN_H
|
|
#define TCP_CONN_H
|
|
|
|
/**
|
|
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
|
|
* @f: Generic flow information
|
|
* @in_epoll: Is the connection in the epoll set?
|
|
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
|
|
* @sock: Socket descriptor number
|
|
* @events: Connection events, implying connection states
|
|
* @timer: timerfd descriptor for timeout events
|
|
* @flags: Connection flags representing internal attributes
|
|
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
|
|
* @ws_from_tap: Window scaling factor advertised from tap/guest
|
|
* @ws_to_tap: Window scaling factor advertised to tap/guest
|
|
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
|
|
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
|
|
* @faddr: Guest side forwarding address (guest's remote address)
|
|
* @eport: Guest side endpoint port (guest's local port)
|
|
* @fport: Guest side forwarding port (guest's remote port)
|
|
* @wnd_from_tap: Last window size from tap, unscaled (as received)
|
|
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
|
|
* @seq_to_tap: Next sequence for packets to tap
|
|
* @seq_ack_from_tap: Last ACK number received from tap
|
|
* @seq_from_tap: Next sequence for packets from tap (not actually sent)
|
|
* @seq_ack_to_tap: Last ACK number sent to tap
|
|
* @seq_init_from_tap: Initial sequence number from tap
|
|
*/
|
|
struct tcp_tap_conn {
|
|
/* Must be first element */
|
|
struct flow_common f;
|
|
|
|
bool in_epoll :1;
|
|
|
|
#define TCP_RETRANS_BITS 3
|
|
unsigned int retrans :TCP_RETRANS_BITS;
|
|
#define TCP_MAX_RETRANS MAX_FROM_BITS(TCP_RETRANS_BITS)
|
|
|
|
#define TCP_WS_BITS 4 /* RFC 7323 */
|
|
#define TCP_WS_MAX 14
|
|
unsigned int ws_from_tap :TCP_WS_BITS;
|
|
unsigned int ws_to_tap :TCP_WS_BITS;
|
|
|
|
|
|
int sock :FD_REF_BITS;
|
|
|
|
uint8_t events;
|
|
#define CLOSED 0
|
|
#define SOCK_ACCEPTED BIT(0) /* implies SYN sent to tap */
|
|
#define TAP_SYN_RCVD BIT(1) /* implies socket connecting */
|
|
#define TAP_SYN_ACK_SENT BIT( 3) /* implies socket connected */
|
|
#define ESTABLISHED BIT(2)
|
|
#define SOCK_FIN_RCVD BIT( 3)
|
|
#define SOCK_FIN_SENT BIT( 4)
|
|
#define TAP_FIN_RCVD BIT( 5)
|
|
#define TAP_FIN_SENT BIT( 6)
|
|
#define TAP_FIN_ACKED BIT( 7)
|
|
|
|
#define CONN_STATE_BITS /* Setting these clears other flags */ \
|
|
(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
|
|
|
|
|
|
int timer :FD_REF_BITS;
|
|
|
|
uint8_t flags;
|
|
#define STALLED BIT(0)
|
|
#define LOCAL BIT(1)
|
|
#define ACTIVE_CLOSE BIT(2)
|
|
#define ACK_TO_TAP_DUE BIT(3)
|
|
#define ACK_FROM_TAP_DUE BIT(4)
|
|
|
|
|
|
#define TCP_MSS_BITS 14
|
|
unsigned int tap_mss :TCP_MSS_BITS;
|
|
#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
|
|
#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
|
|
|
|
|
|
#define SNDBUF_BITS 24
|
|
unsigned int sndbuf :SNDBUF_BITS;
|
|
#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
|
|
#define SNDBUF_GET(conn) (conn->sndbuf << (32 - SNDBUF_BITS))
|
|
|
|
uint8_t seq_dup_ack_approx;
|
|
|
|
|
|
union inany_addr faddr;
|
|
in_port_t eport;
|
|
in_port_t fport;
|
|
|
|
uint16_t wnd_from_tap;
|
|
uint16_t wnd_to_tap;
|
|
|
|
uint32_t seq_to_tap;
|
|
uint32_t seq_ack_from_tap;
|
|
uint32_t seq_from_tap;
|
|
uint32_t seq_ack_to_tap;
|
|
uint32_t seq_init_from_tap;
|
|
};
|
|
|
|
#define SIDES 2
|
|
/**
|
|
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
|
|
* @f: Generic flow information
|
|
* @in_epoll: Is the connection in the epoll set?
|
|
* @s: File descriptor for sockets
|
|
* @pipe: File descriptors for pipes
|
|
* @events: Events observed/actions performed on connection
|
|
* @flags: Connection flags (attributes, not events)
|
|
* @read: Bytes read (not fully written to other side in one shot)
|
|
* @written: Bytes written (not fully written from one other side read)
|
|
*/
|
|
struct tcp_splice_conn {
|
|
/* Must be first element */
|
|
struct flow_common f;
|
|
|
|
bool in_epoll :1;
|
|
int s[SIDES];
|
|
int pipe[SIDES][2];
|
|
|
|
uint8_t events;
|
|
#define SPLICE_CLOSED 0
|
|
#define SPLICE_CONNECT BIT(0)
|
|
#define SPLICE_ESTABLISHED BIT(1)
|
|
#define OUT_WAIT_0 BIT(2)
|
|
#define OUT_WAIT_1 BIT(3)
|
|
#define FIN_RCVD_0 BIT(4)
|
|
#define FIN_RCVD_1 BIT(5)
|
|
#define FIN_SENT_0 BIT(6)
|
|
#define FIN_SENT_1 BIT(7)
|
|
|
|
uint8_t flags;
|
|
#define SPLICE_V6 BIT(0)
|
|
#define RCVLOWAT_SET_0 BIT(1)
|
|
#define RCVLOWAT_SET_1 BIT(2)
|
|
#define RCVLOWAT_ACT_0 BIT(3)
|
|
#define RCVLOWAT_ACT_1 BIT(4)
|
|
#define CLOSING BIT(5)
|
|
|
|
uint32_t read[SIDES];
|
|
uint32_t written[SIDES];
|
|
};
|
|
|
|
/* Socket pools */
|
|
#define TCP_SOCK_POOL_SIZE 32
|
|
|
|
extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
|
|
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
|
|
|
|
void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old,
|
|
struct tcp_tap_conn *new);
|
|
void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new);
|
|
void tcp_splice_destroy(struct ctx *c, union flow *flow);
|
|
void tcp_splice_timer(struct ctx *c, union flow *flow);
|
|
int tcp_conn_pool_sock(int pool[]);
|
|
int tcp_conn_new_sock(const struct ctx *c, sa_family_t af);
|
|
void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af);
|
|
void tcp_splice_refill(const struct ctx *c);
|
|
|
|
#endif /* TCP_CONN_H */
|