5ca555cf78
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
3309 lines
87 KiB
C
3309 lines
87 KiB
C
// SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
|
/* PASST - Plug A Simple Socket Transport
|
|
* for qemu/UNIX domain socket mode
|
|
*
|
|
* PASTA - Pack A Subtle Tap Abstraction
|
|
* for network namespace/tap device mode
|
|
*
|
|
* tcp.c - TCP L2-L4 translation state machine
|
|
*
|
|
* Copyright (c) 2020-2022 Red Hat GmbH
|
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
|
*/
|
|
|
|
/**
|
|
* DOC: Theory of Operation
|
|
*
|
|
*
|
|
* PASST mode
|
|
* ==========
|
|
*
|
|
* This implementation maps TCP traffic between a single L2 interface (tap) and
|
|
* native TCP (L4) sockets, mimicking and reproducing as closely as possible the
|
|
* inferred behaviour of applications running on a guest, connected via said L2
|
|
* interface. Four connection flows are supported:
|
|
* - from the local host to the guest behind the tap interface:
|
|
* - this is the main use case for proxies in service meshes
|
|
* - we bind to configured local ports, and relay traffic between L4 sockets
|
|
* with local endpoints and the L2 interface
|
|
* - from remote hosts to the guest behind the tap interface:
|
|
* - this might be needed for services that need to be addressed directly,
|
|
* and typically configured with special port forwarding rules (which are
|
|
* not needed here)
|
|
* - we also relay traffic between L4 sockets with remote endpoints and the L2
|
|
* interface
|
|
* - from the guest to the local host:
|
|
* - this is not observed in practice, but implemented for completeness and
|
|
* transparency
|
|
* - from the guest to external hosts:
|
|
* - this might be needed for applications running on the guest that need to
|
|
* directly access internet services (e.g. NTP)
|
|
*
|
|
* Relevant goals are:
|
|
* - transparency: sockets need to behave as if guest applications were running
|
|
* directly on the host. This is achieved by:
|
|
* - avoiding port and address translations whenever possible
|
|
* - mirroring TCP dynamics by observation of socket parameters (TCP_INFO
|
|
* socket option) and TCP headers of packets coming from the tap interface,
|
|
* reapplying those parameters in both flow directions (including TCP_MSS,
|
|
* TCP_WINDOW_CLAMP socket options)
|
|
* - simplicity: only a small subset of TCP logic is implemented here and
|
|
* delegated as much as possible to the TCP implementations of guest and host
|
|
* kernel. This is achieved by:
|
|
* - avoiding a complete TCP stack reimplementation, with a modified TCP state
|
|
* machine focused on the translation of observed events instead
|
|
* - mirroring TCP dynamics as described above and hence avoiding the need for
|
|
* segmentation, explicit queueing, and reassembly of segments
|
|
* - security:
|
|
* - no dynamic memory allocation is performed
|
|
* - TODO: synflood protection
|
|
*
|
|
* Portability is limited by usage of Linux-specific socket options.
|
|
*
|
|
*
|
|
* Limits
|
|
* ------
|
|
*
|
|
* To avoid the need for dynamic memory allocation, a maximum, reasonable amount
|
|
* of connections is defined by MAX_TAP_CONNS below (currently 128k).
|
|
*
|
|
* Data needs to linger on sockets as long as it's not acknowledged by the
|
|
* guest, and is read using MSG_PEEK into preallocated static buffers sized
|
|
* to the maximum supported window, 64MiB ("discard" buffer, for already-sent
|
|
* data) plus a number of maximum-MSS-sized buffers. This imposes a practical
|
|
* limitation on window scaling, that is, the maximum factor is 1024. Larger
|
|
* factors will be accepted, but resulting, larger values are never advertised
|
|
* to the other side, and not used while queueing data.
|
|
*
|
|
*
|
|
* Ports
|
|
* -----
|
|
*
|
|
* To avoid the need for ad-hoc configuration of port forwarding or allowed
|
|
* ports, listening sockets can be opened and bound to all unbound ports on the
|
|
* host, as far as process capabilities allow. This service needs to be started
|
|
* after any application proxy that needs to bind to local ports. Mapped ports
|
|
* can also be configured explicitly.
|
|
*
|
|
* No port translation is needed for connections initiated remotely or by the
|
|
* local host: source port from socket is reused while establishing connections
|
|
* to the guest.
|
|
*
|
|
* For connections initiated by the guest, it's not possible to force the same
|
|
* source port as connections are established by the host kernel: that's the
|
|
* only port translation needed.
|
|
*
|
|
*
|
|
* Connection tracking and storage
|
|
* -------------------------------
|
|
*
|
|
* Connections are tracked by the @tc array of struct tcp_conn, containing
|
|
* addresses, ports, TCP states and parameters. This is statically allocated and
|
|
* indexed by an arbitrary connection number. The array is compacted whenever a
|
|
* connection is closed, by remapping the highest connection index in use to the
|
|
* one freed up.
|
|
*
|
|
* References used for the epoll interface report the connection index used for
|
|
* the @tc array.
|
|
*
|
|
* IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for
|
|
* separate data structures depending on the protocol version.
|
|
*
|
|
* - Inbound connection requests (to the guest) are mapped using the triple
|
|
* < source IP address, source port, destination port >
|
|
* - Outbound connection requests (from the guest) are mapped using the triple
|
|
* < destination IP address, destination port, source port >
|
|
* where the source port is the one used by the guest, not the one used by the
|
|
* corresponding host socket
|
|
*
|
|
*
|
|
* Initialisation
|
|
* --------------
|
|
*
|
|
* Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for
|
|
* IPv4 and IPv6) can be opened and bound to wildcard addresses. Some will fail
|
|
* to bind (for low ports, or ports already bound, e.g. by a proxy). These are
|
|
* added to the epoll list, with no separate storage.
|
|
*
|
|
*
|
|
* Events and states
|
|
* -----------------
|
|
*
|
|
* Instead of tracking connection states using a state machine, connection
|
|
* events are used to determine state and actions for a given connection. This
|
|
* makes the implementation simpler as most of the relevant tasks deal with
|
|
* reactions to events, rather than state-associated actions. For user
|
|
* convenience, approximate states are mapped in logs from events by
|
|
* @tcp_state_str.
|
|
*
|
|
* The events are:
|
|
*
|
|
* - SOCK_ACCEPTED connection accepted from socket, SYN sent to tap/guest
|
|
*
|
|
* - TAP_SYN_RCVD tap/guest initiated connection, SYN received
|
|
*
|
|
* - TAP_SYN_ACK_SENT SYN, ACK sent to tap/guest, valid for TAP_SYN_RCVD only
|
|
*
|
|
* - ESTABLISHED connection established, the following events are valid:
|
|
*
|
|
* - SOCK_FIN_RCVD FIN (EPOLLRDHUP) received from socket
|
|
*
|
|
* - SOCK_FIN_SENT FIN (write shutdown) sent to socket
|
|
*
|
|
* - TAP_FIN_RCVD FIN received from tap/guest
|
|
*
|
|
* - TAP_FIN_SENT FIN sent to tap/guest
|
|
*
|
|
* - TAP_FIN_ACKED ACK to FIN seen from tap/guest
|
|
*
|
|
* Setting any event in CONN_STATE_BITS (SOCK_ACCEPTED, TAP_SYN_RCVD,
|
|
* ESTABLISHED) clears all the other events, as those represent the fundamental
|
|
* connection states. No events (events == CLOSED) means the connection is
|
|
* closed.
|
|
*
|
|
* Connection setup
|
|
* ----------------
|
|
*
|
|
* - inbound connection (from socket to guest): on accept() from listening
|
|
* socket, the new socket is mapped in connection tracking table, and
|
|
* three-way handshake initiated towards the guest, advertising MSS and window
|
|
* size and scaling from socket parameters
|
|
* - outbound connection (from guest to socket): on SYN segment from guest, a
|
|
* new socket is created and mapped in connection tracking table, setting
|
|
* MSS and window clamping from header and option of the observed SYN segment
|
|
*
|
|
*
|
|
* Aging and timeout
|
|
* -----------------
|
|
*
|
|
* Open connections are checked periodically against a number of timeouts. Those
|
|
* are:
|
|
*
|
|
* - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake within
|
|
* this time, reset the connection
|
|
*
|
|
* - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
|
|
* either side, the connection is reset
|
|
*
|
|
* - ACK_INTERVAL, or zero-sized window advertised to tap/guest: forcibly check
|
|
* if an ACK segment can be sent
|
|
*
|
|
* - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
|
|
* data, re-send data from the socket and reset sequence to what was
|
|
* acknowledged. If this persists for longer than LAST_ACK_TIMEOUT, reset the
|
|
* connection
|
|
*
|
|
* - FIN_TIMEOUT, on TAP_FIN_SENT: if no ACK is received for the FIN segment
|
|
* within this time, the connection is reset
|
|
*
|
|
* - FIN_TIMEOUT, on SOCK_FIN_SENT: if no activity is detected on the socket
|
|
* after sending a FIN segment (write shutdown), reset the connection
|
|
*
|
|
* - LAST_ACK_TIMEOUT on SOCK_FIN_SENT *and* SOCK_FIN_RCVD: reset the connection
|
|
* if no activity was detected on any of the two sides after sending a FIN
|
|
* segment
|
|
*
|
|
*
|
|
* Summary of data flows (with ESTABLISHED event)
|
|
* ----------------------------------------------
|
|
*
|
|
* @seq_to_tap: next sequence for packets to tap/guest
|
|
* @seq_ack_from_tap: last ACK number received from tap/guest
|
|
* @seq_from_tap: next sequence for packets from tap/guest (expected)
|
|
* @seq_ack_to_tap: last ACK number sent to tap/guest
|
|
*
|
|
* @seq_init_from_tap: initial sequence number from tap/guest
|
|
* @seq_init_to_tap: initial sequence number from tap/guest
|
|
*
|
|
* @wnd_from_tap: last window size received from tap, scaled
|
|
* @wnd_from_tap: last window size advertised from tap, scaled
|
|
*
|
|
* - from socket to tap/guest:
|
|
* - on new data from socket:
|
|
* - peek into buffer
|
|
* - send data to tap/guest:
|
|
* - starting at offset (@seq_to_tap - @seq_ack_from_tap)
|
|
* - in MSS-sized segments
|
|
* - increasing @seq_to_tap at each segment
|
|
* - up to window (until @seq_to_tap - @seq_ack_from_tap <= @wnd_from_tap)
|
|
* - on read error, send RST to tap/guest, close socket
|
|
* - on zero read, send FIN to tap/guest, set TAP_FIN_SENT
|
|
* - on ACK from tap/guest:
|
|
* - set @ts_ack_from_tap
|
|
* - check if it's the second duplicated ACK
|
|
* - consume buffer by difference between new ack_seq and @seq_ack_from_tap
|
|
* - update @seq_ack_from_tap from ack_seq in header
|
|
* - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and
|
|
* resend with steps listed above
|
|
* - set TCP_WINDOW_CLAMP from TCP header from tap
|
|
* - periodically:
|
|
* - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer
|
|
* (TODO: implement requirements from RFC 6298, currently 3s fixed) from
|
|
* @ts_ack_from_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, and
|
|
* resend data with the steps listed above
|
|
*
|
|
* - from tap/guest to socket:
|
|
* - on packet from tap/guest:
|
|
* - set @ts_tap_act
|
|
* - set TCP_WINDOW_CLAMP from TCP header from tap
|
|
* - check seq from header against @seq_from_tap, if data is missing, send
|
|
* two ACKs with number @seq_ack_to_tap, discard packet
|
|
* - otherwise queue data to socket, set @seq_from_tap to seq from header
|
|
* plus payload length
|
|
* - in ESTABLISHED state, send ACK to tap as soon as we queue to the
|
|
* socket. In other states, query socket for TCP_INFO, set
|
|
* @seq_ack_to_tap to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
|
|
* send ACK to tap/guest
|
|
*
|
|
*
|
|
* PASTA mode
|
|
* ==========
|
|
*
|
|
* For traffic directed to TCP ports configured for mapping to the tuntap device
|
|
* in the namespace, and for non-local traffic coming from the tuntap device,
|
|
* the implementation is identical as the PASST mode described in the previous
|
|
* section.
|
|
*
|
|
* For local traffic directed to TCP ports configured for direct mapping between
|
|
* namespaces, see the implementation in tcp_splice.c.
|
|
*/
|
|
|
|
#include <sched.h>
|
|
#include <fcntl.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <errno.h>
|
|
#include <limits.h>
|
|
#include <net/ethernet.h>
|
|
#include <net/if.h>
|
|
#include <netinet/in.h>
|
|
#include <netinet/ip.h>
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
#include <sys/epoll.h>
|
|
#ifdef HAS_GETRANDOM
|
|
#include <sys/random.h>
|
|
#endif
|
|
#include <sys/socket.h>
|
|
#include <sys/types.h>
|
|
#include <sys/uio.h>
|
|
#include <unistd.h>
|
|
#include <time.h>
|
|
|
|
#include <linux/tcp.h> /* For struct tcp_info */
|
|
|
|
#include "checksum.h"
|
|
#include "util.h"
|
|
#include "passt.h"
|
|
#include "tap.h"
|
|
#include "siphash.h"
|
|
#include "pcap.h"
|
|
#include "conf.h"
|
|
#include "tcp_splice.h"
|
|
|
|
#define MAX_TAP_CONNS (128 * 1024)
|
|
|
|
#define TCP_FRAMES_MEM 256
|
|
#define TCP_FRAMES \
|
|
(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
|
|
|
|
#define TCP_HASH_TABLE_LOAD 70 /* % */
|
|
#define TCP_HASH_TABLE_SIZE (MAX_TAP_CONNS * 100 / \
|
|
TCP_HASH_TABLE_LOAD)
|
|
|
|
#define MAX_WS 10
|
|
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
|
|
#define MSS_DEFAULT 536
|
|
#define MSS4 (USHRT_MAX - sizeof(uint32_t) - sizeof(struct ethhdr) - \
|
|
sizeof(struct iphdr) - sizeof(struct tcphdr))
|
|
#define MSS6 (USHRT_MAX - sizeof(uint32_t) - sizeof(struct ethhdr) - \
|
|
sizeof(struct ipv6hdr) - sizeof(struct tcphdr))
|
|
|
|
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
|
|
#ifdef HAS_SND_WND
|
|
# define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd)
|
|
#else
|
|
# define KERNEL_REPORTS_SND_WND(c) (0 && (c))
|
|
#endif
|
|
|
|
#define SYN_TIMEOUT 240000 /* ms */
|
|
#define ACK_TIMEOUT 2000
|
|
#define ACK_INTERVAL 50
|
|
#define ACT_TIMEOUT 7200000
|
|
#define FIN_TIMEOUT 240000
|
|
#define LAST_ACK_TIMEOUT 240000
|
|
|
|
#define TCP_SOCK_POOL_TSH 16 /* Refill in ns if > x used */
|
|
#define REFILL_INTERVAL 1000
|
|
|
|
#define PORT_DETECT_INTERVAL 1000
|
|
|
|
#define LOW_RTT_TABLE_SIZE 8
|
|
#define LOW_RTT_THRESHOLD 10 /* us */
|
|
|
|
/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
|
|
* <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
|
|
*/
|
|
#define SOL_TCP IPPROTO_TCP
|
|
|
|
#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
|
|
#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
|
|
#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
|
|
#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
|
|
|
|
#define FIN (1 << 0)
|
|
#define SYN (1 << 1)
|
|
#define RST (1 << 2)
|
|
#define ACK (1 << 4)
|
|
/* Flags for internal usage */
|
|
#define DUP_ACK (1 << 5)
|
|
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
|
|
|
|
#define OPT_EOL 0
|
|
#define OPT_NOP 1
|
|
#define OPT_MSS 2
|
|
#define OPT_MSS_LEN 4
|
|
#define OPT_WS 3
|
|
#define OPT_WS_LEN 3
|
|
#define OPT_SACKP 4
|
|
#define OPT_SACK 5
|
|
#define OPT_TS 8
|
|
|
|
struct tcp_conn;
|
|
|
|
/**
|
|
* struct tcp_conn - Descriptor for a TCP connection (not spliced)
|
|
* @next: Pointer to next item in hash chain, if any
|
|
* @sock: Socket descriptor number
|
|
* @hash_bucket: Bucket index in connection lookup hash table
|
|
* @a.a6: IPv6 remote address, can be IPv4-mapped
|
|
* @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20
|
|
* @a.a4.one: Ones prefix for IPv4-mapped
|
|
* @a.a4.a: IPv4 address
|
|
* @tap_port: Guest-facing tap port
|
|
* @sock_port: Remote, socket-facing port
|
|
* @events: Connection events, implying connection states
|
|
* @flags: Connection flags representing internal attributes
|
|
* @tap_mss: Maximum segment size advertised by guest
|
|
* @seq_to_tap: Next sequence for packets to tap
|
|
* @seq_ack_from_tap: Last ACK number received from tap
|
|
* @seq_from_tap: Next sequence for packets from tap (not actually sent)
|
|
* @seq_ack_to_tap: Last ACK number sent to tap
|
|
* @seq_dup_ack: Last duplicate ACK number sent to tap
|
|
* @seq_init_from_tap: Initial sequence number from tap
|
|
* @seq_init_from_tap: Initial sequence number to tap
|
|
* @ws_tap: Window scaling factor from tap
|
|
* @ws: Window scaling factor
|
|
* @wnd_from_tap: Last window size received from tap, scaled
|
|
* @wnd_to_tap: Socket-side sending window, advertised to tap
|
|
* @snd_buf: Socket sending buffer reported by kernel, in bytes
|
|
* @ts_sock_act: Last activity timestamp from socket for timeout purposes
|
|
* @ts_tap_act: Last activity timestamp from tap for timeout purposes
|
|
* @ts_ack_from_tap: Last ACK segment timestamp from tap
|
|
* @ts_ack_to_tap: Last ACK segment timestamp to tap
|
|
* @tap_data_noack: Last unacked data to tap, set to { 0, 0 } on ACK
|
|
*/
|
|
struct tcp_conn {
|
|
struct tcp_conn *next;
|
|
int sock;
|
|
int hash_bucket;
|
|
|
|
union {
|
|
struct in6_addr a6;
|
|
struct {
|
|
uint8_t zero[10];
|
|
uint8_t one[2];
|
|
struct in_addr a;
|
|
} a4;
|
|
} a;
|
|
#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6)
|
|
#define CONN_V6(conn) (!CONN_V4(conn))
|
|
|
|
in_port_t tap_port;
|
|
in_port_t sock_port;
|
|
|
|
uint8_t events;
|
|
#define CLOSED 0
|
|
#define SOCK_ACCEPTED BIT(0) /* implies SYN sent to tap */
|
|
#define TAP_SYN_RCVD BIT(1) /* implies socket connecting */
|
|
#define TAP_SYN_ACK_SENT BIT( 3) /* implies socket connected */
|
|
#define ESTABLISHED BIT(2)
|
|
#define SOCK_FIN_RCVD BIT( 3)
|
|
#define SOCK_FIN_SENT BIT( 4)
|
|
#define TAP_FIN_RCVD BIT( 5)
|
|
#define TAP_FIN_SENT BIT( 6)
|
|
#define TAP_FIN_ACKED BIT( 7)
|
|
|
|
#define CONN_STATE_BITS /* Setting these clears other flags */ \
|
|
(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
|
|
|
|
uint8_t flags;
|
|
#define CONN_STALLED BIT(0)
|
|
#define CONN_LOCAL BIT(1)
|
|
#define CONN_WND_CLAMPED BIT(2)
|
|
#define CONN_IN_EPOLL BIT(3)
|
|
#define CONN_ACTIVE_CLOSE BIT(4)
|
|
|
|
uint16_t tap_mss;
|
|
|
|
uint32_t seq_to_tap;
|
|
uint32_t seq_ack_from_tap;
|
|
uint32_t seq_from_tap;
|
|
uint32_t seq_ack_to_tap;
|
|
uint32_t seq_dup_ack;
|
|
uint32_t seq_init_from_tap;
|
|
uint32_t seq_init_to_tap;
|
|
|
|
uint16_t ws_tap;
|
|
uint16_t ws;
|
|
|
|
uint32_t wnd_from_tap;
|
|
uint32_t wnd_to_tap;
|
|
|
|
int snd_buf;
|
|
|
|
struct timespec ts_sock_act;
|
|
struct timespec ts_tap_act;
|
|
struct timespec ts_ack_from_tap;
|
|
struct timespec ts_ack_to_tap;
|
|
struct timespec tap_data_noack;
|
|
};
|
|
|
|
#define CONN_IS_CLOSED(conn) (conn->events == CLOSED)
|
|
#define CONN_IS_CLOSING(conn) \
|
|
((conn->events & ESTABLISHED) && \
|
|
(conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
|
|
#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
|
|
|
|
#define CONN(index) (tc + (index))
|
|
|
|
static const char *tcp_event_str[] __attribute((__unused__)) = {
|
|
"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
|
|
|
|
"SOCK_FIN_RCVD", "SOCK_FIN_SENT", "TAP_FIN_RCVD", "TAP_FIN_SENT",
|
|
"TAP_FIN_ACKED",
|
|
};
|
|
|
|
static const char *tcp_state_str[] __attribute((__unused__)) = {
|
|
"SYN_RCVD", "SYN_SENT", "ESTABLISHED",
|
|
"SYN_RCVD", /* approximately maps to TAP_SYN_ACK_SENT */
|
|
|
|
/* Passive close: */
|
|
"CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK",
|
|
/* Active close (+5): */
|
|
"CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT",
|
|
};
|
|
|
|
static const char *tcp_flag_str[] __attribute((__unused__)) = {
|
|
"STALLED", "LOCAL", "WND_CLAMPED", "IN_EPOLL", "ACTIVE_CLOSE",
|
|
};
|
|
|
|
/* Port re-mappings as delta, indexed by original destination port */
|
|
static in_port_t tcp_port_delta_to_tap [USHRT_MAX];
|
|
static in_port_t tcp_port_delta_to_init [USHRT_MAX];
|
|
|
|
/* Listening sockets, used for automatic port forwarding in pasta mode only */
|
|
static int tcp_sock_init_lo [USHRT_MAX][IP_VERSIONS];
|
|
static int tcp_sock_init_ext [USHRT_MAX][IP_VERSIONS];
|
|
static int tcp_sock_ns [USHRT_MAX][IP_VERSIONS];
|
|
|
|
/* Table of destinations with very low RTT (assumed to be local), LRU */
|
|
static struct in6_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
|
|
|
|
/* Static buffers */
|
|
|
|
/**
|
|
* tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
|
|
* @psum: Partial IP header checksum (excluding tot_len and saddr)
|
|
* @tsum: Partial TCP header checksum (excluding length and saddr)
|
|
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
|
|
* @vnet_len: 4-byte qemu vnet buffer length descriptor, only for passt mode
|
|
* @eh: Pre-filled Ethernet header
|
|
* @iph: Pre-filled IP header (except for tot_len and saddr)
|
|
* @uh: Headroom for TCP header
|
|
* @data: Storage for TCP payload
|
|
*/
|
|
static struct tcp4_l2_buf_t {
|
|
uint32_t psum; /* 0 */
|
|
uint32_t tsum; /* 4 */
|
|
#ifdef __AVX2__
|
|
uint8_t pad[18]; /* 8, align th to 32 bytes */
|
|
#else
|
|
uint8_t pad[2]; /* align iph to 4 bytes 8 */
|
|
#endif
|
|
uint32_t vnet_len; /* 26 10 */
|
|
struct ethhdr eh; /* 30 14 */
|
|
struct iphdr iph; /* 44 28 */
|
|
struct tcphdr th; /* 64 48 */
|
|
uint8_t data[MSS4]; /* 84 68 */
|
|
/* 65541 65525 */
|
|
#ifdef __AVX2__
|
|
} __attribute__ ((packed, aligned(32)))
|
|
#else
|
|
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
|
|
#endif
|
|
tcp4_l2_buf[TCP_FRAMES_MEM];
|
|
|
|
static unsigned int tcp4_l2_buf_used;
|
|
static size_t tcp4_l2_buf_bytes;
|
|
|
|
/**
|
|
* tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
|
|
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
|
|
* @vnet_len: 4-byte qemu vnet buffer length descriptor, only for passt mode
|
|
* @eh: Pre-filled Ethernet header
|
|
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
|
|
* @th: Headroom for TCP header
|
|
* @data: Storage for TCP payload
|
|
*/
|
|
struct tcp6_l2_buf_t {
|
|
#ifdef __AVX2__
|
|
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
|
|
#else
|
|
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
|
|
#endif
|
|
uint32_t vnet_len; /* 14 2 */
|
|
struct ethhdr eh; /* 18 6 */
|
|
struct ipv6hdr ip6h; /* 32 20 */
|
|
struct tcphdr th; /* 72 60 */
|
|
uint8_t data[MSS6]; /* 92 80 */
|
|
/* 65639 65627 */
|
|
#ifdef __AVX2__
|
|
} __attribute__ ((packed, aligned(32)))
|
|
#else
|
|
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
|
|
#endif
|
|
tcp6_l2_buf[TCP_FRAMES_MEM];
|
|
|
|
static unsigned int tcp6_l2_buf_used;
|
|
static size_t tcp6_l2_buf_bytes;
|
|
|
|
/* recvmsg()/sendmsg() data for tap */
|
|
static char tcp_buf_discard [MAX_WINDOW];
|
|
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
|
|
|
|
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM];
|
|
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM];
|
|
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM];
|
|
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM];
|
|
|
|
static struct mmsghdr tcp_l2_mh [TCP_FRAMES_MEM];
|
|
|
|
/* sendmsg() to socket */
|
|
static struct iovec tcp_iov [UIO_MAXIOV];
|
|
|
|
/**
|
|
* tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
|
|
* @psum: Partial IP header checksum (excluding tot_len and saddr)
|
|
* @tsum: Partial TCP header checksum (excluding length and saddr)
|
|
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
|
|
* @vnet_len: 4-byte qemu vnet buffer length descriptor, only for passt mode
|
|
* @eh: Pre-filled Ethernet header
|
|
* @iph: Pre-filled IP header (except for tot_len and saddr)
|
|
* @th: Headroom for TCP header
|
|
* @opts: Headroom for TCP options
|
|
*/
|
|
static struct tcp4_l2_flags_buf_t {
|
|
uint32_t psum; /* 0 */
|
|
uint32_t tsum; /* 4 */
|
|
#ifdef __AVX2__
|
|
uint8_t pad[18]; /* 8, align th to 32 bytes */
|
|
#else
|
|
uint8_t pad[2]; /* align iph to 4 bytes 8 */
|
|
#endif
|
|
uint32_t vnet_len; /* 26 10 */
|
|
struct ethhdr eh; /* 30 14 */
|
|
struct iphdr iph; /* 44 28 */
|
|
struct tcphdr th; /* 64 48 */
|
|
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
|
|
#ifdef __AVX2__
|
|
} __attribute__ ((packed, aligned(32)))
|
|
#else
|
|
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
|
|
#endif
|
|
tcp4_l2_flags_buf[TCP_FRAMES_MEM];
|
|
|
|
static unsigned int tcp4_l2_flags_buf_used;
|
|
static size_t tcp4_l2_flags_buf_bytes;
|
|
|
|
/**
|
|
* tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
|
|
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
|
|
* @vnet_len: 4-byte qemu vnet buffer length descriptor, only for passt mode
|
|
* @eh: Pre-filled Ethernet header
|
|
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
|
|
* @th: Headroom for TCP header
|
|
* @opts: Headroom for TCP options
|
|
*/
|
|
static struct tcp6_l2_flags_buf_t {
|
|
#ifdef __AVX2__
|
|
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
|
|
#else
|
|
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
|
|
#endif
|
|
uint32_t vnet_len; /* 14 2 */
|
|
struct ethhdr eh; /* 18 6 */
|
|
struct ipv6hdr ip6h; /* 32 20 */
|
|
struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */
|
|
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
|
|
#ifdef __AVX2__
|
|
} __attribute__ ((packed, aligned(32)))
|
|
#else
|
|
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
|
|
#endif
|
|
tcp6_l2_flags_buf[TCP_FRAMES_MEM];
|
|
|
|
static unsigned int tcp6_l2_flags_buf_used;
|
|
static size_t tcp6_l2_flags_buf_bytes;
|
|
|
|
/* TCP connections */
|
|
static struct tcp_conn tc[MAX_TAP_CONNS];
|
|
|
|
/* Table for lookup from remote address, local port, remote port */
|
|
static struct tcp_conn *tc_hash[TCP_HASH_TABLE_SIZE];
|
|
|
|
/* Pools for pre-opened sockets */
|
|
int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
|
|
int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
|
|
int ns_sock_pool4 [TCP_SOCK_POOL_SIZE];
|
|
int ns_sock_pool6 [TCP_SOCK_POOL_SIZE];
|
|
|
|
/**
|
|
* tcp_conn_epoll_events() - epoll events mask for given connection state
|
|
* @events: Current connection events
|
|
* @conn_flags Connection flags
|
|
*
|
|
* Return: epoll events mask corresponding to implied connection state
|
|
*/
|
|
static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
|
|
{
|
|
if (!events)
|
|
return 0;
|
|
|
|
if (events & ESTABLISHED) {
|
|
if (events & TAP_FIN_SENT)
|
|
return EPOLLET;
|
|
|
|
if (conn_flags & CONN_STALLED)
|
|
return EPOLLIN | EPOLLRDHUP | EPOLLET;
|
|
|
|
return EPOLLIN | EPOLLRDHUP;
|
|
}
|
|
|
|
if (events == TAP_SYN_RCVD)
|
|
return EPOLLOUT | EPOLLET | EPOLLRDHUP;
|
|
|
|
return EPOLLRDHUP;
|
|
}
|
|
|
|
static void conn_flag_do(struct ctx *c, struct tcp_conn *conn,
|
|
unsigned long flag);
|
|
#define conn_flag(c, conn, flag) \
|
|
do { \
|
|
trace("TCP: flag at %s:%i", __func__, __LINE__); \
|
|
conn_flag_do(c, conn, flag); \
|
|
} while (0)
|
|
|
|
/**
|
|
* tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
*
|
|
* Return: 0 on success, negative error code on failure (not on deletion)
|
|
*/
|
|
static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn)
|
|
{
|
|
int m = (conn->flags & CONN_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
|
|
union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock,
|
|
.r.p.tcp.tcp.index = conn - tc,
|
|
.r.p.tcp.tcp.v6 = CONN_V6(conn) };
|
|
struct epoll_event ev = { .data.u64 = ref.u64 };
|
|
|
|
if (CONN_IS_CLOSED(conn)) {
|
|
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
|
|
return 0;
|
|
}
|
|
|
|
ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
|
|
|
|
if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
|
|
return -errno;
|
|
|
|
conn->flags |= CONN_IN_EPOLL; /* No need to log this */
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* conn_flag_do() - Set/unset given flag, log, update epoll on CONN_STALLED
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @flag: Flag to set, or ~flag to unset
|
|
*/
|
|
static void conn_flag_do(struct ctx *c, struct tcp_conn *conn,
|
|
unsigned long flag)
|
|
{
|
|
if (flag & (flag - 1)) {
|
|
if (!(conn->flags & ~flag))
|
|
return;
|
|
|
|
conn->flags &= flag;
|
|
debug("TCP: index %i: %s dropped", (conn) - tc,
|
|
tcp_flag_str[fls(~flag)]);
|
|
} else {
|
|
if (conn->flags & flag)
|
|
return;
|
|
|
|
conn->flags |= flag;
|
|
debug("TCP: index %i: %s", (conn) - tc,
|
|
tcp_flag_str[fls(flag)]);
|
|
}
|
|
|
|
if (flag == CONN_STALLED || flag == ~CONN_STALLED)
|
|
tcp_epoll_ctl(c, conn);
|
|
}
|
|
|
|
/**
|
|
* conn_event_do() - Set and log connection events, update epoll state
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @event: Connection event
|
|
*/
|
|
static void conn_event_do(struct ctx *c, struct tcp_conn *conn,
|
|
unsigned long event)
|
|
{
|
|
int prev, new, num = fls(event);
|
|
|
|
if (conn->events & event)
|
|
return;
|
|
|
|
prev = fls(conn->events);
|
|
if (conn->flags & CONN_ACTIVE_CLOSE)
|
|
prev += 5;
|
|
|
|
if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED))
|
|
prev++; /* i.e. SOCK_FIN_RCVD, not TAP_SYN_ACK_SENT */
|
|
|
|
if (event == CLOSED || (event & CONN_STATE_BITS))
|
|
conn->events = event;
|
|
else
|
|
conn->events |= event;
|
|
|
|
if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD))
|
|
conn_flag(c, conn, CONN_ACTIVE_CLOSE);
|
|
else
|
|
tcp_epoll_ctl(c, conn);
|
|
|
|
new = fls(conn->events);
|
|
|
|
if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED)) {
|
|
num++;
|
|
new++;
|
|
}
|
|
if (conn->flags & CONN_ACTIVE_CLOSE)
|
|
new += 5;
|
|
|
|
if (prev != new) {
|
|
debug("TCP: index %i, %s: %s -> %s", (conn) - tc,
|
|
num == -1 ? "CLOSED" : tcp_event_str[num],
|
|
prev == -1 ? "CLOSED" : tcp_state_str[prev],
|
|
(new == -1 || num == -1) ? "CLOSED" : tcp_state_str[new]);
|
|
} else {
|
|
debug("TCP: index %i, %s", (conn) - tc,
|
|
num == -1 ? "CLOSED" : tcp_event_str[num]);
|
|
}
|
|
}
|
|
|
|
#define conn_event(c, conn, event) \
|
|
do { \
|
|
trace("TCP: event at %s:%i", __func__, __LINE__); \
|
|
conn_event_do(c, conn, event); \
|
|
} while (0)
|
|
|
|
/**
|
|
* tcp_remap_to_tap() - Set delta for port translation toward guest/tap
|
|
* @port: Original destination port, host order
|
|
* @delta: Delta to be added to original destination port
|
|
*/
|
|
void tcp_remap_to_tap(in_port_t port, in_port_t delta)
|
|
{
|
|
tcp_port_delta_to_tap[port] = delta;
|
|
}
|
|
|
|
/**
|
|
* tcp_remap_to_tap() - Set delta for port translation toward init namespace
|
|
* @port: Original destination port, host order
|
|
* @delta: Delta to be added to original destination port
|
|
*/
|
|
void tcp_remap_to_init(in_port_t port, in_port_t delta)
|
|
{
|
|
tcp_port_delta_to_init[port] = delta;
|
|
}
|
|
|
|
/**
|
|
* tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
|
|
* @conn: Connection pointer
|
|
*
|
|
* Return: 1 if destination is in low RTT table, 0 otherwise
|
|
*/
|
|
static int tcp_rtt_dst_low(struct tcp_conn *conn)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
|
|
if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i))
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* tcp_rtt_dst_check() - Check tcpi_min_rtt, insert endpoint in table if low
|
|
* @conn: Connection pointer
|
|
* @tinfo: Pointer to struct tcp_info for socket
|
|
*/
|
|
static void tcp_rtt_dst_check(struct tcp_conn *conn, struct tcp_info *tinfo)
|
|
{
|
|
#ifdef HAS_MIN_RTT
|
|
int i, hole = -1;
|
|
|
|
if (!tinfo->tcpi_min_rtt ||
|
|
(int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD)
|
|
return;
|
|
|
|
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
|
|
if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i))
|
|
return;
|
|
if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
|
|
hole = i;
|
|
}
|
|
|
|
memcpy(low_rtt_dst + hole++, &conn->a.a6, sizeof(conn->a.a6));
|
|
if (hole == LOW_RTT_TABLE_SIZE)
|
|
hole = 0;
|
|
memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->a.a6));
|
|
#else
|
|
(void)conn;
|
|
(void)tinfo;
|
|
#endif /* HAS_MIN_RTT */
|
|
}
|
|
|
|
/**
|
|
* tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage)
|
|
* @conn: Connection pointer
|
|
*/
|
|
static void tcp_get_sndbuf(struct tcp_conn *conn)
|
|
{
|
|
int s = conn->sock, sndbuf;
|
|
socklen_t sl;
|
|
uint64_t v;
|
|
|
|
sl = sizeof(sndbuf);
|
|
if (getsockopt(s, SOL_SOCKET, SO_SNDBUF, &sndbuf, &sl)) {
|
|
conn->snd_buf = WINDOW_DEFAULT;
|
|
return;
|
|
}
|
|
|
|
v = sndbuf;
|
|
if (v >= SNDBUF_BIG)
|
|
v /= 2;
|
|
else if (v > SNDBUF_SMALL)
|
|
v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;
|
|
|
|
conn->snd_buf = MIN(INT_MAX, v);
|
|
}
|
|
|
|
/**
|
|
* tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values
|
|
* @s: Socket, can be -1 to avoid check in the caller
|
|
*/
|
|
void tcp_sock_set_bufsize(struct ctx *c, int s)
|
|
{
|
|
int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */
|
|
|
|
if (s == -1)
|
|
return;
|
|
|
|
if (!c->low_rmem)
|
|
setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v));
|
|
|
|
if (!c->low_wmem)
|
|
setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v));
|
|
}
|
|
|
|
/**
|
|
* tcp_update_check_ip4() - Update IPv4 with variable parts from stored one
|
|
* @buf: L2 packet buffer with final IPv4 header
|
|
*/
|
|
static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf)
|
|
{
|
|
uint32_t sum = buf->psum;
|
|
|
|
sum += buf->iph.tot_len;
|
|
sum += (buf->iph.saddr >> 16) & 0xffff;
|
|
sum += buf->iph.saddr & 0xffff;
|
|
|
|
buf->iph.check = (uint16_t)~csum_fold(sum);
|
|
}
|
|
|
|
/**
|
|
* tcp_update_check_tcp4() - Update TCP checksum from stored one
|
|
* @buf: L2 packet buffer with final IPv4 header
|
|
*/
|
|
static void tcp_update_check_tcp4(struct tcp4_l2_buf_t *buf)
|
|
{
|
|
uint16_t tlen = ntohs(buf->iph.tot_len) - 20;
|
|
uint32_t sum = buf->tsum;
|
|
|
|
sum += (buf->iph.saddr >> 16) & 0xffff;
|
|
sum += buf->iph.saddr & 0xffff;
|
|
sum += htons(ntohs(buf->iph.tot_len) - 20);
|
|
|
|
buf->th.check = 0;
|
|
buf->th.check = csum(&buf->th, tlen, sum);
|
|
}
|
|
|
|
/**
|
|
* tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
|
|
* @buf: L2 packet buffer with final IPv6 header
|
|
*/
|
|
static void tcp_update_check_tcp6(struct tcp6_l2_buf_t *buf)
|
|
{
|
|
int len = ntohs(buf->ip6h.payload_len) + sizeof(struct ipv6hdr);
|
|
|
|
buf->ip6h.hop_limit = IPPROTO_TCP;
|
|
buf->ip6h.version = 0;
|
|
buf->ip6h.nexthdr = 0;
|
|
|
|
buf->th.check = 0;
|
|
buf->th.check = csum(&buf->ip6h, len, 0);
|
|
|
|
buf->ip6h.hop_limit = 255;
|
|
buf->ip6h.version = 6;
|
|
buf->ip6h.nexthdr = IPPROTO_TCP;
|
|
}
|
|
|
|
/**
|
|
* tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
|
|
* @eth_d: Ethernet destination address, NULL if unchanged
|
|
* @eth_s: Ethernet source address, NULL if unchanged
|
|
* @ip_da: Pointer to IPv4 destination address, NULL if unchanged
|
|
*/
|
|
void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
|
|
const uint32_t *ip_da)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
|
struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
|
|
struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
|
|
struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
|
|
struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
|
|
|
|
if (eth_d) {
|
|
memcpy(b4->eh.h_dest, eth_d, ETH_ALEN);
|
|
memcpy(b6->eh.h_dest, eth_d, ETH_ALEN);
|
|
|
|
memcpy(b4f->eh.h_dest, eth_d, ETH_ALEN);
|
|
memcpy(b6f->eh.h_dest, eth_d, ETH_ALEN);
|
|
}
|
|
|
|
if (eth_s) {
|
|
memcpy(b4->eh.h_source, eth_s, ETH_ALEN);
|
|
memcpy(b6->eh.h_source, eth_s, ETH_ALEN);
|
|
|
|
memcpy(b4f->eh.h_source, eth_s, ETH_ALEN);
|
|
memcpy(b6f->eh.h_source, eth_s, ETH_ALEN);
|
|
}
|
|
|
|
if (ip_da) {
|
|
b4f->iph.daddr = b4->iph.daddr = *ip_da;
|
|
if (!i) {
|
|
b4f->iph.saddr = b4->iph.saddr = 0;
|
|
b4f->iph.tot_len = b4->iph.tot_len = 0;
|
|
b4f->iph.check = b4->iph.check = 0;
|
|
b4f->psum = b4->psum = sum_16b(&b4->iph, 20);
|
|
|
|
b4->tsum = ((*ip_da >> 16) & 0xffff) +
|
|
(*ip_da & 0xffff) +
|
|
htons(IPPROTO_TCP);
|
|
b4f->tsum = b4->tsum;
|
|
} else {
|
|
b4f->psum = b4->psum = tcp4_l2_buf[0].psum;
|
|
b4f->tsum = b4->tsum = tcp4_l2_buf[0].tsum;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
|
|
*/
|
|
static void tcp_sock4_iov_init(void)
|
|
{
|
|
struct iovec *iov;
|
|
int i;
|
|
|
|
for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
|
|
tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { 0, 0,
|
|
{ 0 },
|
|
0, L2_BUF_ETH_IP4_INIT, L2_BUF_IP4_INIT(IPPROTO_TCP),
|
|
{ .doff = sizeof(struct tcphdr) / 4, .ack = 1 }, { 0 },
|
|
};
|
|
}
|
|
|
|
for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
|
|
tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { 0, 0,
|
|
{ 0 },
|
|
0, L2_BUF_ETH_IP4_INIT, L2_BUF_IP4_INIT(IPPROTO_TCP),
|
|
{ 0 }, { 0 },
|
|
};
|
|
}
|
|
|
|
for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) {
|
|
iov->iov_base = &tcp4_l2_buf[i].vnet_len;
|
|
iov->iov_len = MSS_DEFAULT;
|
|
}
|
|
|
|
for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
|
|
iov->iov_base = &tcp4_l2_flags_buf[i].vnet_len;
|
|
}
|
|
|
|
/**
|
|
* tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
|
|
*/
|
|
static void tcp_sock6_iov_init(void)
|
|
{
|
|
struct iovec *iov;
|
|
int i;
|
|
|
|
for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
|
|
tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
|
|
{ 0 },
|
|
0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_TCP),
|
|
{ .doff = sizeof(struct tcphdr) / 4, .ack = 1 }, { 0 },
|
|
};
|
|
}
|
|
|
|
for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
|
|
tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
|
|
{ 0 },
|
|
0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_TCP),
|
|
{ 0 }, { 0 },
|
|
};
|
|
}
|
|
|
|
for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) {
|
|
iov->iov_base = &tcp6_l2_buf[i].vnet_len;
|
|
iov->iov_len = MSS_DEFAULT;
|
|
}
|
|
|
|
for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
|
|
iov->iov_base = &tcp6_l2_flags_buf[i].vnet_len;
|
|
}
|
|
|
|
/**
|
|
* tcp_opt_get() - Get option, and value if any, from TCP header
|
|
* @th: Pointer to TCP header
|
|
* @len: Length of buffer, including TCP header
|
|
* @type_find: Option type to look for
|
|
* @optlen_set: Optional, filled with option length if passed
|
|
* @value_set: Optional, set to start of option value if passed
|
|
*
|
|
* Return: option value, meaningful for up to 4 bytes, -1 if not found
|
|
*/
|
|
static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find,
|
|
uint8_t *optlen_set, char **value_set)
|
|
{
|
|
uint8_t type, optlen;
|
|
char *p;
|
|
|
|
if (len > (size_t)th->doff * 4)
|
|
len = (size_t)th->doff * 4;
|
|
|
|
len -= sizeof(*th);
|
|
p = (char *)(th + 1);
|
|
|
|
for (; len >= 2; p += optlen, len -= optlen) {
|
|
switch (*p) {
|
|
case OPT_EOL:
|
|
return -1;
|
|
case OPT_NOP:
|
|
optlen = 1;
|
|
break;
|
|
default:
|
|
type = *(p++);
|
|
optlen = *(p++) - 2;
|
|
len -= 2;
|
|
|
|
if (type != type_find)
|
|
break;
|
|
|
|
if (optlen_set)
|
|
*optlen_set = optlen;
|
|
if (value_set)
|
|
*value_set = p;
|
|
|
|
switch (optlen) {
|
|
case 0:
|
|
return 0;
|
|
case 1:
|
|
return *p;
|
|
case 2:
|
|
return ntohs(*(uint16_t *)p);
|
|
default:
|
|
return ntohl(*(uint32_t *)p);
|
|
}
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/**
|
|
* tcp_hash_match() - Check if a connection entry matches address and ports
|
|
* @conn: Connection entry to match against
|
|
* @af: Address family, AF_INET or AF_INET6
|
|
* @addr: Remote address, pointer to sin_addr or sin6_addr
|
|
* @tap_port: tap-facing port
|
|
* @sock_port: Socket-facing port
|
|
*
|
|
* Return: 1 on match, 0 otherwise
|
|
*/
|
|
static int tcp_hash_match(struct tcp_conn *conn, int af, void *addr,
|
|
in_port_t tap_port, in_port_t sock_port)
|
|
{
|
|
if (af == AF_INET && CONN_V4(conn) &&
|
|
!memcmp(&conn->a.a4.a, addr, sizeof(conn->a.a4.a)) &&
|
|
conn->tap_port == tap_port && conn->sock_port == sock_port)
|
|
return 1;
|
|
|
|
if (af == AF_INET6 &&
|
|
IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) &&
|
|
conn->tap_port == tap_port && conn->sock_port == sock_port)
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* tcp_hash() - Calculate hash value for connection given address and ports
|
|
* @c: Execution context
|
|
* @af: Address family, AF_INET or AF_INET6
|
|
* @addr: Remote address, pointer to sin_addr or sin6_addr
|
|
* @tap_port: tap-facing port
|
|
* @sock_port: Socket-facing port
|
|
*
|
|
* Return: hash value, already modulo size of the hash table
|
|
*/
|
|
#if TCP_HASH_NOINLINE
|
|
__attribute__((__noinline__)) /* See comment in Makefile */
|
|
#endif
|
|
static unsigned int tcp_hash(struct ctx *c, int af, void *addr,
|
|
in_port_t tap_port, in_port_t sock_port)
|
|
{
|
|
uint64_t b = 0;
|
|
|
|
if (af == AF_INET) {
|
|
struct {
|
|
struct in_addr addr;
|
|
in_port_t tap_port;
|
|
in_port_t sock_port;
|
|
} __attribute__((__packed__)) in = {
|
|
*(struct in_addr *)addr, tap_port, sock_port,
|
|
};
|
|
|
|
b = siphash_8b((uint8_t *)&in, c->tcp.hash_secret);
|
|
} else if (af == AF_INET6) {
|
|
struct {
|
|
struct in6_addr addr;
|
|
in_port_t tap_port;
|
|
in_port_t sock_port;
|
|
} __attribute__((__packed__)) in = {
|
|
*(struct in6_addr *)addr, tap_port, sock_port,
|
|
};
|
|
|
|
b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret);
|
|
}
|
|
|
|
return (unsigned int)(b % TCP_HASH_TABLE_SIZE);
|
|
}
|
|
|
|
/**
|
|
* tcp_hash_insert() - Insert connection into hash table, chain link
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @af: Address family, AF_INET or AF_INET6
|
|
* @addr: Remote address, pointer to sin_addr or sin6_addr
|
|
*/
|
|
static void tcp_hash_insert(struct ctx *c, struct tcp_conn *conn,
|
|
int af, void *addr)
|
|
{
|
|
int b;
|
|
|
|
b = tcp_hash(c, af, addr, conn->tap_port, conn->sock_port);
|
|
conn->next = tc_hash[b];
|
|
tc_hash[b] = conn;
|
|
conn->hash_bucket = b;
|
|
|
|
debug("TCP: hash table insert: index %i, sock %i, bucket: %i, next: %p",
|
|
conn - tc, conn->sock, b, conn->next);
|
|
}
|
|
|
|
/**
|
|
* tcp_hash_remove() - Drop connection from hash table, chain unlink
|
|
* @conn: Connection pointer
|
|
*/
|
|
static void tcp_hash_remove(struct tcp_conn *conn)
|
|
{
|
|
struct tcp_conn *entry, *prev = NULL;
|
|
int b = conn->hash_bucket;
|
|
|
|
for (entry = tc_hash[b]; entry; prev = entry, entry = entry->next) {
|
|
if (entry == conn) {
|
|
if (prev)
|
|
prev->next = conn->next;
|
|
else
|
|
tc_hash[b] = conn->next;
|
|
break;
|
|
}
|
|
}
|
|
|
|
debug("TCP: hash table remove: index %i, sock %i, bucket: %i, new: %p",
|
|
conn - tc, conn->sock, b, prev ? prev->next : tc_hash[b]);
|
|
}
|
|
|
|
/**
|
|
* tcp_hash_update() - Update pointer for given connection
|
|
* @old: Old connection pointer
|
|
* @new: New connection pointer
|
|
*/
|
|
static void tcp_hash_update(struct tcp_conn *old, struct tcp_conn *new)
|
|
{
|
|
struct tcp_conn *entry, *prev = NULL;
|
|
int b = old->hash_bucket;
|
|
|
|
for (entry = tc_hash[b]; entry; prev = entry, entry = entry->next) {
|
|
if (entry == old) {
|
|
if (prev)
|
|
prev->next = new;
|
|
else
|
|
tc_hash[b] = new;
|
|
break;
|
|
}
|
|
}
|
|
|
|
debug("TCP: hash table update: old index %i, new index %i, sock %i, "
|
|
"bucket: %i, old: %p, new: %p",
|
|
old - tc, new - tc, new->sock, b, old, new);
|
|
}
|
|
|
|
/**
|
|
* tcp_hash_lookup() - Look up connection given remote address and ports
|
|
* @c: Execution context
|
|
* @af: Address family, AF_INET or AF_INET6
|
|
* @addr: Remote address, pointer to sin_addr or sin6_addr
|
|
* @tap_port: tap-facing port
|
|
* @sock_port: Socket-facing port
|
|
*
|
|
* Return: connection pointer, if found, -ENOENT otherwise
|
|
*/
|
|
static struct tcp_conn *tcp_hash_lookup(struct ctx *c, int af, void *addr,
|
|
in_port_t tap_port, in_port_t sock_port)
|
|
{
|
|
int b = tcp_hash(c, af, addr, tap_port, sock_port);
|
|
struct tcp_conn *conn;
|
|
|
|
for (conn = tc_hash[b]; conn; conn = conn->next) {
|
|
if (tcp_hash_match(conn, af, addr, tap_port, sock_port))
|
|
return conn;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* tcp_table_compact() - Perform compaction on connection table
|
|
* @c: Execution context
|
|
* @hole: Pointer to recently closed connection
|
|
*/
|
|
static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
|
|
{
|
|
struct tcp_conn *from, *to;
|
|
|
|
if ((hole - tc) == --c->tcp.conn_count) {
|
|
debug("TCP: hash table compaction: index %i (%p) was max index",
|
|
hole - tc, hole);
|
|
return;
|
|
}
|
|
|
|
from = CONN(c->tcp.conn_count);
|
|
memcpy(hole, from, sizeof(*hole));
|
|
from->flags = from->events = 0;
|
|
|
|
to = hole;
|
|
tcp_hash_update(from, to);
|
|
|
|
tcp_epoll_ctl(c, to);
|
|
|
|
debug("TCP: hash table compaction: old index %i, new index %i, "
|
|
"sock %i, from: %p, to: %p",
|
|
from - tc, to - tc, from->sock, from, to);
|
|
}
|
|
|
|
/**
|
|
* tcp_conn_destroy() - Close connection, drop from epoll file descriptor
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
*/
|
|
static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn)
|
|
{
|
|
if (CONN_IS_CLOSED(conn))
|
|
return;
|
|
|
|
conn_event(c, conn, CLOSED);
|
|
conn->flags = 0;
|
|
close(conn->sock);
|
|
|
|
/* Removal from hash table and connection table compaction deferred to
|
|
* timer.
|
|
*/
|
|
}
|
|
|
|
static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn);
|
|
#define tcp_rst(c, conn) \
|
|
do { \
|
|
debug("TCP: index %i, reset at %s:%i", conn - tc, \
|
|
__func__, __LINE__); \
|
|
tcp_rst_do(c, conn); \
|
|
} while (0)
|
|
|
|
/**
|
|
* tcp_l2_buf_write_one() - Write a single buffer to tap file descriptor
|
|
* @c: Execution context
|
|
* @iov: struct iovec item pointing to buffer
|
|
* @ts: Current timestamp
|
|
*
|
|
* Return: 0 on success, negative error code on failure (tap reset possible)
|
|
*/
|
|
static int tcp_l2_buf_write_one(struct ctx *c, struct iovec *iov,
|
|
struct timespec *ts)
|
|
{
|
|
if (write(c->fd_tap, (char *)iov->iov_base + 4, iov->iov_len - 4) < 0) {
|
|
debug("tap write: %s", strerror(errno));
|
|
if (errno != EAGAIN && errno != EWOULDBLOCK)
|
|
tap_handler(c, c->fd_tap, EPOLLERR, ts);
|
|
return -errno;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* tcp_l2_buf_flush_part() - Ensure a complete last message on partial sendmsg()
|
|
* @c: Execution context
|
|
* @mh: Message header that was partially sent by sendmsg()
|
|
* @sent: Bytes already sent
|
|
*/
|
|
static void tcp_l2_buf_flush_part(struct ctx *c, struct msghdr *mh, size_t sent)
|
|
{
|
|
size_t end = 0, missing;
|
|
struct iovec *iov;
|
|
unsigned int i;
|
|
char *p;
|
|
|
|
for (i = 0, iov = mh->msg_iov; i < mh->msg_iovlen; i++, iov++) {
|
|
end += iov->iov_len;
|
|
if (end >= sent)
|
|
break;
|
|
}
|
|
|
|
missing = end - sent;
|
|
p = (char *)iov->iov_base + iov->iov_len - missing;
|
|
send(c->fd_tap, p, missing, MSG_NOSIGNAL);
|
|
}
|
|
|
|
/**
|
|
* tcp_l2_flags_buf_flush() - Send out buffers for segments with or without data
|
|
* @c: Execution context
|
|
* @mh: Message header pointing to buffers, msg_iovlen not set
|
|
* @buf_used: Pointer to count of used buffers, set to 0 on return
|
|
* @buf_bytes: Pointer to count of buffer bytes, set to 0 on return
|
|
* @ts: Current timestamp
|
|
*/
|
|
static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh,
|
|
unsigned int *buf_used, size_t *buf_bytes,
|
|
struct timespec *ts)
|
|
{
|
|
if (!(mh->msg_iovlen = *buf_used))
|
|
return;
|
|
|
|
if (c->mode == MODE_PASST) {
|
|
size_t n = sendmsg(c->fd_tap, mh, MSG_NOSIGNAL | MSG_DONTWAIT);
|
|
if (n > 0 && n < *buf_bytes)
|
|
tcp_l2_buf_flush_part(c, mh, n);
|
|
} else {
|
|
size_t i;
|
|
|
|
for (i = 0; i < mh->msg_iovlen; i++) {
|
|
struct iovec *iov = &mh->msg_iov[i];
|
|
|
|
if (tcp_l2_buf_write_one(c, iov, ts))
|
|
i--;
|
|
}
|
|
}
|
|
*buf_used = *buf_bytes = 0;
|
|
pcapm(mh);
|
|
}
|
|
|
|
/**
|
|
* tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags)
|
|
* @c: Execution context
|
|
* @ts: Current timestamp (not packet timestamp)
|
|
*/
|
|
static void tcp_l2_flags_buf_flush(struct ctx *c, struct timespec *ts)
|
|
{
|
|
struct msghdr mh = { 0 };
|
|
unsigned int *buf_used;
|
|
size_t *buf_bytes;
|
|
|
|
mh.msg_iov = tcp6_l2_flags_iov;
|
|
buf_used = &tcp6_l2_flags_buf_used;
|
|
buf_bytes = &tcp6_l2_flags_buf_bytes;
|
|
tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts);
|
|
|
|
mh.msg_iov = tcp4_l2_flags_iov;
|
|
buf_used = &tcp4_l2_flags_buf_used;
|
|
buf_bytes = &tcp4_l2_flags_buf_bytes;
|
|
tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts);
|
|
}
|
|
|
|
/**
|
|
* tcp_l2_data_buf_flush() - Send out buffers for segments with data
|
|
* @c: Execution context
|
|
* @ts: Current timestamp (not packet timestamp)
|
|
*/
|
|
static void tcp_l2_data_buf_flush(struct ctx *c, struct timespec *ts)
|
|
{
|
|
struct msghdr mh = { 0 };
|
|
unsigned int *buf_used;
|
|
size_t *buf_bytes;
|
|
|
|
mh.msg_iov = tcp6_l2_iov;
|
|
buf_used = &tcp6_l2_buf_used;
|
|
buf_bytes = &tcp6_l2_buf_bytes;
|
|
tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts);
|
|
|
|
mh.msg_iov = tcp4_l2_iov;
|
|
buf_used = &tcp4_l2_buf_used;
|
|
buf_bytes = &tcp4_l2_buf_bytes;
|
|
tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts);
|
|
}
|
|
|
|
/**
|
|
* tcp_defer_handler() - Handler for TCP deferred tasks
|
|
* @c: Execution context
|
|
* @now: Current timestamp
|
|
*/
|
|
void tcp_defer_handler(struct ctx *c, struct timespec *now)
|
|
{
|
|
tcp_l2_flags_buf_flush(c, now);
|
|
tcp_l2_data_buf_flush(c, now);
|
|
}
|
|
|
|
/**
|
|
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @p: Pointer to any type of TCP pre-cooked buffer
|
|
* @plen: Payload length (including TCP header options)
|
|
* @check: Checksum, if already known
|
|
* @seq: Sequence number for this segment
|
|
*
|
|
* Return: 802.3 length, host order
|
|
*/
|
|
static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn,
|
|
void *p, size_t plen,
|
|
const uint16_t *check, uint32_t seq)
|
|
{
|
|
size_t ip_len, eth_len;
|
|
|
|
#define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \
|
|
do { \
|
|
b->th.source = htons(conn->sock_port); \
|
|
b->th.dest = htons(conn->tap_port); \
|
|
b->th.seq = htonl(seq); \
|
|
b->th.ack_seq = htonl(conn->seq_ack_to_tap); \
|
|
\
|
|
/* First value sent by receiver is not scaled */ \
|
|
if (b->th.syn) { \
|
|
b->th.window = htons(MIN(conn->wnd_to_tap, \
|
|
USHRT_MAX)); \
|
|
} else { \
|
|
b->th.window = htons(MIN(conn->wnd_to_tap >> \
|
|
conn->ws, \
|
|
USHRT_MAX)); \
|
|
} \
|
|
} while (0)
|
|
|
|
if (CONN_V6(conn)) {
|
|
struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
|
|
uint32_t flow = conn->seq_init_to_tap;
|
|
|
|
ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
|
|
|
|
b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr));
|
|
b->ip6h.saddr = conn->a.a6;
|
|
if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr))
|
|
b->ip6h.daddr = c->addr6_ll_seen;
|
|
else
|
|
b->ip6h.daddr = c->addr6_seen;
|
|
|
|
memset(b->ip6h.flow_lbl, 0, 3);
|
|
|
|
SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq);
|
|
|
|
tcp_update_check_tcp6(b);
|
|
|
|
b->ip6h.flow_lbl[0] = (flow >> 16) & 0xf;
|
|
b->ip6h.flow_lbl[1] = (flow >> 8) & 0xff;
|
|
b->ip6h.flow_lbl[2] = (flow >> 0) & 0xff;
|
|
|
|
eth_len = ip_len + sizeof(struct ethhdr);
|
|
if (c->mode == MODE_PASST)
|
|
b->vnet_len = htonl(eth_len);
|
|
} else {
|
|
struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p;
|
|
|
|
ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr);
|
|
b->iph.tot_len = htons(ip_len);
|
|
b->iph.saddr = conn->a.a4.a.s_addr;
|
|
b->iph.daddr = c->addr4_seen;
|
|
|
|
if (check)
|
|
b->iph.check = *check;
|
|
else
|
|
tcp_update_check_ip4(b);
|
|
|
|
SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq);
|
|
|
|
tcp_update_check_tcp4(b);
|
|
|
|
eth_len = ip_len + sizeof(struct ethhdr);
|
|
if (c->mode == MODE_PASST)
|
|
b->vnet_len = htonl(eth_len);
|
|
}
|
|
|
|
#undef SET_TCP_HEADER_COMMON_V4_V6
|
|
|
|
return eth_len;
|
|
}
|
|
|
|
/**
|
|
* tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @force_seq: Force ACK sequence to latest segment, instead of checking socket
|
|
* @tinfo: tcp_info from kernel, can be NULL if not pre-fetched
|
|
*
|
|
* Return: 1 if sequence or window were updated, 0 otherwise
|
|
*/
|
|
static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
|
|
int force_seq, struct tcp_info *tinfo)
|
|
{
|
|
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
|
|
uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
|
|
socklen_t sl = sizeof(*tinfo);
|
|
struct tcp_info tinfo_new;
|
|
int s = conn->sock;
|
|
|
|
#ifndef HAS_BYTES_ACKED
|
|
(void)force_seq;
|
|
|
|
conn->seq_ack_to_tap = conn->seq_from_tap;
|
|
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
|
|
conn->seq_ack_to_tap = prev_ack_to_tap;
|
|
#else
|
|
if ((unsigned long)conn->snd_buf < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
|
|
|| CONN_IS_CLOSING(conn) || conn->flags & CONN_LOCAL || force_seq) {
|
|
conn->seq_ack_to_tap = conn->seq_from_tap;
|
|
} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
|
|
if (!tinfo) {
|
|
tinfo = &tinfo_new;
|
|
if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
|
|
return 0;
|
|
}
|
|
|
|
conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
|
|
conn->seq_init_from_tap;
|
|
|
|
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
|
|
conn->seq_ack_to_tap = prev_ack_to_tap;
|
|
}
|
|
#endif /* !HAS_BYTES_ACKED */
|
|
|
|
if (!KERNEL_REPORTS_SND_WND(c)) {
|
|
tcp_get_sndbuf(conn);
|
|
conn->wnd_to_tap = MIN(conn->snd_buf, MAX_WINDOW);
|
|
goto out;
|
|
}
|
|
|
|
if (!tinfo) {
|
|
if (conn->wnd_to_tap > WINDOW_DEFAULT)
|
|
goto out;
|
|
|
|
tinfo = &tinfo_new;
|
|
if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
|
|
goto out;
|
|
}
|
|
|
|
#ifdef HAS_SND_WND
|
|
if ((conn->flags & CONN_LOCAL) || tcp_rtt_dst_low(conn)) {
|
|
conn->wnd_to_tap = tinfo->tcpi_snd_wnd;
|
|
} else {
|
|
tcp_get_sndbuf(conn);
|
|
conn->wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, conn->snd_buf);
|
|
}
|
|
#endif
|
|
|
|
conn->wnd_to_tap = MIN(conn->wnd_to_tap, MAX_WINDOW);
|
|
|
|
out:
|
|
return conn->wnd_to_tap != prev_wnd_to_tap ||
|
|
conn->seq_ack_to_tap != prev_ack_to_tap;
|
|
}
|
|
|
|
/**
|
|
* tcp_send_flag() - Send segment with flags to tap (no payload)
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @flags: TCP flags: if not set, send segment only if ACK is due
|
|
* @now: Current timestamp
|
|
*
|
|
* Return: negative error code on connection reset, 0 otherwise
|
|
*/
|
|
static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags,
|
|
struct timespec *now)
|
|
{
|
|
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
|
|
uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
|
|
struct tcp4_l2_flags_buf_t *b4 = NULL;
|
|
struct tcp6_l2_flags_buf_t *b6 = NULL;
|
|
struct tcp_info tinfo = { 0 };
|
|
socklen_t sl = sizeof(tinfo);
|
|
size_t optlen = 0, eth_len;
|
|
int s = conn->sock;
|
|
struct iovec *iov;
|
|
struct tcphdr *th;
|
|
char *data;
|
|
void *p;
|
|
|
|
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
|
|
!flags && conn->wnd_to_tap)
|
|
return 0;
|
|
|
|
if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
|
|
tcp_conn_destroy(c, conn);
|
|
return -ECONNRESET;
|
|
}
|
|
|
|
if (!(conn->flags & CONN_LOCAL))
|
|
tcp_rtt_dst_check(conn, &tinfo);
|
|
|
|
if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
|
|
return 0;
|
|
|
|
if (CONN_V4(conn)) {
|
|
iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used;
|
|
p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
|
|
th = &b4->th;
|
|
|
|
/* gcc 11.2 would complain on data = (char *)(th + 1); */
|
|
data = b4->opts;
|
|
} else {
|
|
iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used;
|
|
p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
|
|
th = &b6->th;
|
|
data = b6->opts;
|
|
}
|
|
|
|
if (flags & SYN) {
|
|
int mss;
|
|
|
|
/* Options: MSS, NOP and window scale (8 bytes) */
|
|
optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
|
|
|
|
*data++ = OPT_MSS;
|
|
*data++ = OPT_MSS_LEN;
|
|
|
|
if (c->mtu == -1) {
|
|
mss = tinfo.tcpi_snd_mss;
|
|
} else {
|
|
mss = c->mtu - sizeof(struct tcphdr);
|
|
if (CONN_V4(conn))
|
|
mss -= sizeof(struct iphdr);
|
|
else
|
|
mss -= sizeof(struct ipv6hdr);
|
|
|
|
if (c->low_wmem &&
|
|
!(conn->flags & CONN_LOCAL) &&
|
|
!tcp_rtt_dst_low(conn))
|
|
mss = MIN(mss, PAGE_SIZE);
|
|
else if (mss > PAGE_SIZE)
|
|
mss = ROUND_DOWN(mss, PAGE_SIZE);
|
|
}
|
|
*(uint16_t *)data = htons(MIN(USHRT_MAX, mss));
|
|
|
|
data += OPT_MSS_LEN - 2;
|
|
th->doff += OPT_MSS_LEN / 4;
|
|
|
|
#ifdef HAS_SND_WND
|
|
if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
|
|
c->tcp.kernel_snd_wnd = 1;
|
|
#endif
|
|
|
|
conn->ws = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
|
|
|
|
*data++ = OPT_NOP;
|
|
*data++ = OPT_WS;
|
|
*data++ = OPT_WS_LEN;
|
|
*data++ = conn->ws;
|
|
|
|
th->ack = !!(flags & ACK);
|
|
|
|
conn->wnd_to_tap = WINDOW_DEFAULT;
|
|
} else {
|
|
th->ack = !!(flags & (ACK | DUP_ACK)) ||
|
|
conn->seq_ack_to_tap != prev_ack_to_tap ||
|
|
!prev_wnd_to_tap;
|
|
}
|
|
|
|
th->doff = (sizeof(*th) + optlen) / 4;
|
|
|
|
th->rst = !!(flags & RST);
|
|
th->syn = !!(flags & SYN);
|
|
th->fin = !!(flags & FIN);
|
|
|
|
eth_len = tcp_l2_buf_fill_headers(c, conn, p, optlen,
|
|
NULL, conn->seq_to_tap);
|
|
iov->iov_len = eth_len + sizeof(uint32_t);
|
|
|
|
if (CONN_V4(conn))
|
|
tcp4_l2_flags_buf_bytes += iov->iov_len;
|
|
else
|
|
tcp6_l2_flags_buf_bytes += iov->iov_len;
|
|
|
|
if (th->ack && now)
|
|
conn->ts_ack_to_tap = *now;
|
|
|
|
if (th->fin && now)
|
|
conn->tap_data_noack = *now;
|
|
|
|
/* RFC 793, 3.1: "[...] and the first data octet is ISN+1." */
|
|
if (th->fin || th->syn)
|
|
conn->seq_to_tap++;
|
|
|
|
if (CONN_V4(conn)) {
|
|
if (flags & DUP_ACK) {
|
|
memcpy(b4 + 1, b4, sizeof(*b4));
|
|
(iov + 1)->iov_len = iov->iov_len;
|
|
tcp4_l2_flags_buf_used++;
|
|
tcp4_l2_flags_buf_bytes += iov->iov_len;
|
|
}
|
|
|
|
if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
|
|
tcp_l2_flags_buf_flush(c, now);
|
|
} else {
|
|
if (flags & DUP_ACK) {
|
|
memcpy(b6 + 1, b6, sizeof(*b6));
|
|
(iov + 1)->iov_len = iov->iov_len;
|
|
tcp6_l2_flags_buf_used++;
|
|
tcp6_l2_flags_buf_bytes += iov->iov_len;
|
|
}
|
|
|
|
if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
|
|
tcp_l2_flags_buf_flush(c, now);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
*/
|
|
static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn)
|
|
{
|
|
if (CONN_IS_CLOSED(conn))
|
|
return;
|
|
|
|
if (!tcp_send_flag(c, conn, RST, NULL))
|
|
tcp_conn_destroy(c, conn);
|
|
}
|
|
|
|
/**
|
|
* tcp_clamp_window() - Set window and scaling from option, clamp on socket
|
|
* @conn: Connection pointer
|
|
* @th: TCP header, from tap, can be NULL if window is passed
|
|
* @len: Buffer length, at L4, can be 0 if no header is passed
|
|
* @window: Window value, host order, unscaled, if no header is passed
|
|
* @init: Set if this is the very first segment from tap
|
|
*/
|
|
static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn,
|
|
struct tcphdr *th, int len, unsigned int window,
|
|
int init)
|
|
{
|
|
if (init && th) {
|
|
int ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
|
|
|
|
conn->ws_tap = ws;
|
|
|
|
/* RFC 7323, 2.2: first value is not scaled. Also, don't clamp
|
|
* yet, to avoid getting a zero scale just because we set a
|
|
* small window now.
|
|
*/
|
|
conn->wnd_from_tap = ntohs(th->window);
|
|
} else {
|
|
if (th)
|
|
window = ntohs(th->window) << conn->ws_tap;
|
|
else
|
|
window <<= conn->ws_tap;
|
|
|
|
window = MIN(MAX_WINDOW, window);
|
|
|
|
if (conn->flags & CONN_WND_CLAMPED) {
|
|
if (conn->wnd_from_tap == window)
|
|
return;
|
|
|
|
/* Discard +/- 1% updates to spare some syscalls. */
|
|
if ((window > conn->wnd_from_tap &&
|
|
window * 99 / 100 < conn->wnd_from_tap) ||
|
|
(window < conn->wnd_from_tap &&
|
|
window * 101 / 100 > conn->wnd_from_tap)) {
|
|
conn->wnd_from_tap = window;
|
|
return;
|
|
}
|
|
}
|
|
|
|
conn->wnd_from_tap = window;
|
|
if (window < 256)
|
|
window = 256;
|
|
setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP,
|
|
&window, sizeof(window));
|
|
conn_flag(c, conn, CONN_WND_CLAMPED);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* tcp_seq_init() - Calculate initial sequence number according to RFC 6528
|
|
* @c: Execution context
|
|
* @af: Address family, AF_INET or AF_INET6
|
|
* @addr: Remote address, pointer to sin_addr or sin6_addr
|
|
* @dstport: Destination port, connection-wise, network order
|
|
* @srcport: Source port, connection-wise, network order
|
|
* @now: Current timestamp
|
|
*
|
|
* Return: initial TCP sequence
|
|
*/
|
|
static uint32_t tcp_seq_init(struct ctx *c, int af, void *addr,
|
|
in_port_t dstport, in_port_t srcport,
|
|
struct timespec *now)
|
|
{
|
|
uint32_t ns, seq = 0;
|
|
|
|
if (af == AF_INET) {
|
|
struct {
|
|
struct in_addr src;
|
|
in_port_t srcport;
|
|
struct in_addr dst;
|
|
in_port_t dstport;
|
|
} __attribute__((__packed__)) in = {
|
|
.src = *(struct in_addr *)addr,
|
|
.srcport = srcport,
|
|
.dst = { c->addr4 },
|
|
.dstport = dstport,
|
|
};
|
|
|
|
seq = siphash_12b((uint8_t *)&in, c->tcp.hash_secret);
|
|
} else if (af == AF_INET6) {
|
|
struct {
|
|
struct in6_addr src;
|
|
in_port_t srcport;
|
|
struct in6_addr dst;
|
|
in_port_t dstport;
|
|
} __attribute__((__packed__)) in = {
|
|
.src = *(struct in6_addr *)addr,
|
|
.srcport = srcport,
|
|
.dst = c->addr6,
|
|
.dstport = dstport,
|
|
};
|
|
|
|
seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret);
|
|
}
|
|
|
|
ns = now->tv_sec * 1E9;
|
|
ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */
|
|
|
|
return seq + ns;
|
|
}
|
|
|
|
/**
|
|
* tcp_conn_new_sock() - Get socket for new connection from pool or make new one
|
|
* @c: Execution context
|
|
* @af: Address family
|
|
*
|
|
* Return: socket number if available, negative code if socket creation failed
|
|
*/
|
|
static int tcp_conn_new_sock(struct ctx *c, sa_family_t af)
|
|
{
|
|
int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4, i, s;
|
|
|
|
for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, pool++) {
|
|
if ((s = *pool) >= 0) {
|
|
*pool = -1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (s < 0)
|
|
s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
|
|
|
|
if (s < 0)
|
|
return -errno;
|
|
|
|
tcp_sock_set_bufsize(c, s);
|
|
|
|
return s;
|
|
}
|
|
|
|
/**
|
|
* tcp_conn_tap_mss() - Get and clamp MSS value advertised by tap/guest
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @th: TCP header send by tap/guest
|
|
* @len: L4 packet length, host order
|
|
*
|
|
* Return: clamped MSS value
|
|
*/
|
|
static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn,
|
|
struct tcphdr *th, size_t len)
|
|
{
|
|
unsigned int mss;
|
|
int ret;
|
|
|
|
if ((ret = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0)
|
|
mss = MSS_DEFAULT;
|
|
else
|
|
mss = ret;
|
|
|
|
/* Don't upset qemu */
|
|
if (c->mode == MODE_PASST) {
|
|
if (CONN_V4(conn))
|
|
mss = MIN(MSS4, mss);
|
|
else
|
|
mss = MIN(MSS6, mss);
|
|
}
|
|
|
|
return MIN(mss, USHRT_MAX);
|
|
}
|
|
|
|
/**
|
|
* tcp_conn_from_tap() - Handle connection request (SYN segment) from tap
|
|
* @c: Execution context
|
|
* @af: Address family, AF_INET or AF_INET6
|
|
* @addr: Remote address, pointer to sin_addr or sin6_addr
|
|
* @th: TCP header from tap
|
|
* @len: Packet length at L4
|
|
* @now: Current timestamp
|
|
*/
|
|
static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
|
|
struct tcphdr *th, size_t len,
|
|
struct timespec *now)
|
|
{
|
|
struct sockaddr_in addr4 = {
|
|
.sin_family = AF_INET,
|
|
.sin_port = th->dest,
|
|
.sin_addr = *(struct in_addr *)addr,
|
|
};
|
|
struct sockaddr_in6 addr6 = {
|
|
.sin6_family = AF_INET6,
|
|
.sin6_port = th->dest,
|
|
.sin6_addr = *(struct in6_addr *)addr,
|
|
};
|
|
const struct sockaddr *sa;
|
|
struct tcp_conn *conn;
|
|
socklen_t sl;
|
|
int s;
|
|
|
|
if (c->tcp.conn_count >= TCP_MAX_CONNS)
|
|
return;
|
|
|
|
if ((s = tcp_conn_new_sock(c, af)) < 0)
|
|
return;
|
|
|
|
if (!c->no_map_gw) {
|
|
if (af == AF_INET && addr4.sin_addr.s_addr == c->gw4)
|
|
addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
|
|
if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(addr, &c->gw6))
|
|
addr6.sin6_addr = in6addr_loopback;
|
|
}
|
|
|
|
if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) {
|
|
struct sockaddr_in6 addr6_ll = {
|
|
.sin6_family = AF_INET6,
|
|
.sin6_addr = c->addr6_ll,
|
|
.sin6_scope_id = c->ifi,
|
|
};
|
|
if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) {
|
|
close(s);
|
|
return;
|
|
}
|
|
}
|
|
|
|
conn = CONN(c->tcp.conn_count++);
|
|
conn->sock = s;
|
|
conn_event(c, conn, TAP_SYN_RCVD);
|
|
|
|
conn->wnd_to_tap = WINDOW_DEFAULT;
|
|
|
|
conn->tap_mss = tcp_conn_tap_mss(c, conn, th, len);
|
|
|
|
sl = sizeof(conn->tap_mss);
|
|
setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->tap_mss, sl);
|
|
|
|
tcp_clamp_window(c, conn, th, len, 0, 1);
|
|
|
|
if (af == AF_INET) {
|
|
sa = (struct sockaddr *)&addr4;
|
|
sl = sizeof(addr4);
|
|
|
|
memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero));
|
|
memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one));
|
|
memcpy(&conn->a.a4.a, addr, sizeof(conn->a.a4.a));
|
|
} else {
|
|
sa = (struct sockaddr *)&addr6;
|
|
sl = sizeof(addr6);
|
|
|
|
memcpy(&conn->a.a6, addr, sizeof(conn->a.a6));
|
|
}
|
|
|
|
conn->sock_port = ntohs(th->dest);
|
|
conn->tap_port = ntohs(th->source);
|
|
|
|
conn->ts_sock_act = conn->ts_tap_act = *now;
|
|
conn->ts_ack_to_tap = conn->ts_ack_from_tap = *now;
|
|
|
|
conn->seq_init_from_tap = ntohl(th->seq);
|
|
conn->seq_from_tap = conn->seq_init_from_tap + 1;
|
|
conn->seq_ack_to_tap = conn->seq_from_tap;
|
|
|
|
conn->seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now);
|
|
conn->seq_init_to_tap = conn->seq_to_tap;
|
|
conn->seq_ack_from_tap = conn->seq_to_tap + 1;
|
|
|
|
tcp_hash_insert(c, conn, af, addr);
|
|
|
|
if (!bind(s, sa, sl))
|
|
tcp_rst(c, conn); /* Nobody is listening then */
|
|
if (errno != EADDRNOTAVAIL)
|
|
conn_flag(c, conn, CONN_LOCAL);
|
|
|
|
if (connect(s, sa, sl)) {
|
|
if (errno != EINPROGRESS) {
|
|
tcp_rst(c, conn);
|
|
return;
|
|
}
|
|
|
|
tcp_get_sndbuf(conn);
|
|
} else {
|
|
tcp_get_sndbuf(conn);
|
|
|
|
if (tcp_send_flag(c, conn, SYN | ACK, now))
|
|
return;
|
|
|
|
conn_event(c, conn, TAP_SYN_ACK_SENT);
|
|
}
|
|
|
|
tcp_epoll_ctl(c, conn);
|
|
}
|
|
|
|
/**
|
|
* tcp_sock_consume() - Consume (discard) data from buffer, update ACK sequence
|
|
* @conn: Connection pointer
|
|
* @ack_seq: ACK sequence, host order
|
|
*
|
|
* Return: 0 on success, negative error code from recv() on failure
|
|
*/
|
|
static int tcp_sock_consume(struct tcp_conn *conn, uint32_t ack_seq)
|
|
{
|
|
/* Simply ignore out-of-order ACKs: we already consumed the data we
|
|
* needed from the buffer, and we won't rewind back to a lower ACK
|
|
* sequence.
|
|
*/
|
|
if (SEQ_LE(ack_seq, conn->seq_ack_from_tap))
|
|
return 0;
|
|
|
|
if (recv(conn->sock, NULL, ack_seq - conn->seq_ack_from_tap,
|
|
MSG_DONTWAIT | MSG_TRUNC) < 0)
|
|
return -errno;
|
|
|
|
conn->seq_ack_from_tap = ack_seq;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @plen: Payload length at L4
|
|
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
|
|
* @seq: Sequence number to be sent
|
|
* @now: Current timestamp
|
|
*/
|
|
static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn, ssize_t plen,
|
|
int no_csum, uint32_t seq, struct timespec *now)
|
|
{
|
|
struct iovec *iov;
|
|
size_t len;
|
|
|
|
if (CONN_V4(conn)) {
|
|
struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
|
|
uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
|
|
|
|
len = tcp_l2_buf_fill_headers(c, conn, b, plen, check, seq);
|
|
|
|
iov = tcp4_l2_iov + tcp4_l2_buf_used++;
|
|
tcp4_l2_buf_bytes += iov->iov_len = len + sizeof(b->vnet_len);
|
|
if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
|
|
tcp_l2_data_buf_flush(c, now);
|
|
} else if (CONN_V6(conn)) {
|
|
struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
|
|
|
|
len = tcp_l2_buf_fill_headers(c, conn, b, plen, NULL, seq);
|
|
|
|
iov = tcp6_l2_iov + tcp6_l2_buf_used++;
|
|
tcp6_l2_buf_bytes += iov->iov_len = len + sizeof(b->vnet_len);
|
|
if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
|
|
tcp_l2_data_buf_flush(c, now);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @now: Current timestamp
|
|
*
|
|
* Return: negative on connection reset, 0 otherwise
|
|
*
|
|
* #syscalls recvmsg
|
|
*/
|
|
static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn,
|
|
struct timespec *now)
|
|
{
|
|
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
|
|
int sendlen, len, plen, v4 = CONN_V4(conn);
|
|
int s = conn->sock, i, ret = 0;
|
|
struct msghdr mh_sock = { 0 };
|
|
uint32_t already_sent;
|
|
struct iovec *iov;
|
|
|
|
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
|
|
|
if (SEQ_LT(already_sent, 0)) {
|
|
/* RFC 761, section 2.1. */
|
|
trace("TCP: ACK sequence gap: ACK for %lu, sent: %lu",
|
|
conn->seq_ack_from_tap, conn->seq_to_tap);
|
|
conn->seq_to_tap = conn->seq_ack_from_tap;
|
|
already_sent = 0;
|
|
}
|
|
|
|
if (!conn->wnd_from_tap || already_sent >= conn->wnd_from_tap) {
|
|
conn_flag(c, conn, CONN_STALLED);
|
|
conn->tap_data_noack = *now;
|
|
return 0;
|
|
}
|
|
|
|
/* Set up buffer descriptors we'll fill completely and partially. */
|
|
fill_bufs = DIV_ROUND_UP(conn->wnd_from_tap - already_sent,
|
|
conn->tap_mss);
|
|
if (fill_bufs > TCP_FRAMES) {
|
|
fill_bufs = TCP_FRAMES;
|
|
iov_rem = 0;
|
|
} else {
|
|
iov_rem = (conn->wnd_from_tap - already_sent) % conn->tap_mss;
|
|
}
|
|
|
|
mh_sock.msg_iov = iov_sock;
|
|
mh_sock.msg_iovlen = fill_bufs + 1;
|
|
|
|
iov_sock[0].iov_base = tcp_buf_discard;
|
|
iov_sock[0].iov_len = already_sent;
|
|
|
|
if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
|
|
(!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf)))
|
|
tcp_l2_data_buf_flush(c, now);
|
|
|
|
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
|
|
if (v4)
|
|
iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
|
|
else
|
|
iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
|
|
iov->iov_len = conn->tap_mss;
|
|
}
|
|
if (iov_rem)
|
|
iov_sock[fill_bufs].iov_len = iov_rem;
|
|
|
|
/* Receive into buffers, don't dequeue until acknowledged by guest. */
|
|
recvmsg:
|
|
len = recvmsg(s, &mh_sock, MSG_PEEK);
|
|
if (len < 0) {
|
|
if (errno == EINTR)
|
|
goto recvmsg;
|
|
goto err;
|
|
}
|
|
|
|
if (!len)
|
|
goto zero_len;
|
|
|
|
sendlen = len - already_sent;
|
|
if (sendlen <= 0) {
|
|
conn_flag(c, conn, CONN_STALLED);
|
|
return 0;
|
|
}
|
|
|
|
conn_flag(c, conn, ~CONN_STALLED);
|
|
|
|
send_bufs = DIV_ROUND_UP(sendlen, conn->tap_mss);
|
|
last_len = sendlen - (send_bufs - 1) * conn->tap_mss;
|
|
|
|
/* Likely, some new data was acked too. */
|
|
tcp_update_seqack_wnd(c, conn, 0, NULL);
|
|
|
|
/* Finally, queue to tap */
|
|
plen = conn->tap_mss;
|
|
for (i = 0; i < send_bufs; i++) {
|
|
int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
|
|
|
|
if (i == send_bufs - 1)
|
|
plen = last_len;
|
|
|
|
tcp_data_to_tap(c, conn, plen, no_csum, conn->seq_to_tap, now);
|
|
conn->seq_to_tap += plen;
|
|
}
|
|
|
|
conn->tap_data_noack = conn->ts_ack_to_tap = *now;
|
|
|
|
return 0;
|
|
|
|
err:
|
|
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
|
ret = -errno;
|
|
tcp_rst(c, conn);
|
|
}
|
|
|
|
return ret;
|
|
|
|
zero_len:
|
|
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
|
if ((ret = tcp_send_flag(c, conn, FIN | ACK, now))) {
|
|
tcp_rst(c, conn);
|
|
return ret;
|
|
}
|
|
|
|
conn_event(c, conn, TAP_FIN_SENT);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* tcp_data_from_tap() - tap data for established connection
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @msg: Array of messages from tap
|
|
* @count: Count of messages
|
|
* @now: Current timestamp
|
|
*
|
|
* #syscalls sendmsg
|
|
*/
|
|
static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn,
|
|
struct tap_l4_msg *msg, int count,
|
|
struct timespec *now)
|
|
{
|
|
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1;
|
|
uint32_t max_ack_seq = conn->seq_ack_from_tap;
|
|
uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
|
|
uint32_t seq_from_tap = conn->seq_from_tap;
|
|
struct msghdr mh = { .msg_iov = tcp_iov };
|
|
int partial_send = 0;
|
|
uint16_t len;
|
|
ssize_t n;
|
|
|
|
for (i = 0, iov_i = 0; i < count; i++) {
|
|
uint32_t seq, seq_offset, ack_seq;
|
|
struct tcphdr *th;
|
|
char *data;
|
|
size_t off;
|
|
|
|
th = (struct tcphdr *)(pkt_buf + msg[i].pkt_buf_offset);
|
|
len = msg[i].l4_len;
|
|
|
|
if (len < sizeof(*th)) {
|
|
tcp_rst(c, conn);
|
|
return;
|
|
}
|
|
|
|
off = (size_t)th->doff * 4;
|
|
if (off < sizeof(*th) || off > len) {
|
|
tcp_rst(c, conn);
|
|
return;
|
|
}
|
|
|
|
if (th->rst) {
|
|
tcp_conn_destroy(c, conn);
|
|
return;
|
|
}
|
|
|
|
len -= off;
|
|
data = (char *)th + off;
|
|
|
|
seq = ntohl(th->seq);
|
|
ack_seq = ntohl(th->ack_seq);
|
|
|
|
if (th->ack) {
|
|
ack = 1;
|
|
|
|
if (SEQ_GE(ack_seq, conn->seq_ack_from_tap) &&
|
|
SEQ_GE(ack_seq, max_ack_seq)) {
|
|
/* Fast re-transmit */
|
|
retr = !len && !th->fin &&
|
|
ack_seq == max_ack_seq &&
|
|
ntohs(th->window) == max_ack_seq_wnd;
|
|
|
|
max_ack_seq_wnd = ntohs(th->window);
|
|
max_ack_seq = ack_seq;
|
|
}
|
|
}
|
|
|
|
if (th->fin)
|
|
fin = 1;
|
|
|
|
if (!len)
|
|
continue;
|
|
|
|
seq_offset = seq_from_tap - seq;
|
|
/* Use data from this buffer only in these two cases:
|
|
*
|
|
* , seq_from_tap , seq_from_tap
|
|
* |--------| <-- len |--------| <-- len
|
|
* '----' <-- offset ' <-- offset
|
|
* ^ seq ^ seq
|
|
* (offset >= 0, seq + len > seq_from_tap)
|
|
*
|
|
* discard in these two cases:
|
|
* , seq_from_tap , seq_from_tap
|
|
* |--------| <-- len |--------| <-- len
|
|
* '--------' <-- offset '-----| <- offset
|
|
* ^ seq ^ seq
|
|
* (offset >= 0, seq + len <= seq_from_tap)
|
|
*
|
|
* keep, look for another buffer, then go back, in this case:
|
|
* , seq_from_tap
|
|
* |--------| <-- len
|
|
* '===' <-- offset
|
|
* ^ seq
|
|
* (offset < 0)
|
|
*/
|
|
if (SEQ_GE(seq_offset, 0) && SEQ_LE(seq + len, seq_from_tap))
|
|
continue;
|
|
|
|
if (SEQ_LT(seq_offset, 0)) {
|
|
if (keep == -1)
|
|
keep = i;
|
|
continue;
|
|
}
|
|
|
|
tcp_iov[iov_i].iov_base = data + seq_offset;
|
|
tcp_iov[iov_i].iov_len = len - seq_offset;
|
|
seq_from_tap += tcp_iov[iov_i].iov_len;
|
|
iov_i++;
|
|
|
|
if (keep == i)
|
|
keep = -1;
|
|
|
|
if (keep != -1)
|
|
i = keep - 1;
|
|
}
|
|
|
|
tcp_clamp_window(c, conn, NULL, 0, max_ack_seq_wnd, 0);
|
|
|
|
if (ack) {
|
|
conn->ts_ack_from_tap = *now;
|
|
if (max_ack_seq == conn->seq_to_tap)
|
|
conn->tap_data_noack = ((struct timespec) { 0, 0 });
|
|
tcp_sock_consume(conn, max_ack_seq);
|
|
}
|
|
|
|
if (retr) {
|
|
trace("TCP: fast re-transmit, ACK: %lu, previous sequence: %lu",
|
|
max_ack_seq, conn->seq_to_tap);
|
|
conn->seq_ack_from_tap = max_ack_seq;
|
|
conn->seq_to_tap = max_ack_seq;
|
|
tcp_data_from_sock(c, conn, now);
|
|
}
|
|
|
|
if (!iov_i)
|
|
goto out;
|
|
|
|
mh.msg_iovlen = iov_i;
|
|
eintr:
|
|
n = sendmsg(conn->sock, &mh, MSG_DONTWAIT | MSG_NOSIGNAL);
|
|
if (n < 0) {
|
|
if (errno == EPIPE) {
|
|
/* Here's the wrap, said the tap.
|
|
* In my pocket, said the socket.
|
|
* Then swiftly looked away and left.
|
|
*/
|
|
conn->seq_from_tap = seq_from_tap;
|
|
tcp_send_flag(c, conn, ACK, now);
|
|
}
|
|
|
|
if (errno == EINTR)
|
|
goto eintr;
|
|
|
|
if (errno == EAGAIN || errno == EWOULDBLOCK) {
|
|
tcp_send_flag(c, conn, ACK_IF_NEEDED, now);
|
|
return;
|
|
}
|
|
tcp_rst(c, conn);
|
|
return;
|
|
}
|
|
|
|
if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
|
|
partial_send = 1;
|
|
conn->seq_from_tap += n;
|
|
tcp_send_flag(c, conn, ACK_IF_NEEDED, now);
|
|
} else {
|
|
conn->seq_from_tap += n;
|
|
}
|
|
|
|
out:
|
|
if (keep != -1) {
|
|
if (conn->seq_dup_ack != conn->seq_from_tap) {
|
|
conn->seq_dup_ack = conn->seq_from_tap;
|
|
tcp_send_flag(c, conn, DUP_ACK, now);
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (ack && conn->events & TAP_FIN_SENT &&
|
|
conn->seq_ack_from_tap == conn->seq_to_tap)
|
|
conn_event(c, conn, TAP_FIN_ACKED);
|
|
|
|
if (fin && !partial_send) {
|
|
conn->seq_from_tap++;
|
|
|
|
conn_event(c, conn, TAP_FIN_RCVD);
|
|
} else {
|
|
tcp_send_flag(c, conn, ACK_IF_NEEDED, now);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* tcp_conn_from_sock_finish() - Complete connection setup after connect()
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @th: TCP header of SYN, ACK segment from tap/guest
|
|
* @len: Packet length of SYN, ACK segment at L4, host order
|
|
* @now: Current timestamp
|
|
*/
|
|
static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
|
|
struct tcphdr *th, size_t len,
|
|
struct timespec *now)
|
|
{
|
|
tcp_clamp_window(c, conn, th, len, 0, 1);
|
|
conn->tap_mss = tcp_conn_tap_mss(c, conn, th, len);
|
|
|
|
conn->seq_init_from_tap = ntohl(th->seq) + 1;
|
|
conn->seq_from_tap = conn->seq_init_from_tap;
|
|
conn->seq_ack_to_tap = conn->seq_from_tap;
|
|
|
|
conn_event(c, conn, ESTABLISHED);
|
|
|
|
/* The client might have sent data already, which we didn't
|
|
* dequeue waiting for SYN,ACK from tap -- check now.
|
|
*/
|
|
tcp_data_from_sock(c, conn, now);
|
|
tcp_send_flag(c, conn, ACK_IF_NEEDED, now);
|
|
}
|
|
|
|
/**
|
|
* tcp_tap_handler() - Handle packets from tap and state transitions
|
|
* @c: Execution context
|
|
* @af: Address family, AF_INET or AF_INET6
|
|
* @addr: Destination address
|
|
* @msg: Input messages
|
|
* @count: Message count
|
|
* @now: Current timestamp
|
|
*
|
|
* Return: count of consumed packets
|
|
*/
|
|
int tcp_tap_handler(struct ctx *c, int af, void *addr,
|
|
struct tap_l4_msg *msg, int count, struct timespec *now)
|
|
{
|
|
struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset);
|
|
uint16_t len = msg[0].l4_len;
|
|
struct tcp_conn *conn;
|
|
|
|
conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
|
|
|
|
/* New connection from tap */
|
|
if (!conn) {
|
|
if (th->syn && !th->ack)
|
|
tcp_conn_from_tap(c, af, addr, th, len, now);
|
|
return 1;
|
|
}
|
|
|
|
if (th->rst) {
|
|
tcp_conn_destroy(c, conn);
|
|
return count;
|
|
}
|
|
|
|
conn->ts_tap_act = *now;
|
|
conn_flag(c, conn, ~CONN_STALLED);
|
|
|
|
/* Establishing connection from socket */
|
|
if (conn->events & SOCK_ACCEPTED) {
|
|
if (th->syn && th->ack && !th->fin)
|
|
tcp_conn_from_sock_finish(c, conn, th, len, now);
|
|
else
|
|
tcp_rst(c, conn);
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* Establishing connection from tap */
|
|
if (conn->events & TAP_SYN_RCVD) {
|
|
if (!(conn->events & TAP_SYN_ACK_SENT)) {
|
|
tcp_rst(c, conn);
|
|
return count;
|
|
}
|
|
|
|
conn_event(c, conn, ESTABLISHED);
|
|
|
|
if (th->fin) {
|
|
conn->seq_from_tap++;
|
|
|
|
shutdown(conn->sock, SHUT_WR);
|
|
tcp_send_flag(c, conn, ACK, now);
|
|
conn_event(c, conn, SOCK_FIN_SENT);
|
|
|
|
return count;
|
|
}
|
|
|
|
if (!th->ack) {
|
|
tcp_rst(c, conn);
|
|
return count;
|
|
}
|
|
|
|
tcp_clamp_window(c, conn, th, len, 0, 0);
|
|
|
|
if (count == 1)
|
|
return 1;
|
|
}
|
|
|
|
/* Established connections not accepting data from tap */
|
|
if (conn->events & TAP_FIN_RCVD) {
|
|
if (th->ack) {
|
|
conn->tap_data_noack = ((struct timespec) { 0, 0 });
|
|
conn->ts_ack_from_tap = *now;
|
|
}
|
|
|
|
if (conn->events & SOCK_FIN_RCVD &&
|
|
conn->seq_ack_from_tap == conn->seq_to_tap)
|
|
tcp_conn_destroy(c, conn);
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* Established connections accepting data from tap */
|
|
tcp_data_from_tap(c, conn, msg, count, now);
|
|
|
|
if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) {
|
|
shutdown(conn->sock, SHUT_WR);
|
|
conn_event(c, conn, SOCK_FIN_SENT);
|
|
tcp_send_flag(c, conn, ACK, now);
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
/**
|
|
* tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @now: Current timestamp
|
|
*/
|
|
static void tcp_connect_finish(struct ctx *c, struct tcp_conn *conn,
|
|
struct timespec *now)
|
|
{
|
|
socklen_t sl;
|
|
int so;
|
|
|
|
sl = sizeof(so);
|
|
if (getsockopt(conn->sock, SOL_SOCKET, SO_ERROR, &so, &sl) || so) {
|
|
tcp_rst(c, conn);
|
|
return;
|
|
}
|
|
|
|
if (tcp_send_flag(c, conn, SYN | ACK, now))
|
|
return;
|
|
|
|
conn_event(c, conn, TAP_SYN_ACK_SENT);
|
|
}
|
|
|
|
/**
|
|
* tcp_conn_from_sock() - Handle new connection request from listening socket
|
|
* @c: Execution context
|
|
* @ref: epoll reference of listening socket
|
|
* @now: Current timestamp
|
|
*/
|
|
static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
|
|
struct timespec *now)
|
|
{
|
|
struct sockaddr_storage sa;
|
|
struct tcp_conn *conn;
|
|
socklen_t sl;
|
|
int s;
|
|
|
|
if (c->tcp.conn_count >= TCP_MAX_CONNS)
|
|
return;
|
|
|
|
sl = sizeof(sa);
|
|
s = accept4(ref.r.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
|
|
if (s < 0)
|
|
return;
|
|
|
|
conn = CONN(c->tcp.conn_count++);
|
|
conn->sock = s;
|
|
|
|
conn_event(c, conn, SOCK_ACCEPTED);
|
|
|
|
if (ref.r.p.tcp.tcp.v6) {
|
|
struct sockaddr_in6 sa6;
|
|
|
|
memcpy(&sa6, &sa, sizeof(sa6));
|
|
|
|
if (IN6_IS_ADDR_LOOPBACK(&sa6.sin6_addr) ||
|
|
IN6_ARE_ADDR_EQUAL(&sa6.sin6_addr, &c->addr6_seen) ||
|
|
IN6_ARE_ADDR_EQUAL(&sa6.sin6_addr, &c->addr6)) {
|
|
struct in6_addr *src;
|
|
|
|
if (IN6_IS_ADDR_LINKLOCAL(&c->gw6))
|
|
src = &c->gw6;
|
|
else
|
|
src = &c->addr6_ll;
|
|
|
|
memcpy(&sa6.sin6_addr, src, sizeof(*src));
|
|
}
|
|
|
|
memcpy(&conn->a.a6, &sa6.sin6_addr, sizeof(conn->a.a6));
|
|
|
|
conn->sock_port = ntohs(sa6.sin6_port);
|
|
conn->tap_port = ref.r.p.tcp.tcp.index;
|
|
|
|
conn->seq_to_tap = tcp_seq_init(c, AF_INET6, &sa6.sin6_addr,
|
|
conn->sock_port,
|
|
conn->tap_port,
|
|
now);
|
|
conn->seq_init_to_tap = conn->seq_to_tap;
|
|
|
|
tcp_hash_insert(c, conn, AF_INET6, &sa6.sin6_addr);
|
|
} else {
|
|
struct sockaddr_in sa4;
|
|
in_addr_t s_addr;
|
|
|
|
memcpy(&sa4, &sa, sizeof(sa4));
|
|
s_addr = ntohl(sa4.sin_addr.s_addr);
|
|
|
|
memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero));
|
|
memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one));
|
|
|
|
if (s_addr >> IN_CLASSA_NSHIFT == IN_LOOPBACKNET ||
|
|
s_addr == INADDR_ANY || htonl(s_addr) == c->addr4_seen)
|
|
s_addr = ntohl(c->gw4);
|
|
|
|
s_addr = htonl(s_addr);
|
|
memcpy(&conn->a.a4.a, &s_addr, sizeof(conn->a.a4.a));
|
|
|
|
conn->sock_port = ntohs(sa4.sin_port);
|
|
conn->tap_port = ref.r.p.tcp.tcp.index;
|
|
|
|
conn->seq_to_tap = tcp_seq_init(c, AF_INET, &s_addr,
|
|
conn->sock_port,
|
|
conn->tap_port,
|
|
now);
|
|
conn->seq_init_to_tap = conn->seq_to_tap;
|
|
|
|
tcp_hash_insert(c, conn, AF_INET, &s_addr);
|
|
}
|
|
|
|
conn->seq_ack_from_tap = conn->seq_to_tap + 1;
|
|
|
|
conn->wnd_from_tap = WINDOW_DEFAULT;
|
|
|
|
conn->ts_sock_act = conn->ts_tap_act = *now;
|
|
conn->ts_ack_from_tap = conn->ts_ack_to_tap = *now;
|
|
|
|
tcp_send_flag(c, conn, SYN, now);
|
|
|
|
tcp_get_sndbuf(conn);
|
|
}
|
|
|
|
/**
|
|
* tcp_sock_handler() - Handle new data from socket
|
|
* @c: Execution context
|
|
* @ref: epoll reference
|
|
* @events: epoll events bitmap
|
|
* @now: Current timestamp
|
|
*/
|
|
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
|
|
struct timespec *now)
|
|
{
|
|
struct tcp_conn *conn;
|
|
|
|
if (ref.r.p.tcp.tcp.splice) {
|
|
tcp_sock_handler_splice(c, ref, events);
|
|
return;
|
|
}
|
|
|
|
if (ref.r.p.tcp.tcp.listen) {
|
|
tcp_conn_from_sock(c, ref, now);
|
|
return;
|
|
}
|
|
|
|
if (!(conn = CONN(ref.r.p.tcp.tcp.index)))
|
|
return;
|
|
|
|
conn->ts_sock_act = *now;
|
|
|
|
if (events & EPOLLERR) {
|
|
tcp_rst(c, conn);
|
|
return;
|
|
}
|
|
|
|
if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
|
|
tcp_conn_destroy(c, conn);
|
|
return;
|
|
}
|
|
|
|
if (conn->events & ESTABLISHED) {
|
|
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
|
|
tcp_conn_destroy(c, conn);
|
|
|
|
if (events & (EPOLLRDHUP | EPOLLHUP))
|
|
conn_event(c, conn, SOCK_FIN_RCVD);
|
|
|
|
if (events & EPOLLIN)
|
|
tcp_data_from_sock(c, conn, now);
|
|
|
|
if (events & EPOLLOUT)
|
|
tcp_update_seqack_wnd(c, conn, 0, NULL);
|
|
|
|
return;
|
|
}
|
|
|
|
/* EPOLLHUP during handshake: reset */
|
|
if (events & EPOLLHUP) {
|
|
tcp_rst(c, conn);
|
|
return;
|
|
}
|
|
|
|
/* Data during handshake tap-side: check later */
|
|
if (conn->events & SOCK_ACCEPTED)
|
|
return;
|
|
|
|
if (conn->events == TAP_SYN_RCVD) {
|
|
if (events & EPOLLOUT)
|
|
tcp_connect_finish(c, conn, now);
|
|
/* Data? Check later */
|
|
}
|
|
}
|
|
|
|
/**
|
|
* tcp_sock_init_one() - Initialise listening sockets for a given port
|
|
* @c: Execution context
|
|
* @ns: In pasta mode, if set, bind with loopback address in namespace
|
|
* @port: Port, host order
|
|
*/
|
|
static void tcp_sock_init_one(struct ctx *c, int ns, in_port_t port)
|
|
{
|
|
union tcp_epoll_ref tref = { .tcp.listen = 1 };
|
|
int s;
|
|
|
|
if (ns) {
|
|
tref.tcp.index = (in_port_t)(port +
|
|
tcp_port_delta_to_init[port]);
|
|
} else {
|
|
tref.tcp.index = (in_port_t)(port +
|
|
tcp_port_delta_to_tap[port]);
|
|
}
|
|
|
|
if (c->v4) {
|
|
tref.tcp.v6 = 0;
|
|
|
|
tref.tcp.splice = 0;
|
|
if (!ns) {
|
|
s = sock_l4(c, AF_INET, IPPROTO_TCP, port,
|
|
c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY,
|
|
tref.u32);
|
|
if (s >= 0)
|
|
tcp_sock_set_bufsize(c, s);
|
|
else
|
|
s = -1;
|
|
|
|
if (c->tcp.init_detect_ports)
|
|
tcp_sock_init_ext[port][V4] = s;
|
|
}
|
|
|
|
if (c->mode == MODE_PASTA) {
|
|
tref.tcp.splice = 1;
|
|
s = sock_l4(c, AF_INET, IPPROTO_TCP, port,
|
|
BIND_LOOPBACK, tref.u32);
|
|
if (s >= 0)
|
|
tcp_sock_set_bufsize(c, s);
|
|
else
|
|
s = -1;
|
|
|
|
if (c->tcp.ns_detect_ports) {
|
|
if (ns)
|
|
tcp_sock_ns[port][V4] = s;
|
|
else
|
|
tcp_sock_init_lo[port][V4] = s;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (c->v6) {
|
|
tref.tcp.v6 = 1;
|
|
|
|
tref.tcp.splice = 0;
|
|
if (!ns) {
|
|
s = sock_l4(c, AF_INET6, IPPROTO_TCP, port,
|
|
c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY,
|
|
tref.u32);
|
|
if (s >= 0)
|
|
tcp_sock_set_bufsize(c, s);
|
|
else
|
|
s = -1;
|
|
|
|
if (c->tcp.init_detect_ports)
|
|
tcp_sock_init_ext[port][V6] = s;
|
|
}
|
|
|
|
if (c->mode == MODE_PASTA) {
|
|
tref.tcp.splice = 1;
|
|
s = sock_l4(c, AF_INET6, IPPROTO_TCP, port,
|
|
BIND_LOOPBACK, tref.u32);
|
|
if (s >= 0)
|
|
tcp_sock_set_bufsize(c, s);
|
|
else
|
|
s = -1;
|
|
|
|
if (c->tcp.ns_detect_ports) {
|
|
if (ns)
|
|
tcp_sock_ns[port][V6] = s;
|
|
else
|
|
tcp_sock_init_lo[port][V6] = s;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* tcp_sock_init_ns() - Bind sockets in namespace for inbound connections
|
|
* @arg: Execution context
|
|
*
|
|
* Return: 0
|
|
*/
|
|
static int tcp_sock_init_ns(void *arg)
|
|
{
|
|
struct ctx *c = (struct ctx *)arg;
|
|
int port;
|
|
|
|
ns_enter(c);
|
|
|
|
for (port = 0; port < USHRT_MAX; port++) {
|
|
if (!bitmap_isset(c->tcp.port_to_init, port))
|
|
continue;
|
|
|
|
tcp_sock_init_one(c, 1, port);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* struct tcp_sock_refill_arg - Arguments for tcp_sock_refill()
|
|
* @c: Execution context
|
|
* @ns: Set to refill pool of sockets created in namespace
|
|
*/
|
|
struct tcp_sock_refill_arg {
|
|
struct ctx *c;
|
|
int ns;
|
|
};
|
|
|
|
/**
|
|
* tcp_sock_refill() - Refill pool of pre-opened sockets
|
|
* @arg: See @tcp_sock_refill_arg
|
|
*
|
|
* Return: 0
|
|
*/
|
|
static int tcp_sock_refill(void *arg)
|
|
{
|
|
struct tcp_sock_refill_arg *a = (struct tcp_sock_refill_arg *)arg;
|
|
int i, *p4, *p6;
|
|
|
|
if (a->ns) {
|
|
ns_enter(a->c);
|
|
p4 = ns_sock_pool4;
|
|
p6 = ns_sock_pool6;
|
|
} else {
|
|
p4 = init_sock_pool4;
|
|
p6 = init_sock_pool6;
|
|
}
|
|
|
|
for (i = 0; a->c->v4 && i < TCP_SOCK_POOL_SIZE; i++, p4++) {
|
|
if (*p4 >= 0) {
|
|
break;
|
|
}
|
|
*p4 = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
|
|
tcp_sock_set_bufsize(a->c, *p4);
|
|
}
|
|
|
|
for (i = 0; a->c->v6 && i < TCP_SOCK_POOL_SIZE; i++, p6++) {
|
|
if (*p6 >= 0) {
|
|
break;
|
|
}
|
|
*p6 = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK,
|
|
IPPROTO_TCP);
|
|
tcp_sock_set_bufsize(a->c, *p6);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* tcp_sock_init() - Bind sockets for inbound connections, get key for sequence
|
|
* @c: Execution context
|
|
*
|
|
* Return: 0 on success, -1 on failure
|
|
*/
|
|
int tcp_sock_init(struct ctx *c, struct timespec *now)
|
|
{
|
|
struct tcp_sock_refill_arg refill_arg = { c, 0 };
|
|
int i, port;
|
|
#ifndef HAS_GETRANDOM
|
|
int dev_random = open("/dev/random", O_RDONLY);
|
|
unsigned int random_read = 0;
|
|
|
|
while (dev_random && random_read < sizeof(c->tcp.hash_secret)) {
|
|
int ret = read(dev_random,
|
|
(uint8_t *)&c->tcp.hash_secret + random_read,
|
|
sizeof(c->tcp.hash_secret) - random_read);
|
|
|
|
if (ret == -1 && errno == EINTR)
|
|
continue;
|
|
|
|
if (ret <= 0)
|
|
break;
|
|
|
|
random_read += ret;
|
|
}
|
|
if (dev_random >= 0)
|
|
close(dev_random);
|
|
if (random_read < sizeof(c->tcp.hash_secret)) {
|
|
#else
|
|
if (getrandom(&c->tcp.hash_secret, sizeof(c->tcp.hash_secret),
|
|
GRND_RANDOM) < 0) {
|
|
#endif /* !HAS_GETRANDOM */
|
|
perror("TCP initial sequence getrandom");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
for (port = 0; port < USHRT_MAX; port++) {
|
|
if (!bitmap_isset(c->tcp.port_to_tap, port))
|
|
continue;
|
|
|
|
tcp_sock_init_one(c, 0, port);
|
|
}
|
|
|
|
for (i = 0; i < ARRAY_SIZE(tcp_l2_mh); i++)
|
|
tcp_l2_mh[i] = (struct mmsghdr) { .msg_hdr.msg_iovlen = 1 };
|
|
|
|
if (c->v4)
|
|
tcp_sock4_iov_init();
|
|
|
|
if (c->v6)
|
|
tcp_sock6_iov_init();
|
|
|
|
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
|
|
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
|
|
memset(ns_sock_pool4, 0xff, sizeof(ns_sock_pool4));
|
|
memset(ns_sock_pool6, 0xff, sizeof(ns_sock_pool6));
|
|
memset(tcp_sock_init_lo, 0xff, sizeof(tcp_sock_init_lo));
|
|
memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext));
|
|
memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns));
|
|
|
|
c->tcp.refill_ts = *now;
|
|
tcp_sock_refill(&refill_arg);
|
|
|
|
if (c->mode == MODE_PASTA) {
|
|
tcp_splice_init(c);
|
|
|
|
NS_CALL(tcp_sock_init_ns, c);
|
|
|
|
refill_arg.ns = 1;
|
|
NS_CALL(tcp_sock_refill, &refill_arg);
|
|
|
|
c->tcp.port_detect_ts = *now;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* tcp_timer_one() - Handler for timed events on one socket
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @ts: Timestamp from caller
|
|
*/
|
|
static void tcp_timer_one(struct ctx *c, struct tcp_conn *conn,
|
|
struct timespec *ts)
|
|
{
|
|
int ack_from_tap = timespec_diff_ms(ts, &conn->ts_ack_from_tap);
|
|
int ack_to_tap = timespec_diff_ms(ts, &conn->ts_ack_to_tap);
|
|
int sock_act = timespec_diff_ms(ts, &conn->ts_sock_act);
|
|
int tap_act = timespec_diff_ms(ts, &conn->ts_tap_act);
|
|
int tap_data_noack;
|
|
|
|
if (!memcmp(&conn->tap_data_noack, &((struct timespec){ 0, 0 }),
|
|
sizeof(struct timespec)))
|
|
tap_data_noack = 0;
|
|
else
|
|
tap_data_noack = timespec_diff_ms(ts, &conn->tap_data_noack);
|
|
|
|
if (CONN_IS_CLOSED(conn)) {
|
|
tcp_hash_remove(conn);
|
|
tcp_table_compact(c, conn);
|
|
return;
|
|
}
|
|
|
|
if (!(conn->events & ESTABLISHED)) {
|
|
if (ack_from_tap > SYN_TIMEOUT)
|
|
tcp_rst(c, conn);
|
|
return;
|
|
}
|
|
|
|
if (tap_act > ACT_TIMEOUT && sock_act > ACT_TIMEOUT)
|
|
goto rst;
|
|
|
|
if (!conn->wnd_to_tap || ack_to_tap > ACK_INTERVAL)
|
|
tcp_send_flag(c, conn, ACK_IF_NEEDED, ts);
|
|
|
|
if (tap_data_noack > ACK_TIMEOUT) {
|
|
if (conn->seq_ack_from_tap < conn->seq_to_tap) {
|
|
if (tap_data_noack > LAST_ACK_TIMEOUT)
|
|
goto rst;
|
|
|
|
conn->seq_to_tap = conn->seq_ack_from_tap;
|
|
tcp_data_from_sock(c, conn, ts);
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (conn->events & TAP_FIN_SENT && tap_data_noack > FIN_TIMEOUT)
|
|
goto rst;
|
|
|
|
if (conn->events & SOCK_FIN_SENT && sock_act > FIN_TIMEOUT)
|
|
goto rst;
|
|
|
|
if (conn->events & SOCK_FIN_SENT && conn->events & SOCK_FIN_RCVD) {
|
|
if (sock_act > LAST_ACK_TIMEOUT || tap_act > LAST_ACK_TIMEOUT)
|
|
goto rst;
|
|
}
|
|
|
|
return;
|
|
rst:
|
|
tcp_rst(c, conn);
|
|
}
|
|
|
|
/**
|
|
* struct tcp_port_detect_arg - Arguments for tcp_port_detect()
|
|
* @c: Execution context
|
|
* @detect_in_ns: Detect ports bound in namespace, not in init
|
|
*/
|
|
struct tcp_port_detect_arg {
|
|
struct ctx *c;
|
|
int detect_in_ns;
|
|
};
|
|
|
|
/**
|
|
* tcp_port_detect() - Detect ports bound in namespace or init
|
|
* @arg: See struct tcp_port_detect_arg
|
|
*
|
|
* Return: 0
|
|
*/
|
|
static int tcp_port_detect(void *arg)
|
|
{
|
|
struct tcp_port_detect_arg *a = (struct tcp_port_detect_arg *)arg;
|
|
|
|
if (a->detect_in_ns) {
|
|
ns_enter(a->c);
|
|
|
|
get_bound_ports(a->c, 1, IPPROTO_TCP);
|
|
} else {
|
|
get_bound_ports(a->c, 0, IPPROTO_TCP);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* struct tcp_port_rebind_arg - Arguments for tcp_port_rebind()
|
|
* @c: Execution context
|
|
* @bind_in_ns: Rebind ports in namespace, not in init
|
|
*/
|
|
struct tcp_port_rebind_arg {
|
|
struct ctx *c;
|
|
int bind_in_ns;
|
|
};
|
|
|
|
/**
|
|
* tcp_port_rebind() - Rebind ports in namespace or init
|
|
* @arg: See struct tcp_port_rebind_arg
|
|
*
|
|
* Return: 0
|
|
*/
|
|
static int tcp_port_rebind(void *arg)
|
|
{
|
|
struct tcp_port_rebind_arg *a = (struct tcp_port_rebind_arg *)arg;
|
|
int port;
|
|
|
|
if (a->bind_in_ns) {
|
|
ns_enter(a->c);
|
|
|
|
for (port = 0; port < USHRT_MAX; port++) {
|
|
if (!bitmap_isset(a->c->tcp.port_to_init, port)) {
|
|
if (tcp_sock_ns[port][V4] >= 0) {
|
|
close(tcp_sock_ns[port][V4]);
|
|
tcp_sock_ns[port][V4] = -1;
|
|
}
|
|
|
|
if (tcp_sock_ns[port][V6] >= 0) {
|
|
close(tcp_sock_ns[port][V6]);
|
|
tcp_sock_ns[port][V6] = -1;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
/* Don't loop back our own ports */
|
|
if (bitmap_isset(a->c->tcp.port_to_tap, port))
|
|
continue;
|
|
|
|
if ((a->c->v4 && tcp_sock_ns[port][V4] == -1) ||
|
|
(a->c->v6 && tcp_sock_ns[port][V6] == -1))
|
|
tcp_sock_init_one(a->c, 1, port);
|
|
}
|
|
} else {
|
|
for (port = 0; port < USHRT_MAX; port++) {
|
|
if (!bitmap_isset(a->c->tcp.port_to_tap, port)) {
|
|
if (tcp_sock_init_ext[port][V4] >= 0) {
|
|
close(tcp_sock_init_ext[port][V4]);
|
|
tcp_sock_init_ext[port][V4] = -1;
|
|
}
|
|
|
|
if (tcp_sock_init_ext[port][V6] >= 0) {
|
|
close(tcp_sock_init_ext[port][V6]);
|
|
tcp_sock_init_ext[port][V6] = -1;
|
|
}
|
|
|
|
if (tcp_sock_init_lo[port][V4] >= 0) {
|
|
close(tcp_sock_init_lo[port][V4]);
|
|
tcp_sock_init_lo[port][V4] = -1;
|
|
}
|
|
|
|
if (tcp_sock_init_lo[port][V6] >= 0) {
|
|
close(tcp_sock_init_lo[port][V6]);
|
|
tcp_sock_init_lo[port][V6] = -1;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
/* Don't loop back our own ports */
|
|
if (bitmap_isset(a->c->tcp.port_to_init, port))
|
|
continue;
|
|
|
|
if ((a->c->v4 && tcp_sock_init_ext[port][V4] == -1) ||
|
|
(a->c->v6 && tcp_sock_init_ext[port][V6] == -1))
|
|
tcp_sock_init_one(a->c, 0, port);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* tcp_timer() - Scan activity bitmap for sockets waiting for timed events
|
|
* @c: Execution context
|
|
* @now: Timestamp from caller
|
|
*/
|
|
void tcp_timer(struct ctx *c, struct timespec *now)
|
|
{
|
|
struct tcp_sock_refill_arg refill_arg = { c, 0 };
|
|
int i;
|
|
|
|
if (c->mode == MODE_PASTA) {
|
|
if (timespec_diff_ms(now, &c->tcp.port_detect_ts) >
|
|
PORT_DETECT_INTERVAL) {
|
|
struct tcp_port_detect_arg detect_arg = { c, 0 };
|
|
struct tcp_port_rebind_arg rebind_arg = { c, 0 };
|
|
|
|
if (c->tcp.init_detect_ports) {
|
|
detect_arg.detect_in_ns = 0;
|
|
tcp_port_detect(&detect_arg);
|
|
rebind_arg.bind_in_ns = 1;
|
|
NS_CALL(tcp_port_rebind, &rebind_arg);
|
|
}
|
|
|
|
if (c->tcp.ns_detect_ports) {
|
|
detect_arg.detect_in_ns = 1;
|
|
NS_CALL(tcp_port_detect, &detect_arg);
|
|
rebind_arg.bind_in_ns = 0;
|
|
tcp_port_rebind(&rebind_arg);
|
|
}
|
|
|
|
c->tcp.port_detect_ts = *now;
|
|
}
|
|
|
|
tcp_splice_timer(c, now);
|
|
}
|
|
|
|
if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) {
|
|
tcp_sock_refill(&refill_arg);
|
|
if (c->mode == MODE_PASTA) {
|
|
refill_arg.ns = 1;
|
|
if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
|
|
(c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
|
|
NS_CALL(tcp_sock_refill, &refill_arg);
|
|
}
|
|
}
|
|
|
|
for (i = c->tcp.conn_count - 1; i >= 0; i--)
|
|
tcp_timer_one(c, CONN(i), now);
|
|
}
|