tcp: Rework timers to use timerfd instead of periodic bitmap scan
With a lot of concurrent connections, the bitmap scan approach is not really sustainable. Switch to per-connection timerfd timers, set based on events and on two new flags, ACK_FROM_TAP_DUE and ACK_TO_TAP_DUE. Timers are added to the common epoll list, and implement the existing timeouts. While at it, drop the CONN_ prefix from flag names, otherwise they get quite long, and fix the logic to decide if a connection has a local, possibly unreachable endpoint: we shouldn't go through the rest of tcp_conn_from_tap() if we reset the connection due to a successful bind(2), and we'll get EACCES if the port number is low. Suggested by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
parent
3eb19cfd8a
commit
be5bbb9b06
5 changed files with 288 additions and 241 deletions
|
@ -287,11 +287,9 @@ speeding up local connections, and usually requiring NAT. _pasta_:
|
||||||
* ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted)
|
* ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted)
|
||||||
* ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached
|
* ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached
|
||||||
* ✅ no external dependencies (other than a standard C library)
|
* ✅ no external dependencies (other than a standard C library)
|
||||||
* ✅ restrictive seccomp profiles (22 syscalls allowed for _passt_, 34 for
|
* ✅ restrictive seccomp profiles (25 syscalls allowed for _passt_, 37 for
|
||||||
_pasta_ on x86_64)
|
_pasta_ on x86_64)
|
||||||
* ✅ static checkers in continuous integration (clang-tidy, cppcheck)
|
* ✅ static checkers in continuous integration (clang-tidy, cppcheck)
|
||||||
* 🛠️ rework of TCP state machine (flags instead of states), TCP timers, and code
|
|
||||||
de-duplication
|
|
||||||
* 🛠️ clearly defined packet abstraction
|
* 🛠️ clearly defined packet abstraction
|
||||||
* 🛠️ ~5 000 LoC target
|
* 🛠️ ~5 000 LoC target
|
||||||
* ⌚ [fuzzing](https://bugs.passt.top/show_bug.cgi?id=9), _packetdrill_ tests
|
* ⌚ [fuzzing](https://bugs.passt.top/show_bug.cgi?id=9), _packetdrill_ tests
|
||||||
|
|
12
passt.c
12
passt.c
|
@ -119,12 +119,12 @@ static void post_handler(struct ctx *c, struct timespec *now)
|
||||||
#define CALL_PROTO_HANDLER(c, now, lc, uc) \
|
#define CALL_PROTO_HANDLER(c, now, lc, uc) \
|
||||||
do { \
|
do { \
|
||||||
extern void \
|
extern void \
|
||||||
lc ## _defer_handler (struct ctx *, struct timespec *) \
|
lc ## _defer_handler (struct ctx *c) \
|
||||||
__attribute__ ((weak)); \
|
__attribute__ ((weak)); \
|
||||||
\
|
\
|
||||||
if (!c->no_ ## lc) { \
|
if (!c->no_ ## lc) { \
|
||||||
if (lc ## _defer_handler) \
|
if (lc ## _defer_handler) \
|
||||||
lc ## _defer_handler(c, now); \
|
lc ## _defer_handler(c); \
|
||||||
\
|
\
|
||||||
if (timespec_diff_ms((now), &c->lc.timer_run) \
|
if (timespec_diff_ms((now), &c->lc.timer_run) \
|
||||||
>= uc ## _TIMER_INTERVAL) { \
|
>= uc ## _TIMER_INTERVAL) { \
|
||||||
|
@ -134,8 +134,11 @@ static void post_handler(struct ctx *c, struct timespec *now)
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||||
CALL_PROTO_HANDLER(c, now, tcp, TCP);
|
CALL_PROTO_HANDLER(c, now, tcp, TCP);
|
||||||
|
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||||
CALL_PROTO_HANDLER(c, now, udp, UDP);
|
CALL_PROTO_HANDLER(c, now, udp, UDP);
|
||||||
|
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||||
CALL_PROTO_HANDLER(c, now, icmp, ICMP);
|
CALL_PROTO_HANDLER(c, now, icmp, ICMP);
|
||||||
|
|
||||||
#undef CALL_PROTO_HANDLER
|
#undef CALL_PROTO_HANDLER
|
||||||
|
@ -380,8 +383,8 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||||
|
|
||||||
if ((!c.no_udp && udp_sock_init(&c, &now)) ||
|
if ((!c.no_udp && udp_sock_init(&c)) ||
|
||||||
(!c.no_tcp && tcp_sock_init(&c, &now)))
|
(!c.no_tcp && tcp_sock_init(&c)))
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
|
|
||||||
proto_update_l2_buf(c.mac_guest, c.mac, &c.addr4);
|
proto_update_l2_buf(c.mac_guest, c.mac, &c.addr4);
|
||||||
|
@ -425,6 +428,7 @@ int main(int argc, char **argv)
|
||||||
timer_init(&c, &now);
|
timer_init(&c, &now);
|
||||||
|
|
||||||
loop:
|
loop:
|
||||||
|
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
|
||||||
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
|
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
|
||||||
if (nfds == -1 && errno != EINTR) {
|
if (nfds == -1 && errno != EINTR) {
|
||||||
perror("epoll_wait");
|
perror("epoll_wait");
|
||||||
|
|
2
tap.c
2
tap.c
|
@ -939,7 +939,7 @@ void tap_sock_init(struct ctx *c)
|
||||||
* @c: Execution context
|
* @c: Execution context
|
||||||
* @fd: File descriptor where event occurred
|
* @fd: File descriptor where event occurred
|
||||||
* @events: epoll events
|
* @events: epoll events
|
||||||
* @now: Current timestamp
|
* @now: Current timestamp, can be NULL on EPOLLERR
|
||||||
*/
|
*/
|
||||||
void tap_handler(struct ctx *c, int fd, uint32_t events, struct timespec *now)
|
void tap_handler(struct ctx *c, int fd, uint32_t events, struct timespec *now)
|
||||||
{
|
{
|
||||||
|
|
8
tcp.h
8
tcp.h
|
@ -6,7 +6,9 @@
|
||||||
#ifndef TCP_H
|
#ifndef TCP_H
|
||||||
#define TCP_H
|
#define TCP_H
|
||||||
|
|
||||||
#define TCP_TIMER_INTERVAL 20 /* ms */
|
#define REFILL_INTERVAL 1000 /* ms */
|
||||||
|
#define PORT_DETECT_INTERVAL 1000
|
||||||
|
#define TCP_TIMER_INTERVAL MIN(REFILL_INTERVAL, PORT_DETECT_INTERVAL)
|
||||||
|
|
||||||
#define TCP_MAX_CONNS (128 * 1024)
|
#define TCP_MAX_CONNS (128 * 1024)
|
||||||
#define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2)
|
#define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2)
|
||||||
|
@ -21,7 +23,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
|
||||||
struct tap_l4_msg *msg, int count, struct timespec *now);
|
struct tap_l4_msg *msg, int count, struct timespec *now);
|
||||||
int tcp_sock_init(struct ctx *c, struct timespec *now);
|
int tcp_sock_init(struct ctx *c, struct timespec *now);
|
||||||
void tcp_timer(struct ctx *c, struct timespec *now);
|
void tcp_timer(struct ctx *c, struct timespec *now);
|
||||||
void tcp_defer_handler(struct ctx *c, struct timespec *now);
|
void tcp_defer_handler(struct ctx *c);
|
||||||
|
|
||||||
void tcp_sock_set_bufsize(struct ctx *c, int s);
|
void tcp_sock_set_bufsize(struct ctx *c, int s);
|
||||||
void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
|
void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
|
||||||
|
@ -34,6 +36,7 @@ void tcp_remap_to_init(in_port_t port, in_port_t delta);
|
||||||
* @listen: Set if this file descriptor is a listening socket
|
* @listen: Set if this file descriptor is a listening socket
|
||||||
* @splice: Set if descriptor is associated to a spliced connection
|
* @splice: Set if descriptor is associated to a spliced connection
|
||||||
* @v6: Set for IPv6 sockets or connections
|
* @v6: Set for IPv6 sockets or connections
|
||||||
|
* @timer: Reference is a timerfd descriptor for connection
|
||||||
* @index: Index of connection in table, or port for bound sockets
|
* @index: Index of connection in table, or port for bound sockets
|
||||||
* @u32: Opaque u32 value of reference
|
* @u32: Opaque u32 value of reference
|
||||||
*/
|
*/
|
||||||
|
@ -42,6 +45,7 @@ union tcp_epoll_ref {
|
||||||
uint32_t listen:1,
|
uint32_t listen:1,
|
||||||
splice:1,
|
splice:1,
|
||||||
v6:1,
|
v6:1,
|
||||||
|
timer:1,
|
||||||
index:20;
|
index:20;
|
||||||
} tcp;
|
} tcp;
|
||||||
uint32_t u32;
|
uint32_t u32;
|
||||||
|
|
Loading…
Reference in a new issue