mirror of
				https://passt.top/passt
				synced 2025-10-26 00:59:13 +02:00 
			
		
		
		
	tcp: Rework timers to use timerfd instead of periodic bitmap scan
With a lot of concurrent connections, the bitmap scan approach is not really sustainable. Switch to per-connection timerfd timers, set based on events and on two new flags, ACK_FROM_TAP_DUE and ACK_TO_TAP_DUE. Timers are added to the common epoll list, and implement the existing timeouts. While at it, drop the CONN_ prefix from flag names, otherwise they get quite long, and fix the logic to decide if a connection has a local, possibly unreachable endpoint: we shouldn't go through the rest of tcp_conn_from_tap() if we reset the connection due to a successful bind(2), and we'll get EACCES if the port number is low. Suggested by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
		
					parent
					
						
							
								3eb19cfd8a
							
						
					
				
			
			
				commit
				
					
						be5bbb9b06
					
				
			
		
					 5 changed files with 288 additions and 241 deletions
				
			
		|  | @ -287,11 +287,9 @@ speeding up local connections, and usually requiring NAT. _pasta_: | |||
| * ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted) | ||||
| * ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached | ||||
| * ✅ no external dependencies (other than a standard C library) | ||||
| * ✅ restrictive seccomp profiles (22 syscalls allowed for _passt_, 34 for | ||||
| * ✅ restrictive seccomp profiles (25 syscalls allowed for _passt_, 37 for | ||||
|   _pasta_ on x86_64) | ||||
| * ✅ static checkers in continuous integration (clang-tidy, cppcheck) | ||||
| * 🛠️ rework of TCP state machine (flags instead of states), TCP timers, and code | ||||
|   de-duplication | ||||
| * 🛠️ clearly defined packet abstraction | ||||
| * 🛠️ ~5 000 LoC target | ||||
| * ⌚ [fuzzing](https://bugs.passt.top/show_bug.cgi?id=9), _packetdrill_ tests | ||||
|  |  | |||
							
								
								
									
										12
									
								
								passt.c
									
										
									
									
									
								
							
							
						
						
									
										12
									
								
								passt.c
									
										
									
									
									
								
							|  | @ -119,12 +119,12 @@ static void post_handler(struct ctx *c, struct timespec *now) | |||
| #define CALL_PROTO_HANDLER(c, now, lc, uc)				\ | ||||
| 	do {								\ | ||||
| 		extern void						\ | ||||
| 		lc ## _defer_handler (struct ctx *, struct timespec *)	\ | ||||
| 		lc ## _defer_handler (struct ctx *c)			\ | ||||
| 		__attribute__ ((weak));					\ | ||||
| 									\ | ||||
| 		if (!c->no_ ## lc) {					\ | ||||
| 			if (lc ## _defer_handler)			\ | ||||
| 				lc ## _defer_handler(c, now);		\ | ||||
| 				lc ## _defer_handler(c);		\ | ||||
| 									\ | ||||
| 			if (timespec_diff_ms((now), &c->lc.timer_run)	\ | ||||
| 			    >= uc ## _TIMER_INTERVAL) {			\ | ||||
|  | @ -134,8 +134,11 @@ static void post_handler(struct ctx *c, struct timespec *now) | |||
| 		} 							\ | ||||
| 	} while (0) | ||||
| 
 | ||||
| 	/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ | ||||
| 	CALL_PROTO_HANDLER(c, now, tcp, TCP); | ||||
| 	/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ | ||||
| 	CALL_PROTO_HANDLER(c, now, udp, UDP); | ||||
| 	/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ | ||||
| 	CALL_PROTO_HANDLER(c, now, icmp, ICMP); | ||||
| 
 | ||||
| #undef CALL_PROTO_HANDLER | ||||
|  | @ -380,8 +383,8 @@ int main(int argc, char **argv) | |||
| 
 | ||||
| 	clock_gettime(CLOCK_MONOTONIC, &now); | ||||
| 
 | ||||
| 	if ((!c.no_udp && udp_sock_init(&c, &now)) || | ||||
| 	    (!c.no_tcp && tcp_sock_init(&c, &now))) | ||||
| 	if ((!c.no_udp && udp_sock_init(&c)) || | ||||
| 	    (!c.no_tcp && tcp_sock_init(&c))) | ||||
| 		exit(EXIT_FAILURE); | ||||
| 
 | ||||
| 	proto_update_l2_buf(c.mac_guest, c.mac, &c.addr4); | ||||
|  | @ -425,6 +428,7 @@ int main(int argc, char **argv) | |||
| 	timer_init(&c, &now); | ||||
| 
 | ||||
| loop: | ||||
| 	/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ | ||||
| 	nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL); | ||||
| 	if (nfds == -1 && errno != EINTR) { | ||||
| 		perror("epoll_wait"); | ||||
|  |  | |||
							
								
								
									
										2
									
								
								tap.c
									
										
									
									
									
								
							
							
						
						
									
										2
									
								
								tap.c
									
										
									
									
									
								
							|  | @ -939,7 +939,7 @@ void tap_sock_init(struct ctx *c) | |||
|  * @c:		Execution context | ||||
|  * @fd:		File descriptor where event occurred | ||||
|  * @events:	epoll events | ||||
|  * @now:	Current timestamp | ||||
|  * @now:	Current timestamp, can be NULL on EPOLLERR | ||||
|  */ | ||||
| void tap_handler(struct ctx *c, int fd, uint32_t events, struct timespec *now) | ||||
| { | ||||
|  |  | |||
							
								
								
									
										8
									
								
								tcp.h
									
										
									
									
									
								
							
							
						
						
									
										8
									
								
								tcp.h
									
										
									
									
									
								
							|  | @ -6,7 +6,9 @@ | |||
| #ifndef TCP_H | ||||
| #define TCP_H | ||||
| 
 | ||||
| #define TCP_TIMER_INTERVAL		20 /* ms */ | ||||
| #define REFILL_INTERVAL			1000 /* ms */ | ||||
| #define PORT_DETECT_INTERVAL		1000 | ||||
| #define TCP_TIMER_INTERVAL	MIN(REFILL_INTERVAL, PORT_DETECT_INTERVAL) | ||||
| 
 | ||||
| #define TCP_MAX_CONNS			(128 * 1024) | ||||
| #define TCP_MAX_SOCKS			(TCP_MAX_CONNS + USHRT_MAX * 2) | ||||
|  | @ -21,7 +23,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, | |||
| 		    struct tap_l4_msg *msg, int count, struct timespec *now); | ||||
| int tcp_sock_init(struct ctx *c, struct timespec *now); | ||||
| void tcp_timer(struct ctx *c, struct timespec *now); | ||||
| void tcp_defer_handler(struct ctx *c, struct timespec *now); | ||||
| void tcp_defer_handler(struct ctx *c); | ||||
| 
 | ||||
| void tcp_sock_set_bufsize(struct ctx *c, int s); | ||||
| void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s, | ||||
|  | @ -34,6 +36,7 @@ void tcp_remap_to_init(in_port_t port, in_port_t delta); | |||
|  * @listen:		Set if this file descriptor is a listening socket | ||||
|  * @splice:		Set if descriptor is associated to a spliced connection | ||||
|  * @v6:			Set for IPv6 sockets or connections | ||||
|  * @timer:		Reference is a timerfd descriptor for connection | ||||
|  * @index:		Index of connection in table, or port for bound sockets | ||||
|  * @u32:		Opaque u32 value of reference | ||||
|  */ | ||||
|  | @ -42,6 +45,7 @@ union tcp_epoll_ref { | |||
| 		uint32_t	listen:1, | ||||
| 				splice:1, | ||||
| 				v6:1, | ||||
| 				timer:1, | ||||
| 				index:20; | ||||
| 	} tcp; | ||||
| 	uint32_t u32; | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Stefano Brivio
				Stefano Brivio