diff --git a/epoll_type.h b/epoll_type.h index 7f2a121..12ac59b 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -22,8 +22,8 @@ enum epoll_type { EPOLL_TYPE_TCP_TIMER, /* UDP "listening" sockets */ EPOLL_TYPE_UDP_LISTEN, - /* UDP socket for replies on a specific flow */ - EPOLL_TYPE_UDP_REPLY, + /* UDP socket for a specific flow */ + EPOLL_TYPE_UDP, /* ICMP/ICMPv6 ping sockets */ EPOLL_TYPE_PING, /* inotify fd watching for end of netns (pasta) */ diff --git a/passt.c b/passt.c index cd06772..388d10f 100644 --- a/passt.c +++ b/passt.c @@ -68,7 +68,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket", [EPOLL_TYPE_TCP_TIMER] = "TCP timer", [EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket", - [EPOLL_TYPE_UDP_REPLY] = "UDP reply socket", + [EPOLL_TYPE_UDP] = "UDP flow socket", [EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket", [EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch", [EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch", @@ -339,8 +339,8 @@ loop: case EPOLL_TYPE_UDP_LISTEN: udp_listen_sock_handler(&c, ref, eventmask, &now); break; - case EPOLL_TYPE_UDP_REPLY: - udp_reply_sock_handler(&c, ref, eventmask, &now); + case EPOLL_TYPE_UDP: + udp_sock_handler(&c, ref, eventmask, &now); break; case EPOLL_TYPE_PING: icmp_sock_handler(&c, ref); diff --git a/udp.c b/udp.c index 5a251df..1b3fffd 100644 --- a/udp.c +++ b/udp.c @@ -39,27 +39,30 @@ * could receive packets from multiple flows, so we use a hash table match to * find the specific flow for a datagram. * - * When a UDP flow is initiated from a listening socket we take a duplicate of - * the socket and store it in uflow->s[INISIDE]. This will last for the + * Flow sockets + * ============ + * + * When a UDP flow targets a socket, we create a "flow" socket in + * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive + * replies on the target side. This socket is both bound and connected and has + * EPOLL_TYPE_UDP. The connect() means it will only receive datagrams + * associated with this flow, so the epoll reference directly points to the flow + * and we don't need a hash lookup. + * + * When a flow is initiated from a listening socket, we create a "flow" socket + * with the same bound address as the listening socket, but also connect()ed to + * the flow's peer. This is stored in uflow->s[INISIDE] and will last for the * lifetime of the flow, even if the original listening socket is closed due to * port auto-probing. The duplicate is used to deliver replies back to the * originating side. * - * Reply sockets - * ============= - * - * When a UDP flow targets a socket, we create a "reply" socket in - * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive - * replies on the target side. This socket is both bound and connected and has - * EPOLL_TYPE_UDP_REPLY. The connect() means it will only receive datagrams - * associated with this flow, so the epoll reference directly points to the flow - * and we don't need a hash lookup. - * - * NOTE: it's possible that the reply socket could have a bound address - * overlapping with an unrelated listening socket. We assume datagrams for the - * flow will come to the reply socket in preference to a listening socket. The - * sample program doc/platform-requirements/reuseaddr-priority.c documents and - * tests that assumption. + * NOTE: A flow socket can have a bound address overlapping with a listening + * socket. That will happen naturally for flows initiated from a socket, but is + * also possible (though unlikely) for tap initiated flows, depending on the + * source port. We assume datagrams for the flow will come to a connect()ed + * socket in preference to a listening socket. The sample program + * doc/platform-requirements/reuseaddr-priority.c documents and tests that + * assumption. * * "Spliced" flows * =============== @@ -71,8 +74,7 @@ * actually used; it doesn't make sense for datagrams and instead a pair of * recvmmsg() and sendmmsg() is used to forward the datagrams. * - * Note that a spliced flow will have *both* a duplicated listening socket and a - * reply socket (see above). + * Note that a spliced flow will have two flow sockets (see above). */ #include <sched.h> @@ -557,7 +559,7 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) } eh = (const struct errhdr *)CMSG_DATA(hdr); - if (ref.type == EPOLL_TYPE_UDP_REPLY) { + if (ref.type == EPOLL_TYPE_UDP) { flow_sidx_t sidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(sidx); size_t dlen = rc; @@ -792,14 +794,14 @@ static bool udp_buf_reply_sock_data(const struct ctx *c, } /** - * udp_reply_sock_handler() - Handle new data from flow specific socket + * udp_sock_handler() - Handle new data from flow specific socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap * @now: Current timestamp */ -void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now) +void udp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now) { struct udp_flow *uflow = udp_at_sidx(ref.flowside); @@ -807,7 +809,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, if (events & EPOLLERR) { if (udp_sock_errs(c, ref) < 0) { - flow_err(uflow, "Unrecoverable error on reply socket"); + flow_err(uflow, "Unrecoverable error on flow socket"); goto fail; } } diff --git a/udp.h b/udp.h index a811475..8f8531a 100644 --- a/udp.h +++ b/udp.h @@ -11,8 +11,8 @@ void udp_portmap_clear(void); void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); -void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now); +void udp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now); int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, uint8_t ttl, const struct pool *p, int idx, diff --git a/udp_flow.c b/udp_flow.c index 99ae490..a2d417f 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -49,10 +49,7 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) flow_foreach_sidei(sidei) { flow_hash_remove(c, FLOW_SIDX(uflow, sidei)); if (uflow->s[sidei] >= 0) { - /* The listening socket needs to stay in epoll, but the - * flow specific one needs to be removed */ - if (sidei == TGTSIDE) - epoll_del(c, uflow->s[sidei]); + epoll_del(c, uflow->s[sidei]); close(uflow->s[sidei]); uflow->s[sidei] = -1; } @@ -81,7 +78,7 @@ static int udp_flow_sock(const struct ctx *c, } fref = { .sidx = FLOW_SIDX(uflow, sidei) }; int rc, s; - s = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, pif, side, fref.data); + s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data); if (s < 0) { flow_dbg_perror(uflow, "Couldn't open flow specific socket"); return s; @@ -120,13 +117,12 @@ static int udp_flow_sock(const struct ctx *c, * udp_flow_new() - Common setup for a new UDP flow * @c: Execution context * @flow: Initiated flow - * @s_ini: Initiating socket (or -1) * @now: Timestamp * * Return: UDP specific flow, if successful, NULL on failure */ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, - int s_ini, const struct timespec *now) + const struct timespec *now) { struct udp_flow *uflow = NULL; unsigned sidei; @@ -139,22 +135,12 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1; uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0; - if (s_ini >= 0) { - /* When using auto port-scanning the listening port could go - * away, so we need to duplicate the socket - */ - uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0); - if (uflow->s[INISIDE] < 0) { - flow_perror(uflow, - "Couldn't duplicate listening socket"); - goto cancel; - } + flow_foreach_sidei(sidei) { + if (pif_is_socket(uflow->f.pif[sidei])) + if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0) + goto cancel; } - if (pif_is_socket(flow->f.pif[TGTSIDE])) - if ((uflow->s[TGTSIDE] = udp_flow_sock(c, uflow, TGTSIDE)) < 0) - goto cancel; - /* Tap sides always need to be looked up by hash. Socket sides don't * always, but sometimes do (receiving packets on a socket not specific * to one flow). Unconditionally hash both sides so all our bases are @@ -225,7 +211,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, return FLOW_SIDX_NONE; } - return udp_flow_new(c, flow, ref.fd, now); + return udp_flow_new(c, flow, now); } /** @@ -281,7 +267,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, return FLOW_SIDX_NONE; } - return udp_flow_new(c, flow, -1, now); + return udp_flow_new(c, flow, now); } /** diff --git a/util.c b/util.c index b9a3d43..0f68cf5 100644 --- a/util.c +++ b/util.c @@ -71,7 +71,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, case EPOLL_TYPE_UDP_LISTEN: freebind = c->freebind; /* fallthrough */ - case EPOLL_TYPE_UDP_REPLY: + case EPOLL_TYPE_UDP: proto = IPPROTO_UDP; socktype = SOCK_DGRAM | SOCK_NONBLOCK; break;