mirror of
https://passt.top/passt
synced 2025-06-20 13:55:35 +02:00
udp: Use connect()ed sockets for initiating side
Currently we have an asymmetry in how we handle UDP sockets. For flows where the target side is a socket, we create a new connect()ed socket - the "reply socket" specifically for that flow used for sending and receiving datagrams on that flow and only that flow. For flows where the initiating side is a socket, we continue to use the "listening" socket (or rather, a dup() of it). This has some disadvantages: * We need a hash lookup for every datagram on the listening socket in order to work out what flow it belongs to * The dup() keeps the socket alive even if automatic forwarding removes the listening socket. However, the epoll data remains the same including containing the now stale original fd. This causes bug 103. * We can't (easily) set flow-specific options on an initiating side socket, because that could affect other flows as well Alter the code to use a connect()ed socket on the initiating side as well as the target side. There's no way to "clone and connect" the listening socket (a loose equivalent of accept() for UDP), so we have to create a new socket. We have to bind() this socket before we connect() it, which is allowed thanks to SO_REUSEADDR, but does leave a small window where it could receive datagrams not intended for this flow. For now we handle this by simply discarding any datagrams received between bind() and connect(), but I intend to improve this in a later patch. Link: https://bugs.passt.top/show_bug.cgi?id=103 Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
parent
a7775e9550
commit
d74b5a7c10
6 changed files with 43 additions and 55 deletions
|
@ -22,8 +22,8 @@ enum epoll_type {
|
||||||
EPOLL_TYPE_TCP_TIMER,
|
EPOLL_TYPE_TCP_TIMER,
|
||||||
/* UDP "listening" sockets */
|
/* UDP "listening" sockets */
|
||||||
EPOLL_TYPE_UDP_LISTEN,
|
EPOLL_TYPE_UDP_LISTEN,
|
||||||
/* UDP socket for replies on a specific flow */
|
/* UDP socket for a specific flow */
|
||||||
EPOLL_TYPE_UDP_REPLY,
|
EPOLL_TYPE_UDP,
|
||||||
/* ICMP/ICMPv6 ping sockets */
|
/* ICMP/ICMPv6 ping sockets */
|
||||||
EPOLL_TYPE_PING,
|
EPOLL_TYPE_PING,
|
||||||
/* inotify fd watching for end of netns (pasta) */
|
/* inotify fd watching for end of netns (pasta) */
|
||||||
|
|
6
passt.c
6
passt.c
|
@ -68,7 +68,7 @@ char *epoll_type_str[] = {
|
||||||
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
|
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
|
||||||
[EPOLL_TYPE_TCP_TIMER] = "TCP timer",
|
[EPOLL_TYPE_TCP_TIMER] = "TCP timer",
|
||||||
[EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket",
|
[EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket",
|
||||||
[EPOLL_TYPE_UDP_REPLY] = "UDP reply socket",
|
[EPOLL_TYPE_UDP] = "UDP flow socket",
|
||||||
[EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket",
|
[EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket",
|
||||||
[EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch",
|
[EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch",
|
||||||
[EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch",
|
[EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch",
|
||||||
|
@ -339,8 +339,8 @@ loop:
|
||||||
case EPOLL_TYPE_UDP_LISTEN:
|
case EPOLL_TYPE_UDP_LISTEN:
|
||||||
udp_listen_sock_handler(&c, ref, eventmask, &now);
|
udp_listen_sock_handler(&c, ref, eventmask, &now);
|
||||||
break;
|
break;
|
||||||
case EPOLL_TYPE_UDP_REPLY:
|
case EPOLL_TYPE_UDP:
|
||||||
udp_reply_sock_handler(&c, ref, eventmask, &now);
|
udp_sock_handler(&c, ref, eventmask, &now);
|
||||||
break;
|
break;
|
||||||
case EPOLL_TYPE_PING:
|
case EPOLL_TYPE_PING:
|
||||||
icmp_sock_handler(&c, ref);
|
icmp_sock_handler(&c, ref);
|
||||||
|
|
50
udp.c
50
udp.c
|
@ -39,27 +39,30 @@
|
||||||
* could receive packets from multiple flows, so we use a hash table match to
|
* could receive packets from multiple flows, so we use a hash table match to
|
||||||
* find the specific flow for a datagram.
|
* find the specific flow for a datagram.
|
||||||
*
|
*
|
||||||
* When a UDP flow is initiated from a listening socket we take a duplicate of
|
* Flow sockets
|
||||||
* the socket and store it in uflow->s[INISIDE]. This will last for the
|
* ============
|
||||||
|
*
|
||||||
|
* When a UDP flow targets a socket, we create a "flow" socket in
|
||||||
|
* uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
|
||||||
|
* replies on the target side. This socket is both bound and connected and has
|
||||||
|
* EPOLL_TYPE_UDP. The connect() means it will only receive datagrams
|
||||||
|
* associated with this flow, so the epoll reference directly points to the flow
|
||||||
|
* and we don't need a hash lookup.
|
||||||
|
*
|
||||||
|
* When a flow is initiated from a listening socket, we create a "flow" socket
|
||||||
|
* with the same bound address as the listening socket, but also connect()ed to
|
||||||
|
* the flow's peer. This is stored in uflow->s[INISIDE] and will last for the
|
||||||
* lifetime of the flow, even if the original listening socket is closed due to
|
* lifetime of the flow, even if the original listening socket is closed due to
|
||||||
* port auto-probing. The duplicate is used to deliver replies back to the
|
* port auto-probing. The duplicate is used to deliver replies back to the
|
||||||
* originating side.
|
* originating side.
|
||||||
*
|
*
|
||||||
* Reply sockets
|
* NOTE: A flow socket can have a bound address overlapping with a listening
|
||||||
* =============
|
* socket. That will happen naturally for flows initiated from a socket, but is
|
||||||
*
|
* also possible (though unlikely) for tap initiated flows, depending on the
|
||||||
* When a UDP flow targets a socket, we create a "reply" socket in
|
* source port. We assume datagrams for the flow will come to a connect()ed
|
||||||
* uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
|
* socket in preference to a listening socket. The sample program
|
||||||
* replies on the target side. This socket is both bound and connected and has
|
* doc/platform-requirements/reuseaddr-priority.c documents and tests that
|
||||||
* EPOLL_TYPE_UDP_REPLY. The connect() means it will only receive datagrams
|
* assumption.
|
||||||
* associated with this flow, so the epoll reference directly points to the flow
|
|
||||||
* and we don't need a hash lookup.
|
|
||||||
*
|
|
||||||
* NOTE: it's possible that the reply socket could have a bound address
|
|
||||||
* overlapping with an unrelated listening socket. We assume datagrams for the
|
|
||||||
* flow will come to the reply socket in preference to a listening socket. The
|
|
||||||
* sample program doc/platform-requirements/reuseaddr-priority.c documents and
|
|
||||||
* tests that assumption.
|
|
||||||
*
|
*
|
||||||
* "Spliced" flows
|
* "Spliced" flows
|
||||||
* ===============
|
* ===============
|
||||||
|
@ -71,8 +74,7 @@
|
||||||
* actually used; it doesn't make sense for datagrams and instead a pair of
|
* actually used; it doesn't make sense for datagrams and instead a pair of
|
||||||
* recvmmsg() and sendmmsg() is used to forward the datagrams.
|
* recvmmsg() and sendmmsg() is used to forward the datagrams.
|
||||||
*
|
*
|
||||||
* Note that a spliced flow will have *both* a duplicated listening socket and a
|
* Note that a spliced flow will have two flow sockets (see above).
|
||||||
* reply socket (see above).
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
|
@ -557,7 +559,7 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
|
||||||
}
|
}
|
||||||
|
|
||||||
eh = (const struct errhdr *)CMSG_DATA(hdr);
|
eh = (const struct errhdr *)CMSG_DATA(hdr);
|
||||||
if (ref.type == EPOLL_TYPE_UDP_REPLY) {
|
if (ref.type == EPOLL_TYPE_UDP) {
|
||||||
flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
|
flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
|
||||||
const struct flowside *toside = flowside_at_sidx(sidx);
|
const struct flowside *toside = flowside_at_sidx(sidx);
|
||||||
size_t dlen = rc;
|
size_t dlen = rc;
|
||||||
|
@ -792,14 +794,14 @@ static bool udp_buf_reply_sock_data(const struct ctx *c,
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* udp_reply_sock_handler() - Handle new data from flow specific socket
|
* udp_sock_handler() - Handle new data from flow specific socket
|
||||||
* @c: Execution context
|
* @c: Execution context
|
||||||
* @ref: epoll reference
|
* @ref: epoll reference
|
||||||
* @events: epoll events bitmap
|
* @events: epoll events bitmap
|
||||||
* @now: Current timestamp
|
* @now: Current timestamp
|
||||||
*/
|
*/
|
||||||
void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||||
uint32_t events, const struct timespec *now)
|
uint32_t events, const struct timespec *now)
|
||||||
{
|
{
|
||||||
struct udp_flow *uflow = udp_at_sidx(ref.flowside);
|
struct udp_flow *uflow = udp_at_sidx(ref.flowside);
|
||||||
|
|
||||||
|
@ -807,7 +809,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||||
|
|
||||||
if (events & EPOLLERR) {
|
if (events & EPOLLERR) {
|
||||||
if (udp_sock_errs(c, ref) < 0) {
|
if (udp_sock_errs(c, ref) < 0) {
|
||||||
flow_err(uflow, "Unrecoverable error on reply socket");
|
flow_err(uflow, "Unrecoverable error on flow socket");
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
4
udp.h
4
udp.h
|
@ -11,8 +11,8 @@
|
||||||
void udp_portmap_clear(void);
|
void udp_portmap_clear(void);
|
||||||
void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||||
uint32_t events, const struct timespec *now);
|
uint32_t events, const struct timespec *now);
|
||||||
void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||||
uint32_t events, const struct timespec *now);
|
uint32_t events, const struct timespec *now);
|
||||||
int udp_tap_handler(const struct ctx *c, uint8_t pif,
|
int udp_tap_handler(const struct ctx *c, uint8_t pif,
|
||||||
sa_family_t af, const void *saddr, const void *daddr,
|
sa_family_t af, const void *saddr, const void *daddr,
|
||||||
uint8_t ttl, const struct pool *p, int idx,
|
uint8_t ttl, const struct pool *p, int idx,
|
||||||
|
|
32
udp_flow.c
32
udp_flow.c
|
@ -49,10 +49,7 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
|
||||||
flow_foreach_sidei(sidei) {
|
flow_foreach_sidei(sidei) {
|
||||||
flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
|
flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
|
||||||
if (uflow->s[sidei] >= 0) {
|
if (uflow->s[sidei] >= 0) {
|
||||||
/* The listening socket needs to stay in epoll, but the
|
epoll_del(c, uflow->s[sidei]);
|
||||||
* flow specific one needs to be removed */
|
|
||||||
if (sidei == TGTSIDE)
|
|
||||||
epoll_del(c, uflow->s[sidei]);
|
|
||||||
close(uflow->s[sidei]);
|
close(uflow->s[sidei]);
|
||||||
uflow->s[sidei] = -1;
|
uflow->s[sidei] = -1;
|
||||||
}
|
}
|
||||||
|
@ -81,7 +78,7 @@ static int udp_flow_sock(const struct ctx *c,
|
||||||
} fref = { .sidx = FLOW_SIDX(uflow, sidei) };
|
} fref = { .sidx = FLOW_SIDX(uflow, sidei) };
|
||||||
int rc, s;
|
int rc, s;
|
||||||
|
|
||||||
s = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, pif, side, fref.data);
|
s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data);
|
||||||
if (s < 0) {
|
if (s < 0) {
|
||||||
flow_dbg_perror(uflow, "Couldn't open flow specific socket");
|
flow_dbg_perror(uflow, "Couldn't open flow specific socket");
|
||||||
return s;
|
return s;
|
||||||
|
@ -120,13 +117,12 @@ static int udp_flow_sock(const struct ctx *c,
|
||||||
* udp_flow_new() - Common setup for a new UDP flow
|
* udp_flow_new() - Common setup for a new UDP flow
|
||||||
* @c: Execution context
|
* @c: Execution context
|
||||||
* @flow: Initiated flow
|
* @flow: Initiated flow
|
||||||
* @s_ini: Initiating socket (or -1)
|
|
||||||
* @now: Timestamp
|
* @now: Timestamp
|
||||||
*
|
*
|
||||||
* Return: UDP specific flow, if successful, NULL on failure
|
* Return: UDP specific flow, if successful, NULL on failure
|
||||||
*/
|
*/
|
||||||
static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
|
static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
|
||||||
int s_ini, const struct timespec *now)
|
const struct timespec *now)
|
||||||
{
|
{
|
||||||
struct udp_flow *uflow = NULL;
|
struct udp_flow *uflow = NULL;
|
||||||
unsigned sidei;
|
unsigned sidei;
|
||||||
|
@ -139,22 +135,12 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
|
||||||
uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
|
uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
|
||||||
uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;
|
uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;
|
||||||
|
|
||||||
if (s_ini >= 0) {
|
flow_foreach_sidei(sidei) {
|
||||||
/* When using auto port-scanning the listening port could go
|
if (pif_is_socket(uflow->f.pif[sidei]))
|
||||||
* away, so we need to duplicate the socket
|
if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0)
|
||||||
*/
|
goto cancel;
|
||||||
uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0);
|
|
||||||
if (uflow->s[INISIDE] < 0) {
|
|
||||||
flow_perror(uflow,
|
|
||||||
"Couldn't duplicate listening socket");
|
|
||||||
goto cancel;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pif_is_socket(flow->f.pif[TGTSIDE]))
|
|
||||||
if ((uflow->s[TGTSIDE] = udp_flow_sock(c, uflow, TGTSIDE)) < 0)
|
|
||||||
goto cancel;
|
|
||||||
|
|
||||||
/* Tap sides always need to be looked up by hash. Socket sides don't
|
/* Tap sides always need to be looked up by hash. Socket sides don't
|
||||||
* always, but sometimes do (receiving packets on a socket not specific
|
* always, but sometimes do (receiving packets on a socket not specific
|
||||||
* to one flow). Unconditionally hash both sides so all our bases are
|
* to one flow). Unconditionally hash both sides so all our bases are
|
||||||
|
@ -225,7 +211,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
|
||||||
return FLOW_SIDX_NONE;
|
return FLOW_SIDX_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
return udp_flow_new(c, flow, ref.fd, now);
|
return udp_flow_new(c, flow, now);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -281,7 +267,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
|
||||||
return FLOW_SIDX_NONE;
|
return FLOW_SIDX_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
return udp_flow_new(c, flow, -1, now);
|
return udp_flow_new(c, flow, now);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
2
util.c
2
util.c
|
@ -71,7 +71,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
|
||||||
case EPOLL_TYPE_UDP_LISTEN:
|
case EPOLL_TYPE_UDP_LISTEN:
|
||||||
freebind = c->freebind;
|
freebind = c->freebind;
|
||||||
/* fallthrough */
|
/* fallthrough */
|
||||||
case EPOLL_TYPE_UDP_REPLY:
|
case EPOLL_TYPE_UDP:
|
||||||
proto = IPPROTO_UDP;
|
proto = IPPROTO_UDP;
|
||||||
socktype = SOCK_DGRAM | SOCK_NONBLOCK;
|
socktype = SOCK_DGRAM | SOCK_NONBLOCK;
|
||||||
break;
|
break;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue