From 3af5e9fdba4cbff1389e507489578f81d21a5181 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 29 Feb 2024 15:15:32 +1100 Subject: [PATCH] icmp: Store ping socket information in flow table Currently icmp_id_map[][] stores information about ping sockets in a bespoke structure. Move the same information into new types of flow in the flow table. To match that change, replace the existing ICMP timer with a flow-based timer for expiring ping sockets. This has the advantage that we only need to scan the active flows, not all possible ids. We convert icmp_id_map[][] to point to the flow table entries, rather than containing its own information. We do still use that array for locating the right ping flows, rather than using a "flow native" form of lookup for the time being. Signed-off-by: David Gibson [sbrivio: Update id_sock description in comment to icmp_ping_new()] Signed-off-by: Stefano Brivio --- Makefile | 6 +-- flow.c | 9 ++++ flow.h | 4 ++ flow_table.h | 2 + icmp.c | 147 +++++++++++++++++++++++---------------------------- icmp.h | 2 +- icmp_flow.h | 31 +++++++++++ passt.c | 5 -- 8 files changed, 117 insertions(+), 89 deletions(-) create mode 100644 icmp_flow.h diff --git a/Makefile b/Makefile index 2735797..8428052 100644 --- a/Makefile +++ b/Makefile @@ -54,9 +54,9 @@ SRCS = $(PASST_SRCS) $(QRAP_SRCS) MANPAGES = passt.1 pasta.1 qrap.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ - flow_table.h icmp.h inany.h iov.h ip.h isolation.h lineread.h log.h \ - ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h siphash.h tap.h \ - tcp.h tcp_conn.h tcp_splice.h udp.h util.h + flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ + lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ + siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/flow.c b/flow.c index 5bb24cc..80dd269 100644 --- a/flow.c +++ b/flow.c @@ -22,6 +22,8 @@ const char *flow_type_str[] = { [FLOW_TYPE_NONE] = "", [FLOW_TCP] = "TCP connection", [FLOW_TCP_SPLICE] = "TCP connection (spliced)", + [FLOW_PING4] = "ICMP ping sequence", + [FLOW_PING6] = "ICMPv6 ping sequence", }; static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES, "flow_type_str[] doesn't match enum flow_type"); @@ -29,6 +31,8 @@ static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES, const uint8_t flow_proto[] = { [FLOW_TCP] = IPPROTO_TCP, [FLOW_TCP_SPLICE] = IPPROTO_TCP, + [FLOW_PING4] = IPPROTO_ICMP, + [FLOW_PING6] = IPPROTO_ICMPV6, }; static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, "flow_proto[] doesn't match enum flow_type"); @@ -295,6 +299,11 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) if (!closed && timer) tcp_splice_timer(c, flow); break; + case FLOW_PING4: + case FLOW_PING6: + if (timer) + closed = icmp_ping_timer(c, flow, now); + break; default: /* Assume other flow types don't need any handling */ ; diff --git a/flow.h b/flow.h index 8b66751..c943c44 100644 --- a/flow.h +++ b/flow.h @@ -19,6 +19,10 @@ enum flow_type { FLOW_TCP, /* A TCP connection between a host socket and ns socket */ FLOW_TCP_SPLICE, + /* ICMP echo requests from guest to host and matching replies back */ + FLOW_PING4, + /* ICMPv6 echo requests from guest to host and matching replies back */ + FLOW_PING6, FLOW_NUM_TYPES, }; diff --git a/flow_table.h b/flow_table.h index eecf884..b7e5529 100644 --- a/flow_table.h +++ b/flow_table.h @@ -8,6 +8,7 @@ #define FLOW_TABLE_H #include "tcp_conn.h" +#include "icmp_flow.h" /** * struct flow_free_cluster - Information about a cluster of free entries @@ -33,6 +34,7 @@ union flow { struct flow_free_cluster free; struct tcp_tap_conn tcp; struct tcp_splice_conn tcp_splice; + struct icmp_ping_flow ping; }; /* Global Flow Table */ diff --git a/icmp.c b/icmp.c index 49d6dd9..cc8c7ee 100644 --- a/icmp.c +++ b/icmp.c @@ -40,24 +40,17 @@ #include "siphash.h" #include "inany.h" #include "icmp.h" +#include "flow_table.h" #define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */ #define ICMP_NUM_IDS (1U << 16) -/** - * struct icmp_id_sock - Tracking information for single ICMP echo identifier - * @sock: Bound socket for identifier - * @seq: Last sequence number sent to tap, host order, -1: not sent yet - * @ts: Last associated activity from tap, seconds - */ -struct icmp_id_sock { - int sock; - int seq; - time_t ts; -}; +/* Sides of a flow as we use them for ping streams */ +#define SOCKSIDE 0 +#define TAPSIDE 1 /* Indexed by ICMP echo identifier */ -static struct icmp_id_sock icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS]; +static struct icmp_ping_flow *icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS]; /** * icmp_sock_handler() - Handle new data from ICMP or ICMPv6 socket @@ -67,8 +60,8 @@ static struct icmp_id_sock icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS]; */ void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref) { - struct icmp_id_sock *const id_sock = af == AF_INET - ? &icmp_id_map[V4][ref.icmp.id] : &icmp_id_map[V6][ref.icmp.id]; + struct icmp_ping_flow *pingf = af == AF_INET + ? icmp_id_map[V4][ref.icmp.id] : icmp_id_map[V6][ref.icmp.id]; const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6"; union sockaddr_inany sr; socklen_t sl = sizeof(sr); @@ -79,6 +72,8 @@ void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref) if (c->no_icmp) return; + ASSERT(pingf); + n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl); if (n < 0) { warn("%s: recvfrom() error on ping socket: %s", @@ -113,10 +108,10 @@ void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref) /* In PASTA mode, we'll get any reply we send, discard them. */ if (c->mode == MODE_PASTA) { - if (id_sock->seq == seq) + if (pingf->seq == seq) return; - id_sock->seq = seq; + pingf->seq = seq; } debug("%s: echo reply to tap, ID: %"PRIu16", seq: %"PRIu16, pname, @@ -133,36 +128,52 @@ unexpected: } /** - * icmp_ping_close() - Close and clean up a ping socket + * icmp_ping_close() - Close and clean up a ping flow * @c: Execution context - * @id_sock: Socket number and other info + * @pingf: ping flow entry to close */ -static void icmp_ping_close(const struct ctx *c, struct icmp_id_sock *id_sock) +static void icmp_ping_close(const struct ctx *c, + const struct icmp_ping_flow *pingf) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_sock->sock, NULL); - close(id_sock->sock); - id_sock->sock = -1; - id_sock->seq = -1; + uint16_t id = pingf->id; + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, pingf->sock, NULL); + close(pingf->sock); + + if (pingf->f.type == FLOW_PING4) + icmp_id_map[V4][id] = NULL; + else + icmp_id_map[V6][id] = NULL; } /** * icmp_ping_new() - Prepare a new ping socket for a new id * @c: Execution context - * @id_sock: Socket fd and other information + * @id_sock: Pointer to ping flow entry slot in icmp_id_map[] to update * @af: Address family, AF_INET or AF_INET6 * @id: ICMP id for the new socket * - * Return: Newly opened ping socket fd, or -1 on failure + * Return: Newly opened ping flow, or NULL on failure */ -static int icmp_ping_new(const struct ctx *c, struct icmp_id_sock *id_sock, - sa_family_t af, uint16_t id) +static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, + struct icmp_ping_flow **id_sock, + sa_family_t af, uint16_t id) { - uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6; const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6"; + uint8_t flowtype = af == AF_INET ? FLOW_PING4 : FLOW_PING6; union icmp_epoll_ref iref = { .id = id }; + union flow *flow = flow_alloc(); + struct icmp_ping_flow *pingf; const void *bind_addr; const char *bind_if; - int s; + + if (!flow) + return NULL; + + pingf = FLOW_START(flow, flowtype, ping, TAPSIDE); + + pingf->seq = -1; + pingf->id = id; if (af == AF_INET) { bind_addr = &c->ip4.addr_out; @@ -172,28 +183,28 @@ static int icmp_ping_new(const struct ctx *c, struct icmp_id_sock *id_sock, bind_if = c->ip6.ifname_out; } - s = sock_l4(c, af, proto, bind_addr, bind_if, 0, iref.u32); + pingf->sock = sock_l4(c, af, flow_proto[flowtype], bind_addr, bind_if, + 0, iref.u32); - if (s < 0) { + if (pingf->sock < 0) { warn("Cannot open \"ping\" socket. You might need to:"); warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\""); warn("...echo requests/replies will fail."); goto cancel; } - if (s > FD_REF_MAX) + if (pingf->sock > FD_REF_MAX) goto cancel; - id_sock->sock = s; + *id_sock = pingf; - debug("%s: new socket %i for echo ID %"PRIu16, pname, s, id); + debug("%s: new socket %i for echo ID %"PRIu16, pname, pingf->sock, id); - return s; + return pingf; cancel: - if (s >= 0) - close(s); - return -1; + flow_alloc_cancel(flow); + return NULL; } /** @@ -215,14 +226,13 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6"; union sockaddr_inany sa = { .sa_family = af }; const socklen_t sl = af == AF_INET ? sizeof(sa.sa4) : sizeof(sa.sa6); - struct icmp_id_sock *id_sock; + struct icmp_ping_flow *pingf, **id_sock; uint16_t id, seq; size_t plen; void *pkt; - int s; (void)saddr; - (void)pif; + ASSERT(pif == PIF_TAP); if (af == AF_INET) { const struct icmphdr *ih; @@ -261,13 +271,13 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, ASSERT(0); } - if ((s = id_sock->sock) < 0) - if ((s = icmp_ping_new(c, id_sock, af, id)) < 0) + if (!(pingf = *id_sock)) + if (!(pingf = icmp_ping_new(c, id_sock, af, id))) return 1; - id_sock->ts = now->tv_sec; + pingf->ts = now->tv_sec; - if (sendto(s, pkt, plen, MSG_NOSIGNAL, &sa.sa, sl) < 0) { + if (sendto(pingf->sock, pkt, plen, MSG_NOSIGNAL, &sa.sa, sl) < 0) { debug("%s: failed to relay request to socket: %s", pname, strerror(errno)); } else { @@ -279,44 +289,21 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, } /** - * icmp_timer_one() - Handler for timed events related to a given identifier + * icmp_ping_timer() - Handler for timed events related to a given flow * @c: Execution context - * @id_sock: Socket fd and activity timestamp + * @flow: flow table entry to check for timeout * @now: Current timestamp + * + * Return: true if the flow is ready to free, false otherwise */ -static void icmp_timer_one(const struct ctx *c, struct icmp_id_sock *id_sock, - const struct timespec *now) +bool icmp_ping_timer(const struct ctx *c, union flow *flow, + const struct timespec *now) { - if (id_sock->sock < 0 || now->tv_sec - id_sock->ts <= ICMP_ECHO_TIMEOUT) - return; + const struct icmp_ping_flow *pingf = &flow->ping; - icmp_ping_close(c, id_sock); -} - -/** - * icmp_timer() - Scan activity bitmap for identifiers with timed events - * @c: Execution context - * @now: Current timestamp - */ -void icmp_timer(const struct ctx *c, const struct timespec *now) -{ - unsigned int i; - - for (i = 0; i < ICMP_NUM_IDS; i++) { - icmp_timer_one(c, &icmp_id_map[V4][i], now); - icmp_timer_one(c, &icmp_id_map[V6][i], now); - } -} - -/** - * icmp_init() - Initialise sequences in ID map to -1 (no sequence sent yet) - */ -void icmp_init(void) -{ - unsigned i; - - for (i = 0; i < ICMP_NUM_IDS; i++) { - icmp_id_map[V4][i].seq = icmp_id_map[V6][i].seq = -1; - icmp_id_map[V4][i].sock = icmp_id_map[V6][i].sock = -1; - } + if (now->tv_sec - pingf->ts <= ICMP_ECHO_TIMEOUT) + return false; + + icmp_ping_close(c, pingf); + return true; } diff --git a/icmp.h b/icmp.h index 263101d..4f695da 100644 --- a/icmp.h +++ b/icmp.h @@ -9,12 +9,12 @@ #define ICMP_TIMER_INTERVAL 10000 /* ms */ struct ctx; +struct icmp_ping_flow; void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref); int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now); -void icmp_timer(const struct ctx *c, const struct timespec *now); void icmp_init(void); /** diff --git a/icmp_flow.h b/icmp_flow.h new file mode 100644 index 0000000..5a2eed9 --- /dev/null +++ b/icmp_flow.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson + * + * ICMP flow tracking data structures + */ +#ifndef ICMP_FLOW_H +#define ICMP_FLOW_H + +/** + * struct icmp_ping_flow - Descriptor for a flow of ping requests/replies + * @f: Generic flow information + * @seq: Last sequence number sent to tap, host order, -1: not sent yet + * @sock: "ping" socket + * @ts: Last associated activity from tap, seconds + * @id: ICMP id for the flow as seen by the guest + */ +struct icmp_ping_flow { + /* Must be first element */ + struct flow_common f; + + int seq; + int sock; + time_t ts; + uint16_t id; +}; + +bool icmp_ping_timer(const struct ctx *c, union flow *flow, + const struct timespec *now); + +#endif /* ICMP_FLOW_H */ diff --git a/passt.c b/passt.c index a061f2b..0f79492 100644 --- a/passt.c +++ b/passt.c @@ -106,8 +106,6 @@ static void post_handler(struct ctx *c, const struct timespec *now) CALL_PROTO_HANDLER(c, now, tcp, TCP); /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ CALL_PROTO_HANDLER(c, now, udp, UDP); - /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ - CALL_PROTO_HANDLER(c, now, icmp, ICMP); flow_defer_handler(c, now); #undef CALL_PROTO_HANDLER @@ -288,9 +286,6 @@ int main(int argc, char **argv) if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c))) exit(EXIT_FAILURE); - if (!c.no_icmp) - icmp_init(); - proto_update_l2_buf(c.mac_guest, c.mac); if (c.ifi4 && !c.no_dhcp)