passt: Add PASTA mode, major rework

PASTA (Pack A Subtle Tap Abstraction) provides quasi-native host
connectivity to an otherwise disconnected, unprivileged network
and user namespace, similarly to slirp4netns. Given that the
implementation is largely overlapping with PASST, no separate binary
is built: 'pasta' (and 'passt4netns' for clarity) both link to
'passt', and the mode of operation is selected depending on how the
binary is invoked. Usage example:

	$ unshare -rUn
	# echo $$
	1871759

	$ ./pasta 1871759	# From another terminal

	# udhcpc -i pasta0 2>/dev/null
	# ping -c1 pasta.pizza
	PING pasta.pizza (64.190.62.111) 56(84) bytes of data.
	64 bytes from 64.190.62.111 (64.190.62.111): icmp_seq=1 ttl=255 time=34.6 ms

	--- pasta.pizza ping statistics ---
	1 packets transmitted, 1 received, 0% packet loss, time 0ms
	rtt min/avg/max/mdev = 34.575/34.575/34.575/0.000 ms
	# ping -c1 spaghetti.pizza
	PING spaghetti.pizza(2606:4700:3034::6815:147a (2606:4700:3034::6815:147a)) 56 data bytes
	64 bytes from 2606:4700:3034::6815:147a (2606:4700:3034::6815:147a): icmp_seq=1 ttl=255 time=29.0 ms

	--- spaghetti.pizza ping statistics ---
	1 packets transmitted, 1 received, 0% packet loss, time 0ms
	rtt min/avg/max/mdev = 28.967/28.967/28.967/0.000 ms

This entails a major rework, especially with regard to the storage of
tracked connections and to the semantics of epoll(7) references.

Indexing TCP and UDP bindings merely by socket proved to be
inflexible and unsuitable to handle different connection flows: pasta
also provides Layer-2 to Layer-2 socket mapping between init and a
separate namespace for local connections, using a pair of splice()
system calls for TCP, and a recvmmsg()/sendmmsg() pair for UDP local
bindings. For instance, building on the previous example:

	# ip link set dev lo up
	# iperf3 -s

	$ iperf3 -c ::1 -Z -w 32M -l 1024k -P2 | tail -n4
	[SUM]   0.00-10.00  sec  52.3 GBytes  44.9 Gbits/sec  283             sender
	[SUM]   0.00-10.43  sec  52.3 GBytes  43.1 Gbits/sec                  receiver

	iperf Done.

epoll(7) references now include a generic part in order to
demultiplex data to the relevant protocol handler, using 24
bits for the socket number, and an opaque portion reserved for
usage by the single protocol handlers, in order to track sockets
back to corresponding connections and bindings.

A number of fixes pertaining to TCP state machine and congestion
window handling are also included here.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Stefano Brivio 2021-07-17 08:34:53 +02:00
parent 28fca04eb9
commit 33482d5bf2
20 changed files with 2836 additions and 1335 deletions

View file

@ -1,13 +1,20 @@
CFLAGS += -Wall -Wextra -pedantic
CFLAGS += -DRLIMIT_STACK_VAL=$(shell ulimit -s)
all: passt qrap
all: passt pasta passt4netns qrap
passt: passt.c passt.h arp.c arp.h dhcp.c dhcp.h dhcpv6.c dhcpv6.h pcap.c pcap.h ndp.c ndp.h siphash.c siphash.h tap.c tap.h icmp.c icmp.h tcp.c tcp.h udp.c udp.h util.c util.h
$(CC) $(CFLAGS) passt.c arp.c dhcp.c dhcpv6.c pcap.c ndp.c siphash.c tap.c icmp.c tcp.c udp.c util.c -o passt
pasta: passt
ln -s passt pasta
passt4netns: passt
ln -s passt passt4netns
qrap: qrap.c passt.h
$(CC) $(CFLAGS) -DARCH=\"$(shell uname -m)\" qrap.c -o qrap
.PHONY: clean
clean:
-${RM} passt *.o qrap
-${RM} passt *.o qrap pasta passt4netns

9
arp.c
View file

@ -1,12 +1,15 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* arp.c - ARP implementation
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
*/
#include <stdio.h>
@ -22,9 +25,9 @@
#include <net/if_arp.h>
#include <arpa/inet.h>
#include "util.h"
#include "passt.h"
#include "dhcp.h"
#include "util.h"
#include "tap.h"
#include "arp.h"
@ -66,7 +69,7 @@ int arp(struct ctx *c, struct ethhdr *eh, size_t len)
memcpy(eh->h_dest, eh->h_source, ETH_ALEN);
memcpy(eh->h_source, c->mac, ETH_ALEN);
if (tap_send(c->fd_unix, eh, len, 0) < 0)
if (tap_send(c, eh, len, 0) < 0)
perror("ARP: send");
return 1;

9
dhcp.c
View file

@ -1,12 +1,15 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* dhcp.c - Minimalistic DHCP server for PASST
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
*/
#include <stdio.h>
@ -21,9 +24,9 @@
#include <net/if.h>
#include <arpa/inet.h>
#include "util.h"
#include "passt.h"
#include "dhcp.h"
#include "util.h"
#include "tap.h"
/**
@ -322,7 +325,7 @@ int dhcp(struct ctx *c, struct ethhdr *eh, size_t len)
memcpy(eh->h_dest, eh->h_source, ETH_ALEN);
memcpy(eh->h_source, c->mac, ETH_ALEN);
if (tap_send(c->fd_unix, eh, len, 0) < 0)
if (tap_send(c, eh, len, 0) < 0)
perror("DHCP: send");
return 1;

View file

@ -1,12 +1,15 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* dhcpv6.c - Minimalistic DHCPv6 server for PASST
*
* Copyright (c) 2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
*/
#include <stdio.h>
@ -23,9 +26,9 @@
#include <net/if.h>
#include <net/if_arp.h>
#include "util.h"
#include "passt.h"
#include "tap.h"
#include "util.h"
/**
* struct opt_hdr - DHCPv6 option header

185
icmp.c
View file

@ -1,12 +1,15 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* icmp.c - ICMP/ICMPv6 echo proxy
*
* Copyright (c) 2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
*/
#include <stdio.h>
@ -28,57 +31,91 @@
#include <linux/icmpv6.h>
#include <time.h>
#include "util.h"
#include "passt.h"
#include "tap.h"
#include "util.h"
#include "icmp.h"
#define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */
/**
* struct icmp_id - Tracking information for single ICMP echo identifier
* @sock: Bound socket for identifier
* @ts: Last associated activity from tap, seconds
* @seq: Last sequence number sent to tap, host order
*/
struct icmp_id {
int sock;
time_t ts;
uint16_t seq;
};
/* Indexed by ICMP echo identifier */
static int icmp_s_v4[USHRT_MAX];
static int icmp_s_v6[USHRT_MAX];
static struct icmp_id icmp_id_map [IP_VERSIONS][USHRT_MAX];
/* Bitmaps, activity monitoring needed for identifier */
static uint8_t icmp_act [IP_VERSIONS][USHRT_MAX / 8];
/**
* icmp_sock_handler() - Handle new data from socket
* @c: Execution context
* @s: File descriptor number for socket
* @ref: epoll reference
* @events: epoll events bitmap
* @pkt_buf: Buffer to receive packets, currently unused
* @now: Current timestamp, unused
*/
void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now)
{
struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0xff, 0xff,
0, 0, 0, 0 } };
struct sockaddr_storage sr, sl;
socklen_t slen = sizeof(sr);
struct sockaddr_storage sr;
socklen_t sl = sizeof(sr);
char buf[USHRT_MAX];
uint16_t seq, id;
ssize_t n;
(void)events;
(void)pkt_buf;
(void)now;
n = recvfrom(s, buf, sizeof(buf), MSG_DONTWAIT,
(struct sockaddr *)&sr, &slen);
n = recvfrom(ref.s, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
if (n < 0)
return;
if (getsockname(s, (struct sockaddr *)&sl, &slen))
if (ref.icmp.v6) {
struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr;
struct icmp6hdr *ih = (struct icmp6hdr *)buf;
/* In PASTA mode, we'll get any reply we send, discard them. */
if (c->mode == MODE_PASTA) {
seq = ntohs(ih->icmp6_sequence);
id = ntohs(ih->icmp6_identifier);
if (icmp_id_map[V6][id].seq == seq)
return;
if (sl.ss_family == AF_INET) {
icmp_id_map[V6][id].seq = seq;
}
tap_ip_send(c, &sr6->sin6_addr, IPPROTO_ICMPV6, buf, n);
} else {
struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr;
struct icmphdr *ih = (struct icmphdr *)buf;
if (c->mode == MODE_PASTA) {
seq = ntohs(ih->un.echo.sequence);
id = ntohs(ih->un.echo.id);
if (icmp_id_map[V4][id].seq == seq)
return;
icmp_id_map[V4][id].seq = seq;
}
memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr));
tap_ip_send(c, &a6, IPPROTO_ICMP, buf, n);
} else if (sl.ss_family == AF_INET6) {
struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr;
tap_ip_send(c, &sr6->sin6_addr, IPPROTO_ICMPV6, buf, n);
}
}
@ -86,101 +123,131 @@ void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
* icmp_tap_handler() - Handle packets from tap
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @
* @msg: Input message
* @count: Message count (always 1 for ICMP)
* @now: Current timestamp, unused
* @now: Current timestamp
*
* Return: count of consumed packets (always 1, even if malformed)
*/
int icmp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now)
{
int s;
(void)count;
(void)now;
(void)c;
if (af == AF_INET) {
struct icmphdr *ih = (struct icmphdr *)msg[0].l4h;
union icmp_epoll_ref iref = { .v6 = 0 };
struct sockaddr_in sa = {
.sin_family = AF_INET,
.sin_addr = { .s_addr = INADDR_ANY },
.sin_port = ih->un.echo.id,
};
int id, s;
if (msg[0].l4_len < sizeof(*ih) || ih->type != ICMP_ECHO)
return 1;
if ((s = icmp_s_v4[ntohs(ih->un.echo.id)]) < 0)
return 1;
id = ntohs(ih->un.echo.id);
bind(s, (struct sockaddr *)&sa, sizeof(sa));
if ((s = icmp_id_map[V4][id].sock) <= 0) {
s = sock_l4(c, AF_INET, IPPROTO_ICMP, id, 0, iref.u32);
if (s < 0)
goto fail_sock;
icmp_id_map[V4][id].sock = s;
}
icmp_id_map[V4][id].ts = now->tv_sec;
bitmap_set(icmp_act[V4], id);
sa.sin_addr = *(struct in_addr *)addr;
sendto(s, msg[0].l4h, msg[0].l4_len,
MSG_DONTWAIT | MSG_NOSIGNAL,
sendto(s, msg[0].l4h, msg[0].l4_len, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa));
} else if (af == AF_INET6) {
struct icmp6hdr *ih = (struct icmp6hdr *)msg[0].l4h;
union icmp_epoll_ref iref = { .v6 = 1 };
struct sockaddr_in6 sa = {
.sin6_family = AF_INET6,
.sin6_addr = IN6ADDR_ANY_INIT,
.sin6_port = ih->icmp6_identifier,
};
int id, s;
if (msg[0].l4_len < sizeof(*ih) ||
(ih->icmp6_type != 128 && ih->icmp6_type != 129))
return 1;
if ((s = icmp_s_v6[ntohs(ih->icmp6_identifier)]) < 0)
return 1;
id = ntohs(ih->icmp6_identifier);
if ((s = icmp_id_map[V6][id].sock) <= 0) {
s = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, id, 0,
iref.u32);
if (s < 0)
goto fail_sock;
bind(s, (struct sockaddr *)&sa, sizeof(sa));
icmp_id_map[V6][id].sock = s;
}
icmp_id_map[V6][id].ts = now->tv_sec;
bitmap_set(icmp_act[V6], id);
sa.sin6_addr = *(struct in6_addr *)addr;
sendto(s, msg[0].l4h, msg[0].l4_len,
MSG_DONTWAIT | MSG_NOSIGNAL,
sendto(s, msg[0].l4h, msg[0].l4_len, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa));
}
return 1;
fail_sock:
warn("Cannot open \"ping\" socket. You might need to:");
warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\"");
warn("...echo requests/replies will fail.");
return 1;
}
/**
* icmp_sock_init() - Create ICMP, ICMPv6 sockets for echo requests and replies
* icmp_timer_one() - Handler for timed events related to a given identifier
* @c: Execution context
*
* Return: 0 on success, -1 on failure
* @v6: Set for IPv6 echo identifier bindings
* @id: Echo identifier, host order
* @ts: Timestamp from caller
*/
int icmp_sock_init(struct ctx *c)
static void icmp_timer_one(struct ctx *c, int v6, uint16_t id,
struct timespec *ts)
{
int i, fail = 0;
struct icmp_id *id_map = &icmp_id_map[v6 ? V6 : V4][id];
c->icmp.fd_min = INT_MAX;
c->icmp.fd_max = 0;
if (ts->tv_sec - id_map->ts <= ICMP_ECHO_TIMEOUT)
return;
if (c->v4) {
for (i = 0; i < USHRT_MAX; i++) {
icmp_s_v4[i] = sock_l4(c, AF_INET, IPPROTO_ICMP, i);
if (icmp_s_v4[i] < 0)
fail = 1;
bitmap_clear(icmp_act[v6 ? V6 : V4], id);
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_map->sock, NULL);
close(id_map->sock);
id_map->sock = 0;
}
/**
* icmp_timer() - Scan activity bitmap for identifiers with timed events
* @c: Execution context
* @ts: Timestamp from caller
*/
void icmp_timer(struct ctx *c, struct timespec *ts)
{
long *word, tmp;
unsigned int i;
int n, v6 = 0;
v6:
word = (long *)icmp_act[v6 ? V6 : V4];
for (i = 0; i < sizeof(icmp_act[0]) / sizeof(long); i++, word++) {
tmp = *word;
while ((n = ffsl(tmp))) {
tmp &= ~(1UL << (n - 1));
icmp_timer_one(c, v6, i * sizeof(long) * 8 + n - 1, ts);
}
}
if (c->v6) {
for (i = 0; i < USHRT_MAX; i++) {
icmp_s_v6[i] = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, i);
if (icmp_s_v6[i] < 0)
fail = 1;
if (!v6) {
v6 = 1;
goto v6;
}
}
if (fail) {
warn("Cannot open some \"ping\" sockets. You might need to:");
warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\"");
warn("...echo requests/replies might fail.");
}
return 0;
}

24
icmp.h
View file

@ -1,22 +1,34 @@
#ifndef ICMP_H
#define ICMP_H
#define ICMP_TIMER_INTERVAL 1000 /* ms */
struct ctx;
void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
int icmp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now);
int icmp_sock_init(struct ctx *c);
void icmp_timer(struct ctx *c, struct timespec *ts);
/**
* union icmp_epoll_ref - epoll reference portion for ICMP tracking
* @v6: Set for IPv6 sockets or connections
* @u32: Opaque u32 value of reference
*/
union icmp_epoll_ref {
struct {
uint32_t v6:1;
};
uint32_t u32;
};
/**
* struct icmp_ctx - Execution context for ICMP routines
* @fd_min: Lowest file descriptor number for ICMP/ICMPv6 ever used
* @fd_max: Highest file descriptor number for ICMP/ICMPv6 ever used
* @timer_run: Timestamp of most recent timer run
*/
struct icmp_ctx {
int fd_min;
int fd_max;
struct timespec timer_run;
};
#endif /* ICMP_H */

8
ndp.c
View file

@ -1,6 +1,10 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* ndp.c - NDP support for PASST
*
@ -23,8 +27,8 @@
#include <net/if.h>
#include <net/if_arp.h>
#include "passt.h"
#include "util.h"
#include "passt.h"
#include "tap.h"
#define RS 133
@ -175,7 +179,7 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
memcpy(ehr->h_source, c->mac, ETH_ALEN);
ehr->h_proto = htons(ETH_P_IPV6);
if (tap_send(c->fd_unix, ehr, len, 0) < 0)
if (tap_send(c, ehr, len, 0) < 0)
perror("NDP: send");
return 1;

543
passt.c
View file

@ -1,18 +1,26 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* passt.c - Daemon implementation
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
* Grab Ethernet frames via AF_UNIX socket, build SOCK_DGRAM/SOCK_STREAM sockets
* for each 5-tuple from TCP, UDP packets, perform connection tracking and
* forward them. Forward packets received on sockets back to the UNIX domain
* socket (typically, a socket virtio_net file descriptor from qemu).
* Grab Ethernet frames from AF_UNIX socket (in "passt" mode) or tap device (in
* "pasta" mode), build SOCK_DGRAM/SOCK_STREAM sockets for each 5-tuple from
* TCP, UDP packets, perform connection tracking and forward them. Forward
* packets received on sockets back to the UNIX domain socket (typically, a
* socket virtio_net file descriptor from qemu) or to the tap device (typically,
* created in a separate network namespace).
*/
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <sys/epoll.h>
#include <sys/socket.h>
@ -44,91 +52,32 @@
#include <syslog.h>
#include <sys/stat.h>
#include "passt.h"
#include "arp.h"
#include "dhcp.h"
#include "ndp.h"
#include "dhcpv6.h"
#include "util.h"
#include "passt.h"
#include "dhcpv6.h"
#include "icmp.h"
#include "tcp.h"
#include "udp.h"
#include "pcap.h"
#include "tap.h"
#define EPOLL_EVENTS 10
#define TAP_BUF_BYTES (ETH_MAX_MTU * 8)
#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t))
#define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1)
#define __TIMER_INTERVAL MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL)
#define TIMER_INTERVAL MIN(__TIMER_INTERVAL, ICMP_TIMER_INTERVAL)
#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, SOCK_BUF_BYTES)
static char pkt_buf [PKT_BUF_BYTES];
#define TIMER_INTERVAL MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL)
char pkt_buf [PKT_BUF_BYTES];
#ifdef DEBUG
static char *ip_proto_str[IPPROTO_SCTP + 1] = {
char *ip_proto_str[IPPROTO_SCTP + 1] = {
[IPPROTO_ICMP] = "ICMP",
[IPPROTO_TCP] = "TCP",
[IPPROTO_UDP] = "UDP",
[IPPROTO_ICMPV6] = "ICMPV6",
[IPPROTO_SCTP] = "SCTP",
};
#define IP_PROTO_STR(n) \
(((n) <= IPPROTO_SCTP && ip_proto_str[(n)]) ? ip_proto_str[(n)] : "?")
#endif
/**
* sock_unix() - Create and bind AF_UNIX socket, add to epoll list
* @index: Index used in socket path, filled on success
*
* Return: newly created socket, doesn't return on error
*/
static int sock_unix(int *index)
{
int fd = socket(AF_UNIX, SOCK_STREAM, 0), ex;
struct sockaddr_un addr = {
.sun_family = AF_UNIX,
};
int i, ret;
if (fd < 0) {
perror("UNIX socket");
exit(EXIT_FAILURE);
}
for (i = 1; i < UNIX_SOCK_MAX; i++) {
snprintf(addr.sun_path, UNIX_PATH_MAX, UNIX_SOCK_PATH, i);
ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
if (!ret || (errno != ENOENT && errno != ECONNREFUSED)) {
close(ex);
continue;
}
close(ex);
unlink(addr.sun_path);
if (!bind(fd, (const struct sockaddr *)&addr, sizeof(addr)))
break;
}
if (i == UNIX_SOCK_MAX) {
perror("UNIX socket bind");
exit(EXIT_FAILURE);
}
info("UNIX domain socket bound at %s\n", addr.sun_path);
chmod(addr.sun_path,
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
*index = i;
return fd;
}
/**
* struct nl_request - Netlink request filled and sent by get_routes()
* @nlh: Netlink message header
@ -365,362 +314,76 @@ static void get_dns(struct ctx *c)
}
/**
* tap4_handler() - IPv4 and ARP packet handler for tap file descriptor
* @c: Execution context
* @msg: Array of messages with the same L3 protocol
* @count: Count of messages with the same L3 protocol
* @now: Current timestamp
* get_bound_ports_ns() - Get TCP and UDP ports bound in namespace
* @arg: Execution context
*
* Return: count of packets consumed by handlers
* Return: 0
*/
static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count,
struct timespec *now)
static int get_bound_ports_ns(void *arg)
{
char buf_s[INET_ADDRSTRLEN] __attribute((__unused__));
char buf_d[INET_ADDRSTRLEN] __attribute((__unused__));
struct ethhdr *eh = (struct ethhdr *)msg[0].start;
struct iphdr *iph, *prev_iph = NULL;
struct udphdr *uh, *prev_uh = NULL;
size_t len = msg[0].len;
unsigned int i;
char *l4h;
struct ctx *c = (struct ctx *)arg;
if (!c->v4)
return count;
ns_enter(c->pasta_pid);
if (len < sizeof(*eh) + sizeof(*iph))
return 1;
if (arp(c, eh, len) || dhcp(c, eh, len))
return 1;
for (i = 0; i < count; i++) {
len = msg[i].len;
if (len < sizeof(*eh) + sizeof(*iph))
return 1;
eh = (struct ethhdr *)msg[i].start;
iph = (struct iphdr *)(eh + 1);
l4h = (char *)iph + iph->ihl * 4;
c->addr4_seen = iph->saddr;
msg[i].l4h = l4h;
msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
if (iph->protocol != IPPROTO_TCP &&
iph->protocol != IPPROTO_UDP)
break;
if (len < sizeof(*uh))
break;
uh = (struct udphdr *)l4h;
if (!i) {
prev_iph = iph;
prev_uh = uh;
continue;
if (c->v4) {
procfs_scan_listen("tcp", c->tcp.port_to_ns);
procfs_scan_listen("udp", c->udp.port_to_ns);
}
if (iph->tos != prev_iph->tos ||
iph->frag_off != prev_iph->frag_off ||
iph->protocol != prev_iph->protocol ||
iph->saddr != prev_iph->saddr ||
iph->daddr != prev_iph->daddr ||
uh->source != prev_uh->source ||
uh->dest != prev_uh->dest)
break;
prev_iph = iph;
prev_uh = uh;
if (c->v6) {
procfs_scan_listen("tcp6", c->tcp.port_to_ns);
procfs_scan_listen("udp6", c->udp.port_to_ns);
}
eh = (struct ethhdr *)msg[0].start;
iph = (struct iphdr *)(eh + 1);
if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP ||
iph->protocol == IPPROTO_SCTP) {
uh = (struct udphdr *)msg[0].l4h;
if (msg[0].len < sizeof(*uh))
return 1;
debug("%s (%i) from tap: %s:%i -> %s:%i (%i packet%s)",
IP_PROTO_STR(iph->protocol), iph->protocol,
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
ntohs(uh->source),
inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)),
ntohs(uh->dest),
i, i > 1 ? "s" : "");
} else if (iph->protocol == IPPROTO_ICMP) {
debug("icmp from tap: %s -> %s",
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)));
}
if (iph->protocol == IPPROTO_TCP)
return tcp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
if (iph->protocol == IPPROTO_UDP)
return udp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
if (iph->protocol == IPPROTO_ICMP)
icmp_tap_handler(c, AF_INET, &iph->daddr, msg, 1, now);
return 1;
return 0;
}
/**
* tap6_handler() - IPv6 packet handler for tap file descriptor
* get_bound_ports() - Get maps of ports that should have bound sockets
* @c: Execution context
* @msg: Array of messages with the same L3 protocol
* @count: Count of messages with the same L3 protocol
* @now: Current timestamp
*/
static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count,
struct timespec *now)
static void get_bound_ports(struct ctx *c)
{
char buf_s[INET6_ADDRSTRLEN] __attribute((__unused__));
char buf_d[INET6_ADDRSTRLEN] __attribute((__unused__));
struct ethhdr *eh = (struct ethhdr *)msg[0].start;
struct udphdr *uh, *prev_uh = NULL;
uint8_t proto = 0, prev_proto = 0;
size_t len = msg[0].len;
struct ipv6hdr *ip6h;
unsigned int i;
char *l4h;
char ns_fn_stack[NS_FN_STACK_SIZE];
if (!c->v6)
return count;
if (len < sizeof(*eh) + sizeof(*ip6h))
return 1;
if (ndp(c, eh, len) || dhcpv6(c, eh, len))
return 1;
for (i = 0; i < count; i++) {
struct ipv6hdr *p_ip6h;
len = msg[i].len;
if (len < sizeof(*eh) + sizeof(*ip6h))
return 1;
eh = (struct ethhdr *)msg[i].start;
ip6h = (struct ipv6hdr *)(eh + 1);
l4h = ipv6_l4hdr(ip6h, &proto);
msg[i].l4h = l4h;
msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr))
c->addr6_ll_seen = ip6h->saddr;
else
c->addr6_seen = ip6h->saddr;
ip6h->saddr = c->addr6;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP)
break;
if (len < sizeof(*uh))
break;
uh = (struct udphdr *)l4h;
if (!i) {
p_ip6h = ip6h;
prev_proto = proto;
prev_uh = uh;
continue;
if (c->mode == MODE_PASST) {
memset(c->tcp.port_to_tap, 0xff, PORT_EPHEMERAL_MIN / 8);
memset(c->udp.port_to_tap, 0xff, PORT_EPHEMERAL_MIN / 8);
return;
}
if (proto != prev_proto ||
memcmp(&ip6h->saddr, &p_ip6h->saddr, sizeof(ip6h->saddr)) ||
memcmp(&ip6h->daddr, &p_ip6h->daddr, sizeof(ip6h->daddr)) ||
uh->source != prev_uh->source ||
uh->dest != prev_uh->dest)
break;
clone(get_bound_ports_ns, ns_fn_stack + sizeof(ns_fn_stack) / 2,
CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, (void *)c);
p_ip6h = ip6h;
prev_proto = proto;
prev_uh = uh;
if (c->v4) {
procfs_scan_listen("tcp", c->tcp.port_to_init);
procfs_scan_listen("udp", c->udp.port_to_init);
}
if (prev_proto)
proto = prev_proto;
eh = (struct ethhdr *)msg[0].start;
ip6h = (struct ipv6hdr *)(eh + 1);
if (proto == IPPROTO_ICMPV6) {
debug("icmpv6 from tap: %s ->\n\t%s",
inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)));
} else if (proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
proto == IPPROTO_SCTP) {
uh = (struct udphdr *)msg[0].l4h;
if (msg[0].len < sizeof(*uh))
return 1;
debug("%s (%i) from tap: [%s]:%i\n\t-> [%s]:%i (%i packet%s)",
IP_PROTO_STR(proto), proto,
inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
ntohs(uh->source),
inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)),
ntohs(uh->dest),
i, i > 1 ? "s" : "");
if (c->v6) {
procfs_scan_listen("tcp6", c->tcp.port_to_init);
procfs_scan_listen("udp6", c->udp.port_to_init);
}
if (proto == IPPROTO_TCP)
return tcp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
if (proto == IPPROTO_UDP)
return udp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
if (proto == IPPROTO_ICMPV6)
icmp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, 1, now);
return 1;
}
/**
* tap_handler() - Packet handler for tap file descriptor
* @c: Execution context
* @now: Current timestamp
*
* Return: -ECONNRESET if tap connection was lost, 0 otherwise
*/
static int tap_handler(struct ctx *c, struct timespec *now)
{
struct tap_msg msg[TAP_MSGS];
int msg_count, same, i;
struct ethhdr *eh;
char *p = pkt_buf;
ssize_t n, rem;
while ((n = recv(c->fd_unix, p, TAP_BUF_FILL, MSG_DONTWAIT)) > 0) {
msg_count = 0;
while (n > (ssize_t)sizeof(uint32_t)) {
ssize_t len = ntohl(*(uint32_t *)p);
p += sizeof(uint32_t);
n -= sizeof(uint32_t);
if (len < (ssize_t)sizeof(*eh))
return 0;
/* At most one packet might not fit in a single read */
if (len > n) {
rem = recv(c->fd_unix, p + n, len - n,
MSG_DONTWAIT);
if ((n += rem) != len)
return 0;
}
pcap(p, len);
msg[msg_count].start = p;
msg[msg_count++].len = len;
n -= len;
p += len;
}
i = 0;
while (i < msg_count) {
eh = (struct ethhdr *)msg[i].start;
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
tap4_handler(c, msg + i, 1, now);
i++;
break;
case ETH_P_IP:
for (same = 1; i + same < msg_count &&
same < UIO_MAXIOV; same++) {
struct tap_msg *next = &msg[i + same];
eh = (struct ethhdr *)next->start;
if (ntohs(eh->h_proto) != ETH_P_IP)
break;
}
i += tap4_handler(c, msg + i, same, now);
break;
case ETH_P_IPV6:
for (same = 1; i + same < msg_count &&
same < UIO_MAXIOV; same++) {
struct tap_msg *next = &msg[i + same];
eh = (struct ethhdr *)next->start;
if (ntohs(eh->h_proto) != ETH_P_IPV6)
break;
}
i += tap6_handler(c, msg + i, same, now);
break;
default:
i++;
break;
}
}
p = pkt_buf;
}
if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
return 0;
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_unix, NULL);
close(c->fd_unix);
return -ECONNRESET;
}
/**
* sock_handler() - Event handler for L4 sockets
* @c: Execution context
* @s: Socket associated to event
* @ref: epoll reference
* @events: epoll events
* @now: Current timestamp
*/
static void sock_handler(struct ctx *c, int s, uint32_t events,
static void sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now)
{
socklen_t sl;
int proto;
debug("%s packet from socket %i", IP_PROTO_STR(ref.proto), ref.s);
sl = sizeof(proto);
if ( FD_PROTO(s, udp) && !FD_PROTO(s, icmp) && !FD_PROTO(s, tcp))
proto = IPPROTO_UDP;
else if (FD_PROTO(s, tcp) && !FD_PROTO(s, icmp) && !FD_PROTO(s, udp))
proto = IPPROTO_TCP;
else if (FD_PROTO(s, icmp) && !FD_PROTO(s, udp) && !FD_PROTO(s, tcp))
proto = IPPROTO_ICMP; /* Fits ICMPv6 below, too */
else if (getsockopt(s, SOL_SOCKET, SO_PROTOCOL, &proto, &sl))
proto = -1;
if (proto == -1) {
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL);
close(s);
return;
}
debug("%s (%i): packet from socket %i", IP_PROTO_STR(proto), proto, s);
if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
icmp_sock_handler(c, s, events, pkt_buf, now);
else if (proto == IPPROTO_TCP)
tcp_sock_handler( c, s, events, pkt_buf, now);
else if (proto == IPPROTO_UDP)
udp_sock_handler( c, s, events, pkt_buf, now);
if (ref.proto == IPPROTO_TCP)
tcp_sock_handler( c, ref, events, now);
else if (ref.proto == IPPROTO_UDP)
udp_sock_handler( c, ref, events, now);
else if (ref.proto == IPPROTO_ICMP || ref.proto == IPPROTO_ICMPV6)
icmp_sock_handler(c, ref, events, now);
}
/**
@ -739,39 +402,70 @@ static void timer_handler(struct ctx *c, struct timespec *now)
udp_timer(c, now);
c->udp.timer_run = *now;
}
if (timespec_diff_ms(now, &c->icmp.timer_run) >= ICMP_TIMER_INTERVAL) {
icmp_timer(c, now);
c->icmp.timer_run = *now;
}
}
/**
* usage() - Print usage and exit
* usage_passt() - Print usage for "passt" mode and exit
* @name: Executable name
*/
void usage(const char *name)
void usage_passt(const char *name)
{
fprintf(stderr, "Usage: %s\n", name);
exit(EXIT_FAILURE);
}
/**
* usage_pasta() - Print usage for "pasta" mode and exit
* @name: Executable name
*/
void usage_pasta(const char *name)
{
fprintf(stderr, "Usage: %s TARGET_PID\n", name);
exit(EXIT_FAILURE);
}
/**
* main() - Entry point and main loop
* @argc: Argument count
* @argv: Interface names
* @argv: Target PID for pasta mode
*
* Return: 0 once interrupted, non-zero on failure
*/
int main(int argc, char **argv)
{
char buf6[INET6_ADDRSTRLEN], buf4[INET_ADDRSTRLEN], *log_name;
struct epoll_event events[EPOLL_EVENTS];
int nfds, i, fd_unix, sock_index;
char buf6[INET6_ADDRSTRLEN];
char buf4[INET_ADDRSTRLEN];
struct epoll_event ev = { 0 };
struct ctx c = { 0 };
struct rlimit limit;
struct timespec now;
int nfds, i;
if (strstr(argv[0], "pasta") || strstr(argv[0], "passt4netns")) {
if (argc != 2)
usage_pasta(argv[0]);
errno = 0;
c.pasta_pid = strtol(argv[1], NULL, 0);
if (c.pasta_pid < 0 || errno)
usage_pasta(argv[0]);
c.mode = MODE_PASTA;
log_name = "pasta";
} else {
if (argc != 1)
usage(argv[0]);
usage_passt(argv[0]);
c.mode = MODE_PASST;
log_name = "passt";
memset(&c.mac_guest, 0xff, sizeof(c.mac_guest));
}
if (clock_gettime(CLOCK_MONOTONIC, &now)) {
perror("clock_gettime");
@ -795,27 +489,22 @@ int main(int argc, char **argv)
}
#if DEBUG
openlog("passt", 0, LOG_DAEMON);
openlog(log_name, 0, LOG_DAEMON);
#else
openlog("passt", isatty(fileno(stdout)) ? 0 : LOG_PERROR, LOG_DAEMON);
openlog(log_name, isatty(fileno(stdout)) ? 0 : LOG_PERROR, LOG_DAEMON);
#endif
get_routes(&c);
get_addrs(&c);
get_dns(&c);
get_bound_ports(&c);
fd_unix = sock_unix(&sock_index);
if (icmp_sock_init(&c) || udp_sock_init(&c) || tcp_sock_init(&c))
if (udp_sock_init(&c) || tcp_sock_init(&c))
exit(EXIT_FAILURE);
if (c.v6)
dhcpv6_init(&c);
memset(&c.mac_guest, 0xff, sizeof(c.mac_guest));
pcap_init(sock_index);
if (c.v4) {
info("ARP:");
info(" address: %02x:%02x:%02x:%02x:%02x:%02x from %s",
@ -859,15 +548,7 @@ int main(int argc, char **argv)
}
}
listen:
listen(fd_unix, 0);
info("You can now start qrap:");
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
info("or directly qemu, patched with:");
info(" qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch");
info("as follows:");
info(" kvm ... -net socket,connect=" UNIX_SOCK_PATH
" -net nic,model=virtio", sock_index);
tap_sock_init(&c);
#ifndef DEBUG
if (isatty(fileno(stdout)) && daemon(0, 0)) {
@ -876,12 +557,6 @@ listen:
}
#endif
c.fd_unix = accept(fd_unix, NULL, NULL);
ev.events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
ev.data.fd = c.fd_unix;
epoll_ctl(c.epollfd, EPOLL_CTL_ADD, c.fd_unix, &ev);
loop:
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
if (nfds == -1 && errno != EINTR) {
@ -892,18 +567,12 @@ loop:
clock_gettime(CLOCK_MONOTONIC, &now);
for (i = 0; i < nfds; i++) {
if (events[i].data.fd == c.fd_unix) {
if (events[i].events & EPOLLRDHUP ||
events[i].events & EPOLLHUP ||
events[i].events & EPOLLERR ||
tap_handler(&c, &now)) {
close(c.fd_unix);
goto listen;
}
} else {
sock_handler(&c, events[i].data.fd, events[i].events,
&now);
}
union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
if (events[i].data.fd == c.fd_tap)
tap_handler(&c, events[i].events, &now);
else
sock_handler(&c, ref, events[i].events, &now);
}
timer_handler(&c, &now);

68
passt.h
View file

@ -15,27 +15,76 @@ struct tap_msg {
size_t l4_len;
};
#define SOCK_BUF_BYTES (ETH_MAX_MTU * 4)
union epoll_ref;
#include "icmp.h"
#include "tcp.h"
#include "udp.h"
/**
* union epoll_ref - Breakdown of reference for epoll socket bookkeeping
* @proto: IP protocol number
* @s: Socket number (implies 2^24 limit on number of descriptors)
* @tcp: TCP-specific reference part
* @udp: UDP-specific reference part
* @icmp: ICMP-specific reference part
* @data: Data handled by protocol handlers
* @u64: Opaque reference for epoll_ctl() and epoll_wait()
*/
union epoll_ref {
struct {
uint32_t proto:8,
s:24;
union {
union tcp_epoll_ref tcp;
union udp_epoll_ref udp;
union icmp_epoll_ref icmp;
uint32_t data;
};
};
uint64_t u64;
};
#define TAP_BUF_BYTES (ETH_MAX_MTU * 3)
#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t))
#define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1)
#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0)
extern char pkt_buf [PKT_BUF_BYTES];
#ifdef DEBUG
extern char *ip_proto_str[];
#define IP_PROTO_STR(n) \
(((n) <= IPPROTO_SCTP && ip_proto_str[(n)]) ? ip_proto_str[(n)] : "?")
#endif
#include <resolv.h> /* For MAXNS below */
/**
* struct fqdn - Representation of fully-qualified domain name
* @n: Domain name string
*/
struct fqdn {
char n[NS_MAXDNAME];
};
#include <net/if.h>
enum passt_modes {
MODE_PASST,
MODE_PASTA,
};
/**
* struct ctx - Execution context
* @epollfd: file descriptor for epoll instance
* @fd_unix: AF_UNIX socket for tap file descriptor
* @v4: Enable IPv4 transport
* @mode: Operation mode, qemu/UNIX domain socket or namespace/tap
* @pasta_pid: Target PID of namespace for pasta mode
* @epollfd: File descriptor for epoll instance
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
* @fd_tap: File descriptor for AF_UNIX socket or tuntap device
* @mac: Host MAC address
* @mac_guest: Guest MAC address
* @v4: Enable IPv4 transport
* @addr4: IPv4 address for external, routable interface
* @addr4_seen: Latest IPv4 address seen as source from tap
* @mask4: IPv4 netmask, network order
@ -49,10 +98,17 @@ struct fqdn {
* @gw6: Default IPv6 gateway
* @dns4: IPv4 DNS addresses, zero-terminated
* @ifn: Name of routable interface
* @tcp: Context for TCP protocol handler
* @udp: Context for UDP protocol handler
* @icmp: Context for ICMP protocol handler
*/
struct ctx {
enum passt_modes mode;
int pasta_pid;
int epollfd;
int fd_unix;
int fd_tap_listen;
int fd_tap;
unsigned char mac[ETH_ALEN];
unsigned char mac_guest[ETH_ALEN];
@ -74,7 +130,7 @@ struct ctx {
char ifn[IF_NAMESIZE];
struct icmp_ctx icmp;
struct tcp_ctx tcp;
struct udp_ctx udp;
struct icmp_ctx icmp;
};

34
pcap.c
View file

@ -1,12 +1,15 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* pcap.c - Packet capture for PASST
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* pcap.c - Packet capture for PASST/PASTA
*
* Copyright (c) 2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
*/
#include <stdio.h>
@ -22,18 +25,19 @@
#include <unistd.h>
#include <net/if.h>
#include "passt.h"
#include "util.h"
#include "passt.h"
#ifdef DEBUG
#define PCAP_PREFIX "/tmp/passt_"
#define PCAP_PREFIX_PASTA "/tmp/pasta_"
#define PCAP_ISO8601_FORMAT "%FT%H:%M:%SZ"
#define PCAP_ISO8601_STR "YYYY-MM-ddTHH:mm:ssZ"
#define PCAP_VERSION_MINOR 4
static int pcap_fd = 1;
static int pcap_fd = -1;
/* See pcap.h from libpcap, or pcap-savefile(5) */
static struct {
@ -64,6 +68,11 @@ struct pcap_pkthdr {
uint32_t len;
};
/**
* pcap() - Capture a single frame to pcap file
* @pkt: Pointer to data buffer, including L2 headers
* @len: L2 packet length
*/
void pcap(char *pkt, size_t len)
{
struct pcap_pkthdr h;
@ -81,12 +90,23 @@ void pcap(char *pkt, size_t len)
write(pcap_fd, pkt, len);
}
void pcap_init(int sock_index)
/**
* pcap_init() - Initialise pcap file
* @c: Execution context
* @index: pcap name index: passt instance number or pasta target pid
*/
void pcap_init(struct ctx *c, int index)
{
char name[] = PCAP_PREFIX PCAP_ISO8601_STR STR(UNIX_SOCK_MAX) ".pcap";
char name[] = PCAP_PREFIX PCAP_ISO8601_STR STR(UINT_MAX) ".pcap";
struct timeval tv;
struct tm *tm;
if (pcap_fd != -1)
close(pcap_fd);
if (c->mode == MODE_PASTA)
memcpy(name, PCAP_PREFIX_PASTA, sizeof(PCAP_PREFIX_PASTA));
gettimeofday(&tv, NULL);
tm = localtime(&tv.tv_sec);
strftime(name + strlen(PCAP_PREFIX), sizeof(PCAP_ISO8601_STR) - 1,
@ -94,7 +114,7 @@ void pcap_init(int sock_index)
snprintf(name + strlen(PCAP_PREFIX) + strlen(PCAP_ISO8601_STR),
sizeof(name) - strlen(PCAP_PREFIX) - strlen(PCAP_ISO8601_STR),
"_%i.pcap", sock_index);
"_%i.pcap", index);
pcap_fd = open(name, O_WRONLY | O_CREAT | O_APPEND | O_DSYNC,
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);

2
pcap.h
View file

@ -1,2 +1,2 @@
void pcap(char *pkt, size_t len);
void pcap_init(int sock_index);
void pcap_init(struct ctx *c, int sock_index);

View file

@ -1,6 +1,10 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* siphash.c - SipHash routines
*

590
tap.c
View file

@ -1,21 +1,39 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* tap.c - Functions to communicate with guest-facing tap interface
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* tap.c - Functions to communicate with guest- or namespace-facing interface
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
*/
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <limits.h>
#include <string.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <stdint.h>
#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/uio.h>
#include <stdlib.h>
#include <unistd.h>
#include <linux/un.h>
#include <linux/if.h>
#include <linux/if_tun.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
@ -23,26 +41,46 @@
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include "passt.h"
#include "util.h"
#include "passt.h"
#include "arp.h"
#include "dhcp.h"
#include "ndp.h"
#include "dhcpv6.h"
#include "pcap.h"
/**
* tap_send() - Send frame and qemu socket header with indication of length
* @fd: tap file descriptor
* tap_send() - Send frame, with qemu socket header if needed
* @c: Execution context
* @data: Packet buffer
* @len: Total L2 packet length
* @flags: Flags for send(), if any
* @vnet_pre: Buffer has four-byte headroom
*
* Return: return code from send()
* Return: return code from send() or write()
*/
int tap_send(int fd, void *data, size_t len, int flags)
int tap_send(struct ctx *c, void *data, size_t len, int vnet_pre)
{
uint32_t vnet_len = htonl(len);
send(fd, &vnet_len, 4, MSG_DONTWAIT | MSG_NOSIGNAL);
if (vnet_pre)
pcap((char *)data + 4, len);
else
pcap(data, len);
return send(fd, data, len, flags | MSG_DONTWAIT | MSG_NOSIGNAL);
if (c->mode == MODE_PASST) {
int flags = MSG_NOSIGNAL | MSG_DONTWAIT;
if (vnet_pre) {
*((uint32_t *)data) = htonl(len);
len += 4;
} else {
uint32_t vnet_len = htonl(len);
send(c->fd_tap, &vnet_len, 4, flags);
}
return send(c->fd_tap, data, len, flags);
}
return write(c->fd_tap, (char *)data + (vnet_pre ? 4 : 0), len);
}
/**
@ -56,7 +94,8 @@ int tap_send(int fd, void *data, size_t len, int flags)
void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
char *in, size_t len)
{
char pkt[USHRT_MAX];
char buf[USHRT_MAX];
char *pkt = buf + 4;
struct ethhdr *eh;
eh = (struct ethhdr *)pkt;
@ -95,7 +134,7 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
uh->check = 0;
}
tap_send(c->fd_unix, pkt, len + sizeof(*iph) + sizeof(*eh), 0);
tap_send(c, buf, len + sizeof(*iph) + sizeof(*eh), 1);
} else {
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
char *data = (char *)(ip6h + 1);
@ -137,6 +176,527 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
ip6h->nexthdr = proto;
ip6h->hop_limit = 255;
tap_send(c->fd_unix, pkt, len + sizeof(*ip6h) + sizeof(*eh), 0);
tap_send(c, buf, len + sizeof(*ip6h) + sizeof(*eh), 1);
}
}
/**
* tap4_handler() - IPv4 and ARP packet handler for tap file descriptor
* @c: Execution context
* @msg: Array of messages with the same L3 protocol
* @count: Count of messages with the same L3 protocol
* @now: Current timestamp
*
* Return: count of packets consumed by handlers
*/
static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count,
struct timespec *now)
{
char buf_s[INET_ADDRSTRLEN] __attribute((__unused__));
char buf_d[INET_ADDRSTRLEN] __attribute((__unused__));
struct ethhdr *eh = (struct ethhdr *)msg[0].start;
struct iphdr *iph, *prev_iph = NULL;
struct udphdr *uh, *prev_uh = NULL;
size_t len = msg[0].len;
unsigned int i;
char *l4h;
if (!c->v4)
return count;
if (len < sizeof(*eh) + sizeof(*iph))
return 1;
if (arp(c, eh, len) || dhcp(c, eh, len))
return 1;
for (i = 0; i < count; i++) {
len = msg[i].len;
if (len < sizeof(*eh) + sizeof(*iph))
return 1;
eh = (struct ethhdr *)msg[i].start;
iph = (struct iphdr *)(eh + 1);
l4h = (char *)iph + iph->ihl * 4;
c->addr4_seen = iph->saddr;
msg[i].l4h = l4h;
msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
if (iph->protocol != IPPROTO_TCP &&
iph->protocol != IPPROTO_UDP)
break;
if (len < sizeof(*uh))
break;
uh = (struct udphdr *)l4h;
if (!i) {
prev_iph = iph;
prev_uh = uh;
continue;
}
if (iph->tos != prev_iph->tos ||
iph->frag_off != prev_iph->frag_off ||
iph->protocol != prev_iph->protocol ||
iph->saddr != prev_iph->saddr ||
iph->daddr != prev_iph->daddr ||
uh->source != prev_uh->source ||
uh->dest != prev_uh->dest)
break;
prev_iph = iph;
prev_uh = uh;
}
eh = (struct ethhdr *)msg[0].start;
iph = (struct iphdr *)(eh + 1);
if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP ||
iph->protocol == IPPROTO_SCTP) {
uh = (struct udphdr *)msg[0].l4h;
if (msg[0].len < sizeof(*uh))
return 1;
debug("%s (%i) from tap: %s:%i -> %s:%i (%i packet%s)",
IP_PROTO_STR(iph->protocol), iph->protocol,
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
ntohs(uh->source),
inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)),
ntohs(uh->dest),
i, i > 1 ? "s" : "");
} else if (iph->protocol == IPPROTO_ICMP) {
debug("icmp from tap: %s -> %s",
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)));
}
if (iph->protocol == IPPROTO_TCP)
return tcp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
if (iph->protocol == IPPROTO_UDP)
return udp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
if (iph->protocol == IPPROTO_ICMP)
icmp_tap_handler(c, AF_INET, &iph->daddr, msg, 1, now);
return 1;
}
/**
* tap6_handler() - IPv6 packet handler for tap file descriptor
* @c: Execution context
* @msg: Array of messages with the same L3 protocol
* @count: Count of messages with the same L3 protocol
* @now: Current timestamp
*
* Return: count of packets consumed by handlers
*/
static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count,
struct timespec *now)
{
char buf_s[INET6_ADDRSTRLEN] __attribute((__unused__));
char buf_d[INET6_ADDRSTRLEN] __attribute((__unused__));
struct ethhdr *eh = (struct ethhdr *)msg[0].start;
struct udphdr *uh, *prev_uh = NULL;
uint8_t proto = 0, prev_proto = 0;
size_t len = msg[0].len;
struct ipv6hdr *ip6h;
unsigned int i;
char *l4h;
if (!c->v6)
return count;
if (len < sizeof(*eh) + sizeof(*ip6h))
return 1;
if (ndp(c, eh, len) || dhcpv6(c, eh, len))
return 1;
for (i = 0; i < count; i++) {
struct ipv6hdr *p_ip6h;
len = msg[i].len;
if (len < sizeof(*eh) + sizeof(*ip6h))
return 1;
eh = (struct ethhdr *)msg[i].start;
ip6h = (struct ipv6hdr *)(eh + 1);
l4h = ipv6_l4hdr(ip6h, &proto);
msg[i].l4h = l4h;
msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr))
c->addr6_ll_seen = ip6h->saddr;
else
c->addr6_seen = ip6h->saddr;
ip6h->saddr = c->addr6;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP)
break;
if (len < sizeof(*uh))
break;
uh = (struct udphdr *)l4h;
if (!i) {
p_ip6h = ip6h;
prev_proto = proto;
prev_uh = uh;
continue;
}
if (proto != prev_proto ||
memcmp(&ip6h->saddr, &p_ip6h->saddr, sizeof(ip6h->saddr)) ||
memcmp(&ip6h->daddr, &p_ip6h->daddr, sizeof(ip6h->daddr)) ||
uh->source != prev_uh->source ||
uh->dest != prev_uh->dest)
break;
p_ip6h = ip6h;
prev_proto = proto;
prev_uh = uh;
}
if (prev_proto)
proto = prev_proto;
eh = (struct ethhdr *)msg[0].start;
ip6h = (struct ipv6hdr *)(eh + 1);
if (proto == IPPROTO_ICMPV6) {
debug("icmpv6 from tap: %s ->\n\t%s",
inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)));
} else if (proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
proto == IPPROTO_SCTP) {
uh = (struct udphdr *)msg[0].l4h;
if (msg[0].len < sizeof(*uh))
return 1;
debug("%s (%i) from tap: [%s]:%i\n\t-> [%s]:%i (%i packet%s)",
IP_PROTO_STR(proto), proto,
inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
ntohs(uh->source),
inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)),
ntohs(uh->dest),
i, i > 1 ? "s" : "");
}
if (proto == IPPROTO_TCP)
return tcp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
if (proto == IPPROTO_UDP)
return udp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
if (proto == IPPROTO_ICMPV6)
icmp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, 1, now);
return 1;
}
/**
* tap_handler_passt() - Packet handler for AF_UNIX file descriptor
* @c: Execution context
* @now: Current timestamp
*
* Return: -ECONNRESET on receive error, 0 otherwise
*/
static int tap_handler_passt(struct ctx *c, struct timespec *now)
{
int msg_count = 0, same, i = 0;
struct tap_msg msg[TAP_MSGS];
struct ethhdr *eh;
char *p = pkt_buf;
ssize_t n, rem;
n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT);
if (n < 0) {
if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
return 0;
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
close(c->fd_tap);
return -ECONNRESET;
}
while (n > (ssize_t)sizeof(uint32_t)) {
ssize_t len = ntohl(*(uint32_t *)p);
p += sizeof(uint32_t);
n -= sizeof(uint32_t);
if (len < (ssize_t)sizeof(*eh))
return 0;
/* At most one packet might not fit in a single read */
if (len > n) {
rem = recv(c->fd_tap, p + n, len - n, MSG_DONTWAIT);
if ((n += rem) != len)
return 0;
}
pcap(p, len);
msg[msg_count].start = p;
msg[msg_count++].len = len;
n -= len;
p += len;
}
while (i < msg_count) {
eh = (struct ethhdr *)msg[i].start;
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
tap4_handler(c, msg + i, 1, now);
i++;
break;
case ETH_P_IP:
for (same = 1; i + same < msg_count &&
same < UIO_MAXIOV; same++) {
struct tap_msg *next = &msg[i + same];
eh = (struct ethhdr *)next->start;
if (ntohs(eh->h_proto) != ETH_P_IP)
break;
}
i += tap4_handler(c, msg + i, same, now);
break;
case ETH_P_IPV6:
for (same = 1; i + same < msg_count &&
same < UIO_MAXIOV; same++) {
struct tap_msg *next = &msg[i + same];
eh = (struct ethhdr *)next->start;
if (ntohs(eh->h_proto) != ETH_P_IPV6)
break;
}
i += tap6_handler(c, msg + i, same, now);
break;
default:
i++;
break;
}
}
return 0;
}
/**
* tap_handler_passt() - Packet handler for tuntap file descriptor
* @c: Execution context
* @now: Current timestamp
*
* Return: -ECONNRESET on receive error, 0 otherwise
*/
static int tap_handler_pasta(struct ctx *c, struct timespec *now)
{
struct tap_msg msg = { .start = pkt_buf };
ssize_t n;
while ((n = read(c->fd_tap, pkt_buf, TAP_BUF_BYTES)) > 0) {
struct ethhdr *eh = (struct ethhdr *)pkt_buf;
msg.len = n;
pcap(msg.start, msg.len);
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
tap4_handler(c, &msg, 1, now);
break;
case ETH_P_IP:
tap4_handler(c, &msg, 1, now);
break;
case ETH_P_IPV6:
tap6_handler(c, &msg, 1, now);
break;
}
}
if (!n || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
return 0;
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
close(c->fd_tap);
return -ECONNRESET;
}
/**
* tap_sock_init_unix() - Create and bind AF_UNIX socket, wait for connection
* @c: Execution context
*/
static void tap_sock_init_unix(struct ctx *c)
{
int fd = socket(AF_UNIX, SOCK_STREAM, 0), ex;
struct sockaddr_un addr = {
.sun_family = AF_UNIX,
};
int i, ret;
if (c->fd_tap_listen)
close(c->fd_tap_listen);
if (fd < 0) {
perror("UNIX socket");
exit(EXIT_FAILURE);
}
c->fd_tap_listen = fd;
for (i = 1; i < UNIX_SOCK_MAX; i++) {
snprintf(addr.sun_path, UNIX_PATH_MAX, UNIX_SOCK_PATH, i);
ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
if (!ret || (errno != ENOENT && errno != ECONNREFUSED)) {
close(ex);
continue;
}
close(ex);
unlink(addr.sun_path);
if (!bind(fd, (const struct sockaddr *)&addr, sizeof(addr)))
break;
}
if (i == UNIX_SOCK_MAX) {
perror("UNIX socket bind");
exit(EXIT_FAILURE);
}
info("UNIX domain socket bound at %s\n", addr.sun_path);
chmod(addr.sun_path,
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
pcap_init(c, i);
listen(fd, 0);
info("You can now start qrap:");
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
info("or directly qemu, patched with:");
info(" qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch");
info("as follows:");
info(" kvm ... -net socket,connect=" UNIX_SOCK_PATH
" -net nic,model=virtio", i);
c->fd_tap = accept(fd, NULL, NULL);
}
static int tun_ns_fd = -1;
/**
* tap_sock_init_tun_ns() - Create tuntap file descriptor in namespace
* @c: Execution context
*/
static int tap_sock_init_tun_ns(void *target_pid)
{
int fd;
if (ns_enter(*(int *)target_pid))
goto fail;
if ((fd = open("/dev/net/tun", O_RDWR)) < 0)
goto fail;
fcntl(fd, F_SETFL, O_NONBLOCK);
tun_ns_fd = fd;
return 0;
fail:
tun_ns_fd = -1;
return 0;
}
/**
* tap_sock_init_tun() - Set up tuntap file descriptor
* @c: Execution context
*/
static void tap_sock_init_tun(struct ctx *c)
{
struct ifreq ifr = { .ifr_name = "pasta0",
.ifr_flags = IFF_TAP | IFF_NO_PI,
};
char ns_fn_stack[NS_FN_STACK_SIZE];
clone(tap_sock_init_tun_ns, ns_fn_stack + sizeof(ns_fn_stack) / 2,
CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,
(void *)&c->pasta_pid);
if (tun_ns_fd == -1) {
err("Failed to open tun socket in namespace");
exit(EXIT_FAILURE);
}
if (ioctl(tun_ns_fd, TUNSETIFF, &ifr)) {
perror("TUNSETIFF ioctl");
exit(EXIT_FAILURE);
}
pcap_init(c, c->pasta_pid);
c->fd_tap = tun_ns_fd;
}
/**
* tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor
* @c: Execution context
*/
void tap_sock_init(struct ctx *c)
{
struct epoll_event ev = { 0 };
if (c->fd_tap) {
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
close(c->fd_tap);
}
if (c->mode == MODE_PASST)
tap_sock_init_unix(c);
else
tap_sock_init_tun(c);
ev.events = EPOLLIN | EPOLLRDHUP;
ev.data.fd = c->fd_tap;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
}
/**
* tap_handler() - Packet handler for AF_UNIX or tuntap file descriptor
* @c: Execution context
* @events: epoll events
* @now: Current timestamp
*/
void tap_handler(struct ctx *c, uint32_t events, struct timespec *now)
{
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
goto fail;
if ((c->mode == MODE_PASST && tap_handler_passt(c, now)) ||
(c->mode == MODE_PASTA && tap_handler_pasta(c, now)))
goto fail;
return;
fail:
tap_sock_init(c);
}

4
tap.h
View file

@ -1,3 +1,5 @@
void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
char *in, size_t len);
int tap_send(int fd, void *data, size_t len, int flags);
int tap_send(struct ctx *c, void *data, size_t len, int vnet_pre);
void tap_handler(struct ctx *c, uint32_t events, struct timespec *now);
void tap_sock_init(struct ctx *c);

1545
tcp.c

File diff suppressed because it is too large Load diff

45
tcp.h
View file

@ -3,34 +3,53 @@
#define TCP_TIMER_INTERVAL 20 /* ms */
#define TCP_MAX_CONNS (128 * 1024)
#define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2)
struct ctx;
void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
int tcp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now);
int tcp_sock_init(struct ctx *c);
void tcp_timer(struct ctx *c, struct timespec *ts);
/**
* union tcp_epoll_ref - epoll reference portion for TCP connections
* @listen: Set if this file descriptor is a listening socket
* @splice: Set if descriptor is associated to a spliced connection
* @v6: Set for IPv6 sockets or connections
* @index: Index of connection in table, or port for bound sockets
* @u32: Opaque u32 value of reference
*/
union tcp_epoll_ref {
struct {
uint32_t listen:1,
splice:1,
v6:1,
index:20;
};
uint32_t u32;
};
/**
* struct tcp_ctx - Execution context for TCP routines
* @hash_secret: 128-bit secret for hash functions, ISN and hash table
* @fd_min: Lowest file descriptor number for TCP ever used
* @fd_max: Highest file descriptor number for TCP ever used
* @fd_listen_min: Lowest file descriptor number for listening sockets
* @fd_listen_max: Highest file descriptor number for listening sockets
* @fd_conn_min: Lowest file descriptor number for connected sockets
* @fd_conn_max: Highest file descriptor number for connected sockets
* @tap_conn_count: Count of tap connections in connection table
* @splice_conn_count: Count of spliced connections in connection table
* @port_to_tap: Ports bound host/init-side, packets to guest/tap
* @port_to_init: Ports bound namespace-side, spliced to init
* @port_to_ns: Ports bound init-side, spliced to namespace
* @timer_run: Timestamp of most recent timer run
*/
struct tcp_ctx {
uint64_t hash_secret[2];
int fd_min;
int fd_max;
int fd_listen_min;
int fd_listen_max;
int fd_conn_min;
int fd_conn_max;
int tap_conn_count;
int splice_conn_count;
uint8_t port_to_tap [USHRT_MAX / 8];
uint8_t port_to_init [USHRT_MAX / 8];
uint8_t port_to_ns [USHRT_MAX / 8];
struct timespec timer_run;
};

830
udp.c

File diff suppressed because it is too large Load diff

38
udp.h
View file

@ -3,24 +3,48 @@
#define UDP_TIMER_INTERVAL 1000 /* ms */
void udp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
int udp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now);
int udp_sock_init(struct ctx *c);
void udp_timer(struct ctx *c, struct timespec *ts);
/**
* union udp_epoll_ref - epoll reference portion for TCP connections
* @bound: Set if this file descriptor is a bound socket
* @splice: Set if descriptor is associated to "spliced" connection
* @v6: Set for IPv6 sockets or connections
* @port: Source port for connected sockets, bound port otherwise
* @u32: Opaque u32 value of reference
*/
union udp_epoll_ref {
struct {
uint32_t bound:1,
splice:3,
#define UDP_TO_NS 1
#define UDP_TO_INIT 2
#define UDP_BACK_TO_NS 3
#define UDP_BACK_TO_INIT 4
v6:1,
port:16;
};
uint32_t u32;
};
/**
* struct udp_ctx - Execution context for UDP
* @fd_min: Lowest file descriptor number for UDP ever used
* @fd_max: Highest file descriptor number for UDP ever used
* @fd_in_seq: 1 if all socket numbers are in sequence, 0 otherwise
* @port_to_tap: Ports bound host/init-side, packets to guest/tap
* @port_to_init: Ports bound namespace-side, spliced to init
* @port_to_ns: Ports bound init-side, spliced to namespace
* @timer_run: Timestamp of most recent timer run
*/
struct udp_ctx {
int fd_min;
int fd_max;
int fd_in_seq;
uint8_t port_to_tap [USHRT_MAX / 8];
uint8_t port_to_init [USHRT_MAX / 8];
uint8_t port_to_ns [USHRT_MAX / 8];
struct timespec timer_run;
};

158
util.c
View file

@ -1,14 +1,19 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* util.c - Convenience helpers
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
*/
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
@ -20,13 +25,16 @@
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <syslog.h>
#include <stdarg.h>
#include <string.h>
#include <time.h>
#include "passt.h"
#include "util.h"
#include "passt.h"
#ifdef DEBUG
#define logfn(name, level) \
@ -183,74 +191,73 @@ char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto)
* sock_l4() - Create and bind socket for given L4, add to epoll list
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @proto: Protocol number, host order
* @proto: Protocol number
* @port: Port, host order
* @lo: Bind to loopback address only, if set
* @data: epoll reference portion for protocol handlers
*
* Return: newly created socket, -1 on error
*/
int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port)
int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int lo,
uint32_t data)
{
union epoll_ref ref = { .proto = proto, .data = data };
struct sockaddr_in addr4 = {
.sin_family = AF_INET,
.sin_port = htons(port),
.sin_addr = { .s_addr = INADDR_ANY },
};
struct sockaddr_in6 addr6 = {
.sin6_family = AF_INET6,
.sin6_port = htons(port),
.sin6_addr = IN6ADDR_ANY_INIT,
};
struct epoll_event ev = { 0 };
const struct sockaddr *sa;
struct epoll_event ev;
int fd, sl, one = 1;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6)
return -1; /* Not implemented. */
fd = socket(af, proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM, proto);
if (proto == IPPROTO_TCP)
fd = socket(af, SOCK_STREAM | SOCK_NONBLOCK, proto);
else
fd = socket(af, SOCK_DGRAM | SOCK_NONBLOCK, proto);
if (fd < 0) {
perror("L4 socket");
return -1;
}
ref.s = fd;
if (af == AF_INET) {
if (lo)
addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
else
addr4.sin_addr.s_addr = htonl(INADDR_ANY);
sa = (const struct sockaddr *)&addr4;
sl = sizeof(addr4);
} else {
if (lo)
addr6.sin6_addr = in6addr_loopback;
else
addr6.sin6_addr = in6addr_any;
sa = (const struct sockaddr *)&addr6;
sl = sizeof(addr6);
setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one));
}
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMP, icmp, fd);
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMPV6, icmp, fd);
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_TCP, tcp, fd);
CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_UDP, udp, fd);
if (proto == IPPROTO_UDP && PORT_IS_EPHEMERAL(port))
goto epoll_add;
if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
goto epoll_add;
setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
if (bind(fd, sa, sl) < 0) {
/* We'll fail to bind to low ports if we don't have enough
* capabilities, and we'll fail to bind on already bound ports,
* this is fine. If this isn't the socket with the lowest number
* for a given protocol, leave it open, to avoid unnecessary
* holes in the numbering.
* this is fine.
*/
if ((proto == IPPROTO_TCP && fd == c->tcp.fd_min) ||
(proto == IPPROTO_UDP && fd == c->udp.fd_min) ||
((proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) &&
fd == c->icmp.fd_min)) {
close(fd);
return 0;
}
return fd;
}
if (proto == IPPROTO_TCP && listen(fd, 128) < 0) {
perror("TCP socket listen");
@ -258,9 +265,8 @@ int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port)
return -1;
}
epoll_add:
ev.events = EPOLLIN;
ev.data.fd = fd;
ev.data.u64 = ref.u64;
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
perror("L4 epoll_ctl");
return -1;
@ -286,3 +292,97 @@ int timespec_diff_ms(struct timespec *a, struct timespec *b)
return (a->tv_nsec - b->tv_nsec) / 1000000 +
(a->tv_sec - b->tv_sec) * 1000;
}
/**
* bitmap_set() - Set single bit in bitmap
* @map: Pointer to bitmap
* @bit: Bit number to set
*/
void bitmap_set(uint8_t *map, int bit)
{
map[bit / 8] |= 1 << (bit % 8);
}
/**
* bitmap_set() - Clear single bit in bitmap
* @map: Pointer to bitmap
* @bit: Bit number to clear
*/
void bitmap_clear(uint8_t *map, int bit)
{
map[bit / 8] &= ~(1 << (bit % 8));
}
/**
* bitmap_isset() - Check for set bit in bitmap
* @map: Pointer to bitmap
* @bit: Bit number to check
*
* Return: non-zero if given bit is set, zero if it's not
*/
int bitmap_isset(uint8_t *map, int bit)
{
return map[bit / 8] & (1 << bit % 8);
}
/**
* procfs_scan_listen() - Set bits for listening TCP or UDP sockets from procfs
* @name: Corresponding name of file under /proc/net/
* @map: Bitmap where numbers of ports in listening state will be set
*/
void procfs_scan_listen(char *name, uint8_t *map)
{
char line[200], path[PATH_MAX];
unsigned long port;
unsigned int state;
FILE *fp;
snprintf(path, PATH_MAX, "/proc/net/%s", name);
if (!(fp = fopen(path, "r")))
return;
fgets(line, sizeof(line), fp);
while (fgets(line, sizeof(line), fp)) {
if (sscanf(line, "%*u: %*x:%lx %*x:%*x %x", &port, &state) != 2)
continue;
/* See enum in kernel's include/net/tcp_states.h */
if ((strstr(name, "tcp") && state != 0x0a) ||
(strstr(name, "udp") && state != 0x07))
continue;
bitmap_set(map, port);
}
fclose(fp);
}
/**
* ns_enter() - Enter user and network namespaces of process with given PID
* @target_pid: Process PID
*
* Return: 0 on success, -1 on failure
*/
int ns_enter(int target_pid)
{
char ns[PATH_MAX];
int fd;
snprintf(ns, PATH_MAX, "/proc/%i/ns/user", target_pid);
if ((fd = open(ns, O_RDONLY)) < 0 || setns(fd, 0))
goto fail;
close(fd);
snprintf(ns, PATH_MAX, "/proc/%i/ns/net", target_pid);
if ((fd = open(ns, O_RDONLY)) < 0 || setns(fd, 0))
goto fail;
close(fd);
return 0;
fail:
if (fd != -1)
close(fd);
return -1;
}

25
util.h
View file

@ -29,24 +29,45 @@ void debug(const char *format, ...);
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#endif
#define SWAP(a, b) \
do { \
typeof(a) __x = (a); (a) = (b); (b) = __x; \
} while (0) \
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
#define V4 0
#define V6 1
#define IP_VERSIONS 2
#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0])))
#define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b))
#define FD_PROTO(x, proto) \
(IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x)))
#define PORT_IS_EPHEMERAL(port) ((port) >= (1 << 15) + (1 << 14)) /* RFC 6335 */
#define PORT_EPHEMERAL_MIN ((1 << 15) + (1 << 14)) /* RFC 6335 */
#define PORT_IS_EPHEMERAL(port) ((port) >= PORT_EPHEMERAL_MIN)
#define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 4)
#include <linux/ipv6.h>
#include <net/if.h>
#include <linux/ip.h>
#include <limits.h>
struct ctx;
uint16_t csum_fold(uint32_t sum);
uint16_t csum_ip4(void *buf, size_t len);
void csum_tcp4(struct iphdr *iph);
char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto);
int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port);
int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int lo,
uint32_t data);
int timespec_diff_ms(struct timespec *a, struct timespec *b);
void bitmap_set(uint8_t *map, int bit);
void bitmap_clear(uint8_t *map, int bit);
int bitmap_isset(uint8_t *map, int bit);
void procfs_scan_listen(char *name, uint8_t *map);
int ns_enter(int target_pid);