passt/ndp.c

359 lines
7.8 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
passt: New design and implementation with native Layer 4 sockets This is a reimplementation, partially building on the earlier draft, that uses L4 sockets (SOCK_DGRAM, SOCK_STREAM) instead of SOCK_RAW, providing L4-L2 translation functionality without requiring any security capability. Conceptually, this follows the design presented at: https://gitlab.com/abologna/kubevirt-and-kvm/-/blob/master/Networking.md The most significant novelty here comes from TCP and UDP translation layers. In particular, the TCP state and translation logic follows the intent of being minimalistic, without reimplementing a full TCP stack in either direction, and synchronising as much as possible the TCP dynamic and flows between guest and host kernel. Another important introduction concerns addressing, port translation and forwarding. The Layer 4 implementations now attempt to bind on all unbound ports, in order to forward connections in a transparent way. While at it: - the qemu 'tap' back-end can't be used as-is by qrap anymore, because of explicit checks now introduced in qemu to ensure that the corresponding file descriptor is actually a tap device. For this reason, qrap now operates on a 'socket' back-end type, accounting for and building the additional header reporting frame length - provide a demo script that sets up namespaces, addresses and routes, and starts the daemon. A virtual machine started in the network namespace, wrapped by qrap, will now directly interface with passt and communicate using Layer 4 sockets provided by the host kernel. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2021-02-16 07:25:09 +01:00
/* PASST - Plug A Simple Socket Transport
passt: Add PASTA mode, major rework PASTA (Pack A Subtle Tap Abstraction) provides quasi-native host connectivity to an otherwise disconnected, unprivileged network and user namespace, similarly to slirp4netns. Given that the implementation is largely overlapping with PASST, no separate binary is built: 'pasta' (and 'passt4netns' for clarity) both link to 'passt', and the mode of operation is selected depending on how the binary is invoked. Usage example: $ unshare -rUn # echo $$ 1871759 $ ./pasta 1871759 # From another terminal # udhcpc -i pasta0 2>/dev/null # ping -c1 pasta.pizza PING pasta.pizza (64.190.62.111) 56(84) bytes of data. 64 bytes from 64.190.62.111 (64.190.62.111): icmp_seq=1 ttl=255 time=34.6 ms --- pasta.pizza ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 34.575/34.575/34.575/0.000 ms # ping -c1 spaghetti.pizza PING spaghetti.pizza(2606:4700:3034::6815:147a (2606:4700:3034::6815:147a)) 56 data bytes 64 bytes from 2606:4700:3034::6815:147a (2606:4700:3034::6815:147a): icmp_seq=1 ttl=255 time=29.0 ms --- spaghetti.pizza ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 28.967/28.967/28.967/0.000 ms This entails a major rework, especially with regard to the storage of tracked connections and to the semantics of epoll(7) references. Indexing TCP and UDP bindings merely by socket proved to be inflexible and unsuitable to handle different connection flows: pasta also provides Layer-2 to Layer-2 socket mapping between init and a separate namespace for local connections, using a pair of splice() system calls for TCP, and a recvmmsg()/sendmmsg() pair for UDP local bindings. For instance, building on the previous example: # ip link set dev lo up # iperf3 -s $ iperf3 -c ::1 -Z -w 32M -l 1024k -P2 | tail -n4 [SUM] 0.00-10.00 sec 52.3 GBytes 44.9 Gbits/sec 283 sender [SUM] 0.00-10.43 sec 52.3 GBytes 43.1 Gbits/sec receiver iperf Done. epoll(7) references now include a generic part in order to demultiplex data to the relevant protocol handler, using 24 bits for the socket number, and an opaque portion reserved for usage by the single protocol handlers, in order to track sockets back to corresponding connections and bindings. A number of fixes pertaining to TCP state machine and congestion window handling are also included here. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2021-07-17 08:34:53 +02:00
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* ndp.c - NDP support for PASST
*
passt: New design and implementation with native Layer 4 sockets This is a reimplementation, partially building on the earlier draft, that uses L4 sockets (SOCK_DGRAM, SOCK_STREAM) instead of SOCK_RAW, providing L4-L2 translation functionality without requiring any security capability. Conceptually, this follows the design presented at: https://gitlab.com/abologna/kubevirt-and-kvm/-/blob/master/Networking.md The most significant novelty here comes from TCP and UDP translation layers. In particular, the TCP state and translation logic follows the intent of being minimalistic, without reimplementing a full TCP stack in either direction, and synchronising as much as possible the TCP dynamic and flows between guest and host kernel. Another important introduction concerns addressing, port translation and forwarding. The Layer 4 implementations now attempt to bind on all unbound ports, in order to forward connections in a transparent way. While at it: - the qemu 'tap' back-end can't be used as-is by qrap anymore, because of explicit checks now introduced in qemu to ensure that the corresponding file descriptor is actually a tap device. For this reason, qrap now operates on a 'socket' back-end type, accounting for and building the additional header reporting frame length - provide a demo script that sets up namespaces, addresses and routes, and starts the daemon. A virtual machine started in the network namespace, wrapped by qrap, will now directly interface with passt and communicate using Layer 4 sockets provided by the host kernel. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2021-02-16 07:25:09 +01:00
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
*/
#include <stdio.h>
#include <stddef.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <arpa/inet.h>
#include <netinet/ip.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <netinet/if_ether.h>
#include <linux/icmpv6.h>
#include "checksum.h"
#include "util.h"
#include "ip.h"
passt: Add PASTA mode, major rework PASTA (Pack A Subtle Tap Abstraction) provides quasi-native host connectivity to an otherwise disconnected, unprivileged network and user namespace, similarly to slirp4netns. Given that the implementation is largely overlapping with PASST, no separate binary is built: 'pasta' (and 'passt4netns' for clarity) both link to 'passt', and the mode of operation is selected depending on how the binary is invoked. Usage example: $ unshare -rUn # echo $$ 1871759 $ ./pasta 1871759 # From another terminal # udhcpc -i pasta0 2>/dev/null # ping -c1 pasta.pizza PING pasta.pizza (64.190.62.111) 56(84) bytes of data. 64 bytes from 64.190.62.111 (64.190.62.111): icmp_seq=1 ttl=255 time=34.6 ms --- pasta.pizza ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 34.575/34.575/34.575/0.000 ms # ping -c1 spaghetti.pizza PING spaghetti.pizza(2606:4700:3034::6815:147a (2606:4700:3034::6815:147a)) 56 data bytes 64 bytes from 2606:4700:3034::6815:147a (2606:4700:3034::6815:147a): icmp_seq=1 ttl=255 time=29.0 ms --- spaghetti.pizza ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 28.967/28.967/28.967/0.000 ms This entails a major rework, especially with regard to the storage of tracked connections and to the semantics of epoll(7) references. Indexing TCP and UDP bindings merely by socket proved to be inflexible and unsuitable to handle different connection flows: pasta also provides Layer-2 to Layer-2 socket mapping between init and a separate namespace for local connections, using a pair of splice() system calls for TCP, and a recvmmsg()/sendmmsg() pair for UDP local bindings. For instance, building on the previous example: # ip link set dev lo up # iperf3 -s $ iperf3 -c ::1 -Z -w 32M -l 1024k -P2 | tail -n4 [SUM] 0.00-10.00 sec 52.3 GBytes 44.9 Gbits/sec 283 sender [SUM] 0.00-10.43 sec 52.3 GBytes 43.1 Gbits/sec receiver iperf Done. epoll(7) references now include a generic part in order to demultiplex data to the relevant protocol handler, using 24 bits for the socket number, and an opaque portion reserved for usage by the single protocol handlers, in order to track sockets back to corresponding connections and bindings. A number of fixes pertaining to TCP state machine and congestion window handling are also included here. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2021-07-17 08:34:53 +02:00
#include "passt.h"
passt: New design and implementation with native Layer 4 sockets This is a reimplementation, partially building on the earlier draft, that uses L4 sockets (SOCK_DGRAM, SOCK_STREAM) instead of SOCK_RAW, providing L4-L2 translation functionality without requiring any security capability. Conceptually, this follows the design presented at: https://gitlab.com/abologna/kubevirt-and-kvm/-/blob/master/Networking.md The most significant novelty here comes from TCP and UDP translation layers. In particular, the TCP state and translation logic follows the intent of being minimalistic, without reimplementing a full TCP stack in either direction, and synchronising as much as possible the TCP dynamic and flows between guest and host kernel. Another important introduction concerns addressing, port translation and forwarding. The Layer 4 implementations now attempt to bind on all unbound ports, in order to forward connections in a transparent way. While at it: - the qemu 'tap' back-end can't be used as-is by qrap anymore, because of explicit checks now introduced in qemu to ensure that the corresponding file descriptor is actually a tap device. For this reason, qrap now operates on a 'socket' back-end type, accounting for and building the additional header reporting frame length - provide a demo script that sets up namespaces, addresses and routes, and starts the daemon. A virtual machine started in the network namespace, wrapped by qrap, will now directly interface with passt and communicate using Layer 4 sockets provided by the host kernel. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2021-02-16 07:25:09 +01:00
#include "tap.h"
#include "log.h"
#define RS 133
#define RA 134
#define NS 135
#define NA 136
enum ndp_option_types {
OPT_SRC_L2_ADDR = 1,
OPT_TARGET_L2_ADDR = 2,
OPT_PREFIX_INFO = 3,
OPT_MTU = 5,
OPT_RDNSS_TYPE = 25,
OPT_DNSSL_TYPE = 31,
};
/**
* struct opt_header - Option header
* @type: Option type
* @len: Option length, in units of 8 bytes
*/
struct opt_header {
uint8_t type;
uint8_t len;
} __attribute__((packed));
/**
* struct opt_l2_addr - Link-layer address
* @header: Option header
* @mac: MAC address
*/
struct opt_l2_addr {
struct opt_header header;
unsigned char mac[ETH_ALEN];
} __attribute__((packed));
/**
* struct ndp_na - NDP Neighbor Advertisement (NA) message
* @ih: ICMPv6 header
* @target_addr: Target IPv6 address
* @target_l2_addr: Target link-layer address
*/
struct ndp_na {
struct icmp6hdr ih;
struct in6_addr target_addr;
struct opt_l2_addr target_l2_addr;
} __attribute__((packed));
/**
* struct opt_prefix_info - Prefix Information option
* @header: Option header
* @prefix_len: The number of leading bits in the Prefix that are valid
* @prefix_flags: Flags associated with the prefix
* @valid_lifetime: Valid lifetime (ms)
* @pref_lifetime: Preferred lifetime (ms)
* @reserved: Unused
*/
struct opt_prefix_info {
struct opt_header header;
uint8_t prefix_len;
uint8_t prefix_flags;
uint32_t valid_lifetime;
uint32_t pref_lifetime;
uint32_t reserved;
} __attribute__((packed));
/**
* struct opt_mtu - Maximum transmission unit (MTU) option
* @header: Option header
* @reserved: Unused
* @value: MTU value, network order
*/
struct opt_mtu {
struct opt_header header;
uint16_t reserved;
uint32_t value;
} __attribute__((packed));
/**
* struct rdnss - Recursive DNS Server (RDNSS) option
* @header: Option header
* @reserved: Unused
* @lifetime: Validity time (s)
* @dns: List of DNS server addresses
*/
struct opt_rdnss {
struct opt_header header;
uint16_t reserved;
uint32_t lifetime;
struct in6_addr dns[MAXNS + 1];
} __attribute__((packed));
/**
* struct dnssl - DNS Search List (DNSSL) option
* @header: Option header
* @reserved: Unused
* @lifetime: Validity time (s)
* @domains: List of NULL-seperated search domains
*/
struct opt_dnssl {
struct opt_header header;
uint16_t reserved;
uint32_t lifetime;
unsigned char domains[MAXDNSRCH * NS_MAXDNAME];
} __attribute__((packed));
/**
* struct ndp_ra - NDP Router Advertisement (RA) message
* @ih: ICMPv6 header
* @reachable: Reachability time, after confirmation (ms)
* @retrans: Time between retransmitted NS messages (ms)
* @prefix_info: Prefix Information option
* @prefix: IPv6 prefix
* @mtu: MTU option
* @source_ll: Target link-layer address
* @var: Variable fields
*/
struct ndp_ra {
struct icmp6hdr ih;
uint32_t reachable;
uint32_t retrans;
struct opt_prefix_info prefix_info;
struct in6_addr prefix;
struct opt_l2_addr source_ll;
unsigned char var[sizeof(struct opt_mtu) + sizeof(struct opt_rdnss) +
sizeof(struct opt_dnssl)];
} __attribute__((packed));
/**
* struct ndp_ns - NDP Neighbor Solicitation (NS) message
* @ih: ICMPv6 header
* @target_addr: Target IPv6 address
*/
struct ndp_ns {
struct icmp6hdr ih;
struct in6_addr target_addr;
} __attribute__((packed));
/**
* ndp() - Check for NDP solicitations, reply as needed
* @c: Execution context
* @ih: ICMPv6 header
* @saddr: Source IPv6 address
* @p: Packet pool
*
* Return: 0 if not handled here, 1 if handled, -1 on failure
*/
int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
const struct pool *p)
{
struct ndp_na na = {
.ih = {
.icmp6_type = NA,
.icmp6_code = 0,
.icmp6_router = 1,
.icmp6_solicited = 1,
.icmp6_override = 1,
},
.target_l2_addr = {
.header = {
.type = OPT_TARGET_L2_ADDR,
.len = 1,
},
}
};
struct ndp_ra ra = {
.ih = {
.icmp6_type = RA,
.icmp6_code = 0,
.icmp6_hop_limit = 255,
/* RFC 8319 */
.icmp6_rt_lifetime = htons_constant(65535),
.icmp6_addrconf_managed = 1,
},
.prefix_info = {
.header = {
.type = OPT_PREFIX_INFO,
.len = 4,
},
.prefix_len = 64,
.prefix_flags = 0xc0, /* prefix flags: L, A */
.valid_lifetime = ~0U,
.pref_lifetime = ~0U,
},
.source_ll = {
.header = {
.type = OPT_SRC_L2_ADDR,
.len = 1,
},
},
};
const struct in6_addr *rsaddr; /* src addr for reply */
unsigned char *ptr = NULL;
size_t dlen;
if (ih->icmp6_type < RS || ih->icmp6_type > NA)
return 0;
if (c->no_ndp)
return 1;
if (ih->icmp6_type == NS) {
struct ndp_ns *ns = packet_get(p, 0, 0, sizeof(struct ndp_ns),
NULL);
if (!ns)
return -1;
if (IN6_IS_ADDR_UNSPECIFIED(saddr))
return 1;
info("NDP: received NS, sending NA");
memcpy(&na.target_addr, &ns->target_addr,
sizeof(na.target_addr));
memcpy(na.target_l2_addr.mac, c->mac, ETH_ALEN);
} else if (ih->icmp6_type == RS) {
size_t dns_s_len = 0;
int i, n;
if (c->no_ra)
return 1;
info("NDP: received RS, sending RA");
memcpy(&ra.prefix, &c->ip6.addr, sizeof(ra.prefix));
ptr = &ra.var[0];
if (c->mtu != -1) {
struct opt_mtu *mtu = (struct opt_mtu *)ptr;
*mtu = (struct opt_mtu) {
.header = {
.type = OPT_MTU,
.len = 1,
},
.value = htonl(c->mtu),
};
ptr += sizeof(struct opt_mtu);
}
if (c->no_dhcp_dns)
goto dns_done;
for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[n]); n++);
if (n) {
struct opt_rdnss *rdnss = (struct opt_rdnss *)ptr;
*rdnss = (struct opt_rdnss) {
.header = {
.type = OPT_RDNSS_TYPE,
.len = 1 + 2 * n,
},
.lifetime = ~0U,
};
for (i = 0; i < n; i++) {
memcpy(&rdnss->dns[i], &c->ip6.dns[i],
sizeof(rdnss->dns[i]));
}
ptr += offsetof(struct opt_rdnss, dns) +
i * sizeof(rdnss->dns[0]);
for (n = 0; *c->dns_search[n].n; n++)
dns_s_len += strlen(c->dns_search[n].n) + 2;
}
if (!c->no_dhcp_dns_search && dns_s_len) {
struct opt_dnssl *dnssl = (struct opt_dnssl *)ptr;
*dnssl = (struct opt_dnssl) {
.header = {
.type = OPT_DNSSL_TYPE,
.len = DIV_ROUND_UP(dns_s_len, 8) + 1,
},
.lifetime = ~0U,
};
ptr = dnssl->domains;
for (i = 0; i < n; i++) {
size_t len;
char *dot;
*(ptr++) = '.';
len = sizeof(dnssl->domains) -
(ptr - dnssl->domains);
strncpy((char *)ptr, c->dns_search[i].n, len);
for (dot = (char *)ptr - 1; *dot; dot++) {
if (*dot == '.')
*dot = strcspn(dot + 1, ".");
}
ptr += strlen(c->dns_search[i].n);
*(ptr++) = 0;
}
memset(ptr, 0, 8 - dns_s_len % 8); /* padding */
ptr += 8 - dns_s_len % 8;
}
dns_done:
memcpy(&ra.source_ll.mac, c->mac, ETH_ALEN);
} else {
return 1;
}
if (IN6_IS_ADDR_LINKLOCAL(saddr))
c->ip6.addr_ll_seen = *saddr;
else
c->ip6.addr_seen = *saddr;
if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw))
rsaddr = &c->ip6.gw;
else
rsaddr = &c->ip6.addr_ll;
if (ih->icmp6_type == NS) {
dlen = sizeof(struct ndp_na);
tap_icmp6_send(c, rsaddr, saddr, &na, dlen);
} else if (ih->icmp6_type == RS) {
dlen = ptr - (unsigned char *)&ra;
tap_icmp6_send(c, rsaddr, saddr, &ra, dlen);
}
return 1;
}