passt/dhcp.c
Stefano Brivio 33482d5bf2 passt: Add PASTA mode, major rework
PASTA (Pack A Subtle Tap Abstraction) provides quasi-native host
connectivity to an otherwise disconnected, unprivileged network
and user namespace, similarly to slirp4netns. Given that the
implementation is largely overlapping with PASST, no separate binary
is built: 'pasta' (and 'passt4netns' for clarity) both link to
'passt', and the mode of operation is selected depending on how the
binary is invoked. Usage example:

	$ unshare -rUn
	# echo $$
	1871759

	$ ./pasta 1871759	# From another terminal

	# udhcpc -i pasta0 2>/dev/null
	# ping -c1 pasta.pizza
	PING pasta.pizza (64.190.62.111) 56(84) bytes of data.
	64 bytes from 64.190.62.111 (64.190.62.111): icmp_seq=1 ttl=255 time=34.6 ms

	--- pasta.pizza ping statistics ---
	1 packets transmitted, 1 received, 0% packet loss, time 0ms
	rtt min/avg/max/mdev = 34.575/34.575/34.575/0.000 ms
	# ping -c1 spaghetti.pizza
	PING spaghetti.pizza(2606:4700:3034::6815:147a (2606:4700:3034::6815:147a)) 56 data bytes
	64 bytes from 2606:4700:3034::6815:147a (2606:4700:3034::6815:147a): icmp_seq=1 ttl=255 time=29.0 ms

	--- spaghetti.pizza ping statistics ---
	1 packets transmitted, 1 received, 0% packet loss, time 0ms
	rtt min/avg/max/mdev = 28.967/28.967/28.967/0.000 ms

This entails a major rework, especially with regard to the storage of
tracked connections and to the semantics of epoll(7) references.

Indexing TCP and UDP bindings merely by socket proved to be
inflexible and unsuitable to handle different connection flows: pasta
also provides Layer-2 to Layer-2 socket mapping between init and a
separate namespace for local connections, using a pair of splice()
system calls for TCP, and a recvmmsg()/sendmmsg() pair for UDP local
bindings. For instance, building on the previous example:

	# ip link set dev lo up
	# iperf3 -s

	$ iperf3 -c ::1 -Z -w 32M -l 1024k -P2 | tail -n4
	[SUM]   0.00-10.00  sec  52.3 GBytes  44.9 Gbits/sec  283             sender
	[SUM]   0.00-10.43  sec  52.3 GBytes  43.1 Gbits/sec                  receiver

	iperf Done.

epoll(7) references now include a generic part in order to
demultiplex data to the relevant protocol handler, using 24
bits for the socket number, and an opaque portion reserved for
usage by the single protocol handlers, in order to track sockets
back to corresponding connections and bindings.

A number of fixes pertaining to TCP state machine and congestion
window handling are also included here.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2021-07-17 11:04:22 +02:00

332 lines
7.5 KiB
C

// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* dhcp.c - Minimalistic DHCP server for PASST
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#include <stdio.h>
#include <stddef.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/udp.h>
#include <net/if.h>
#include <arpa/inet.h>
#include "util.h"
#include "passt.h"
#include "dhcp.h"
#include "tap.h"
/**
* struct opt - DHCP option
* @sent: Convenience flag, set while filling replies
* @slen: Length of option defined for server
* @s: Option payload from server
* @clen: Length of option received from client
* @c: Option payload from client
*/
struct opt {
int sent;
int slen;
unsigned char s[255];
int clen;
unsigned char c[255];
};
static struct opt opts[255] = {
[1] = { 0, 4, { 0 }, 0, { 0 }, }, /* Mask */
[3] = { 0, 4, { 0 }, 0, { 0 }, }, /* Router */
[51] = { 0, 4, { 0xff, 0xff, 0xff, 0xff }, 0, { 0 }, }, /* Lease time */
[53] = { 0, 1, { 0 }, 0, { 0 }, }, /* Type */
#define DHCPDISCOVER 1
#define DHCPOFFER 2
#define DHCPREQUEST 3
#define DHCPDECLINE 4
#define DHCPACK 5
#define DHCPNAK 6
#define DHCPRELEASE 7
#define DHCPINFORM 8
#define DHCPFORCERENEW 9
[54] = { 0, 4, { 0 }, 0, { 0 }, }, /* Server ID */
};
/**
* struct msg - BOOTP/DHCP message
* @op: BOOTP message type
* @htype: Hardware address type
* @hlen: Hardware address length
* @hops: DHCP relay hops
* @xid: Transaction ID randomly chosen by client
* @secs: Seconds elapsed since beginning of acquisition or renewal
* @flags: DHCP message flags
* @ciaddr: Client IP address in BOUND, RENEW, REBINDING
* @yiaddr: IP address being offered or assigned
* @siaddr: Next server to use in bootstrap
* @giaddr: Relay agent IP address
* @chaddr: Client hardware address
* @sname: Server host name
* @file: Boot file name
* @magic: Magic cookie prefix before options
* @o: Options
*/
struct msg {
uint8_t op;
#define BOOTREQUEST 1
#define BOOTREPLY 2
uint8_t htype;
uint8_t hlen;
uint8_t hops;
uint32_t xid;
uint16_t secs;
uint16_t flags;
uint32_t ciaddr;
uint32_t yiaddr;
uint32_t siaddr;
uint32_t giaddr;
uint8_t chaddr[16];
uint8_t sname[64];
uint8_t file[128];
uint32_t magic;
uint8_t o[308];
} __attribute__((__packed__));
/**
* fill_one() - Fill a single option in message
* @m: Message to fill
* @o: Option number
* @offset: Current offset within options field, updated on insertion
*/
static void fill_one(struct msg *m, int o, int *offset)
{
m->o[*offset] = o;
m->o[*offset + 1] = opts[o].slen;
memcpy(&m->o[*offset + 2], opts[o].s, opts[o].slen);
opts[o].sent = 1;
*offset += 2 + opts[o].slen;
}
/**
* fill() - Fill options in message
* @m: Message to fill
*
* Return: current size of options field
*/
static int fill(struct msg *m)
{
int i, o, offset = 0;
m->op = BOOTREPLY;
m->secs = 0;
for (o = 0; o < 255; o++)
opts[o].sent = 0;
for (i = 0; i < opts[55].clen; i++) {
o = opts[55].c[i];
if (opts[o].slen)
fill_one(m, o, &offset);
}
for (o = 0; o < 255; o++) {
if (opts[o].slen && !opts[o].sent)
fill_one(m, o, &offset);
}
m->o[offset++] = 255;
m->o[offset++] = 0;
if (offset < 62 /* RFC 951 */) {
memset(&m->o[offset], 0, 62 - offset);
offset = 62;
}
return offset;
}
/**
* opt_dns_search_dup_ptr() - Look for possible domain name compression pointer
* @buf: Current option buffer with existing labels
* @cmp: Portion of domain name being added
* @len: Length of current option buffer
*
* Return: offset to corresponding compression pointer if any, -1 if not found
*/
static int opt_dns_search_dup_ptr(unsigned char *buf, char *cmp, size_t len)
{
unsigned int i;
for (i = 0; i < len; i++) {
if (buf[i] == 0 &&
len - i - 1 >= strlen(cmp) &&
!memcmp(buf + i + 1, cmp, strlen(cmp)))
return i;
if ((buf[i] & 0xc0) == 0xc0 &&
len - i - 2 >= strlen(cmp) &&
!memcmp(buf + i + 2, cmp, strlen(cmp)))
return i + 1;
}
return -1;
}
/**
* opt_set_dns_search() - Fill data and set length for Domain Search option
* @c: Execution context
* @max_len: Maximum total length of option buffer
*/
static void opt_set_dns_search(struct ctx *c, size_t max_len)
{
char buf[NS_MAXDNAME];
int i;
opts[119].slen = 0;
for (i = 0; i < 255; i++)
max_len -= opts[i].slen;
for (i = 0; *c->dns_search[i].n; i++) {
unsigned int n;
int dup = -1;
char *p;
buf[0] = 0;
for (p = c->dns_search[i].n, n = 1; *p; p++) {
if (*p == '.') {
/* RFC 1035 4.1.4 Message compression */
dup = opt_dns_search_dup_ptr(opts[119].s, p + 1,
opts[119].slen);
if (dup >= 0) {
buf[n++] = '\xc0';
buf[n++] = dup;
break;
} else {
buf[n++] = '.';
}
} else {
buf[n++] = *p;
}
}
/* The compression pointer is also an end of label */
if (dup < 0)
buf[n++] = 0;
if (n >= max_len)
break;
memcpy(opts[119].s + opts[119].slen, buf, n);
opts[119].slen += n;
max_len -= n;
}
for (i = 0; i < opts[119].slen; i++) {
if (!opts[119].s[i] || opts[119].s[i] == '.') {
opts[119].s[i] = strcspn((char *)opts[119].s + i + 1,
".\xc0");
}
}
}
/**
* dhcp() - Check if this is a DHCP message, reply as needed
* @c: Execution context
* @len: Total L2 packet length
* @eh: Packet buffer, Ethernet header
*
* Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure
*/
int dhcp(struct ctx *c, struct ethhdr *eh, size_t len)
{
struct iphdr *iph = (struct iphdr *)(eh + 1);
size_t mlen, olen;
struct udphdr *uh;
unsigned int i;
struct msg *m;
if (len < sizeof(*eh) + sizeof(*iph))
return 0;
if (len < sizeof(*eh) + iph->ihl * 4 + sizeof(*uh))
return 0;
uh = (struct udphdr *)((char *)iph + iph->ihl * 4);
m = (struct msg *)(uh + 1);
if (uh->dest != htons(67))
return 0;
mlen = len - sizeof(*eh) - iph->ihl * 4 - sizeof(*uh);
if (mlen != ntohs(uh->len) - sizeof(*uh) ||
mlen < offsetof(struct msg, o) ||
m->op != BOOTREQUEST)
return -1;
olen = mlen - offsetof(struct msg, o);
for (i = 0; i + 2 < olen; i += m->o[i + 1] + 2) {
if (m->o[i + 1] + i + 2 >= olen)
return -1;
memcpy(&opts[m->o[i]].c, &m->o[i + 2], m->o[i + 1]);
}
if (opts[53].c[0] == DHCPDISCOVER) {
info("DHCP: offer to discover");
opts[53].s[0] = DHCPOFFER;
} else if (opts[53].c[0] == DHCPREQUEST) {
info("DHCP: ack to request");
opts[53].s[0] = DHCPACK;
} else {
return -1;
}
info(" from %02x:%02x:%02x:%02x:%02x:%02x",
m->chaddr[0], m->chaddr[1], m->chaddr[2],
m->chaddr[3], m->chaddr[4], m->chaddr[5]);
m->yiaddr = c->addr4;
*(unsigned long *)opts[1].s = c->mask4;
*(unsigned long *)opts[3].s = c->gw4;
*(unsigned long *)opts[54].s = c->gw4;
for (i = 0, opts[6].slen = 0; c->dns4[i]; i++) {
((uint32_t *)opts[6].s)[i] = c->dns4[i];
opts[6].slen += sizeof(uint32_t);
}
opt_set_dns_search(c, sizeof(m->o));
uh->len = htons(len = offsetof(struct msg, o) + fill(m) + sizeof(*uh));
uh->check = 0;
uh->source = htons(67);
uh->dest = htons(68);
iph->tot_len = htons(len += sizeof(*iph));
iph->daddr = c->addr4;
iph->saddr = c->gw4;
iph->check = 0;
iph->check = csum_ip4(iph, iph->ihl * 4);
len += sizeof(*eh);
memcpy(eh->h_dest, eh->h_source, ETH_ALEN);
memcpy(eh->h_source, c->mac, ETH_ALEN);
if (tap_send(c, eh, len, 0) < 0)
perror("DHCP: send");
return 1;
}