passt/tap.c
Stefano Brivio 38b50dba47 passt: Spare some syscalls, add some optimisations from profiling
Avoid a bunch of syscalls on forwarding paths by:

- storing minimum and maximum file descriptor numbers for each
  protocol, fall back to SO_PROTOCOL query only on overlaps

- allocating a larger receive buffer -- this can result in more
  coalesced packets than sendmmsg() can take (UIO_MAXIOV, i.e. 1024),
  so make sure we don't exceed that within a single call to protocol
  tap handlers

- nesting the handling loop in tap_handler() in the receive loop,
  so that we have better chances of filling our receive buffer in
  fewer calls

- skipping the recvfrom() in the UDP handler on EPOLLERR -- there's
  nothing to be done in that case

and while at it:

- restore the 20ms timer interval for periodic (TCP) events, I
  accidentally changed that to 100ms in an earlier commit

- attempt using SO_ZEROCOPY for UDP -- if it's not available,
  sendmmsg() will succeed anyway

- fix the handling of the status code from sendmmsg(), if it fails,
  we'll try to discard the first message, hence return 1 from the
  UDP handler

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2021-04-23 22:22:37 +02:00

136 lines
3.2 KiB
C

// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
*
* tap.c - Functions to communicate with guest-facing tap interface
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
*/
#include <stdio.h>
#include <limits.h>
#include <string.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <stdint.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include "passt.h"
#include "util.h"
/**
* tap_send() - Send frame and qemu socket header with indication of length
* @fd: tap file descriptor
* @len: Total L2 packet length
* @flags: Flags for send(), if any
*
* Return: return code from send()
*/
int tap_send(int fd, void *data, size_t len, int flags)
{
uint32_t vnet_len = htonl(len);
send(fd, &vnet_len, 4, MSG_DONTWAIT | MSG_NOSIGNAL);
return send(fd, data, len, flags | MSG_DONTWAIT | MSG_NOSIGNAL);
}
/**
* tap_ip_send() - Send IP packet, with L2 headers, calculating L3/L4 checksums
* @c: Execution context
* @src: IPv6 source address, IPv4-mapped for IPv4 sources
* @proto: L4 protocol number
* @in: Payload
* @len: L4 payload length
*/
void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
char *in, size_t len)
{
char pkt[USHRT_MAX];
struct ethhdr *eh;
eh = (struct ethhdr *)pkt;
/* TODO: ARP table lookup */
memcpy(eh->h_dest, c->mac_guest, ETH_ALEN);
memcpy(eh->h_source, c->mac, ETH_ALEN);
if (IN6_IS_ADDR_V4MAPPED(src)) {
struct iphdr *iph = (struct iphdr *)(eh + 1);
char *data = (char *)(iph + 1);
eh->h_proto = ntohs(ETH_P_IP);
iph->version = 4;
iph->ihl = 5;
iph->tos = 0;
iph->tot_len = htons(len + 20);
iph->id = 0;
iph->frag_off = 0;
iph->ttl = 255;
iph->protocol = proto;
iph->daddr = c->addr4;
memcpy(&iph->saddr, &src->s6_addr[12], 4);
iph->check = 0;
iph->check = csum_ip4(iph, iph->ihl * 4);
memcpy(data, in, len);
if (iph->protocol == IPPROTO_TCP) {
csum_tcp4(iph);
} else if (iph->protocol == IPPROTO_UDP) {
struct udphdr *uh = (struct udphdr *)(iph + 1);
uh->check = 0;
}
tap_send(c->fd_unix, pkt, len + sizeof(*iph) + sizeof(*eh), 0);
} else {
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
char *data = (char *)(ip6h + 1);
eh->h_proto = ntohs(ETH_P_IPV6);
memset(ip6h->flow_lbl, 0, 3);
ip6h->payload_len = htons(len);
ip6h->priority = 0;
ip6h->saddr = *src;
ip6h->daddr = c->addr6_guest;
memcpy(data, in, len);
ip6h->hop_limit = proto;
ip6h->version = 0;
ip6h->nexthdr = 0;
if (proto == IPPROTO_TCP) {
struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
th->check = 0;
th->check = csum_ip4(ip6h, len + sizeof(*ip6h));
} else if (proto == IPPROTO_UDP) {
struct udphdr *uh = (struct udphdr *)(ip6h + 1);
uh->check = 0;
uh->check = csum_ip4(ip6h, len + sizeof(*ip6h));
} else if (proto == IPPROTO_ICMPV6) {
struct icmp6hdr *ih = (struct icmp6hdr *)(ip6h + 1);
ih->icmp6_cksum = 0;
ih->icmp6_cksum = csum_ip4(ip6h, len + sizeof(*ip6h));
}
ip6h->version = 6;
ip6h->nexthdr = proto;
ip6h->hop_limit = 255;
tap_send(c->fd_unix, pkt, len + sizeof(*ip6h) + sizeof(*eh), 0);
}
}