Compare commits

...

16 commits

Author SHA1 Message Date
Laurent Vivier
60e35ab2bd vhost-user: remove tap_send_frames_vu()
As TCP and UDP use now directly vhost-user we don't need this function
anymore. Other protocols (ICMP, ARP, DHCP, ...) use tap_send()/vu_send()

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 13:58:47 +01:00
Laurent Vivier
95aebad0a4 udp: vhost-user RX nocopy
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 13:58:34 +01:00
Laurent Vivier
2d5528c9be tcp: vhost-user RX nocopy
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 13:58:27 +01:00
Laurent Vivier
1bf4abe402 vhost-user: use guest buffer directly in vu_handle_tx()
Check the buffer address is correctly in the mmap'ed memory.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 11:54:26 +01:00
Laurent Vivier
37f457a76c vhost-user: add vhost-user
add virtio and vhost-user functions to connect with QEMU.

  $ ./passt --vhost-user

and

  # qemu-system-x86_64 ... -m 4G \
        -object memory-backend-memfd,id=memfd0,share=on,size=4G \
        -numa node,memdev=memfd0 \
        -chardev socket,id=chr0,path=/tmp/passt_1.socket \
        -netdev vhost-user,id=netdev0,chardev=chr0 \
        -device virtio-net,mac=9a:2b:2c:2d:2e:2f,netdev=netdev0 \
        ...

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 11:54:26 +01:00
Laurent Vivier
b2229bd24f vhost-user: introduce vhost-user API
Add vhost_user.c and vhost_user.h that define the functions needed
to implement vhost-user backend.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 11:54:26 +01:00
Laurent Vivier
45b1403f42 vhost-user: introduce virtio API
Add virtio.c and virtio.h that define the functions needed
to manage virtqueues.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 11:54:26 +01:00
Laurent Vivier
bb3877dde3 vhost-user: compare mode MODE_PASTA and not MODE_PASST
As we are going to introduce the MODE_VU that will act like
the mode MODE_PASST, compare to MODE_PASTA rather than to add
a comparison to MODE_VU when we check for MODE_PASST.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 11:54:26 +01:00
Laurent Vivier
27a713947c packet: replace struct desc by struct iovec
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 11:54:26 +01:00
Laurent Vivier
0938100596 udp: rename udp_sock_handler() to udp_buf_sock_handler()
We are going to introduce a variant of the function to use
vhost-user buffers rather than passt internal buffers.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 10:54:19 +01:00
Laurent Vivier
72cadf34ad udp: move udpX_l2_buf_t and udpX_l2_mh_sock out of udp_update_hdrX()
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 10:54:19 +01:00
Laurent Vivier
4d7ca742ef tap: export pool_flush()/tapX_handler()/packet_add()
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 10:54:19 +01:00
Laurent Vivier
9cc20cbdb1 tcp: move buffers management functions to their own file
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-12 10:54:16 +01:00
Laurent Vivier
c38f260820 tcp: rename functions that manage buffers
To separate these functions from the ones specific to TCP management,
we are going to move it to a new file, but before that update their names
to reflect their role.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-11 19:47:54 +01:00
Laurent Vivier
576c1cca2c tcp: extract buffer management from tcp_send_flag()
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-11 19:27:39 +01:00
Laurent Vivier
a66fceb280 tcp: Replace TCP buffer structure by an iovec array
To be able to provide pointers to TCP headers and IP headers without
worrying about alignment in the structure, split the structure into
several arrays and point to each part of the frame using an iovec array.

Using iovec also allows us to simply ignore the first entry when the
vnet length header is not needed. And as the payload buffer contains
only the TCP header and the TCP data we can increase the size of the
TCP data to USHRT_MAX - sizeof(struct tcphdr).

As a side effect, these changes improve performance by a factor of
x1.5.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
2024-03-11 15:58:29 +01:00
28 changed files with 3631 additions and 865 deletions

View file

@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
tcp_splice.c udp.c util.c
tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_vu.c util.c vhost_user.c virtio.c
QRAP_SRCS = qrap.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
flow_table.h icmp.h inany.h iov.h ip.h isolation.h lineread.h log.h \
ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h siphash.h tap.h \
tcp.h tcp_conn.h tcp_splice.h udp.h util.h
tcp.h tcp_buf.h tcp_conn.h tcp_splice.h tcp_vu.h udp.h udp_internal.h \
udp_vu.h util.h vhost_user.h virtio.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };

32
conf.c
View file

@ -44,6 +44,7 @@
#include "lineread.h"
#include "isolation.h"
#include "log.h"
#include "vhost_user.h"
/**
* next_chunk - Return the next piece of a string delimited by a character
@ -146,7 +147,7 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
if (fwd->mode)
goto mode_conflict;
if (c->mode != MODE_PASST)
if (c->mode == MODE_PASTA)
die("'all' port forwarding is only allowed for passt");
fwd->mode = FWD_ALL;
@ -721,9 +722,12 @@ static void print_usage(const char *name, int status)
info( " -I, --ns-ifname NAME namespace interface name");
info( " default: same interface name as external one");
} else {
info( " -s, --socket PATH UNIX domain socket path");
info( " -s, --socket, --socket-path PATH UNIX domain socket path");
info( " default: probe free path starting from "
UNIX_SOCK_PATH, 1);
info( " --vhost-user Enable vhost-user mode");
info( " UNIX domain socket is provided by -s option");
info( " --print-capabilities print back-end capabilities in JSON format");
}
info( " -F, --fd FD Use FD as pre-opened connected socket");
@ -1109,6 +1113,7 @@ void conf(struct ctx *c, int argc, char **argv)
{"help", no_argument, NULL, 'h' },
{"socket", required_argument, NULL, 's' },
{"fd", required_argument, NULL, 'F' },
{"socket-path", required_argument, NULL, 's' }, /* vhost-user mandatory */
{"ns-ifname", required_argument, NULL, 'I' },
{"pcap", required_argument, NULL, 'p' },
{"pid", required_argument, NULL, 'P' },
@ -1155,6 +1160,8 @@ void conf(struct ctx *c, int argc, char **argv)
{"config-net", no_argument, NULL, 17 },
{"no-copy-routes", no_argument, NULL, 18 },
{"no-copy-addrs", no_argument, NULL, 19 },
{"vhost-user", no_argument, NULL, 20 },
{"print-capabilities", no_argument, NULL, 21 }, /* vhost-user mandatory */
{ 0 },
};
char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 };
@ -1226,7 +1233,7 @@ void conf(struct ctx *c, int argc, char **argv)
c->no_dhcp_dns = 0;
break;
case 6:
if (c->mode != MODE_PASST)
if (c->mode == MODE_PASTA)
die("--no-dhcp-dns is for passt mode only");
c->no_dhcp_dns = 1;
@ -1238,7 +1245,7 @@ void conf(struct ctx *c, int argc, char **argv)
c->no_dhcp_dns_search = 0;
break;
case 8:
if (c->mode != MODE_PASST)
if (c->mode == MODE_PASTA)
die("--no-dhcp-search is for passt mode only");
c->no_dhcp_dns_search = 1;
@ -1293,7 +1300,7 @@ void conf(struct ctx *c, int argc, char **argv)
break;
case 14:
fprintf(stdout,
c->mode == MODE_PASST ? "passt " : "pasta ");
c->mode == MODE_PASTA ? "pasta " : "passt ");
fprintf(stdout, VERSION_BLOB);
exit(EXIT_SUCCESS);
case 15:
@ -1314,7 +1321,6 @@ void conf(struct ctx *c, int argc, char **argv)
sizeof(c->ip6.ifname_out), "%s", optarg);
if (ret <= 0 || ret >= (int)sizeof(c->ip6.ifname_out))
die("Invalid interface name: %s", optarg);
break;
case 17:
if (c->mode != MODE_PASTA)
@ -1336,6 +1342,16 @@ void conf(struct ctx *c, int argc, char **argv)
warn("--no-copy-addrs will be dropped soon");
c->no_copy_addrs = copy_addrs_opt = true;
break;
case 20:
if (c->mode == MODE_PASTA) {
err("--vhost-user is for passt mode only");
usage(argv[0]);
}
c->mode = MODE_VU;
break;
case 21:
vu_print_capabilities();
break;
case 'd':
if (c->debug)
die("Multiple --debug options given");
@ -1596,7 +1612,7 @@ void conf(struct ctx *c, int argc, char **argv)
v6_only = true;
break;
case '1':
if (c->mode != MODE_PASST)
if (c->mode == MODE_PASTA)
die("--one-off is for passt mode only");
if (c->one_off)
@ -1643,7 +1659,7 @@ void conf(struct ctx *c, int argc, char **argv)
conf_ugid(runas, &uid, &gid);
if (logfile) {
logfile_init(c->mode == MODE_PASST ? "passt" : "pasta",
logfile_init(c->mode == MODE_PASTA ? "pasta" : "passt",
logfile, logsize);
}

39
iov.c
View file

@ -156,42 +156,3 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt)
return len;
}
/**
* iov_copy - Copy data from one scatter/gather I/O vector (struct iovec) to
* another.
*
* @dst_iov: Pointer to the destination array of struct iovec describing
* the scatter/gather I/O vector to copy to.
* @dst_iov_cnt: Number of elements in the destination iov array.
* @iov: Pointer to the source array of struct iovec describing
* the scatter/gather I/O vector to copy from.
* @iov_cnt: Number of elements in the source iov array.
* @offset: Offset within the source iov from where copying should start.
* @bytes: Total number of bytes to copy from iov to dst_iov.
*
* Returns: The number of elements successfully copied to the destination
* iov array.
*/
/* cppcheck-suppress unusedFunction */
unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt,
const struct iovec *iov, size_t iov_cnt,
size_t offset, size_t bytes)
{
unsigned int i, j;
i = iov_skip_bytes(iov, iov_cnt, offset, &offset);
/* copying data */
for (j = 0; i < iov_cnt && j < dst_iov_cnt && bytes; i++) {
size_t len = MIN(bytes, iov[i].iov_len - offset);
dst_iov[j].iov_base = (char *)iov[i].iov_base + offset;
dst_iov[j].iov_len = len;
j++;
bytes -= len;
offset = 0;
}
return j;
}

3
iov.h
View file

@ -25,7 +25,4 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
size_t offset, void *buf, size_t bytes);
size_t iov_size(const struct iovec *iov, size_t iov_cnt);
unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt,
const struct iovec *iov, size_t iov_cnt,
size_t offset, size_t bytes);
#endif /* IOVEC_H */

View file

@ -312,7 +312,7 @@ int isolate_prefork(const struct ctx *c)
* PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody
* ever gets around seccomp profiles -- there's no harm in passing it.
*/
if (!c->foreground || c->mode == MODE_PASST)
if (!c->foreground || c->mode != MODE_PASTA)
flags |= CLONE_NEWPID;
if (unshare(flags)) {
@ -379,12 +379,12 @@ void isolate_postfork(const struct ctx *c)
prctl(PR_SET_DUMPABLE, 0);
if (c->mode == MODE_PASST) {
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
prog.filter = filter_passt;
} else {
if (c->mode == MODE_PASTA) {
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
prog.filter = filter_pasta;
} else {
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
prog.filter = filter_passt;
}
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||

View file

@ -22,6 +22,42 @@
#include "util.h"
#include "log.h"
static int packet_check_range(const struct pool *p, size_t offset, size_t len,
const char *start, const char *func, int line)
{
ASSERT(p->buf);
if (p->buf_size == 0)
return vu_packet_check_range((void *)p->buf, offset, len, start,
func, line);
if (start < p->buf) {
if (func) {
trace("add packet start %p before buffer start %p, "
"%s:%i", (void *)start, (void *)p->buf, func, line);
}
return -1;
}
if (start + len + offset > p->buf + p->buf_size) {
if (func) {
trace("packet offset plus length %lu from size %lu, "
"%s:%i", start - p->buf + len + offset,
p->buf_size, func, line);
}
return -1;
}
#if UINTPTR_MAX == UINT64_MAX
if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {
trace("add packet start %p, buffer start %p, %s:%i",
(void *)start, (void *)p->buf, func, line);
return -1;
}
#endif
return 0;
}
/**
* packet_add_do() - Add data as packet descriptor to given pool
* @p: Existing pool
@ -41,34 +77,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
return;
}
if (start < p->buf) {
trace("add packet start %p before buffer start %p, %s:%i",
(void *)start, (void *)p->buf, func, line);
if (packet_check_range(p, 0, len, start, func, line))
return;
}
if (start + len > p->buf + p->buf_size) {
trace("add packet start %p, length: %zu, buffer end %p, %s:%i",
(void *)start, len, (void *)(p->buf + p->buf_size),
func, line);
return;
}
if (len > UINT16_MAX) {
trace("add packet length %zu, %s:%i", len, func, line);
return;
}
#if UINTPTR_MAX == UINT64_MAX
if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {
trace("add packet start %p, buffer start %p, %s:%i",
(void *)start, (void *)p->buf, func, line);
return;
}
#endif
p->pkt[idx].offset = start - p->buf;
p->pkt[idx].len = len;
p->pkt[idx].iov_base = (void *)start;
p->pkt[idx].iov_len = len;
p->count++;
}
@ -104,28 +122,23 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
return NULL;
}
if (p->pkt[idx].offset + len + offset > p->buf_size) {
if (len + offset > p->pkt[idx].iov_len) {
if (func) {
trace("packet offset plus length %zu from size %zu, "
"%s:%i", p->pkt[idx].offset + len + offset,
p->buf_size, func, line);
}
return NULL;
}
if (len + offset > p->pkt[idx].len) {
if (func) {
trace("data length %zu, offset %zu from length %u, "
"%s:%i", len, offset, p->pkt[idx].len,
trace("data length %zu, offset %zu from length %zu, "
"%s:%i", len, offset, p->pkt[idx].iov_len,
func, line);
}
return NULL;
}
if (left)
*left = p->pkt[idx].len - offset - len;
if (packet_check_range(p, offset, len, p->pkt[idx].iov_base,
func, line))
return NULL;
return p->buf + p->pkt[idx].offset + offset;
if (left)
*left = p->pkt[idx].iov_len - offset - len;
return (char *)p->pkt[idx].iov_base + offset;
}
/**

View file

@ -6,16 +6,6 @@
#ifndef PACKET_H
#define PACKET_H
/**
* struct desc - Generic offset-based descriptor within buffer
* @offset: Offset of descriptor relative to buffer start, 32-bit limit
* @len: Length of descriptor, host order, 16-bit limit
*/
struct desc {
uint32_t offset;
uint16_t len;
};
/**
* struct pool - Generic pool of packets stored in a buffer
* @buf: Buffer storing packet descriptors
@ -29,9 +19,11 @@ struct pool {
size_t buf_size;
size_t size;
size_t count;
struct desc pkt[1];
struct iovec pkt[1];
};
int vu_packet_check_range(void *buf, size_t offset, size_t len,
const char *start, const char *func, int line);
void packet_add_do(struct pool *p, size_t len, const char *start,
const char *func, int line);
void *packet_get_do(const struct pool *p, const size_t idx,
@ -54,7 +46,7 @@ struct _name ## _t { \
size_t buf_size; \
size_t size; \
size_t count; \
struct desc pkt[_size]; \
struct iovec pkt[_size]; \
}
#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \

18
passt.c
View file

@ -73,6 +73,8 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device",
[EPOLL_TYPE_TAP_PASST] = "connected qemu socket",
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
@ -165,7 +167,7 @@ static void timer_init(struct ctx *c, const struct timespec *now)
*/
void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
{
tcp_update_l2_buf(eth_d, eth_s);
tcp_buf_update_l2(eth_d, eth_s);
udp_update_l2_buf(eth_d, eth_s);
}
@ -278,6 +280,7 @@ int main(int argc, char **argv)
pasta_netns_quit_init(&c);
tap_sock_init(&c);
vu_init(&c);
secret_init(&c);
@ -348,7 +351,7 @@ loop:
uint32_t eventmask = events[i].events;
trace("%s: epoll event on %s %i (events: 0x%08x)",
c.mode == MODE_PASST ? "passt" : "pasta",
c.mode == MODE_PASTA ? "pasta" : "passt",
EPOLL_TYPE_STR(ref.type), ref.fd, eventmask);
switch (ref.type) {
@ -380,7 +383,10 @@ loop:
tcp_timer_handler(&c, ref);
break;
case EPOLL_TYPE_UDP:
udp_sock_handler(&c, ref, eventmask, &now);
if (c.mode == MODE_VU)
udp_vu_sock_handler(&c, ref, eventmask, &now);
else
udp_buf_sock_handler(&c, ref, eventmask, &now);
break;
case EPOLL_TYPE_ICMP:
icmp_sock_handler(&c, AF_INET, ref);
@ -388,6 +394,12 @@ loop:
case EPOLL_TYPE_ICMPV6:
icmp_sock_handler(&c, AF_INET6, ref);
break;
case EPOLL_TYPE_VHOST_CMD:
tap_handler_vu(&c, eventmask);
break;
case EPOLL_TYPE_VHOST_KICK:
vu_kick_cb(&c, ref);
break;
default:
/* Can't happen */
ASSERT(0);

10
passt.h
View file

@ -42,6 +42,8 @@ union epoll_ref;
#include "fwd.h"
#include "tcp.h"
#include "udp.h"
#include "udp_vu.h"
#include "vhost_user.h"
/**
* enum epoll_type - Different types of fds we poll over
@ -73,6 +75,10 @@ enum epoll_type {
EPOLL_TYPE_TAP_PASST,
/* socket listening for qemu socket connections */
EPOLL_TYPE_TAP_LISTEN,
/* vhost-user command socket */
EPOLL_TYPE_VHOST_CMD,
/* vhost-user kick event socket */
EPOLL_TYPE_VHOST_KICK,
EPOLL_NUM_TYPES,
};
@ -140,6 +146,7 @@ struct fqdn {
enum passt_modes {
MODE_PASST,
MODE_PASTA,
MODE_VU,
};
/**
@ -307,6 +314,9 @@ struct ctx {
int low_wmem;
int low_rmem;
/* vhost-user */
struct VuDev vdev;
};
void proto_update_l2_buf(const unsigned char *eth_d,

313
tap.c
View file

@ -58,6 +58,7 @@
#include "packet.h"
#include "tap.h"
#include "log.h"
#include "vhost_user.h"
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
@ -76,19 +77,22 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
*/
int tap_send(const struct ctx *c, const void *data, size_t len)
{
pcap(data, len);
if (c->mode == MODE_PASST) {
int flags = MSG_NOSIGNAL | MSG_DONTWAIT;
uint32_t vnet_len = htonl(len);
pcap(data, len);
switch (c->mode) {
case MODE_PASST:
if (send(c->fd_tap, &vnet_len, 4, flags) < 0)
return -1;
return send(c->fd_tap, data, len, flags);
}
case MODE_PASTA:
return write(c->fd_tap, (char *)data, len);
case MODE_VU:
return vu_send(c, data, len);
}
return 0;
}
/**
@ -350,6 +354,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c,
return i;
}
/**
* tap_send_iov_pasta() - Send out multiple prepared frames
* @c: Execution context
* @iov: Array of frames, each frames is divided in an array of iovecs.
* The first entry of the iovec is ignored
* @n: Number of frames in @iov
*
* Return: number of frames actually sent
*/
static size_t tap_send_iov_pasta(const struct ctx *c,
struct iovec iov[][TCP_IOV_NUM], size_t n)
{
unsigned int i;
for (i = 0; i < n; i++) {
if (!tap_send_frames_pasta(c, &iov[i][TCP_IOV_ETH],
TCP_IOV_NUM - TCP_IOV_ETH))
break;
}
return i;
}
/**
* tap_send_frames_passt() - Send multiple frames to the passt tap
* @c: Execution context
@ -390,6 +418,42 @@ static size_t tap_send_frames_passt(const struct ctx *c,
return i;
}
/**
* tap_send_iov_passt() - Send out multiple prepared frames
* @c: Execution context
* @iov: Array of frames, each frames is divided in an array of iovecs.
* The first entry of the iovec is updated to point to an
* uint32_t storing the frame length.
* @n: Number of frames in @iov
*
* Return: number of frames actually sent
*/
static size_t tap_send_iov_passt(const struct ctx *c,
struct iovec iov[][TCP_IOV_NUM],
size_t n)
{
unsigned int i;
for (i = 0; i < n; i++) {
uint32_t vnet_len;
int j;
vnet_len = 0;
for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
vnet_len += iov[i][j].iov_len;
vnet_len = htonl(vnet_len);
iov[i][TCP_IOV_VNET].iov_base = &vnet_len;
iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len);
if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))
break;
}
return i;
}
/**
* tap_send_frames() - Send out multiple prepared frames
* @c: Execution context
@ -405,10 +469,19 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n)
if (!n)
return 0;
if (c->mode == MODE_PASST)
m = tap_send_frames_passt(c, iov, n);
else
switch (c->mode) {
case MODE_PASTA:
m = tap_send_frames_pasta(c, iov, n);
break;
case MODE_PASST:
m = tap_send_frames_passt(c, iov, n);
break;
case MODE_VU:
ASSERT(0);
default:
m = 0;
break;
}
if (m < n)
debug("tap: failed to send %zu frames of %zu", n - m, n);
@ -418,6 +491,50 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n)
return m;
}
/**
* tap_send_iov() - Send out multiple prepared frames
* @c: Execution context
* @iov: Array of frames, each frames is divided in an array of iovecs.
* iovec array is:
* TCP_IOV_VNET (0) vnet length
* TCP_IOV_ETH (1) ethernet header
* TCP_IOV_IP (2) IP (v4/v6) header
* TCP_IOV_PAYLOAD (3) IP payload (TCP header + data)
* TCP_IOV_NUM (4) is the number of entries in the iovec array
* TCP_IOV_VNET entry is updated with passt, ignored with pasta.
* @n: Number of frames in @iov
*
* Return: number of frames actually sent
*/
size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
size_t n)
{
size_t m;
unsigned int i;
if (!n)
return 0;
switch (c->mode) {
case MODE_PASST:
m = tap_send_iov_passt(c, iov, n);
break;
case MODE_PASTA:
m = tap_send_iov_pasta(c, iov, n);
break;
default:
ASSERT(0);
}
if (m < n)
debug("tap: failed to send %zu frames of %zu", n - m, n);
for (i = 0; i < m; i++)
pcap_iov(&iov[i][TCP_IOV_ETH], TCP_IOV_NUM - TCP_IOV_ETH);
return m;
}
/**
* eth_update_mac() - Update tap L2 header with new Ethernet addresses
* @eh: Ethernet headers to update
@ -589,7 +706,7 @@ resume:
if (!eh)
continue;
if (ntohs(eh->h_proto) == ETH_P_ARP) {
PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
packet_add(pkt, l2_len, (char *)eh);
arp(c, pkt);
@ -629,7 +746,7 @@ resume:
continue;
if (iph->protocol == IPPROTO_ICMP) {
PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
if (c->no_icmp)
continue;
@ -648,7 +765,7 @@ resume:
continue;
if (iph->protocol == IPPROTO_UDP) {
PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
packet_add(pkt, l2_len, (char *)eh);
if (dhcp(c, pkt))
@ -797,7 +914,7 @@ resume:
}
if (proto == IPPROTO_ICMPV6) {
PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
if (c->no_icmp)
continue;
@ -821,7 +938,7 @@ resume:
uh = (struct udphdr *)l4h;
if (proto == IPPROTO_UDP) {
PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
packet_add(pkt, l4_len, l4h);
@ -907,11 +1024,50 @@ append:
return in->count;
}
void pool_flush_all(void)
{
pool_flush(pool_tap4);
pool_flush(pool_tap6);
}
void tap_handler_all(struct ctx *c, const struct timespec *now)
{
tap4_handler(c, pool_tap4, now);
tap6_handler(c, pool_tap6, now);
}
void packet_add_all_do(struct ctx *c, ssize_t len, char *p,
const char *func, int line)
{
const struct ethhdr *eh;
pcap(p, len);
eh = (struct ethhdr *)p;
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
proto_update_l2_buf(c->mac_guest, NULL);
}
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
case ETH_P_IP:
packet_add_do(pool_tap4, len, p, func, line);
break;
case ETH_P_IPV6:
packet_add_do(pool_tap6, len, p, func, line);
break;
default:
break;
}
}
/**
* tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket
* @c: Execution context
*/
static void tap_sock_reset(struct ctx *c)
void tap_sock_reset(struct ctx *c)
{
if (c->one_off) {
info("Client closed connection, exiting");
@ -933,7 +1089,6 @@ static void tap_sock_reset(struct ctx *c)
void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now)
{
const struct ethhdr *eh;
ssize_t n, rem;
char *p;
@ -946,8 +1101,7 @@ redo:
p = pkt_buf;
rem = 0;
pool_flush(pool_tap4);
pool_flush(pool_tap6);
pool_flush_all();
n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT);
if (n < 0) {
@ -974,37 +1128,18 @@ redo:
/* Complete the partial read above before discarding a malformed
* frame, otherwise the stream will be inconsistent.
*/
if (len < (ssize_t)sizeof(*eh) || len > (ssize_t)ETH_MAX_MTU)
if (len < (ssize_t)sizeof(struct ethhdr) ||
len > (ssize_t)ETH_MAX_MTU)
goto next;
pcap(p, len);
eh = (struct ethhdr *)p;
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
proto_update_l2_buf(c->mac_guest, NULL);
}
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
case ETH_P_IP:
packet_add(pool_tap4, len, p);
break;
case ETH_P_IPV6:
packet_add(pool_tap6, len, p);
break;
default:
break;
}
packet_add_all(c, len, p);
next:
p += len;
n -= len;
}
tap4_handler(c, pool_tap4, now);
tap6_handler(c, pool_tap6, now);
tap_handler_all(c, now);
/* We can't use EPOLLET otherwise. */
if (rem)
@ -1029,35 +1164,18 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
redo:
n = 0;
pool_flush(pool_tap4);
pool_flush(pool_tap6);
pool_flush_all();
restart:
while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
const struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n);
if (len < (ssize_t)sizeof(*eh) || len > (ssize_t)ETH_MAX_MTU) {
if (len < (ssize_t)sizeof(struct ethhdr) ||
len > (ssize_t)ETH_MAX_MTU) {
n += len;
continue;
}
pcap(pkt_buf + n, len);
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
proto_update_l2_buf(c->mac_guest, NULL);
}
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
case ETH_P_IP:
packet_add(pool_tap4, len, pkt_buf + n);
break;
case ETH_P_IPV6:
packet_add(pool_tap6, len, pkt_buf + n);
break;
default:
break;
}
packet_add_all(c, len, pkt_buf + n);
if ((n += len) == TAP_BUF_BYTES)
break;
@ -1068,8 +1186,7 @@ restart:
ret = errno;
tap4_handler(c, pool_tap4, now);
tap6_handler(c, pool_tap6, now);
tap_handler_all(c, now);
if (len > 0 || ret == EAGAIN)
return;
@ -1145,11 +1262,17 @@ static void tap_sock_unix_init(struct ctx *c)
ev.data.u64 = ref.u64;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
if (c->mode == MODE_VU) {
info("You can start qemu with:");
info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n",
addr.sun_path);
} else {
info("You can now start qemu (>= 7.2, with commit 13c6be96618c):");
info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
addr.sun_path);
info("or qrap, for earlier qemu versions:");
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
}
}
/**
@ -1159,7 +1282,7 @@ static void tap_sock_unix_init(struct ctx *c)
*/
void tap_listen_handler(struct ctx *c, uint32_t events)
{
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST };
union epoll_ref ref;
struct epoll_event ev = { 0 };
int v = INT_MAX / 2;
struct ucred ucred;
@ -1200,7 +1323,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
trace("tap: failed to set SO_SNDBUF to %i", v);
ref.fd = c->fd_tap;
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
if (c->mode == MODE_VU) {
ref.type = EPOLL_TYPE_VHOST_CMD;
ev.events = EPOLLIN | EPOLLRDHUP;
} else {
ref.type = EPOLL_TYPE_TAP_PASST;
ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
}
ev.data.u64 = ref.u64;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
}
@ -1261,6 +1390,23 @@ static void tap_sock_tun_init(struct ctx *c)
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
}
void tap_sock_update_buf(void *base, size_t size)
{
int i;
pool_tap4_storage.buf = base;
pool_tap4_storage.buf_size = size;
pool_tap6_storage.buf = base;
pool_tap6_storage.buf_size = size;
for (i = 0; i < TAP_SEQS; i++) {
tap4_l4[i].p.buf = base;
tap4_l4[i].p.buf_size = size;
tap6_l4[i].p.buf = base;
tap6_l4[i].p.buf_size = size;
}
}
/**
* tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor
* @c: Execution context
@ -1272,10 +1418,22 @@ void tap_sock_init(struct ctx *c)
pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz);
pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz);
if (c->mode == MODE_VU) {
pool_tap4_storage.buf = NULL;
pool_tap4_storage.buf_size = 0;
pool_tap6_storage.buf = NULL;
pool_tap6_storage.buf_size = 0;
}
for (i = 0; i < TAP_SEQS; i++) {
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
if (c->mode == MODE_VU) {
tap4_l4[i].p.buf = NULL;
tap4_l4[i].p.buf_size = 0;
tap6_l4[i].p.buf = NULL;
tap6_l4[i].p.buf_size = 0;
}
}
if (c->fd_tap != -1) { /* Passed as --fd */
@ -1284,21 +1442,30 @@ void tap_sock_init(struct ctx *c)
ASSERT(c->one_off);
ref.fd = c->fd_tap;
if (c->mode == MODE_PASST)
switch (c->mode) {
case MODE_PASST:
ref.type = EPOLL_TYPE_TAP_PASST;
else
ref.type = EPOLL_TYPE_TAP_PASTA;
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
break;
case MODE_PASTA:
ref.type = EPOLL_TYPE_TAP_PASTA;
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
break;
case MODE_VU:
ref.type = EPOLL_TYPE_VHOST_CMD;
ev.events = EPOLLIN | EPOLLRDHUP;
break;
}
ev.data.u64 = ref.u64;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
return;
}
if (c->mode == MODE_PASST) {
if (c->mode == MODE_PASTA) {
tap_sock_tun_init(c);
} else {
if (c->fd_tap_listen == -1)
tap_sock_unix_init(c);
} else {
tap_sock_tun_init(c);
}
}

27
tap.h
View file

@ -6,6 +6,20 @@
#ifndef TAP_H
#define TAP_H
/*
* TCP frame iovec array:
* TCP_IOV_VNET vnet length
* TCP_IOV_ETH ethernet header
* TCP_IOV_IP IP (v4/v6) header
* TCP_IOV_PAYLOAD IP payload (TCP header + data)
* TCP_IOV_NUM is the number of entries in the iovec array
*/
#define TCP_IOV_VNET 0
#define TCP_IOV_ETH 1
#define TCP_IOV_IP 2
#define TCP_IOV_PAYLOAD 3
#define TCP_IOV_NUM 4
/**
* struct tap_hdr - L2 and tap specific headers
* @vnet_len: Frame length (for qemu socket transport)
@ -74,6 +88,8 @@ void tap_icmp6_send(const struct ctx *c,
const void *in, size_t len);
int tap_send(const struct ctx *c, const void *data, size_t len);
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n);
size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
size_t n);
void eth_update_mac(struct ethhdr *eh,
const unsigned char *eth_d, const unsigned char *eth_s);
void tap_listen_handler(struct ctx *c, uint32_t events);
@ -81,6 +97,17 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
const struct timespec *now);
void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now);
void tap_sock_reset(struct ctx *c);
void tap_sock_update_buf(void *base, size_t size);
void tap_sock_init(struct ctx *c);
void pool_flush_all(void);
void tap_handler_all(struct ctx *c, const struct timespec *now);
void packet_add_do(struct pool *p, size_t len, const char *start,
const char *func, int line);
void packet_add_all_do(struct ctx *c, ssize_t len, char *p,
const char *func, int line);
#define packet_add_all(p, len, start) \
packet_add_all_do(p, len, start, __func__, __LINE__)
#endif /* TAP_H */

672
tcp.c
View file

@ -301,57 +301,20 @@
#include "flow.h"
#include "flow_table.h"
#include "tcp_internal.h"
#include "tcp_buf.h"
#include "tcp_vu.h"
/* Sides of a flow as we use them in "tap" connections */
#define SOCKSIDE 0
#define TAPSIDE 1
#define TCP_FRAMES_MEM 128
#define TCP_FRAMES \
(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
#define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)
#define MAX_WS 8
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
#ifdef __AVX2__
uint8_t pad[26];
#else
uint8_t pad[2];
#endif
struct tap_hdr taph;
struct iphdr iph;
struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
#ifdef __AVX2__
uint8_t pad[14];
#else
uint8_t pad[2];
#endif
struct tap_hdr taph;
struct ipv6hdr ip6h;
struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
#ifdef HAS_SND_WND
# define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd)
@ -373,31 +336,9 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
*/
#define SOL_TCP IPPROTO_TCP
#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
#define ACK_IF_NEEDED 0 /* See tcp_buf_send_flag() */
#define FIN (1 << 0)
#define SYN (1 << 1)
#define RST (1 << 2)
#define ACK (1 << 4)
/* Flags for internal usage */
#define DUP_ACK (1 << 5)
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define OPT_EOL 0
#define OPT_NOP 1
#define OPT_MSS 2
#define OPT_MSS_LEN 4
#define OPT_WS 3
#define OPT_WS_LEN 3
#define OPT_SACKP 4
#define OPT_SACK 5
#define OPT_TS 8
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
#define CONN_V6(conn) (!CONN_V4(conn))
#define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \
(conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
@ -434,144 +375,11 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
/**
* tcp_buf_seq_update - Sequences to update with length of frames once sent
* @seq: Pointer to sequence number sent to tap-side, to be updated
* @len: TCP payload length
*/
struct tcp_buf_seq_update {
uint32_t *seq;
uint16_t len;
};
/* Static buffers */
/**
* tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
* @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr)
* @uh: Headroom for TCP header
* @data: Storage for TCP payload
*/
static struct tcp4_l2_buf_t {
#ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */
#else
uint8_t pad[2]; /* align iph to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 26 2 */
struct iphdr iph; /* 44 20 */
struct tcphdr th; /* 64 40 */
uint8_t data[MSS4]; /* 84 60 */
/* 65536 65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_buf[TCP_FRAMES_MEM];
static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
static unsigned int tcp4_l2_buf_used;
/**
* tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
* @taph: Tap-level headers (partially pre-filled)
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
* @th: Headroom for TCP header
* @data: Storage for TCP payload
*/
struct tcp6_l2_buf_t {
#ifdef __AVX2__
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
#else
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 14 2 */
struct ipv6hdr ip6h; /* 32 20 */
struct tcphdr th; /* 72 60 */
uint8_t data[MSS6]; /* 92 80 */
/* 65536 65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_buf[TCP_FRAMES_MEM];
static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
static unsigned int tcp6_l2_buf_used;
/* recvmsg()/sendmsg() data for tap */
static char tcp_buf_discard [MAX_WINDOW];
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM];
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM];
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM];
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM];
char tcp_buf_discard [MAX_WINDOW];
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
/**
* tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
* @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr)
* @th: Headroom for TCP header
* @opts: Headroom for TCP options
*/
static struct tcp4_l2_flags_buf_t {
#ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */
#else
uint8_t pad[2]; /* align iph to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 26 2 */
struct iphdr iph; /* 44 20 */
struct tcphdr th; /* 64 40 */
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_flags_buf[TCP_FRAMES_MEM];
static unsigned int tcp4_l2_flags_buf_used;
/**
* tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
* @taph: Tap-level headers (partially pre-filled)
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
* @th: Headroom for TCP header
* @opts: Headroom for TCP options
*/
static struct tcp6_l2_flags_buf_t {
#ifdef __AVX2__
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
#else
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 14 2 */
struct ipv6hdr ip6h; /* 32 20 */
struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_flags_buf[TCP_FRAMES_MEM];
static unsigned int tcp6_l2_flags_buf_used;
#define CONN(idx) (&(FLOW(idx)->tcp))
/* Table for lookup from remote address, local port, remote port */
@ -612,14 +420,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
return EPOLLRDHUP;
}
static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long flag);
#define conn_flag(c, conn, flag) \
do { \
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
conn_flag_do(c, conn, flag); \
} while (0)
/**
* tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
* @c: Execution context
@ -731,7 +531,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
* @conn: Connection pointer
* @flag: Flag to set, or ~flag to unset
*/
static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long flag)
{
if (flag & (flag - 1)) {
@ -782,7 +582,7 @@ static void tcp_hash_remove(const struct ctx *c,
* @conn: Connection pointer
* @event: Connection event
*/
static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long event)
{
int prev, new, num = fls(event);
@ -831,12 +631,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_timer_ctl(c, conn);
}
#define conn_event(c, conn, event) \
do { \
flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
conn_event_do(c, conn, event); \
} while (0)
/**
* tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
* @conn: Connection pointer
@ -966,91 +760,6 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
th->check = csum(th, payload_len, sum);
}
/**
* tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
* @eth_d: Ethernet destination address, NULL if unchanged
* @eth_s: Ethernet source address, NULL if unchanged
*/
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
{
int i;
for (i = 0; i < TCP_FRAMES_MEM; i++) {
struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
eth_update_mac(&b4->taph.eh, eth_d, eth_s);
eth_update_mac(&b6->taph.eh, eth_d, eth_s);
eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
}
}
/**
* tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
* @c: Execution context
*/
static void tcp_sock4_iov_init(const struct ctx *c)
{
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
struct iovec *iov;
int i;
for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IP),
.iph = iph,
.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
};
}
for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IP),
.iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
};
}
for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph);
for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph);
}
/**
* tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
* @c: Execution context
*/
static void tcp_sock6_iov_init(const struct ctx *c)
{
struct iovec *iov;
int i;
for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IPV6),
.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
};
}
for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IPV6),
.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
};
}
for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph);
for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph);
}
/**
* tcp_opt_get() - Get option, and value if any, from TCP header
* @opts: Pointer to start of TCP options in header
@ -1276,46 +985,6 @@ bool tcp_flow_defer(union flow *flow)
return true;
}
static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
#define tcp_rst(c, conn) \
do { \
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
tcp_rst_do(c, conn); \
} while (0)
/**
* tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags)
* @c: Execution context
*/
static void tcp_l2_flags_buf_flush(const struct ctx *c)
{
tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
tcp6_l2_flags_buf_used = 0;
tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
tcp4_l2_flags_buf_used = 0;
}
/**
* tcp_l2_data_buf_flush() - Send out buffers for segments with data
* @c: Execution context
*/
static void tcp_l2_data_buf_flush(const struct ctx *c)
{
unsigned i;
size_t m;
m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
for (i = 0; i < m; i++)
*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
tcp6_l2_buf_used = 0;
m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
for (i = 0; i < m; i++)
*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
tcp4_l2_buf_used = 0;
}
/**
* tcp_defer_handler() - Handler for TCP deferred tasks
* @c: Execution context
@ -1323,8 +992,8 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
/* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
void tcp_defer_handler(struct ctx *c)
{
tcp_l2_flags_buf_flush(c);
tcp_l2_data_buf_flush(c);
tcp_buf_l2_flags_flush(c);
tcp_buf_l2_data_flush(c);
}
/**
@ -1362,7 +1031,7 @@ static void tcp_fill_header(struct tcphdr *th,
*
* Return: The total length of the IPv4 packet, host order
*/
static size_t tcp_fill_headers4(const struct ctx *c,
size_t tcp_fill_headers4(const struct ctx *c,
const struct tcp_tap_conn *conn,
struct iphdr *iph, struct tcphdr *th,
size_t plen, const uint16_t *check,
@ -1383,6 +1052,7 @@ static size_t tcp_fill_headers4(const struct ctx *c,
tcp_fill_header(th, conn, seq);
if (c->mode != MODE_VU)
tcp_update_check_tcp4(iph, th);
return ip_len;
@ -1400,7 +1070,7 @@ static size_t tcp_fill_headers4(const struct ctx *c,
*
* Return: The total length of the IPv6 packet, host order
*/
static size_t tcp_fill_headers6(const struct ctx *c,
size_t tcp_fill_headers6(const struct ctx *c,
const struct tcp_tap_conn *conn,
struct ipv6hdr *ip6h, struct tcphdr *th,
size_t plen, uint32_t seq)
@ -1424,49 +1094,12 @@ static size_t tcp_fill_headers6(const struct ctx *c,
tcp_fill_header(th, conn, seq);
if (c->mode != MODE_VU)
tcp_update_check_tcp6(ip6h, th);
return ip_len;
}
/**
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
* @c: Execution context
* @conn: Connection pointer
* @p: Pointer to any type of TCP pre-cooked buffer
* @plen: Payload length (including TCP header options)
* @check: Checksum, if already known
* @seq: Sequence number for this segment
*
* Return: frame length including L2 headers, host order
*/
static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
const struct tcp_tap_conn *conn,
void *p, size_t plen,
const uint16_t *check, uint32_t seq)
{
const struct in_addr *a4 = inany_v4(&conn->faddr);
size_t ip_len, tlen;
if (a4) {
struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p;
ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
check, seq);
tlen = tap_iov_len(c, &b->taph, ip_len);
} else {
struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
seq);
tlen = tap_iov_len(c, &b->taph, ip_len);
}
return tlen;
}
/**
* tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap
* @c: Execution context
@ -1476,7 +1109,7 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
*
* Return: 1 if sequence or window were updated, 0 otherwise
*/
static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
int force_seq, struct tcp_info *tinfo)
{
uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
@ -1584,27 +1217,27 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
}
/**
* tcp_send_flag() - Send segment with flags to tap (no payload)
* tcp_fill_flag_header() - Prepare header for flags-only segment (no payload)
* @c: Execution context
* @conn: Connection pointer
* @flags: TCP flags: if not set, send segment only if ACK is due
* @th: TCP header to update
* @opts: buffer to store TCP option
* @optlen: size of the TCP option buffer
*
* Return: negative error code on connection reset, 0 otherwise
* Return: < 0 error code on connection reset,
* 0 if there is no flag to send
* 1 otherwise
*/
static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, char *opts,
size_t *optlen)
{
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
struct tcp4_l2_flags_buf_t *b4 = NULL;
struct tcp6_l2_flags_buf_t *b6 = NULL;
struct tcp_info tinfo = { 0 };
socklen_t sl = sizeof(tinfo);
int s = conn->sock;
size_t optlen = 0;
struct iovec *iov;
struct tcphdr *th;
char *data;
void *p;
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
!flags && conn->wnd_to_tap)
@ -1626,37 +1259,19 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
return 0;
if (CONN_V4(conn)) {
iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used;
p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
th = &b4->th;
/* gcc 11.2 would complain on data = (char *)(th + 1); */
data = b4->opts;
} else {
iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used;
p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
th = &b6->th;
data = b6->opts;
}
if (flags & SYN) {
int mss;
/* Options: MSS, NOP and window scale (8 bytes) */
optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
*optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
*data++ = OPT_MSS;
*data++ = OPT_MSS_LEN;
*opts++ = OPT_MSS;
*opts++ = OPT_MSS_LEN;
if (c->mtu == -1) {
mss = tinfo.tcpi_snd_mss;
} else {
mss = c->mtu - sizeof(struct tcphdr);
if (CONN_V4(conn))
mss -= sizeof(struct iphdr);
else
mss -= sizeof(struct ipv6hdr);
if (c->low_wmem &&
!(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn))
@ -1664,16 +1279,16 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
else if (mss > PAGE_SIZE)
mss = ROUND_DOWN(mss, PAGE_SIZE);
}
*(uint16_t *)data = htons(MIN(USHRT_MAX, mss));
*(uint16_t *)opts = htons(MIN(USHRT_MAX, mss));
data += OPT_MSS_LEN - 2;
opts += OPT_MSS_LEN - 2;
conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
*data++ = OPT_NOP;
*data++ = OPT_WS;
*data++ = OPT_WS_LEN;
*data++ = conn->ws_to_tap;
*opts++ = OPT_NOP;
*opts++ = OPT_WS;
*opts++ = OPT_WS_LEN;
*opts++ = conn->ws_to_tap;
th->ack = !!(flags & ACK);
} else {
@ -1682,15 +1297,12 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
!prev_wnd_to_tap;
}
th->doff = (sizeof(*th) + optlen) / 4;
th->doff = (sizeof(*th) + *optlen) / 4;
th->rst = !!(flags & RST);
th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN);
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen,
NULL, conn->seq_to_tap);
if (th->ack) {
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
conn_flag(c, conn, ~ACK_TO_TAP_DUE);
@ -1705,27 +1317,14 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (th->fin || th->syn)
conn->seq_to_tap++;
if (CONN_V4(conn)) {
if (flags & DUP_ACK) {
memcpy(b4 + 1, b4, sizeof(*b4));
(iov + 1)->iov_len = iov->iov_len;
tcp4_l2_flags_buf_used++;
}
return 1;
}
if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
tcp_l2_flags_buf_flush(c);
} else {
if (flags & DUP_ACK) {
memcpy(b6 + 1, b6, sizeof(*b6));
(iov + 1)->iov_len = iov->iov_len;
tcp6_l2_flags_buf_used++;
}
if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
tcp_l2_flags_buf_flush(c);
}
return 0;
int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
if (c->mode == MODE_VU)
return tcp_vu_send_flag(c, conn, flags);
return tcp_buf_send_flag(c, conn, flags);
}
/**
@ -1733,7 +1332,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
* @c: Execution context
* @conn: Connection pointer
*/
static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
{
if (conn->events == CLOSED)
return;
@ -1881,21 +1480,22 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af)
*
* Return: clamped MSS value
*/
static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
static uint16_t tcp_conn_tap_mss(const struct ctx *c,
const struct tcp_tap_conn *conn,
const char *opts, size_t optlen)
{
unsigned int mss;
int ret;
(void)c; /* unused */
(void)conn; /* unused */
if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
mss = MSS_DEFAULT;
else
mss = ret;
if (CONN_V4(conn))
mss = MIN(MSS4, mss);
else
mss = MIN(MSS6, mss);
mss = MIN(MSS, mss);
return MIN(mss, USHRT_MAX);
}
@ -2051,7 +1651,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
conn->wnd_to_tap = WINDOW_DEFAULT;
mss = tcp_conn_tap_mss(conn, opts, optlen);
mss = tcp_conn_tap_mss(c, conn, opts, optlen);
if (setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)))
flow_trace(conn, "failed to set TCP_MAXSEG on socket %i", s);
MSS_SET(conn, mss);
@ -2156,174 +1756,12 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
return 0;
}
/**
* tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
* @c: Execution context
* @conn: Connection pointer
* @plen: Payload length at L4
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
* @seq: Sequence number to be sent
*/
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
ssize_t plen, int no_csum, uint32_t seq)
{
uint32_t *seq_update = &conn->seq_to_tap;
struct iovec *iov;
if (CONN_V4(conn)) {
struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
iov = tcp4_l2_iov + tcp4_l2_buf_used++;
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
check, seq);
if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
tcp_l2_data_buf_flush(c);
} else if (CONN_V6(conn)) {
struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
iov = tcp6_l2_iov + tcp6_l2_buf_used++;
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
NULL, seq);
if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
tcp_l2_data_buf_flush(c);
}
}
/**
* tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
* @c: Execution context
* @conn: Connection pointer
*
* Return: negative on connection reset, 0 otherwise
*
* #syscalls recvmsg
*/
static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
{
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
int sendlen, len, plen, v4 = CONN_V4(conn);
int s = conn->sock, i, ret = 0;
struct msghdr mh_sock = { 0 };
uint16_t mss = MSS_GET(conn);
uint32_t already_sent, seq;
struct iovec *iov;
if (c->mode == MODE_VU)
return tcp_vu_data_from_sock(c, conn);
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
if (SEQ_LT(already_sent, 0)) {
/* RFC 761, section 2.1. */
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
conn->seq_ack_from_tap, conn->seq_to_tap);
conn->seq_to_tap = conn->seq_ack_from_tap;
already_sent = 0;
}
if (!wnd_scaled || already_sent >= wnd_scaled) {
conn_flag(c, conn, STALLED);
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
}
/* Set up buffer descriptors we'll fill completely and partially. */
fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
if (fill_bufs > TCP_FRAMES) {
fill_bufs = TCP_FRAMES;
iov_rem = 0;
} else {
iov_rem = (wnd_scaled - already_sent) % mss;
}
mh_sock.msg_iov = iov_sock;
mh_sock.msg_iovlen = fill_bufs + 1;
iov_sock[0].iov_base = tcp_buf_discard;
iov_sock[0].iov_len = already_sent;
if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
(!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
tcp_l2_data_buf_flush(c);
/* Silence Coverity CWE-125 false positive */
tcp4_l2_buf_used = tcp6_l2_buf_used = 0;
}
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
if (v4)
iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
else
iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
iov_sock[fill_bufs].iov_len = iov_rem;
/* Receive into buffers, don't dequeue until acknowledged by guest. */
do
len = recvmsg(s, &mh_sock, MSG_PEEK);
while (len < 0 && errno == EINTR);
if (len < 0)
goto err;
if (!len) {
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
if ((ret = tcp_send_flag(c, conn, FIN | ACK))) {
tcp_rst(c, conn);
return ret;
}
conn_event(c, conn, TAP_FIN_SENT);
}
return 0;
}
sendlen = len - already_sent;
if (sendlen <= 0) {
conn_flag(c, conn, STALLED);
return 0;
}
conn_flag(c, conn, ~STALLED);
send_bufs = DIV_ROUND_UP(sendlen, mss);
last_len = sendlen - (send_bufs - 1) * mss;
/* Likely, some new data was acked too. */
tcp_update_seqack_wnd(c, conn, 0, NULL);
/* Finally, queue to tap */
plen = mss;
seq = conn->seq_to_tap;
for (i = 0; i < send_bufs; i++) {
int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
if (i == send_bufs - 1)
plen = last_len;
tcp_data_to_tap(c, conn, plen, no_csum, seq);
seq += plen;
}
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
err:
if (errno != EAGAIN && errno != EWOULDBLOCK) {
ret = -errno;
tcp_rst(c, conn);
}
return ret;
return tcp_buf_data_from_sock(c, conn);
}
/**
@ -2542,7 +1980,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
if (!(conn->wnd_from_tap >>= conn->ws_from_tap))
conn->wnd_from_tap = 1;
MSS_SET(conn, tcp_conn_tap_mss(conn, opts, optlen));
MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen));
conn->seq_init_from_tap = ntohl(th->seq) + 1;
conn->seq_from_tap = conn->seq_init_from_tap;
@ -3179,10 +2617,10 @@ int tcp_init(struct ctx *c)
tc_hash[b] = FLOW_SIDX_NONE;
if (c->ifi4)
tcp_sock4_iov_init(c);
tcp_buf_sock4_iov_init(c);
if (c->ifi6)
tcp_sock6_iov_init(c);
tcp_buf_sock6_iov_init(c);
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));

2
tcp.h
View file

@ -23,7 +23,7 @@ int tcp_init(struct ctx *c);
void tcp_timer(struct ctx *c, const struct timespec *now);
void tcp_defer_handler(struct ctx *c);
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
void tcp_buf_update_l2(const unsigned char *eth_d, const unsigned char *eth_s);
/**
* union tcp_epoll_ref - epoll reference portion for TCP connections

494
tcp_buf.c Normal file
View file

@ -0,0 +1,494 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* tcp_buf.c - TCP L2-L4 translation state machine
*
* Copyright (c) 2020-2022 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#include <stddef.h>
#include <stdint.h>
#include <limits.h>
#include <string.h>
#include <errno.h>
#include <netinet/ip.h>
#include <linux/tcp.h>
#include "util.h"
#include "ip.h"
#include "passt.h"
#include "tap.h"
#include "siphash.h"
#include "inany.h"
#include "tcp_conn.h"
#include "tcp_internal.h"
#include "tcp_buf.h"
#define TCP_FRAMES_MEM 128
#define TCP_FRAMES \
(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
/**
* tcp_buf_seq_update - Sequences to update with length of frames once sent
* @seq: Pointer to sequence number sent to tap-side, to be updated
* @len: TCP payload length
*/
struct tcp_buf_seq_update {
uint32_t *seq;
uint16_t len;
};
/* Static buffers */
/**
* tcp_l2_flags_t - TCP header and data to send option flags
* @th: TCP header
* @opts TCP option flags
*/
struct tcp_l2_flags_t {
struct tcphdr th;
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
};
/**
* tcp_l2_payload_t - TCP header and data to send data
* 32 bytes aligned to be able to use AVX2 checksum
* @th: TCP header
* @data: TCP data
*/
struct tcp_l2_payload_t {
struct tcphdr th; /* 20 bytes */
uint8_t data[MSS]; /* 65516 bytes */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
/* Ethernet header for IPv4 frames */
static struct ethhdr tcp4_eth_src;
/* IPv4 headers */
static struct iphdr tcp4_l2_ip[TCP_FRAMES_MEM];
/* TCP headers and data for IPv4 frames */
static struct tcp_l2_payload_t tcp4_l2_payload[TCP_FRAMES_MEM];
static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
static unsigned int tcp4_l2_buf_used;
/* IPv4 headers for TCP option flags frames */
static struct iphdr tcp4_l2_flags_ip[TCP_FRAMES_MEM];
/* TCP headers and option flags for IPv4 frames */
static struct tcp_l2_flags_t tcp4_l2_flags[TCP_FRAMES_MEM];
static unsigned int tcp4_l2_flags_buf_used;
/* Ethernet header for IPv6 frames */
static struct ethhdr tcp6_eth_src;
/* IPv6 headers */
static struct ipv6hdr tcp6_l2_ip[TCP_FRAMES_MEM];
/* TCP headers and data for IPv6 frames */
static struct tcp_l2_payload_t tcp6_l2_payload[TCP_FRAMES_MEM];
static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
static unsigned int tcp6_l2_buf_used;
/* IPv6 headers for TCP option flags frames */
static struct ipv6hdr tcp6_l2_flags_ip[TCP_FRAMES_MEM];
/* TCP headers and option flags for IPv6 frames */
static struct tcp_l2_flags_t tcp6_l2_flags[TCP_FRAMES_MEM];
static unsigned int tcp6_l2_flags_buf_used;
/* recvmsg()/sendmsg() data for tap */
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
/**
* tcp_buf_update_l2() - Update L2 buffers with Ethernet and IPv4 addresses
* @eth_d: Ethernet destination address, NULL if unchanged
* @eth_s: Ethernet source address, NULL if unchanged
*/
void tcp_buf_update_l2(const unsigned char *eth_d, const unsigned char *eth_s)
{
eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
}
/**
* tcp_buf_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
* @c: Execution context
*/
void tcp_buf_sock4_iov_init(const struct ctx *c)
{
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
int i;
(void)c;
tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
for (i = 0; i < TCP_FRAMES_MEM; i++) {
struct iovec *iov;
/* headers */
tcp4_l2_ip[i] = iph;
tcp4_l2_payload[i].th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
tcp4_l2_flags_ip[i] = iph;
tcp4_l2_flags[i].th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
/* iovecs */
iov = tcp4_l2_iov[i];
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i];
iov = tcp4_l2_flags_iov[i];
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i];
}
}
/**
* tcp_buf_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
* @c: Execution context
*/
void tcp_buf_sock6_iov_init(const struct ctx *c)
{
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
int i;
(void)c;
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
for (i = 0; i < TCP_FRAMES_MEM; i++) {
struct iovec *iov;
/* headers */
tcp6_l2_ip[i] = ip6;
tcp6_l2_payload[i].th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
tcp6_l2_flags_ip[i] = ip6;
tcp6_l2_flags[i].th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
/* iovecs */
iov = tcp6_l2_iov[i];
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i];
iov = tcp6_l2_flags_iov[i];
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i];
}
}
/**
* tcp_buf_l2_flags_flush() - Send out buffers for segments with no data (flags)
* @c: Execution context
*/
void tcp_buf_l2_flags_flush(const struct ctx *c)
{
tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
tcp6_l2_flags_buf_used = 0;
tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
tcp4_l2_flags_buf_used = 0;
}
/**
* tcp_buf_l2_data_flush() - Send out buffers for segments with data
* @c: Execution context
*/
void tcp_buf_l2_data_flush(const struct ctx *c)
{
unsigned i;
size_t m;
m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used);
for (i = 0; i < m; i++)
*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
tcp6_l2_buf_used = 0;
m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used);
for (i = 0; i < m; i++)
*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
tcp4_l2_buf_used = 0;
}
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
struct tcp_l2_flags_t *payload;
struct iovec *dup_iov;
struct iovec *iov;
struct tcphdr *th;
size_t optlen = 0;
size_t ip_len;
char *data;
int ret;
if (CONN_V4(conn)) {
iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++];
dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used];
} else {
iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++];
dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used];
}
payload = iov[TCP_IOV_PAYLOAD].iov_base;
th = &payload->th;
data = payload->opts;
ret = tcp_fill_flag_header(c, conn, flags, th, data, &optlen);
if (ret <= 0)
return ret;
if (CONN_V4(conn)) {
struct iphdr *iph = iov[TCP_IOV_IP].iov_base;
ip_len = tcp_fill_headers4(c, conn, iph, th, optlen, NULL,
conn->seq_to_tap);
} else {
struct ipv6hdr *ip6h = iov[TCP_IOV_IP].iov_base;
ip_len = tcp_fill_headers6(c, conn, ip6h, th, optlen,
conn->seq_to_tap);
}
iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
if (flags & DUP_ACK) {
int i;
for (i = 0; i < TCP_IOV_NUM; i++) {
memcpy(dup_iov[i].iov_base, iov[i].iov_base,
iov[i].iov_len);
dup_iov[i].iov_len = iov[i].iov_len;
}
}
if (CONN_V4(conn)) {
if (flags & DUP_ACK)
tcp4_l2_flags_buf_used++;
if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
tcp_buf_l2_flags_flush(c);
} else {
if (flags & DUP_ACK)
tcp6_l2_flags_buf_used++;
if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
tcp_buf_l2_flags_flush(c);
}
return 0;
}
/**
* tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
* @c: Execution context
* @conn: Connection pointer
* @plen: Payload length at L4
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
* @seq: Sequence number to be sent
*/
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
ssize_t plen, int no_csum, uint32_t seq)
{
uint32_t *seq_update = &conn->seq_to_tap;
struct iovec *iov;
if (CONN_V4(conn)) {
struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1];
const uint16_t *check = NULL;
if (no_csum) {
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
check = &iph->check;
}
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
iov = tcp4_l2_iov[tcp4_l2_buf_used++];
iov[TCP_IOV_PAYLOAD].iov_len = tcp_fill_headers4(c, conn,
iov[TCP_IOV_IP].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base,
plen, check, seq);
if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1)
tcp_buf_l2_data_flush(c);
} else if (CONN_V6(conn)) {
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
iov = tcp6_l2_iov[tcp6_l2_buf_used++];
iov[TCP_IOV_PAYLOAD].iov_len = tcp_fill_headers6(c, conn,
iov[TCP_IOV_IP].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base,
plen, seq);
if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1)
tcp_buf_l2_data_flush(c);
}
}
/**
* tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window
* @c: Execution context
* @conn: Connection pointer
*
* Return: negative on connection reset, 0 otherwise
*
* #syscalls recvmsg
*/
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
{
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
int sendlen, len, plen, v4 = CONN_V4(conn);
int s = conn->sock, i, ret = 0;
struct msghdr mh_sock = { 0 };
uint16_t mss = MSS_GET(conn);
uint32_t already_sent, seq;
struct iovec *iov;
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
if (SEQ_LT(already_sent, 0)) {
/* RFC 761, section 2.1. */
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
conn->seq_ack_from_tap, conn->seq_to_tap);
conn->seq_to_tap = conn->seq_ack_from_tap;
already_sent = 0;
}
if (!wnd_scaled || already_sent >= wnd_scaled) {
conn_flag(c, conn, STALLED);
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
}
/* Set up buffer descriptors we'll fill completely and partially. */
fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
if (fill_bufs > TCP_FRAMES) {
fill_bufs = TCP_FRAMES;
iov_rem = 0;
} else {
iov_rem = (wnd_scaled - already_sent) % mss;
}
mh_sock.msg_iov = iov_sock;
mh_sock.msg_iovlen = fill_bufs + 1;
iov_sock[0].iov_base = tcp_buf_discard;
iov_sock[0].iov_len = already_sent;
if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) ||
(!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) {
tcp_buf_l2_data_flush(c);
/* Silence Coverity CWE-125 false positive */
tcp4_l2_buf_used = tcp6_l2_buf_used = 0;
}
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
if (v4)
iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data;
else
iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
iov_sock[fill_bufs].iov_len = iov_rem;
/* Receive into buffers, don't dequeue until acknowledged by guest. */
do
len = recvmsg(s, &mh_sock, MSG_PEEK);
while (len < 0 && errno == EINTR);
if (len < 0)
goto err;
if (!len) {
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
tcp_rst(c, conn);
return ret;
}
conn_event(c, conn, TAP_FIN_SENT);
}
return 0;
}
sendlen = len - already_sent;
if (sendlen <= 0) {
conn_flag(c, conn, STALLED);
return 0;
}
conn_flag(c, conn, ~STALLED);
send_bufs = DIV_ROUND_UP(sendlen, mss);
last_len = sendlen - (send_bufs - 1) * mss;
/* Likely, some new data was acked too. */
tcp_update_seqack_wnd(c, conn, 0, NULL);
/* Finally, queue to tap */
plen = mss;
seq = conn->seq_to_tap;
for (i = 0; i < send_bufs; i++) {
int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
if (i == send_bufs - 1)
plen = last_len;
tcp_data_to_tap(c, conn, plen, no_csum, seq);
seq += plen;
}
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
err:
if (errno != EAGAIN && errno != EWOULDBLOCK) {
ret = -errno;
tcp_rst(c, conn);
}
return ret;
}

17
tcp_buf.h Normal file
View file

@ -0,0 +1,17 @@
/* SPDX-License-Identifier: GPL-2.0-or-later
* Copyright (c) 2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#ifndef TCP_BUF_H
#define TCP_BUF_H
void tcp_buf_sock4_iov_init(const struct ctx *c);
void tcp_buf_sock6_iov_init(const struct ctx *c);
void tcp_buf_l2_flags_flush(const struct ctx *c);
void tcp_buf_l2_data_flush(const struct ctx *c);
uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn);
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
#endif /*TCP_BUF_H */

81
tcp_internal.h Normal file
View file

@ -0,0 +1,81 @@
/* SPDX-License-Identifier: GPL-2.0-or-later
* Copyright (c) 2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#ifndef TCP_INTERNAL_H
#define TCP_INTERNAL_H
#define MAX_WS 8
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
#define MSS (USHRT_MAX - sizeof(struct tcphdr))
#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
#define FIN (1 << 0)
#define SYN (1 << 1)
#define RST (1 << 2)
#define ACK (1 << 4)
/* Flags for internal usage */
#define DUP_ACK (1 << 5)
#define OPT_EOL 0
#define OPT_NOP 1
#define OPT_MSS 2
#define OPT_MSS_LEN 4
#define OPT_WS 3
#define OPT_WS_LEN 3
#define OPT_SACKP 4
#define OPT_SACK 5
#define OPT_TS 8
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
#define CONN_V6(conn) (!CONN_V4(conn))
extern char tcp_buf_discard[MAX_WINDOW];
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long flag);
#define conn_flag(c, conn, flag) \
do { \
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
conn_flag_do(c, conn, flag); \
} while (0)
void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long event);
#define conn_event(c, conn, event) \
do { \
flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
conn_event_do(c, conn, event); \
} while (0)
void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
#define tcp_rst(c, conn) \
do { \
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
tcp_rst_do(c, conn); \
} while (0)
size_t tcp_fill_headers4(const struct ctx *c,
const struct tcp_tap_conn *conn,
struct iphdr *iph, struct tcphdr *th,
size_t plen, const uint16_t *check,
uint32_t seq);
size_t tcp_fill_headers6(const struct ctx *c,
const struct tcp_tap_conn *conn,
struct ipv6hdr *ip6h, struct tcphdr *th,
size_t plen, uint32_t seq);
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
int force_seq, struct tcp_info *tinfo);
int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, int flags,
struct tcphdr *th, char *opts, size_t *optlen);
#endif /* TCP_INTERNAL_H */

460
tcp_vu.c Normal file
View file

@ -0,0 +1,460 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <errno.h>
#include <stddef.h>
#include <stdint.h>
#include <netinet/ip.h>
#include <sys/socket.h>
#include <linux/tcp.h>
#include <linux/virtio_net.h>
#include "util.h"
#include "ip.h"
#include "passt.h"
#include "siphash.h"
#include "inany.h"
#include "vhost_user.h"
#include "tcp.h"
#include "pcap.h"
#include "flow.h"
#include "tcp_conn.h"
#include "flow_table.h"
#include "tcp_vu.h"
#include "tcp_internal.h"
#include "checksum.h"
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
#define CONN_V6(conn) (!CONN_V4(conn))
/* vhost-user */
static const struct virtio_net_hdr vu_header = {
.flags = VIRTIO_NET_HDR_F_DATA_VALID,
.gso_type = VIRTIO_NET_HDR_GSO_NONE,
};
static unsigned char buffer[65536];
static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE];
static unsigned int indexes [VIRTQUEUE_MAX_SIZE];
uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn)
{
(void)conn;
return USHRT_MAX;
}
int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
VuDev *vdev = (VuDev *)&c->vdev;
VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
size_t tlen, vnet_hdrlen, ip_len, optlen = 0;
struct virtio_net_hdr_mrg_rxbuf *vh;
VuVirtqElement *elem;
struct ethhdr *eh;
int nb_ack;
int ret;
elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
if (!elem)
return 0;
if (elem->in_num < 1) {
err("virtio-net receive queue contains no in buffers");
vu_queue_rewind(vdev, vq, 1);
return 0;
}
vh = elem->in_sg[0].iov_base;
vh->hdr = vu_header;
if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
vh->num_buffers = htole16(1);
} else {
vnet_hdrlen = sizeof(struct virtio_net_hdr);
}
eh = (struct ethhdr *)((char *)elem->in_sg[0].iov_base + vnet_hdrlen);
memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
if (CONN_V4(conn)) {
struct iphdr *iph = (struct iphdr *)(eh + 1);
struct tcphdr *th = (struct tcphdr *)(iph + 1);
char *data = (char *)(th + 1);
eh->h_proto = htons(ETH_P_IP);
*th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
ret = tcp_fill_flag_header(c, conn, flags, th, data, &optlen);
if (ret <= 0) {
vu_queue_rewind(vdev, vq, 1);
return ret;
}
ip_len = tcp_fill_headers4(c, conn, iph,
(struct tcphdr *)(iph + 1), optlen,
NULL, conn->seq_to_tap);
tlen = ip_len + sizeof(struct ethhdr);
if (*c->pcap) {
uint32_t sum = proto_ipv4_header_psum(iph->tot_len,
IPPROTO_TCP,
(struct in_addr){ .s_addr = iph->saddr },
(struct in_addr){ .s_addr = iph->daddr });
th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
}
} else {
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
char *data = (char *)(th + 1);
eh->h_proto = htons(ETH_P_IPV6);
*th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
ret = tcp_fill_flag_header(c, conn, flags, th, data, &optlen);
if (ret <= 0) {
vu_queue_rewind(vdev, vq, 1);
return ret;
}
ip_len = tcp_fill_headers6(c, conn, ip6h,
(struct tcphdr *)(ip6h + 1),
optlen, conn->seq_to_tap);
tlen = ip_len + sizeof(struct ethhdr);
if (*c->pcap) {
uint32_t sum = proto_ipv6_header_psum(ip6h->payload_len,
IPPROTO_TCP,
&ip6h->saddr,
&ip6h->daddr);
th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
}
}
pcap((void *)eh, tlen);
tlen += vnet_hdrlen;
vu_queue_fill(vdev, vq, elem, tlen, 0);
nb_ack = 1;
if (flags & DUP_ACK) {
elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
if (elem) {
if (elem->in_num < 1 || elem->in_sg[0].iov_len < tlen) {
vu_queue_rewind(vdev, vq, 1);
} else {
memcpy(elem->in_sg[0].iov_base, vh, tlen);
nb_ack++;
}
}
}
vu_queue_flush(vdev, vq, nb_ack);
vu_queue_notify(vdev, vq);
return 0;
}
int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
{
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
uint32_t already_sent;
VuDev *vdev = (VuDev *)&c->vdev;
VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
int s = conn->sock, v4 = CONN_V4(conn);
int i, ret = 0, iov_count, iov_used;
struct msghdr mh_sock = { 0 };
size_t l2_hdrlen, vnet_hdrlen, fillsize;
ssize_t len;
uint16_t *check;
uint16_t mss = MSS_GET(conn);
int num_buffers;
int segment_size;
struct iovec *first;
bool has_mrg_rxbuf;
if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
err("Got packet, but no available descriptors on RX virtq.");
return 0;
}
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
if (SEQ_LT(already_sent, 0)) {
/* RFC 761, section 2.1. */
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
conn->seq_ack_from_tap, conn->seq_to_tap);
conn->seq_to_tap = conn->seq_ack_from_tap;
already_sent = 0;
}
if (!wnd_scaled || already_sent >= wnd_scaled) {
conn_flag(c, conn, STALLED);
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
}
/* Set up buffer descriptors we'll fill completely and partially. */
fillsize = wnd_scaled;
iov_vu[0].iov_base = tcp_buf_discard;
iov_vu[0].iov_len = already_sent;
fillsize -= already_sent;
has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF);
if (has_mrg_rxbuf) {
vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
} else {
vnet_hdrlen = sizeof(struct virtio_net_hdr);
}
l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr);
if (v4) {
l2_hdrlen += sizeof(struct iphdr);
} else {
l2_hdrlen += sizeof(struct ipv6hdr);
}
iov_count = 0;
segment_size = 0;
while (fillsize > 0 && iov_count < VIRTQUEUE_MAX_SIZE - 1) {
VuVirtqElement *elem;
elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
if (!elem)
break;
if (elem->in_num < 1) {
err("virtio-net receive queue contains no in buffers");
goto err;
}
ASSERT(elem->in_num == 1);
ASSERT(elem->in_sg[0].iov_len >= l2_hdrlen);
indexes[iov_count] = elem->index;
if (segment_size == 0) {
iov_vu[iov_count + 1].iov_base =
(char *)elem->in_sg[0].iov_base + l2_hdrlen;
iov_vu[iov_count + 1].iov_len =
elem->in_sg[0].iov_len - l2_hdrlen;
} else {
iov_vu[iov_count + 1].iov_base = elem->in_sg[0].iov_base;
iov_vu[iov_count + 1].iov_len = elem->in_sg[0].iov_len;
}
if (iov_vu[iov_count + 1].iov_len > fillsize)
iov_vu[iov_count + 1].iov_len = fillsize;
segment_size += iov_vu[iov_count + 1].iov_len;
if (!has_mrg_rxbuf) {
segment_size = 0;
} else if (segment_size >= mss) {
iov_vu[iov_count + 1].iov_len -= segment_size - mss;
segment_size = 0;
}
fillsize -= iov_vu[iov_count + 1].iov_len;
iov_count++;
}
if (iov_count == 0)
return 0;
mh_sock.msg_iov = iov_vu;
mh_sock.msg_iovlen = iov_count + 1;
do
len = recvmsg(s, &mh_sock, MSG_PEEK);
while (len < 0 && errno == EINTR);
if (len < 0)
goto err;
if (!len) {
vu_queue_rewind(vdev, vq, iov_count);
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
if ((ret = tcp_vu_send_flag(c, conn, FIN | ACK))) {
tcp_rst(c, conn);
return ret;
}
conn_event(c, conn, TAP_FIN_SENT);
}
return 0;
}
len -= already_sent;
if (len <= 0) {
conn_flag(c, conn, STALLED);
vu_queue_rewind(vdev, vq, iov_count);
return 0;
}
conn_flag(c, conn, ~STALLED);
/* Likely, some new data was acked too. */
tcp_update_seqack_wnd(c, conn, 0, NULL);
/* initialize headers */
iov_used = 0;
num_buffers = 0;
check = NULL;
segment_size = 0;
for (i = 0; i < iov_count && len; i++) {
if (segment_size == 0)
first = &iov_vu[i + 1];
if (iov_vu[i + 1].iov_len > (size_t)len)
iov_vu[i + 1].iov_len = len;
len -= iov_vu[i + 1].iov_len;
iov_used++;
segment_size += iov_vu[i + 1].iov_len;
num_buffers++;
if (segment_size >= mss || len == 0 ||
i + 1 == iov_count || !has_mrg_rxbuf) {
struct ethhdr *eh;
struct virtio_net_hdr_mrg_rxbuf *vh;
char *base = (char *)first->iov_base - l2_hdrlen;
size_t size = first->iov_len + l2_hdrlen;
vh = (struct virtio_net_hdr_mrg_rxbuf *)base;
vh->hdr = vu_header;
if (has_mrg_rxbuf)
vh->num_buffers = htole16(num_buffers);
eh = (struct ethhdr *)((char *)base + vnet_hdrlen);
memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
/* initialize header */
if (v4) {
struct iphdr *iph = (struct iphdr *)(eh + 1);
struct tcphdr *th = (struct tcphdr *)(iph + 1);
eh->h_proto = htons(ETH_P_IP);
*th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
tcp_fill_headers4(c, conn, iph,
(struct tcphdr *)(iph + 1),
segment_size, len ? check : NULL,
conn->seq_to_tap);
if (*c->pcap) {
uint32_t sum = proto_ipv4_header_psum(iph->tot_len,
IPPROTO_TCP,
(struct in_addr){ .s_addr = iph->saddr },
(struct in_addr){ .s_addr = iph->daddr });
first->iov_base = th;
first->iov_len = size - l2_hdrlen + sizeof(*th);
th->check = csum_iov(first, num_buffers, sum);
}
check = &iph->check;
} else {
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
eh->h_proto = htons(ETH_P_IPV6);
*th = (struct tcphdr){
.doff = sizeof(struct tcphdr) / 4,
.ack = 1
};
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
tcp_fill_headers6(c, conn, ip6h,
(struct tcphdr *)(ip6h + 1),
segment_size, conn->seq_to_tap);
if (*c->pcap) {
uint32_t sum = proto_ipv6_header_psum(ip6h->payload_len,
IPPROTO_TCP,
&ip6h->saddr,
&ip6h->daddr);
first->iov_base = th;
first->iov_len = size - l2_hdrlen + sizeof(*th);
th->check = csum_iov(first, num_buffers, sum);
}
}
/* set iov for pcap logging */
first->iov_base = eh;
first->iov_len = size - vnet_hdrlen;
pcap_iov(first, num_buffers);
/* set iov_len for vu_queue_fill_by_index(); */
first->iov_base = base;
first->iov_len = size;
conn->seq_to_tap += segment_size;
segment_size = 0;
num_buffers = 0;
}
}
/* release unused buffers */
vu_queue_rewind(vdev, vq, iov_count - iov_used);
/* send packets */
for (i = 0; i < iov_used; i++) {
vu_queue_fill_by_index(vdev, vq, indexes[i],
iov_vu[i + 1].iov_len, i);
}
vu_queue_flush(vdev, vq, iov_used);
vu_queue_notify(vdev, vq);
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
err:
vu_queue_rewind(vdev, vq, iov_count);
if (errno != EAGAIN && errno != EWOULDBLOCK) {
ret = -errno;
tcp_rst(c, conn);
}
return ret;
}

9
tcp_vu.h Normal file
View file

@ -0,0 +1,9 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#ifndef TCP_VU_H
#define TCP_VU_H
int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
#endif /*TCP_VU_H */

117
udp.c
View file

@ -120,9 +120,7 @@
#include "tap.h"
#include "pcap.h"
#include "log.h"
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
#include "udp_internal.h"
/**
* struct udp_tap_port - Port tracking based on tap-facing source port
@ -230,11 +228,11 @@ static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES];
static struct iovec udp4_iov_splice [UDP_MAX_FRAMES];
static struct iovec udp6_iov_splice [UDP_MAX_FRAMES];
static struct sockaddr_in udp4_localname = {
struct sockaddr_in udp4_localname = {
.sin_family = AF_INET,
.sin_addr = IN4ADDR_LOOPBACK_INIT,
};
static struct sockaddr_in6 udp6_localname = {
struct sockaddr_in6 udp6_localname = {
.sin6_family = AF_INET6,
.sin6_addr = IN6ADDR_LOOPBACK_INIT,
};
@ -567,21 +565,22 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
*
* Return: size of tap frame with headers
*/
static size_t udp_update_hdr4(const struct ctx *c, int n, in_port_t dstport,
const struct timespec *now)
size_t udp_update_hdr4(const struct ctx *c, struct iphdr *iph,
size_t data_len, struct sockaddr_in *s_in,
in_port_t dstport, const struct timespec *now)
{
struct udp4_l2_buf_t *b = &udp4_l2_buf[n];
struct udphdr *uh = (struct udphdr *)(iph + 1);
const struct in_addr *src;
in_port_t src_port;
size_t ip_len;
ip_len = udp4_l2_mh_sock[n].msg_len + sizeof(b->iph) + sizeof(b->uh);
ip_len = data_len + sizeof(struct iphdr) + sizeof(struct udphdr);
b->iph.tot_len = htons(ip_len);
b->iph.daddr = c->ip4.addr_seen.s_addr;
iph->tot_len = htons(ip_len);
iph->daddr = c->ip4.addr_seen.s_addr;
src = &b->s_in.sin_addr;
src_port = ntohs(b->s_in.sin_port);
src = &s_in->sin_addr;
src_port = ntohs(s_in->sin_port);
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) &&
IN4_ARE_ADDR_EQUAL(src, &c->ip4.dns_host) && src_port == 53) {
@ -600,15 +599,16 @@ static size_t udp_update_hdr4(const struct ctx *c, int n, in_port_t dstport,
src = &c->ip4.gw;
}
b->iph.saddr = src->s_addr;
iph->saddr = src->s_addr;
b->iph.check = csum_ip4_header(b->iph.tot_len, IPPROTO_UDP,
iph->check = csum_ip4_header(iph->tot_len, IPPROTO_UDP,
*src, c->ip4.addr_seen);
b->uh.source = b->s_in.sin_port;
b->uh.dest = htons(dstport);
b->uh.len = htons(udp4_l2_mh_sock[n].msg_len + sizeof(b->uh));
uh->source = s_in->sin_port;
uh->dest = htons(dstport);
uh->len = htons(data_len + sizeof(struct udphdr));
uh->check = 0;
return tap_iov_len(c, &b->taph, ip_len);
return ip_len;
}
/**
@ -620,23 +620,24 @@ static size_t udp_update_hdr4(const struct ctx *c, int n, in_port_t dstport,
*
* Return: size of tap frame with headers
*/
static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport,
const struct timespec *now)
size_t udp_update_hdr6(const struct ctx *c, struct ipv6hdr *ip6h,
size_t data_len, struct sockaddr_in6 *s_in6,
in_port_t dstport, const struct timespec *now)
{
struct udp6_l2_buf_t *b = &udp6_l2_buf[n];
struct udphdr *uh = (struct udphdr *)(ip6h + 1);
const struct in6_addr *src, *dst;
uint16_t payload_len;
in_port_t src_port;
size_t ip_len;
dst = &c->ip6.addr_seen;
src = &b->s_in6.sin6_addr;
src_port = ntohs(b->s_in6.sin6_port);
src = &s_in6->sin6_addr;
src_port = ntohs(s_in6->sin6_port);
ip_len = udp6_l2_mh_sock[n].msg_len + sizeof(b->ip6h) + sizeof(b->uh);
ip_len = data_len + sizeof(struct ipv6hdr) + sizeof(struct udphdr);
payload_len = udp6_l2_mh_sock[n].msg_len + sizeof(b->uh);
b->ip6h.payload_len = htons(payload_len);
payload_len = data_len + sizeof(struct udphdr);
ip6h->payload_len = htons(payload_len);
if (IN6_IS_ADDR_LINKLOCAL(src)) {
dst = &c->ip6.addr_ll_seen;
@ -668,23 +669,25 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport,
src = &c->ip6.gw;
else
src = &c->ip6.addr_ll;
}
b->ip6h.daddr = *dst;
b->ip6h.saddr = *src;
b->ip6h.version = 6;
b->ip6h.nexthdr = IPPROTO_UDP;
b->ip6h.hop_limit = 255;
ip6h->daddr = *dst;
ip6h->saddr = *src;
ip6h->version = 6;
ip6h->nexthdr = IPPROTO_UDP;
ip6h->hop_limit = 255;
b->uh.source = b->s_in6.sin6_port;
b->uh.dest = htons(dstport);
b->uh.len = b->ip6h.payload_len;
b->uh.check = 0;
b->uh.check = csum(&b->uh, payload_len,
uh->source = s_in6->sin6_port;
uh->dest = htons(dstport);
uh->len = ip6h->payload_len;
uh->check = 0;
if (c->mode != MODE_VU)
uh->check = csum(uh, payload_len,
proto_ipv6_header_psum(payload_len, IPPROTO_UDP,
src, dst));
else
uh->check = 0xffff; /* zero checksum is invalid with IPv6 */
return tap_iov_len(c, &b->taph, ip_len);
return ip_len;
}
/**
@ -698,6 +701,11 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport,
*
* Return: size of tap frame with headers
*/
#pragma GCC diagnostic push
/* ignore unaligned pointer value warning for &udp6_l2_buf[i].ip6h and
* &udp4_l2_buf[i].iph
*/
#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
static void udp_tap_send(const struct ctx *c,
unsigned int start, unsigned int n,
in_port_t dstport, bool v6, const struct timespec *now)
@ -711,21 +719,34 @@ static void udp_tap_send(const struct ctx *c,
tap_iov = udp4_l2_iov_tap;
for (i = start; i < start + n; i++) {
size_t buf_len;
size_t ip_len;
if (v6)
buf_len = udp_update_hdr6(c, i, dstport, now);
else
buf_len = udp_update_hdr4(c, i, dstport, now);
if (v6) {
ip_len = udp_update_hdr6(c, &udp6_l2_buf[i].ip6h,
udp6_l2_mh_sock[i].msg_len,
&udp6_l2_buf[i].s_in6, dstport,
now);
tap_iov[i].iov_len = tap_iov_len(c,
&udp6_l2_buf[i].taph,
ip_len);
} else {
ip_len = udp_update_hdr4(c, &udp4_l2_buf[i].iph,
udp4_l2_mh_sock[i].msg_len,
&udp4_l2_buf[i].s_in,
dstport, now);
tap_iov[i].iov_len = buf_len;
tap_iov[i].iov_len = tap_iov_len(c,
&udp4_l2_buf[i].taph,
ip_len);
}
}
tap_send_frames(c, tap_iov + start, n);
}
#pragma GCC diagnostic pop
/**
* udp_sock_handler() - Handle new data from socket
* udp_buf_sock_handler() - Handle new data from socket
* @c: Execution context
* @ref: epoll reference
* @events: epoll events bitmap
@ -733,7 +754,7 @@ static void udp_tap_send(const struct ctx *c,
*
* #syscalls recvmmsg
*/
void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
const struct timespec *now)
{
/* For not entirely clear reasons (data locality?) pasta gets
@ -744,7 +765,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
* whether we'll use tap or splice, always go one at a time
* for pasta mode.
*/
ssize_t n = (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1);
ssize_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
in_port_t dstport = ref.udp.port;
bool v6 = ref.udp.v6;
struct mmsghdr *mmh_recv;

2
udp.h
View file

@ -9,7 +9,7 @@
#define UDP_TIMER_INTERVAL 1000 /* ms */
void udp_portmap_clear(void);
void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
const struct timespec *now);
int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,

21
udp_internal.h Normal file
View file

@ -0,0 +1,21 @@
/* SPDX-License-Identifier: GPL-2.0-or-later
* Copyright (c) 2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#ifndef UDP_INTERNAL_H
#define UDP_INTERNAL_H
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
extern struct sockaddr_in udp4_localname;
extern struct sockaddr_in6 udp6_localname;
size_t udp_update_hdr4(const struct ctx *c, struct iphdr *iph,
size_t data_len, struct sockaddr_in *s_in,
in_port_t dstport, const struct timespec *now);
size_t udp_update_hdr6(const struct ctx *c, struct ipv6hdr *ip6h,
size_t data_len, struct sockaddr_in6 *s_in6,
in_port_t dstport, const struct timespec *now);
#endif /* UDP_INTERNAL_H */

218
udp_vu.c Normal file
View file

@ -0,0 +1,218 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <unistd.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/udp.h>
#include <stdint.h>
#include <stddef.h>
#include <sys/uio.h>
#include <linux/virtio_net.h>
#include "checksum.h"
#include "util.h"
#include "ip.h"
#include "passt.h"
#include "pcap.h"
#include "log.h"
#include "vhost_user.h"
#include "udp_internal.h"
#include "udp_vu.h"
/* vhost-user */
static const struct virtio_net_hdr vu_header = {
.flags = VIRTIO_NET_HDR_F_DATA_VALID,
.gso_type = VIRTIO_NET_HDR_GSO_NONE,
};
static unsigned char buffer[65536];
static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE];
static unsigned int indexes [VIRTQUEUE_MAX_SIZE];
void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
const struct timespec *now)
{
VuDev *vdev = (VuDev *)&c->vdev;
VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
size_t l2_hdrlen, vnet_hdrlen, fillsize;
ssize_t data_len;
in_port_t dstport = ref.udp.port;
bool has_mrg_rxbuf, v6 = ref.udp.v6;
struct msghdr msg;
int i, iov_count, iov_used, virtqueue_max;
if (c->no_udp || !(events & EPOLLIN))
return;
has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF);
if (has_mrg_rxbuf) {
vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
virtqueue_max = VIRTQUEUE_MAX_SIZE;
} else {
vnet_hdrlen = sizeof(struct virtio_net_hdr);
virtqueue_max = 1;
}
l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct udphdr);
if (v6) {
l2_hdrlen += sizeof(struct ipv6hdr);
udp6_localname.sin6_port = htons(dstport);
msg.msg_name = &udp6_localname;
msg.msg_namelen = sizeof(udp6_localname);
} else {
l2_hdrlen += sizeof(struct iphdr);
udp4_localname.sin_port = htons(dstport);
msg.msg_name = &udp4_localname;
msg.msg_namelen = sizeof(udp4_localname);
}
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
for (i = 0; i < UDP_MAX_FRAMES; i++) {
struct virtio_net_hdr_mrg_rxbuf *vh;
struct ethhdr *eh;
char *base;
size_t size;
fillsize = USHRT_MAX;
iov_count = 0;
while (fillsize && iov_count < virtqueue_max) {
VuVirtqElement *elem;
elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
if (!elem)
break;
if (elem->in_num < 1) {
err("virtio-net receive queue contains no in buffers");
vu_queue_rewind(vdev, vq, iov_count);
return;
}
ASSERT(elem->in_num == 1);
ASSERT(elem->in_sg[0].iov_len >= l2_hdrlen);
indexes[iov_count] = elem->index;
if (iov_count == 0) {
iov_vu[0].iov_base = (char *)elem->in_sg[0].iov_base + l2_hdrlen;
iov_vu[0].iov_len = elem->in_sg[0].iov_len - l2_hdrlen;
} else {
iov_vu[iov_count].iov_base = elem->in_sg[0].iov_base;
iov_vu[iov_count].iov_len = elem->in_sg[0].iov_len;
}
if (iov_vu[iov_count].iov_len > fillsize)
iov_vu[iov_count].iov_len = fillsize;
fillsize -= iov_vu[iov_count].iov_len;
iov_count++;
}
if (iov_count == 0)
break;
msg.msg_iov = iov_vu;
msg.msg_iovlen = iov_count;
data_len = recvmsg(ref.fd, &msg, 0);
if (data_len < 0) {
vu_queue_rewind(vdev, vq, iov_count);
return;
}
iov_used = 0;
size = data_len;
while (size) {
if (iov_vu[iov_used].iov_len > size)
iov_vu[iov_used].iov_len = size;
size -= iov_vu[iov_used].iov_len;
iov_used++;
}
base = (char *)iov_vu[0].iov_base - l2_hdrlen;
size = iov_vu[0].iov_len + l2_hdrlen;
/* release unused buffers */
vu_queue_rewind(vdev, vq, iov_count - iov_used);
/* vnet_header */
vh = (struct virtio_net_hdr_mrg_rxbuf *)base;
vh->hdr = vu_header;
if (has_mrg_rxbuf)
vh->num_buffers = htole16(iov_used);
/* ethernet header */
eh = (struct ethhdr *)(base + vnet_hdrlen);
memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
/* initialize header */
if (v6) {
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
struct udphdr *uh = (struct udphdr *)(ip6h + 1);
uint32_t sum;
eh->h_proto = htons(ETH_P_IPV6);
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP);
udp_update_hdr6(c, ip6h, data_len, &udp6_localname,
dstport, now);
if (*c->pcap) {
sum = proto_ipv6_header_psum(ip6h->payload_len,
IPPROTO_UDP,
&ip6h->saddr,
&ip6h->daddr);
iov_vu[0].iov_base = uh;
iov_vu[0].iov_len = size - l2_hdrlen + sizeof(*uh);
uh->check = csum_iov(iov_vu, iov_used, sum);
}
} else {
struct iphdr *iph = (struct iphdr *)(eh + 1);
struct udphdr *uh = (struct udphdr *)(iph + 1);
uint32_t sum;
eh->h_proto = htons(ETH_P_IP);
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP);
udp_update_hdr4(c, iph, data_len, &udp4_localname,
dstport, now);
if (*c->pcap) {
sum = proto_ipv4_header_psum(iph->tot_len,
IPPROTO_UDP,
(struct in_addr){ .s_addr = iph->saddr },
(struct in_addr){ .s_addr = iph->daddr });
iov_vu[0].iov_base = uh;
iov_vu[0].iov_len = size - l2_hdrlen + sizeof(*uh);
uh->check = csum_iov(iov_vu, iov_used, sum);
}
}
/* set iov for pcap logging */
iov_vu[0].iov_base = base + vnet_hdrlen;
iov_vu[0].iov_len = size - vnet_hdrlen;
pcap_iov(iov_vu, iov_used);
/* set iov_len for vu_queue_fill_by_index(); */
iov_vu[0].iov_base = base;
iov_vu[0].iov_len = size;
/* send packets */
for (i = 0; i < iov_used; i++)
vu_queue_fill_by_index(vdev, vq, indexes[i],
iov_vu[i].iov_len, i);
vu_queue_flush(vdev, vq, iov_used);
vu_queue_notify(vdev, vq);
}
}

8
udp_vu.h Normal file
View file

@ -0,0 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#ifndef UDP_VU_H
#define UDP_VU_H
void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now);
#endif /* UDP_VU_H */

11
util.h
View file

@ -43,6 +43,9 @@
#define ROUND_DOWN(x, y) ((x) & ~((y) - 1))
#define ROUND_UP(x, y) (((x) + (y) - 1) & ~((y) - 1))
#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
#define MAX_FROM_BITS(n) (((1U << (n)) - 1))
#define BIT(n) (1UL << (n))
@ -110,6 +113,14 @@
#define htonl_constant(x) (__bswap_constant_32(x))
#endif
#define barrier() do { __asm__ __volatile__("" ::: "memory"); } while (0)
#define smp_mb() do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0)
#define smp_mb_release() do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0)
#define smp_mb_acquire() do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0)
#define smp_wmb() smp_mb_release()
#define smp_rmb() smp_mb_acquire()
#define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8)
int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
void *arg);

1050
vhost_user.c Normal file

File diff suppressed because it is too large Load diff

137
vhost_user.h Normal file
View file

@ -0,0 +1,137 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* some parts from subprojects/libvhost-user/libvhost-user.h */
#ifndef VHOST_USER_H
#define VHOST_USER_H
#include "virtio.h"
#include "iov.h"
#define VHOST_USER_F_PROTOCOL_FEATURES 30
#define VHOST_MEMORY_BASELINE_NREGIONS 8
enum vhost_user_protocol_feature {
VHOST_USER_PROTOCOL_F_MQ = 0,
VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
VHOST_USER_PROTOCOL_F_RARP = 2,
VHOST_USER_PROTOCOL_F_REPLY_ACK = 3,
VHOST_USER_PROTOCOL_F_NET_MTU = 4,
VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5,
VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6,
VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
VHOST_USER_PROTOCOL_F_CONFIG = 9,
VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14,
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15,
VHOST_USER_PROTOCOL_F_MAX
};
enum vhost_user_request {
VHOST_USER_NONE = 0,
VHOST_USER_GET_FEATURES = 1,
VHOST_USER_SET_FEATURES = 2,
VHOST_USER_SET_OWNER = 3,
VHOST_USER_RESET_OWNER = 4,
VHOST_USER_SET_MEM_TABLE = 5,
VHOST_USER_SET_LOG_BASE = 6,
VHOST_USER_SET_LOG_FD = 7,
VHOST_USER_SET_VRING_NUM = 8,
VHOST_USER_SET_VRING_ADDR = 9,
VHOST_USER_SET_VRING_BASE = 10,
VHOST_USER_GET_VRING_BASE = 11,
VHOST_USER_SET_VRING_KICK = 12,
VHOST_USER_SET_VRING_CALL = 13,
VHOST_USER_SET_VRING_ERR = 14,
VHOST_USER_GET_PROTOCOL_FEATURES = 15,
VHOST_USER_SET_PROTOCOL_FEATURES = 16,
VHOST_USER_GET_QUEUE_NUM = 17,
VHOST_USER_SET_VRING_ENABLE = 18,
VHOST_USER_SEND_RARP = 19,
VHOST_USER_NET_SET_MTU = 20,
VHOST_USER_SET_BACKEND_REQ_FD = 21,
VHOST_USER_IOTLB_MSG = 22,
VHOST_USER_SET_VRING_ENDIAN = 23,
VHOST_USER_GET_CONFIG = 24,
VHOST_USER_SET_CONFIG = 25,
VHOST_USER_CREATE_CRYPTO_SESSION = 26,
VHOST_USER_CLOSE_CRYPTO_SESSION = 27,
VHOST_USER_POSTCOPY_ADVISE = 28,
VHOST_USER_POSTCOPY_LISTEN = 29,
VHOST_USER_POSTCOPY_END = 30,
VHOST_USER_GET_INFLIGHT_FD = 31,
VHOST_USER_SET_INFLIGHT_FD = 32,
VHOST_USER_GPU_SET_SOCKET = 33,
VHOST_USER_VRING_KICK = 35,
VHOST_USER_GET_MAX_MEM_SLOTS = 36,
VHOST_USER_ADD_MEM_REG = 37,
VHOST_USER_REM_MEM_REG = 38,
VHOST_USER_MAX
};
typedef struct {
enum vhost_user_request request;
#define VHOST_USER_VERSION_MASK 0x3
#define VHOST_USER_REPLY_MASK (0x1 << 2)
#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3)
uint32_t flags;
uint32_t size; /* the following payload size */
} __attribute__ ((__packed__)) vhost_user_header;
typedef struct VhostUserMemory_region {
uint64_t guest_phys_addr;
uint64_t memory_size;
uint64_t userspace_addr;
uint64_t mmap_offset;
} VhostUserMemory_region;
struct VhostUserMemory {
uint32_t nregions;
uint32_t padding;
struct VhostUserMemory_region regions[VHOST_MEMORY_BASELINE_NREGIONS];
};
typedef union {
#define VHOST_USER_VRING_IDX_MASK 0xff
#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8)
uint64_t u64;
struct vhost_vring_state state;
struct vhost_vring_addr addr;
struct VhostUserMemory memory;
} vhost_user_payload;
typedef struct VhostUserMsg {
vhost_user_header hdr;
vhost_user_payload payload;
int fds[VHOST_MEMORY_BASELINE_NREGIONS];
int fd_num;
uint8_t *data;
} __attribute__ ((__packed__)) VhostUserMsg;
#define VHOST_USER_HDR_SIZE sizeof(vhost_user_header)
#define VHOST_USER_RX_QUEUE 0
#define VHOST_USER_TX_QUEUE 1
static inline bool vu_queue_enabled(VuVirtq *vq)
{
return vq->enable;
}
static inline bool vu_queue_started(const VuVirtq *vq)
{
return vq->started;
}
int vu_send(const struct ctx *c, const void *data, size_t len);
void vu_print_capabilities(void);
void vu_init(struct ctx *c);
void vu_kick_cb(struct ctx *c, union epoll_ref ref);
void tap_handler_vu(struct ctx *c, uint32_t events);
#endif /* VHOST_USER_H */

484
virtio.c Normal file
View file

@ -0,0 +1,484 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c */
#include <stddef.h>
#include <endian.h>
#include <string.h>
#include <errno.h>
#include <sys/eventfd.h>
#include <sys/socket.h>
#include "util.h"
#include "virtio.h"
#define VIRTQUEUE_MAX_SIZE 1024
/* Translate guest physical address to our virtual address. */
static void *vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr)
{
unsigned int i;
if (*plen == 0) {
return NULL;
}
/* Find matching memory region. */
for (i = 0; i < dev->nregions; i++) {
VuDevRegion *r = &dev->regions[i];
if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
if ((guest_addr + *plen) > (r->gpa + r->size)) {
*plen = r->gpa + r->size - guest_addr;
}
return (void *)(guest_addr - (uintptr_t)r->gpa +
(uintptr_t)r->mmap_addr + r->mmap_offset);
}
}
return NULL;
}
static inline uint16_t vring_avail_flags(VuVirtq *vq)
{
return le16toh(vq->vring.avail->flags);
}
static inline uint16_t vring_avail_idx(VuVirtq *vq)
{
vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
return vq->shadow_avail_idx;
}
static inline uint16_t vring_avail_ring(VuVirtq *vq, int i)
{
return le16toh(vq->vring.avail->ring[i]);
}
static inline uint16_t vring_get_used_event(VuVirtq *vq)
{
return vring_avail_ring(vq, vq->vring.num);
}
static bool virtqueue_get_head(VuDev *dev, VuVirtq *vq,
unsigned int idx, unsigned int *head)
{
/* Grab the next descriptor number they're advertising, and increment
* the index we've seen. */
*head = vring_avail_ring(vq, idx % vq->vring.num);
/* If their number is silly, that's a fatal mistake. */
if (*head >= vq->vring.num) {
vu_panic(dev, "Guest says index %u is available", *head);
return false;
}
return true;
}
static int
virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc,
uint64_t addr, size_t len)
{
struct vring_desc *ori_desc;
uint64_t read_len;
if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
return -1;
}
if (len == 0) {
return -1;
}
while (len) {
read_len = len;
ori_desc = vu_gpa_to_va(dev, &read_len, addr);
if (!ori_desc) {
return -1;
}
memcpy(desc, ori_desc, read_len);
len -= read_len;
addr += read_len;
desc += read_len;
}
return 0;
}
enum {
VIRTQUEUE_READ_DESC_ERROR = -1,
VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
};
static int
virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc,
int i, unsigned int max, unsigned int *next)
{
/* If this descriptor says it doesn't chain, we're done. */
if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
return VIRTQUEUE_READ_DESC_DONE;
}
/* Check they're not leading us off end of descriptors. */
*next = le16toh(desc[i].next);
/* Make sure compiler knows to grab that: we don't want it changing! */
smp_wmb();
if (*next >= max) {
vu_panic(dev, "Desc next is %u", *next);
return VIRTQUEUE_READ_DESC_ERROR;
}
return VIRTQUEUE_READ_DESC_MORE;
}
bool vu_queue_empty(VuDev *dev, VuVirtq *vq)
{
if (dev->broken ||
!vq->vring.avail) {
return true;
}
if (vq->shadow_avail_idx != vq->last_avail_idx) {
return false;
}
return vring_avail_idx(vq) == vq->last_avail_idx;
}
static bool vring_notify(VuDev *dev, VuVirtq *vq)
{
uint16_t old, new;
bool v;
/* We need to expose used array entries before checking used event. */
smp_mb();
/* Always notify when queue is empty (when feature acknowledge) */
if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
!vq->inuse && vu_queue_empty(dev, vq)) {
return true;
}
if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
}
v = vq->signalled_used_valid;
vq->signalled_used_valid = true;
old = vq->signalled_used;
new = vq->signalled_used = vq->used_idx;
return !v || vring_need_event(vring_get_used_event(vq), new, old);
}
void vu_queue_notify(VuDev *dev, VuVirtq *vq)
{
if (dev->broken || !vq->vring.avail) {
return;
}
if (!vring_notify(dev, vq)) {
debug("skipped notify...");
return;
}
if (eventfd_write(vq->call_fd, 1) < 0) {
vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
}
}
static inline void vring_set_avail_event(VuVirtq *vq, uint16_t val)
{
uint16_t val_le = htole16(val);
if (!vq->notification) {
return;
}
memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t));
}
static bool virtqueue_map_desc(VuDev *dev,
unsigned int *p_num_sg, struct iovec *iov,
unsigned int max_num_sg,
uint64_t pa, size_t sz)
{
unsigned num_sg = *p_num_sg;
ASSERT(num_sg <= max_num_sg);
if (!sz) {
vu_panic(dev, "virtio: zero sized buffers are not allowed");
return false;
}
while (sz) {
uint64_t len = sz;
if (num_sg == max_num_sg) {
vu_panic(dev, "virtio: too many descriptors in indirect table");
return false;
}
iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa);
if (iov[num_sg].iov_base == NULL) {
vu_panic(dev, "virtio: invalid address for buffers");
return false;
}
iov[num_sg].iov_len = len;
num_sg++;
sz -= len;
pa += len;
}
*p_num_sg = num_sg;
return true;
}
static void * virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num, unsigned char *buffer)
{
VuVirtqElement *elem;
size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
if (out_sg_end > 65536)
return NULL;
elem = (void *)buffer;
elem->out_num = out_num;
elem->in_num = in_num;
elem->in_sg = (struct iovec *)((uintptr_t)elem + in_sg_ofs);
elem->out_sg = (struct iovec *)((uintptr_t)elem + out_sg_ofs);
return elem;
}
static void *
vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz, unsigned char *buffer)
{
struct vring_desc *desc = vq->vring.desc;
uint64_t desc_addr, read_len;
unsigned int desc_len;
unsigned int max = vq->vring.num;
unsigned int i = idx;
VuVirtqElement *elem;
unsigned int out_num = 0, in_num = 0;
struct iovec iov[VIRTQUEUE_MAX_SIZE];
struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
int rc;
if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
vu_panic(dev, "Invalid size for indirect buffer table");
return NULL;
}
/* loop over the indirect descriptor table */
desc_addr = le64toh(desc[i].addr);
desc_len = le32toh(desc[i].len);
max = desc_len / sizeof(struct vring_desc);
read_len = desc_len;
desc = vu_gpa_to_va(dev, &read_len, desc_addr);
if (desc && read_len != desc_len) {
/* Failed to use zero copy */
desc = NULL;
if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len)) {
desc = desc_buf;
}
}
if (!desc) {
vu_panic(dev, "Invalid indirect buffer table");
return NULL;
}
i = 0;
}
/* Collect all the descriptors */
do {
if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
if (!virtqueue_map_desc(dev, &in_num, iov + out_num,
VIRTQUEUE_MAX_SIZE - out_num,
le64toh(desc[i].addr),
le32toh(desc[i].len))) {
return NULL;
}
} else {
if (in_num) {
vu_panic(dev, "Incorrect order for descriptors");
return NULL;
}
if (!virtqueue_map_desc(dev, &out_num, iov,
VIRTQUEUE_MAX_SIZE,
le64toh(desc[i].addr),
le32toh(desc[i].len))) {
return NULL;
}
}
/* If we've got too many, that implies a descriptor loop. */
if ((in_num + out_num) > max) {
vu_panic(dev, "Looped descriptor");
return NULL;
}
rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
} while (rc == VIRTQUEUE_READ_DESC_MORE);
if (rc == VIRTQUEUE_READ_DESC_ERROR) {
vu_panic(dev, "read descriptor error");
return NULL;
}
/* Now copy what we have collected and mapped */
elem = virtqueue_alloc_element(sz, out_num, in_num, buffer);
if (!elem) {
return NULL;
}
elem->index = idx;
for (i = 0; i < out_num; i++) {
elem->out_sg[i] = iov[i];
}
for (i = 0; i < in_num; i++) {
elem->in_sg[i] = iov[out_num + i];
}
return elem;
}
void *vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz, unsigned char *buffer)
{
unsigned int head;
VuVirtqElement *elem;
if (dev->broken || !vq->vring.avail) {
return NULL;
}
if (vu_queue_empty(dev, vq)) {
return NULL;
}
/*
* Needed after virtio_queue_empty(), see comment in
* virtqueue_num_heads().
*/
smp_rmb();
if (vq->inuse >= vq->vring.num) {
vu_panic(dev, "Virtqueue size exceeded");
return NULL;
}
if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
return NULL;
}
if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
vring_set_avail_event(vq, vq->last_avail_idx);
}
elem = vu_queue_map_desc(dev, vq, head, sz, buffer);
if (!elem) {
return NULL;
}
vq->inuse++;
return elem;
}
void vu_queue_detach_element(VuDev *dev, VuVirtq *vq,
unsigned int index, size_t len)
{
(void)dev;
(void)index;
(void)len;
vq->inuse--;
/* unmap, when DMA support is added */
}
void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len)
{
vq->last_avail_idx--;
vu_queue_detach_element(dev, vq, index, len);
}
bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num)
{
(void)dev;
if (num > vq->inuse) {
return false;
}
vq->last_avail_idx -= num;
vq->inuse -= num;
return true;
}
static inline void vring_used_write(VuVirtq *vq,
struct vring_used_elem *uelem, int i)
{
struct vring_used *used = vq->vring.used;
used->ring[i] = *uelem;
}
void vu_queue_fill_by_index(VuDev *dev, VuVirtq *vq, unsigned int index,
unsigned int len, unsigned int idx)
{
struct vring_used_elem uelem;
if (dev->broken || !vq->vring.avail)
return;
idx = (idx + vq->used_idx) % vq->vring.num;
uelem.id = htole32(index);
uelem.len = htole32(len);
vring_used_write(vq, &uelem, idx);
}
void vu_queue_fill(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem,
unsigned int len, unsigned int idx)
{
vu_queue_fill_by_index(dev, vq, elem->index, len, idx);
}
static inline void vring_used_idx_set(VuVirtq *vq, uint16_t val)
{
vq->vring.used->idx = htole16(val);
vq->used_idx = val;
}
void vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count)
{
uint16_t old, new;
if (dev->broken ||
!vq->vring.avail) {
return;
}
/* Make sure buffer is written before we update index. */
smp_wmb();
old = vq->used_idx;
new = old + count;
vring_used_idx_set(vq, new);
vq->inuse -= count;
if ((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old)) {
vq->signalled_used_valid = false;
}
}
void vu_queue_push(VuDev *dev, VuVirtq *vq,
VuVirtqElement *elem, unsigned int len)
{
vu_queue_fill(dev, vq, elem, len, 0);
vu_queue_flush(dev, vq, 1);
}

121
virtio.h Normal file
View file

@ -0,0 +1,121 @@
// SPDX-License-Identifier: GPL-2.0-or-later
//
/* come parts copied from QEMU subprojects/libvhost-user/libvhost-user.h */
#ifndef VIRTIO_H
#define VIRTIO_H
#include <stdbool.h>
#include <linux/vhost_types.h>
#define VIRTQUEUE_MAX_SIZE 1024
#define vu_panic(vdev, ...) \
do { \
(vdev)->broken = true; \
err( __VA_ARGS__ ); \
} while (0)
typedef struct VuRing {
unsigned int num;
struct vring_desc *desc;
struct vring_avail *avail;
struct vring_used *used;
uint64_t log_guest_addr;
uint32_t flags;
} VuRing;
typedef struct VuVirtq {
VuRing vring;
/* Next head to pop */
uint16_t last_avail_idx;
/* Last avail_idx read from VQ. */
uint16_t shadow_avail_idx;
uint16_t used_idx;
/* Last used index value we have signalled on */
uint16_t signalled_used;
/* Last used index value we have signalled on */
bool signalled_used_valid;
bool notification;
unsigned int inuse;
int call_fd;
int kick_fd;
int err_fd;
unsigned int enable;
bool started;
/* Guest addresses of our ring */
struct vhost_vring_addr vra;
} VuVirtq;
typedef struct VuDevRegion {
uint64_t gpa;
uint64_t size;
uint64_t qva;
uint64_t mmap_offset;
uint64_t mmap_addr;
} VuDevRegion;
#define VHOST_USER_MAX_QUEUES 2
/*
* Set a reasonable maximum number of ram slots, which will be supported by
* any architecture.
*/
#define VHOST_USER_MAX_RAM_SLOTS 32
typedef struct VuDev {
uint32_t nregions;
VuDevRegion regions[VHOST_USER_MAX_RAM_SLOTS];
VuVirtq vq[VHOST_USER_MAX_QUEUES];
uint64_t features;
uint64_t protocol_features;
bool broken;
int hdrlen;
} VuDev;
typedef struct VuVirtqElement {
unsigned int index;
unsigned int out_num;
unsigned int in_num;
struct iovec *in_sg;
struct iovec *out_sg;
} VuVirtqElement;
static inline bool has_feature(uint64_t features, unsigned int fbit)
{
return !!(features & (1ULL << fbit));
}
static inline bool vu_has_feature(VuDev *vdev, unsigned int fbit)
{
return has_feature(vdev->features, fbit);
}
static inline bool vu_has_protocol_feature(VuDev *vdev, unsigned int fbit)
{
return has_feature(vdev->protocol_features, fbit);
}
bool vu_queue_empty(VuDev *dev, VuVirtq *vq);
void vu_queue_notify(VuDev *dev, VuVirtq *vq);
void *vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz, unsigned char *buffer);
void vu_queue_detach_element(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len);
void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len);
bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num);
void vu_queue_fill_by_index(VuDev *dev, VuVirtq *vq, unsigned int index,
unsigned int len, unsigned int idx);
void vu_queue_fill(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, unsigned int len,
unsigned int idx);
void vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count);
void vu_queue_push(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, unsigned int len);
#endif /* VIRTIO_H */