mirror of
https://passt.top/passt
synced 2025-04-08 22:55:01 +02:00
Compare commits
11 commits
master
...
vhost-user
Author | SHA1 | Date | |
---|---|---|---|
![]() |
71a16dbc49 | ||
![]() |
3958736de5 | ||
![]() |
9392ea7e5a | ||
![]() |
92fe7e967a | ||
![]() |
007af94bb9 | ||
![]() |
1ceee36c57 | ||
![]() |
7f6b184fb8 | ||
![]() |
23cc8f892f | ||
![]() |
119b45358c | ||
![]() |
8ac20f4795 | ||
![]() |
7f6c10626d |
40 changed files with 4117 additions and 216 deletions
9
Makefile
9
Makefile
|
@ -37,7 +37,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
|
|||
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
|
||||
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
|
||||
ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
|
||||
tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c
|
||||
tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
|
||||
vhost_user.c virtio.c vu_common.c
|
||||
QRAP_SRCS = qrap.c
|
||||
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
|
||||
|
||||
|
@ -47,7 +48,8 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
|
|||
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
|
||||
lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
|
||||
siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
|
||||
udp.h udp_flow.h util.h
|
||||
tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \
|
||||
virtio.h vu_common.h
|
||||
HEADERS = $(PASST_HEADERS) seccomp.h
|
||||
|
||||
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
|
||||
|
@ -99,7 +101,8 @@ qrap: $(QRAP_SRCS) passt.h
|
|||
|
||||
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
|
||||
rt_sigreturn getpid gettid kill clock_gettime mmap \
|
||||
mmap2 munmap open unlink gettimeofday futex
|
||||
mmap2 munmap open unlink gettimeofday futex statx \
|
||||
readlink
|
||||
valgrind: FLAGS += -g -DVALGRIND
|
||||
valgrind: all
|
||||
|
||||
|
|
21
conf.c
21
conf.c
|
@ -45,6 +45,7 @@
|
|||
#include "lineread.h"
|
||||
#include "isolation.h"
|
||||
#include "log.h"
|
||||
#include "vhost_user.h"
|
||||
|
||||
#define NETNS_RUN_DIR "/run/netns"
|
||||
|
||||
|
@ -769,9 +770,14 @@ static void usage(const char *name, FILE *f, int status)
|
|||
" default: same interface name as external one\n");
|
||||
} else {
|
||||
FPRINTF(f,
|
||||
" -s, --socket PATH UNIX domain socket path\n"
|
||||
" -s, --socket, --socket-path PATH UNIX domain socket path\n"
|
||||
" default: probe free path starting from "
|
||||
UNIX_SOCK_PATH "\n", 1);
|
||||
FPRINTF(f,
|
||||
" --vhost-user Enable vhost-user mode\n"
|
||||
" UNIX domain socket is provided by -s option\n"
|
||||
" --print-capabilities print back-end capabilities in JSON format,\n"
|
||||
" only meaningful for vhost-user mode\n");
|
||||
}
|
||||
|
||||
FPRINTF(f,
|
||||
|
@ -1305,6 +1311,10 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
{"map-guest-addr", required_argument, NULL, 22 },
|
||||
{"host-lo-to-ns-lo", no_argument, NULL, 23 },
|
||||
{"dns-host", required_argument, NULL, 24 },
|
||||
{"vhost-user", no_argument, NULL, 25 },
|
||||
/* vhost-user backend program convention */
|
||||
{"print-capabilities", no_argument, NULL, 26 },
|
||||
{"socket-path", required_argument, NULL, 's' },
|
||||
{ 0 },
|
||||
};
|
||||
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
|
||||
|
@ -1498,6 +1508,15 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
break;
|
||||
|
||||
die("Invalid host nameserver address: %s", optarg);
|
||||
case 25:
|
||||
if (c->mode == MODE_PASTA) {
|
||||
err("--vhost-user is for passt mode only");
|
||||
usage(argv[0], stdout, EXIT_SUCCESS);
|
||||
}
|
||||
c->mode = MODE_VU;
|
||||
break;
|
||||
case 26:
|
||||
vu_print_capabilities();
|
||||
break;
|
||||
case 'd':
|
||||
c->debug = 1;
|
||||
|
|
|
@ -47,8 +47,6 @@ require {
|
|||
type port_t;
|
||||
type http_port_t;
|
||||
|
||||
type passwd_file_t;
|
||||
|
||||
class netlink_route_socket { bind create nlmsg_read };
|
||||
type sysctl_net_t;
|
||||
|
||||
|
@ -96,8 +94,7 @@ allow passt_t self:capability { sys_tty_config setpcap net_bind_service setuid s
|
|||
allow passt_t self:cap_userns { setpcap sys_admin sys_ptrace };
|
||||
allow passt_t self:user_namespace create;
|
||||
|
||||
allow passt_t passwd_file_t:file read_file_perms;
|
||||
sssd_search_lib(passt_t)
|
||||
auth_read_passwd(passt_t)
|
||||
|
||||
allow passt_t proc_net_t:file read;
|
||||
allow passt_t net_conf_t:file { open read };
|
||||
|
|
|
@ -68,9 +68,6 @@ require {
|
|||
type system_dbusd_t;
|
||||
type systemd_hostnamed_t;
|
||||
type systemd_systemctl_exec_t;
|
||||
type passwd_file_t;
|
||||
type sssd_public_t;
|
||||
type sssd_var_lib_t;
|
||||
class dbus send_msg;
|
||||
class system module_request;
|
||||
class system status;
|
||||
|
@ -115,8 +112,7 @@ allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read
|
|||
allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service };
|
||||
allow pasta_t self:user_namespace create;
|
||||
|
||||
allow pasta_t passwd_file_t:file read_file_perms;
|
||||
sssd_search_lib(pasta_t)
|
||||
auth_read_passwd(pasta_t)
|
||||
|
||||
domain_auto_trans(pasta_t, bin_t, unconfined_t);
|
||||
domain_auto_trans(pasta_t, shell_exec_t, unconfined_t);
|
||||
|
@ -178,12 +174,9 @@ allow pasta_t init_t:system status;
|
|||
allow pasta_t unconfined_t:dir search;
|
||||
allow pasta_t unconfined_t:file read;
|
||||
allow pasta_t unconfined_t:lnk_file read;
|
||||
allow pasta_t passwd_file_t:file { getattr open read };
|
||||
allow pasta_t self:process { setpgid setcap };
|
||||
allow pasta_t shell_exec_t:file { execute execute_no_trans map };
|
||||
|
||||
allow pasta_t sssd_var_lib_t:dir search;
|
||||
allow pasta_t sssd_public_t:dir search;
|
||||
allow pasta_t hostname_exec_t:file { execute execute_no_trans getattr open read map };
|
||||
allow pasta_t system_dbusd_t:unix_stream_socket connectto;
|
||||
allow pasta_t system_dbusd_t:dbus send_msg;
|
||||
|
|
|
@ -36,6 +36,10 @@ enum epoll_type {
|
|||
EPOLL_TYPE_TAP_PASST,
|
||||
/* socket listening for qemu socket connections */
|
||||
EPOLL_TYPE_TAP_LISTEN,
|
||||
/* vhost-user command socket */
|
||||
EPOLL_TYPE_VHOST_CMD,
|
||||
/* vhost-user kick event socket */
|
||||
EPOLL_TYPE_VHOST_KICK,
|
||||
|
||||
EPOLL_NUM_TYPES,
|
||||
};
|
||||
|
|
1
iov.c
1
iov.c
|
@ -68,7 +68,6 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
|
|||
*
|
||||
* Returns: The number of bytes successfully copied.
|
||||
*/
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
|
||||
size_t offset, const void *buf, size_t bytes)
|
||||
{
|
||||
|
|
17
isolation.c
17
isolation.c
|
@ -379,12 +379,21 @@ void isolate_postfork(const struct ctx *c)
|
|||
|
||||
prctl(PR_SET_DUMPABLE, 0);
|
||||
|
||||
if (c->mode == MODE_PASTA) {
|
||||
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
|
||||
prog.filter = filter_pasta;
|
||||
} else {
|
||||
switch (c->mode) {
|
||||
case MODE_PASST:
|
||||
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
|
||||
prog.filter = filter_passt;
|
||||
break;
|
||||
case MODE_PASTA:
|
||||
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
|
||||
prog.filter = filter_pasta;
|
||||
break;
|
||||
case MODE_VU:
|
||||
prog.len = (unsigned short)ARRAY_SIZE(filter_vu);
|
||||
prog.filter = filter_vu;
|
||||
break;
|
||||
default:
|
||||
ASSERT(0);
|
||||
}
|
||||
|
||||
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
|
||||
|
|
95
packet.c
95
packet.c
|
@ -22,6 +22,46 @@
|
|||
#include "util.h"
|
||||
#include "log.h"
|
||||
|
||||
/**
|
||||
* packet_check_range() - Check if a packet memory range is valid
|
||||
* @p: Packet pool
|
||||
* @offset: Offset of data range in packet descriptor
|
||||
* @len: Length of desired data range
|
||||
* @start: Start of the packet descriptor
|
||||
* @func: For tracing: name of calling function
|
||||
* @line: For tracing: caller line of function call
|
||||
*
|
||||
* Return: 0 if the range is valid, -1 otherwise
|
||||
*/
|
||||
static int packet_check_range(const struct pool *p, size_t offset, size_t len,
|
||||
const char *start, const char *func, int line)
|
||||
{
|
||||
if (p->buf_size == 0) {
|
||||
int ret;
|
||||
|
||||
ret = vu_packet_check_range((void *)p->buf, offset, len, start);
|
||||
|
||||
if (ret == -1)
|
||||
trace("cannot find region, %s:%i", func, line);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (start < p->buf) {
|
||||
trace("packet start %p before buffer start %p, "
|
||||
"%s:%i", (void *)start, (void *)p->buf, func, line);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (start + len + offset > p->buf + p->buf_size) {
|
||||
trace("packet offset plus length %lu from size %lu, "
|
||||
"%s:%i", start - p->buf + len + offset,
|
||||
p->buf_size, func, line);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
/**
|
||||
* packet_add_do() - Add data as packet descriptor to given pool
|
||||
* @p: Existing pool
|
||||
|
@ -41,34 +81,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
|
|||
return;
|
||||
}
|
||||
|
||||
if (start < p->buf) {
|
||||
trace("add packet start %p before buffer start %p, %s:%i",
|
||||
(void *)start, (void *)p->buf, func, line);
|
||||
if (packet_check_range(p, 0, len, start, func, line))
|
||||
return;
|
||||
}
|
||||
|
||||
if (start + len > p->buf + p->buf_size) {
|
||||
trace("add packet start %p, length: %zu, buffer end %p, %s:%i",
|
||||
(void *)start, len, (void *)(p->buf + p->buf_size),
|
||||
func, line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (len > UINT16_MAX) {
|
||||
trace("add packet length %zu, %s:%i", len, func, line);
|
||||
return;
|
||||
}
|
||||
|
||||
#if UINTPTR_MAX == UINT64_MAX
|
||||
if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {
|
||||
trace("add packet start %p, buffer start %p, %s:%i",
|
||||
(void *)start, (void *)p->buf, func, line);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
p->pkt[idx].offset = start - p->buf;
|
||||
p->pkt[idx].len = len;
|
||||
p->pkt[idx].iov_base = (void *)start;
|
||||
p->pkt[idx].iov_len = len;
|
||||
|
||||
p->count++;
|
||||
}
|
||||
|
@ -96,36 +118,31 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (len > UINT16_MAX || len + offset > UINT32_MAX) {
|
||||
if (len > UINT16_MAX) {
|
||||
if (func) {
|
||||
trace("packet data length %zu, offset %zu, %s:%i",
|
||||
len, offset, func, line);
|
||||
trace("packet data length %zu, %s:%i",
|
||||
len, func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (p->pkt[idx].offset + len + offset > p->buf_size) {
|
||||
if (len + offset > p->pkt[idx].iov_len) {
|
||||
if (func) {
|
||||
trace("packet offset plus length %zu from size %zu, "
|
||||
"%s:%i", p->pkt[idx].offset + len + offset,
|
||||
p->buf_size, func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (len + offset > p->pkt[idx].len) {
|
||||
if (func) {
|
||||
trace("data length %zu, offset %zu from length %u, "
|
||||
"%s:%i", len, offset, p->pkt[idx].len,
|
||||
trace("data length %zu, offset %zu from length %zu, "
|
||||
"%s:%i", len, offset, p->pkt[idx].iov_len,
|
||||
func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (left)
|
||||
*left = p->pkt[idx].len - offset - len;
|
||||
if (packet_check_range(p, offset, len, p->pkt[idx].iov_base,
|
||||
func, line))
|
||||
return NULL;
|
||||
|
||||
return p->buf + p->pkt[idx].offset + offset;
|
||||
if (left)
|
||||
*left = p->pkt[idx].iov_len - offset - len;
|
||||
|
||||
return (char *)p->pkt[idx].iov_base + offset;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
22
packet.h
22
packet.h
|
@ -6,20 +6,12 @@
|
|||
#ifndef PACKET_H
|
||||
#define PACKET_H
|
||||
|
||||
/**
|
||||
* struct desc - Generic offset-based descriptor within buffer
|
||||
* @offset: Offset of descriptor relative to buffer start, 32-bit limit
|
||||
* @len: Length of descriptor, host order, 16-bit limit
|
||||
*/
|
||||
struct desc {
|
||||
uint32_t offset;
|
||||
uint16_t len;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct pool - Generic pool of packets stored in a buffer
|
||||
* @buf: Buffer storing packet descriptors
|
||||
* @buf_size: Total size of buffer
|
||||
* @buf: Buffer storing packet descriptors,
|
||||
* a struct vu_dev_region array for passt vhost-user mode
|
||||
* @buf_size: Total size of buffer,
|
||||
* 0 for passt vhost-user mode
|
||||
* @size: Number of usable descriptors for the pool
|
||||
* @count: Number of used descriptors for the pool
|
||||
* @pkt: Descriptors: see macros below
|
||||
|
@ -29,9 +21,11 @@ struct pool {
|
|||
size_t buf_size;
|
||||
size_t size;
|
||||
size_t count;
|
||||
struct desc pkt[1];
|
||||
struct iovec pkt[1];
|
||||
};
|
||||
|
||||
int vu_packet_check_range(void *buf, size_t offset, size_t len,
|
||||
const char *start);
|
||||
void packet_add_do(struct pool *p, size_t len, const char *start,
|
||||
const char *func, int line);
|
||||
void *packet_get_do(const struct pool *p, const size_t idx,
|
||||
|
@ -54,7 +48,7 @@ struct _name ## _t { \
|
|||
size_t buf_size; \
|
||||
size_t size; \
|
||||
size_t count; \
|
||||
struct desc pkt[_size]; \
|
||||
struct iovec pkt[_size]; \
|
||||
}
|
||||
|
||||
#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \
|
||||
|
|
10
passt.1
10
passt.1
|
@ -397,12 +397,20 @@ interface address are configured on a given host interface.
|
|||
.SS \fBpasst\fR-only options
|
||||
|
||||
.TP
|
||||
.BR \-s ", " \-\-socket " " \fIpath
|
||||
.BR \-s ", " \-\-socket-path ", " \-\-socket " " \fIpath
|
||||
Path for UNIX domain socket used by \fBqemu\fR(1) or \fBqrap\fR(1) to connect to
|
||||
\fBpasst\fR.
|
||||
Default is to probe a free socket, not accepting connections, starting from
|
||||
\fI/tmp/passt_1.socket\fR to \fI/tmp/passt_64.socket\fR.
|
||||
|
||||
.TP
|
||||
.BR \-\-vhost-user
|
||||
Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR.
|
||||
|
||||
.TP
|
||||
.BR \-\-print-capabilities
|
||||
Print back-end capabilities in JSON format, only meaningful for vhost-user mode.
|
||||
|
||||
.TP
|
||||
.BR \-F ", " \-\-fd " " \fIFD
|
||||
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
|
||||
|
|
11
passt.c
11
passt.c
|
@ -50,6 +50,7 @@
|
|||
#include "log.h"
|
||||
#include "tcp_splice.h"
|
||||
#include "ndp.h"
|
||||
#include "vu_common.h"
|
||||
|
||||
#define EPOLL_EVENTS 8
|
||||
|
||||
|
@ -72,6 +73,8 @@ char *epoll_type_str[] = {
|
|||
[EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device",
|
||||
[EPOLL_TYPE_TAP_PASST] = "connected qemu socket",
|
||||
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
|
||||
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
|
||||
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
|
||||
};
|
||||
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
|
||||
"epoll_type_str[] doesn't match enum epoll_type");
|
||||
|
@ -244,7 +247,7 @@ int main(int argc, char **argv)
|
|||
|
||||
pasta_netns_quit_init(&c);
|
||||
|
||||
tap_sock_init(&c);
|
||||
tap_backend_init(&c);
|
||||
|
||||
random_init(&c);
|
||||
|
||||
|
@ -346,6 +349,12 @@ loop:
|
|||
case EPOLL_TYPE_PING:
|
||||
icmp_sock_handler(&c, ref);
|
||||
break;
|
||||
case EPOLL_TYPE_VHOST_CMD:
|
||||
vu_control_handler(c.vdev, c.fd_tap, eventmask);
|
||||
break;
|
||||
case EPOLL_TYPE_VHOST_KICK:
|
||||
vu_kick_cb(c.vdev, ref, &now);
|
||||
break;
|
||||
default:
|
||||
/* Can't happen */
|
||||
ASSERT(0);
|
||||
|
|
7
passt.h
7
passt.h
|
@ -25,6 +25,7 @@ union epoll_ref;
|
|||
#include "fwd.h"
|
||||
#include "tcp.h"
|
||||
#include "udp.h"
|
||||
#include "vhost_user.h"
|
||||
|
||||
/* Default address for our end on the tap interface. Bit 0 of byte 0 must be 0
|
||||
* (unicast) and bit 1 of byte 1 must be 1 (locally administered). Otherwise
|
||||
|
@ -43,6 +44,7 @@ union epoll_ref;
|
|||
* @icmp: ICMP-specific reference part
|
||||
* @data: Data handled by protocol handlers
|
||||
* @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone
|
||||
* @queue: vhost-user queue index for this fd
|
||||
* @u64: Opaque reference for epoll_ctl() and epoll_wait()
|
||||
*/
|
||||
union epoll_ref {
|
||||
|
@ -58,6 +60,7 @@ union epoll_ref {
|
|||
union udp_listen_epoll_ref udp;
|
||||
uint32_t data;
|
||||
int nsdir_fd;
|
||||
int queue;
|
||||
};
|
||||
};
|
||||
uint64_t u64;
|
||||
|
@ -94,6 +97,7 @@ struct fqdn {
|
|||
enum passt_modes {
|
||||
MODE_PASST,
|
||||
MODE_PASTA,
|
||||
MODE_VU,
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -229,6 +233,7 @@ struct ip6_ctx {
|
|||
* @freebind: Allow binding of non-local addresses for forwarding
|
||||
* @low_wmem: Low probed net.core.wmem_max
|
||||
* @low_rmem: Low probed net.core.rmem_max
|
||||
* @vdev: vhost-user device
|
||||
*/
|
||||
struct ctx {
|
||||
enum passt_modes mode;
|
||||
|
@ -291,6 +296,8 @@ struct ctx {
|
|||
|
||||
int low_wmem;
|
||||
int low_rmem;
|
||||
|
||||
struct vu_dev *vdev;
|
||||
};
|
||||
|
||||
void proto_update_l2_buf(const unsigned char *eth_d,
|
||||
|
|
1
pcap.c
1
pcap.c
|
@ -143,7 +143,6 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
|
|||
* @iovcnt: Number of buffers (@iov entries)
|
||||
* @offset: Offset of the L2 frame within the full data length
|
||||
*/
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
|
||||
{
|
||||
struct timespec now = { 0 };
|
||||
|
|
129
tap.c
129
tap.c
|
@ -58,6 +58,8 @@
|
|||
#include "packet.h"
|
||||
#include "tap.h"
|
||||
#include "log.h"
|
||||
#include "vhost_user.h"
|
||||
#include "vu_common.h"
|
||||
|
||||
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
|
||||
static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
|
||||
|
@ -78,16 +80,22 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len)
|
|||
struct iovec iov[2];
|
||||
size_t iovcnt = 0;
|
||||
|
||||
if (c->mode == MODE_PASST) {
|
||||
switch (c->mode) {
|
||||
case MODE_PASST:
|
||||
iov[iovcnt] = IOV_OF_LVALUE(vnet_len);
|
||||
iovcnt++;
|
||||
/* fall through */
|
||||
case MODE_PASTA:
|
||||
iov[iovcnt].iov_base = (void *)data;
|
||||
iov[iovcnt].iov_len = l2len;
|
||||
iovcnt++;
|
||||
|
||||
tap_send_frames(c, iov, iovcnt, 1);
|
||||
break;
|
||||
case MODE_VU:
|
||||
vu_send_single(c, data, l2len);
|
||||
break;
|
||||
}
|
||||
|
||||
iov[iovcnt].iov_base = (void *)data;
|
||||
iov[iovcnt].iov_len = l2len;
|
||||
iovcnt++;
|
||||
|
||||
tap_send_frames(c, iov, iovcnt, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -414,10 +422,18 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
|
|||
if (!nframes)
|
||||
return 0;
|
||||
|
||||
if (c->mode == MODE_PASTA)
|
||||
switch (c->mode) {
|
||||
case MODE_PASTA:
|
||||
m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes);
|
||||
else
|
||||
break;
|
||||
case MODE_PASST:
|
||||
m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes);
|
||||
break;
|
||||
case MODE_VU:
|
||||
/* fall through */
|
||||
default:
|
||||
ASSERT(0);
|
||||
}
|
||||
|
||||
if (m < nframes)
|
||||
debug("tap: failed to send %zu frames of %zu",
|
||||
|
@ -976,7 +992,7 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
|
|||
* tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket
|
||||
* @c: Execution context
|
||||
*/
|
||||
static void tap_sock_reset(struct ctx *c)
|
||||
void tap_sock_reset(struct ctx *c)
|
||||
{
|
||||
info("Client connection closed%s", c->one_off ? ", exiting" : "");
|
||||
|
||||
|
@ -987,6 +1003,8 @@ static void tap_sock_reset(struct ctx *c)
|
|||
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
|
||||
close(c->fd_tap);
|
||||
c->fd_tap = -1;
|
||||
if (c->mode == MODE_VU)
|
||||
vu_cleanup(c->vdev);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1190,11 +1208,36 @@ int tap_sock_unix_open(char *sock_path)
|
|||
return fd;
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_backend_show_hints() - Give help information to start QEMU
|
||||
* @c: Execution context
|
||||
*/
|
||||
static void tap_backend_show_hints(struct ctx *c)
|
||||
{
|
||||
switch (c->mode) {
|
||||
case MODE_PASTA:
|
||||
/* No hints */
|
||||
break;
|
||||
case MODE_PASST:
|
||||
info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):");
|
||||
info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
|
||||
c->sock_path);
|
||||
info("or qrap, for earlier qemu versions:");
|
||||
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
|
||||
break;
|
||||
case MODE_VU:
|
||||
info("You can start qemu with:");
|
||||
info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n",
|
||||
c->sock_path);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_sock_unix_init() - Start listening for connections on AF_UNIX socket
|
||||
* @c: Execution context
|
||||
*/
|
||||
static void tap_sock_unix_init(struct ctx *c)
|
||||
static void tap_sock_unix_init(const struct ctx *c)
|
||||
{
|
||||
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN };
|
||||
struct epoll_event ev = { 0 };
|
||||
|
@ -1205,12 +1248,6 @@ static void tap_sock_unix_init(struct ctx *c)
|
|||
ev.events = EPOLLIN | EPOLLET;
|
||||
ev.data.u64 = ref.u64;
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
|
||||
|
||||
info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):");
|
||||
info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
|
||||
c->sock_path);
|
||||
info("or qrap, for earlier qemu versions:");
|
||||
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1220,8 +1257,8 @@ static void tap_sock_unix_init(struct ctx *c)
|
|||
*/
|
||||
void tap_listen_handler(struct ctx *c, uint32_t events)
|
||||
{
|
||||
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST };
|
||||
struct epoll_event ev = { 0 };
|
||||
union epoll_ref ref = { 0 };
|
||||
int v = INT_MAX / 2;
|
||||
struct ucred ucred;
|
||||
socklen_t len;
|
||||
|
@ -1261,6 +1298,10 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
|
|||
trace("tap: failed to set SO_SNDBUF to %i", v);
|
||||
|
||||
ref.fd = c->fd_tap;
|
||||
if (c->mode == MODE_VU)
|
||||
ref.type = EPOLL_TYPE_VHOST_CMD;
|
||||
else
|
||||
ref.type = EPOLL_TYPE_TAP_PASST;
|
||||
ev.events = EPOLLIN | EPOLLRDHUP;
|
||||
ev.data.u64 = ref.u64;
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
|
||||
|
@ -1323,21 +1364,34 @@ static void tap_sock_tun_init(struct ctx *c)
|
|||
}
|
||||
|
||||
/**
|
||||
* tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor
|
||||
* @c: Execution context
|
||||
* tap_sock_update_pool() - Set the buffer base and size for the pool of packets
|
||||
* @base: Buffer base
|
||||
* @size Buffer size
|
||||
*/
|
||||
void tap_sock_init(struct ctx *c)
|
||||
void tap_sock_update_pool(void *base, size_t size)
|
||||
{
|
||||
size_t sz = sizeof(pkt_buf);
|
||||
int i;
|
||||
|
||||
pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz);
|
||||
pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz);
|
||||
pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, base, size);
|
||||
pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, base, size);
|
||||
|
||||
for (i = 0; i < TAP_SEQS; i++) {
|
||||
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
|
||||
tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
|
||||
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
|
||||
tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_backend_init() - Create and set up AF_UNIX socket or
|
||||
* tuntap file descriptor
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tap_backend_init(struct ctx *c)
|
||||
{
|
||||
if (c->mode == MODE_VU)
|
||||
tap_sock_update_pool(NULL, 0);
|
||||
else
|
||||
tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
|
||||
|
||||
if (c->fd_tap != -1) { /* Passed as --fd */
|
||||
struct epoll_event ev = { 0 };
|
||||
|
@ -1345,10 +1399,17 @@ void tap_sock_init(struct ctx *c)
|
|||
|
||||
ASSERT(c->one_off);
|
||||
ref.fd = c->fd_tap;
|
||||
if (c->mode == MODE_PASST)
|
||||
switch (c->mode) {
|
||||
case MODE_PASST:
|
||||
ref.type = EPOLL_TYPE_TAP_PASST;
|
||||
else
|
||||
break;
|
||||
case MODE_PASTA:
|
||||
ref.type = EPOLL_TYPE_TAP_PASTA;
|
||||
break;
|
||||
case MODE_VU:
|
||||
ref.type = EPOLL_TYPE_VHOST_CMD;
|
||||
break;
|
||||
}
|
||||
|
||||
ev.events = EPOLLIN | EPOLLRDHUP;
|
||||
ev.data.u64 = ref.u64;
|
||||
|
@ -1356,9 +1417,14 @@ void tap_sock_init(struct ctx *c)
|
|||
return;
|
||||
}
|
||||
|
||||
if (c->mode == MODE_PASTA) {
|
||||
switch (c->mode) {
|
||||
case MODE_PASTA:
|
||||
tap_sock_tun_init(c);
|
||||
} else {
|
||||
break;
|
||||
case MODE_VU:
|
||||
vu_init(c);
|
||||
/* fall through */
|
||||
case MODE_PASST:
|
||||
tap_sock_unix_init(c);
|
||||
|
||||
/* In passt mode, we don't know the guest's MAC address until it
|
||||
|
@ -1366,5 +1432,8 @@ void tap_sock_init(struct ctx *c)
|
|||
* first packets will reach it.
|
||||
*/
|
||||
memset(&c->guest_mac, 0xff, sizeof(c->guest_mac));
|
||||
break;
|
||||
}
|
||||
|
||||
tap_backend_show_hints(c);
|
||||
}
|
||||
|
|
7
tap.h
7
tap.h
|
@ -40,7 +40,8 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c,
|
|||
*/
|
||||
static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
|
||||
{
|
||||
thdr->vnet_len = htonl(l2len);
|
||||
if (thdr)
|
||||
thdr->vnet_len = htonl(l2len);
|
||||
}
|
||||
|
||||
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
|
||||
|
@ -68,7 +69,9 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
|
|||
void tap_handler_passt(struct ctx *c, uint32_t events,
|
||||
const struct timespec *now);
|
||||
int tap_sock_unix_open(char *sock_path);
|
||||
void tap_sock_init(struct ctx *c);
|
||||
void tap_sock_reset(struct ctx *c);
|
||||
void tap_sock_update_pool(void *base, size_t size);
|
||||
void tap_backend_init(struct ctx *c);
|
||||
void tap_flush_pools(void);
|
||||
void tap_handler(struct ctx *c, const struct timespec *now);
|
||||
void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
|
||||
|
|
76
tcp.c
76
tcp.c
|
@ -304,6 +304,7 @@
|
|||
#include "flow_table.h"
|
||||
#include "tcp_internal.h"
|
||||
#include "tcp_buf.h"
|
||||
#include "tcp_vu.h"
|
||||
|
||||
/* MSS rounding: see SET_MSS() */
|
||||
#define MSS_DEFAULT 536
|
||||
|
@ -758,9 +759,9 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
|
|||
* @iov_cnt: Length of the array
|
||||
* @l4offset: IPv4 payload offset in the iovec array
|
||||
*/
|
||||
static void tcp_update_check_tcp4(const struct iphdr *iph,
|
||||
const struct iovec *iov, int iov_cnt,
|
||||
size_t l4offset)
|
||||
void tcp_update_check_tcp4(const struct iphdr *iph,
|
||||
const struct iovec *iov, int iov_cnt,
|
||||
size_t l4offset)
|
||||
{
|
||||
uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
|
||||
struct in_addr saddr = { .s_addr = iph->saddr };
|
||||
|
@ -810,9 +811,9 @@ static void tcp_update_check_tcp4(const struct iphdr *iph,
|
|||
* @iov_cnt: Length of the array
|
||||
* @l4offset: IPv6 payload offset in the iovec array
|
||||
*/
|
||||
static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
|
||||
const struct iovec *iov, int iov_cnt,
|
||||
size_t l4offset)
|
||||
void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
|
||||
const struct iovec *iov, int iov_cnt,
|
||||
size_t l4offset)
|
||||
{
|
||||
uint16_t l4len = ntohs(ip6h->payload_len);
|
||||
size_t check_ofs;
|
||||
|
@ -975,14 +976,11 @@ static void tcp_fill_header(struct tcphdr *th,
|
|||
* @check: Checksum, if already known
|
||||
* @seq: Sequence number for this segment
|
||||
* @no_tcp_csum: Do not set TCP checksum
|
||||
*
|
||||
* Return: The IPv4 payload length, host order
|
||||
*/
|
||||
static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
|
||||
struct tap_hdr *taph,
|
||||
struct iphdr *iph, struct tcp_payload_t *bp,
|
||||
size_t dlen, const uint16_t *check,
|
||||
uint32_t seq, bool no_tcp_csum)
|
||||
void tcp_fill_headers4(const struct tcp_tap_conn *conn,
|
||||
struct tap_hdr *taph, struct iphdr *iph,
|
||||
struct tcp_payload_t *bp, size_t dlen,
|
||||
const uint16_t *check, uint32_t seq, bool no_tcp_csum)
|
||||
{
|
||||
const struct flowside *tapside = TAPFLOW(conn);
|
||||
const struct in_addr *src4 = inany_v4(&tapside->oaddr);
|
||||
|
@ -1013,8 +1011,6 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
|
|||
}
|
||||
|
||||
tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
|
||||
|
||||
return l4len;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1027,13 +1023,11 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
|
|||
* @check: Checksum, if already known
|
||||
* @seq: Sequence number for this segment
|
||||
* @no_tcp_csum: Do not set TCP checksum
|
||||
*
|
||||
* Return: The IPv6 payload length, host order
|
||||
*/
|
||||
static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
|
||||
struct tap_hdr *taph,
|
||||
struct ipv6hdr *ip6h, struct tcp_payload_t *bp,
|
||||
size_t dlen, uint32_t seq, bool no_tcp_csum)
|
||||
void tcp_fill_headers6(const struct tcp_tap_conn *conn,
|
||||
struct tap_hdr *taph, struct ipv6hdr *ip6h,
|
||||
struct tcp_payload_t *bp, size_t dlen,
|
||||
uint32_t seq, bool no_tcp_csum)
|
||||
{
|
||||
const struct flowside *tapside = TAPFLOW(conn);
|
||||
size_t l4len = dlen + sizeof(bp->th);
|
||||
|
@ -1064,40 +1058,6 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
|
|||
}
|
||||
|
||||
tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
|
||||
|
||||
return l4len;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
|
||||
* @conn: Connection pointer
|
||||
* @iov: Pointer to an array of iovec of TCP pre-cooked buffers
|
||||
* @dlen: TCP payload length
|
||||
* @check: Checksum, if already known
|
||||
* @seq: Sequence number for this segment
|
||||
* @no_tcp_csum: Do not set TCP checksum
|
||||
*
|
||||
* Return: IP payload length, host order
|
||||
*/
|
||||
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
||||
struct iovec *iov, size_t dlen,
|
||||
const uint16_t *check, uint32_t seq,
|
||||
bool no_tcp_csum)
|
||||
{
|
||||
const struct flowside *tapside = TAPFLOW(conn);
|
||||
const struct in_addr *a4 = inany_v4(&tapside->oaddr);
|
||||
|
||||
if (a4) {
|
||||
return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
|
||||
iov[TCP_IOV_IP].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base, dlen,
|
||||
check, seq, no_tcp_csum);
|
||||
}
|
||||
|
||||
return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
|
||||
iov[TCP_IOV_IP].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base, dlen,
|
||||
seq, no_tcp_csum);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1312,6 +1272,9 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int flags)
|
||||
{
|
||||
if (c->mode == MODE_VU)
|
||||
return tcp_vu_send_flag(c, conn, flags);
|
||||
|
||||
return tcp_buf_send_flag(c, conn, flags);
|
||||
}
|
||||
|
||||
|
@ -1705,6 +1668,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
|
|||
*/
|
||||
static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
if (c->mode == MODE_VU)
|
||||
return tcp_vu_data_from_sock(c, conn);
|
||||
|
||||
return tcp_buf_data_from_sock(c, conn);
|
||||
}
|
||||
|
||||
|
|
39
tcp_buf.c
39
tcp_buf.c
|
@ -147,6 +147,36 @@ void tcp_payload_flush(const struct ctx *c)
|
|||
tcp_payload_used = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
|
||||
* @conn: Connection pointer
|
||||
* @iov: Pointer to an array of iovec of TCP pre-cooked buffers
|
||||
* @dlen: TCP payload length
|
||||
* @check: Checksum, if already known
|
||||
* @seq: Sequence number for this segment
|
||||
* @no_tcp_csum: Do not set TCP checksum
|
||||
*/
|
||||
static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
||||
struct iovec *iov, size_t dlen,
|
||||
const uint16_t *check, uint32_t seq,
|
||||
bool no_tcp_csum)
|
||||
{
|
||||
const struct flowside *tapside = TAPFLOW(conn);
|
||||
const struct in_addr *a4 = inany_v4(&tapside->oaddr);
|
||||
|
||||
if (a4) {
|
||||
tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
|
||||
iov[TCP_IOV_IP].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base, dlen,
|
||||
check, seq, no_tcp_csum);
|
||||
} else {
|
||||
tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
|
||||
iov[TCP_IOV_IP].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base, dlen,
|
||||
seq, no_tcp_csum);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_buf_send_flag() - Send segment with flags to tap (no payload)
|
||||
* @c: Execution context
|
||||
|
@ -181,8 +211,10 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
|||
return ret;
|
||||
|
||||
tcp_payload_used++;
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
|
||||
l4len = optlen + sizeof(struct tcphdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false);
|
||||
|
||||
if (flags & DUP_ACK) {
|
||||
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
|
||||
|
||||
|
@ -215,7 +247,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
struct tcp_payload_t *payload;
|
||||
const uint16_t *check = NULL;
|
||||
struct iovec *iov;
|
||||
size_t l4len;
|
||||
|
||||
conn->seq_to_tap = seq + dlen;
|
||||
tcp_frame_conns[tcp_payload_used] = conn;
|
||||
|
@ -238,8 +269,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
payload->th.th_x2 = 0;
|
||||
payload->th.th_flags = 0;
|
||||
payload->th.ack = 1;
|
||||
l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
|
||||
tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, false);
|
||||
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_payload_flush(c);
|
||||
}
|
||||
|
|
|
@ -162,10 +162,21 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
|
|||
|
||||
struct tcp_info_linux;
|
||||
|
||||
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
||||
struct iovec *iov, size_t dlen,
|
||||
const uint16_t *check, uint32_t seq,
|
||||
bool no_tcp_csum);
|
||||
void tcp_update_check_tcp4(const struct iphdr *iph,
|
||||
const struct iovec *iov, int iov_cnt,
|
||||
size_t l4offset);
|
||||
void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
|
||||
const struct iovec *iov, int iov_cnt,
|
||||
size_t l4offset);
|
||||
void tcp_fill_headers4(const struct tcp_tap_conn *conn,
|
||||
struct tap_hdr *taph, struct iphdr *iph,
|
||||
struct tcp_payload_t *bp, size_t dlen,
|
||||
const uint16_t *check, uint32_t seq, bool no_tcp_csum);
|
||||
void tcp_fill_headers6(const struct tcp_tap_conn *conn,
|
||||
struct tap_hdr *taph, struct ipv6hdr *ip6h,
|
||||
struct tcp_payload_t *bp, size_t dlen,
|
||||
uint32_t seq, bool no_tcp_csum);
|
||||
|
||||
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
bool force_seq, struct tcp_info_linux *tinfo);
|
||||
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
|
|
494
tcp_vu.c
Normal file
494
tcp_vu.c
Normal file
|
@ -0,0 +1,494 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/* tcp_vu.c - TCP L2 vhost-user management functions
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/tcp.h>
|
||||
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include <linux/virtio_net.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "passt.h"
|
||||
#include "siphash.h"
|
||||
#include "inany.h"
|
||||
#include "vhost_user.h"
|
||||
#include "tcp.h"
|
||||
#include "pcap.h"
|
||||
#include "flow.h"
|
||||
#include "tcp_conn.h"
|
||||
#include "flow_table.h"
|
||||
#include "tcp_vu.h"
|
||||
#include "tap.h"
|
||||
#include "tcp_internal.h"
|
||||
#include "checksum.h"
|
||||
#include "vu_common.h"
|
||||
#include <time.h>
|
||||
|
||||
static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
|
||||
static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
|
||||
|
||||
/**
|
||||
* tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP)
|
||||
* @v6: Set for IPv6 packet
|
||||
*
|
||||
* Return: Return the size of the header
|
||||
*/
|
||||
static size_t tcp_vu_hdrlen(bool v6)
|
||||
{
|
||||
size_t hdrlen;
|
||||
|
||||
hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) +
|
||||
sizeof(struct ethhdr) + sizeof(struct tcphdr);
|
||||
|
||||
if (v6)
|
||||
hdrlen += sizeof(struct ipv6hdr);
|
||||
else
|
||||
hdrlen += sizeof(struct iphdr);
|
||||
|
||||
return hdrlen;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_vu_update_check() - Calculate TCP checksum
|
||||
* @tapside: Address information for one side of the flow
|
||||
* @iov: Pointer to the array of IO vectors
|
||||
* @iov_used: Length of the array
|
||||
*/
|
||||
static void tcp_vu_update_check(const struct flowside *tapside,
|
||||
struct iovec *iov, int iov_used)
|
||||
{
|
||||
char *base = iov[0].iov_base;
|
||||
|
||||
if (inany_v4(&tapside->oaddr)) {
|
||||
const struct iphdr *iph = vu_ip(base);
|
||||
|
||||
tcp_update_check_tcp4(iph, iov, iov_used,
|
||||
(char *)vu_payloadv4(base) - base);
|
||||
} else {
|
||||
const struct ipv6hdr *ip6h = vu_ip(base);
|
||||
|
||||
tcp_update_check_tcp6(ip6h, iov, iov_used,
|
||||
(char *)vu_payloadv6(base) - base);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload)
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @flags: TCP flags: if not set, send segment only if ACK is due
|
||||
*
|
||||
* Return: negative error code on connection reset, 0 otherwise
|
||||
*/
|
||||
int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
{
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
const struct flowside *tapside = TAPFLOW(conn);
|
||||
size_t optlen, hdrlen;
|
||||
struct vu_virtq_element flags_elem[2];
|
||||
struct tcp_payload_t *payload;
|
||||
struct ipv6hdr *ip6h = NULL;
|
||||
struct iovec flags_iov[2];
|
||||
struct iphdr *iph = NULL;
|
||||
struct ethhdr *eh;
|
||||
uint32_t seq;
|
||||
int elem_cnt;
|
||||
int nb_ack;
|
||||
int ret;
|
||||
|
||||
hdrlen = tcp_vu_hdrlen(CONN_V6(conn));
|
||||
|
||||
vu_set_element(&flags_elem[0], NULL, &flags_iov[0]);
|
||||
|
||||
elem_cnt = vu_collect(vdev, vq, &flags_elem[0], 1,
|
||||
hdrlen + sizeof(struct tcp_syn_opts), NULL);
|
||||
if (elem_cnt != 1)
|
||||
return -1;
|
||||
|
||||
vu_set_vnethdr(vdev, flags_elem[0].in_sg[0].iov_base, 1);
|
||||
|
||||
eh = vu_eth(flags_elem[0].in_sg[0].iov_base);
|
||||
|
||||
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
|
||||
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
eh->h_proto = htons(ETH_P_IP);
|
||||
|
||||
iph = vu_ip(flags_elem[0].in_sg[0].iov_base);
|
||||
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
|
||||
payload = vu_payloadv4(flags_elem[0].in_sg[0].iov_base);
|
||||
} else {
|
||||
eh->h_proto = htons(ETH_P_IPV6);
|
||||
|
||||
ip6h = vu_ip(flags_elem[0].in_sg[0].iov_base);
|
||||
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
payload = vu_payloadv6(flags_elem[0].in_sg[0].iov_base);
|
||||
}
|
||||
|
||||
memset(&payload->th, 0, sizeof(payload->th));
|
||||
payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
|
||||
payload->th.ack = 1;
|
||||
|
||||
seq = conn->seq_to_tap;
|
||||
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
|
||||
(struct tcp_syn_opts *)payload->data,
|
||||
&optlen);
|
||||
if (ret <= 0) {
|
||||
vu_queue_rewind(vq, 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
tcp_fill_headers4(conn, NULL, iph, payload, optlen, NULL, seq,
|
||||
true);
|
||||
} else {
|
||||
tcp_fill_headers6(conn, NULL, ip6h, payload, optlen, seq, true);
|
||||
}
|
||||
|
||||
if (*c->pcap) {
|
||||
tcp_vu_update_check(tapside, &flags_elem[0].in_sg[0], 1);
|
||||
pcap_iov(&flags_elem[0].in_sg[0], 1,
|
||||
sizeof(struct virtio_net_hdr_mrg_rxbuf));
|
||||
}
|
||||
nb_ack = 1;
|
||||
|
||||
if (flags & DUP_ACK) {
|
||||
vu_set_element(&flags_elem[1], NULL, &flags_iov[1]);
|
||||
|
||||
elem_cnt = vu_collect(vdev, vq, &flags_elem[1], 1,
|
||||
flags_elem[0].in_sg[0].iov_len, NULL);
|
||||
if (elem_cnt == 1) {
|
||||
memcpy(flags_elem[1].in_sg[0].iov_base,
|
||||
flags_elem[0].in_sg[0].iov_base,
|
||||
flags_elem[0].in_sg[0].iov_len);
|
||||
nb_ack++;
|
||||
|
||||
if (*c->pcap)
|
||||
pcap_iov(&flags_elem[1].in_sg[0], 1, 0);
|
||||
}
|
||||
}
|
||||
|
||||
vu_flush(vdev, vq, flags_elem, nb_ack);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @v6: Set for IPv6 connections
|
||||
* @already_sent: Number of bytes already sent
|
||||
* @fillsize: Number of bytes we can receive
|
||||
* @iov_cnt: number of iov (output)
|
||||
*
|
||||
* Return: Number of iov entries used to store the data
|
||||
*/
|
||||
static ssize_t tcp_vu_sock_recv(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn, bool v6,
|
||||
uint32_t already_sent, size_t fillsize,
|
||||
int *iov_cnt)
|
||||
{
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
struct msghdr mh_sock = { 0 };
|
||||
uint16_t mss = MSS_GET(conn);
|
||||
int s = conn->sock;
|
||||
size_t hdrlen;
|
||||
int elem_cnt;
|
||||
ssize_t ret;
|
||||
|
||||
*iov_cnt = 0;
|
||||
|
||||
hdrlen = tcp_vu_hdrlen(v6);
|
||||
|
||||
vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
|
||||
|
||||
elem_cnt = 0;
|
||||
|
||||
while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) {
|
||||
struct iovec *iov;
|
||||
size_t frame_size;
|
||||
int cnt;
|
||||
|
||||
if (mss > fillsize)
|
||||
mss = fillsize;
|
||||
|
||||
cnt = vu_collect(vdev, vq, &elem[elem_cnt],
|
||||
VIRTQUEUE_MAX_SIZE - elem_cnt,
|
||||
mss + hdrlen, &frame_size);
|
||||
if (cnt == 0)
|
||||
break;
|
||||
|
||||
frame_size -= hdrlen;
|
||||
iov = &elem[elem_cnt].in_sg[0];
|
||||
iov->iov_base = (char *)iov->iov_base + hdrlen;
|
||||
iov->iov_len -= hdrlen;
|
||||
|
||||
fillsize -= frame_size;
|
||||
elem_cnt += cnt;
|
||||
|
||||
/* All the frames must have the same size (except the last one),
|
||||
* otherwise we will no able to scan the iov array
|
||||
* to find iov entries with headers
|
||||
* (headers are spread every frame_size in the the array
|
||||
*/
|
||||
if (frame_size < mss)
|
||||
break;
|
||||
}
|
||||
|
||||
if (peek_offset_cap) {
|
||||
mh_sock.msg_iov = iov_vu + 1;
|
||||
mh_sock.msg_iovlen = elem_cnt;
|
||||
} else {
|
||||
iov_vu[0].iov_base = tcp_buf_discard;
|
||||
iov_vu[0].iov_len = already_sent;
|
||||
|
||||
mh_sock.msg_iov = iov_vu;
|
||||
mh_sock.msg_iovlen = elem_cnt + 1;
|
||||
}
|
||||
|
||||
do
|
||||
ret = recvmsg(s, &mh_sock, MSG_PEEK);
|
||||
while (ret < 0 && errno == EINTR);
|
||||
|
||||
*iov_cnt = elem_cnt;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_vu_prepare() - Prepare the frame header
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @first: Pointer to the array of IO vectors
|
||||
* @dlen: Packet data length
|
||||
* @check: Checksum, if already known
|
||||
*/
|
||||
static void tcp_vu_prepare(const struct ctx *c,
|
||||
struct tcp_tap_conn *conn, struct iovec *first,
|
||||
size_t dlen, const uint16_t **check)
|
||||
{
|
||||
const struct flowside *toside = TAPFLOW(conn);
|
||||
struct tcp_payload_t *payload;
|
||||
char *base = first->iov_base;
|
||||
struct ipv6hdr *ip6h = NULL;
|
||||
struct iphdr *iph = NULL;
|
||||
struct ethhdr *eh;
|
||||
|
||||
/* we guess the first iovec provided by the guest can embed
|
||||
* all the headers needed by L2 frame
|
||||
*/
|
||||
|
||||
eh = vu_eth(base);
|
||||
|
||||
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
|
||||
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
|
||||
|
||||
/* initialize header */
|
||||
|
||||
if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
|
||||
ASSERT(first[0].iov_len >= tcp_vu_hdrlen(false));
|
||||
|
||||
eh->h_proto = htons(ETH_P_IP);
|
||||
|
||||
iph = vu_ip(base);
|
||||
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
payload = vu_payloadv4(base);
|
||||
} else {
|
||||
ASSERT(first[0].iov_len >= tcp_vu_hdrlen(true));
|
||||
|
||||
eh->h_proto = htons(ETH_P_IPV6);
|
||||
|
||||
ip6h = vu_ip(base);
|
||||
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
|
||||
payload = vu_payloadv6(base);
|
||||
}
|
||||
|
||||
memset(&payload->th, 0, sizeof(payload->th));
|
||||
payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
|
||||
payload->th.ack = 1;
|
||||
|
||||
if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
|
||||
tcp_fill_headers4(conn, NULL, iph, payload, dlen,
|
||||
*check, conn->seq_to_tap, true);
|
||||
*check = &iph->check;
|
||||
} else {
|
||||
tcp_fill_headers6(conn, NULL, ip6h, payload, dlen,
|
||||
conn->seq_to_tap, true);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_vu_data_from_sock() - Handle new data from socket, queue to vhost-user,
|
||||
* in window
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
*
|
||||
* Return: Negative on connection reset, 0 otherwise
|
||||
*/
|
||||
int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
const struct flowside *tapside = TAPFLOW(conn);
|
||||
uint16_t mss = MSS_GET(conn);
|
||||
size_t hdrlen, fillsize;
|
||||
int i, iov_cnt, iov_used;
|
||||
int v6 = CONN_V6(conn);
|
||||
uint32_t already_sent = 0;
|
||||
const uint16_t *check;
|
||||
struct iovec *first;
|
||||
int frame_size;
|
||||
int num_buffers;
|
||||
ssize_t len;
|
||||
|
||||
if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
|
||||
flow_err(conn,
|
||||
"Got packet, but RX virtqueue not usable yet");
|
||||
return 0;
|
||||
}
|
||||
|
||||
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
|
||||
if (SEQ_LT(already_sent, 0)) {
|
||||
/* RFC 761, section 2.1. */
|
||||
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
|
||||
conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
already_sent = 0;
|
||||
if (tcp_set_peek_offset(conn->sock, 0)) {
|
||||
tcp_rst(c, conn);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up buffer descriptors we'll fill completely and partially. */
|
||||
|
||||
fillsize = wnd_scaled - already_sent;
|
||||
|
||||
/* collect the buffers from vhost-user and fill them with the
|
||||
* data from the socket
|
||||
*/
|
||||
len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, &iov_cnt);
|
||||
if (len < 0) {
|
||||
vu_queue_rewind(vq, iov_cnt);
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
tcp_rst(c, conn);
|
||||
return -errno;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!len) {
|
||||
vu_queue_rewind(vq, iov_cnt);
|
||||
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
||||
int ret = tcp_vu_send_flag(c, conn, FIN | ACK);
|
||||
if (ret) {
|
||||
tcp_rst(c, conn);
|
||||
return ret;
|
||||
}
|
||||
|
||||
conn_event(c, conn, TAP_FIN_SENT);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!peek_offset_cap)
|
||||
len -= already_sent;
|
||||
|
||||
if (len <= 0) {
|
||||
vu_queue_rewind(vq, iov_cnt);
|
||||
conn_flag(c, conn, STALLED);
|
||||
return 0;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ~STALLED);
|
||||
|
||||
/* Likely, some new data was acked too. */
|
||||
tcp_update_seqack_wnd(c, conn, false, NULL);
|
||||
|
||||
/* initialize headers */
|
||||
hdrlen = tcp_vu_hdrlen(v6);
|
||||
iov_used = 0;
|
||||
num_buffers = 0;
|
||||
check = NULL;
|
||||
frame_size = 0;
|
||||
|
||||
/* iov_vu is an array of buffers and the buffer size can be
|
||||
* smaller than the frame size we want to use but with
|
||||
* num_buffer we can merge several virtio iov buffers in one packet
|
||||
* we need only to set the packet headers in the first iov and
|
||||
* num_buffer to the number of iov entries
|
||||
*/
|
||||
for (i = 0; i < iov_cnt && len; i++) {
|
||||
|
||||
if (frame_size == 0)
|
||||
first = &iov_vu[i + 1];
|
||||
|
||||
if (iov_vu[i + 1].iov_len > (size_t)len)
|
||||
iov_vu[i + 1].iov_len = len;
|
||||
|
||||
len -= iov_vu[i + 1].iov_len;
|
||||
iov_used++;
|
||||
|
||||
frame_size += iov_vu[i + 1].iov_len;
|
||||
num_buffers++;
|
||||
|
||||
if (frame_size >= mss || len == 0 ||
|
||||
i + 1 == iov_cnt || !vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
|
||||
if (i + 1 == iov_cnt)
|
||||
check = NULL;
|
||||
|
||||
/* restore first iovec base: point to vnet header */
|
||||
first->iov_base = (char *)first->iov_base - hdrlen;
|
||||
first->iov_len += hdrlen;
|
||||
vu_set_vnethdr(vdev, first->iov_base, num_buffers);
|
||||
|
||||
tcp_vu_prepare(c, conn, first, frame_size, &check);
|
||||
if (*c->pcap) {
|
||||
tcp_vu_update_check(tapside, first, num_buffers);
|
||||
pcap_iov(first, num_buffers,
|
||||
sizeof(struct virtio_net_hdr_mrg_rxbuf));
|
||||
}
|
||||
|
||||
conn->seq_to_tap += frame_size;
|
||||
|
||||
frame_size = 0;
|
||||
num_buffers = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* release unused buffers */
|
||||
vu_queue_rewind(vq, iov_cnt - iov_used);
|
||||
|
||||
/* send packets */
|
||||
vu_flush(vdev, vq, elem, iov_used);
|
||||
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
|
||||
return 0;
|
||||
}
|
12
tcp_vu.h
Normal file
12
tcp_vu.h
Normal file
|
@ -0,0 +1,12 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*/
|
||||
|
||||
#ifndef TCP_VU_H
|
||||
#define TCP_VU_H
|
||||
|
||||
int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||
int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
|
||||
|
||||
#endif /*TCP_VU_H */
|
|
@ -49,6 +49,21 @@ td:empty { visibility: hidden; }
|
|||
__passt_tcp_LINE__ __passt_udp_LINE__
|
||||
</table>
|
||||
|
||||
</li><li><p>passt with vhost-user support</p>
|
||||
<table class="passt" width="70%">
|
||||
<tr>
|
||||
<th/>
|
||||
<th id="perf_passt_vu_tcp" colspan="__passt_vu_tcp_cols__">TCP, __passt_vu_tcp_threads__ at __passt_vu_tcp_freq__ GHz</th>
|
||||
<th id="perf_passt_vu_udp" colspan="__passt_vu_udp_cols__">UDP, __passt_vu_udp_threads__ at __passt_vu_udp_freq__ GHz</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="right">MTU:</td>
|
||||
__passt_vu_tcp_header__
|
||||
__passt_vu_udp_header__
|
||||
</tr>
|
||||
__passt_vu_tcp_LINE__ __passt_vu_udp_LINE__
|
||||
</table>
|
||||
|
||||
<style type="text/CSS">
|
||||
table.pasta_local td { border: 0px solid; padding: 6px; line-height: 1; }
|
||||
table.pasta_local td { text-align: right; }
|
||||
|
|
|
@ -15,8 +15,7 @@
|
|||
|
||||
INITRAMFS="${BASEPATH}/mbuto.img"
|
||||
VCPUS="$( [ $(nproc) -ge 8 ] && echo 6 || echo $(( $(nproc) / 2 + 1 )) )"
|
||||
__mem_kib="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
|
||||
VMEM="$((${__mem_kib} / 1024 / 4))"
|
||||
MEM_KIB="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
|
||||
QEMU_ARCH="$(uname -m)"
|
||||
[ "${QEMU_ARCH}" = "i686" ] && QEMU_ARCH=i386
|
||||
|
||||
|
@ -46,6 +45,7 @@ setup_passt() {
|
|||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
|
||||
|
||||
context_run passt "make clean"
|
||||
context_run passt "make valgrind"
|
||||
|
@ -54,16 +54,29 @@ setup_passt() {
|
|||
# pidfile isn't created until passt is listening
|
||||
wait_for [ -f "${STATESETUP}/passt.pid" ]
|
||||
|
||||
__vmem="$((${MEM_KIB} / 1024 / 4))"
|
||||
if [ ${VHOST_USER} -eq 1 ]; then
|
||||
__vmem="$(((${__vmem} + 500) / 1000))G"
|
||||
__qemu_netdev=" \
|
||||
-chardev socket,id=c,path=${STATESETUP}/passt.socket \
|
||||
-netdev vhost-user,id=v,chardev=c \
|
||||
-device virtio-net,netdev=v \
|
||||
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
|
||||
-numa node,memdev=m"
|
||||
else
|
||||
__qemu_netdev="-device virtio-net-pci,netdev=s \
|
||||
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket"
|
||||
fi
|
||||
|
||||
GUEST_CID=94557
|
||||
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -machine accel=kvm' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
' -device virtio-net-pci,netdev=s0 ' \
|
||||
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \
|
||||
" ${__qemu_netdev}" \
|
||||
" -pidfile ${STATESETUP}/qemu.pid" \
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_CID"
|
||||
|
||||
|
@ -142,6 +155,7 @@ setup_passt_in_ns() {
|
|||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_in_pasta.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
|
||||
|
||||
if [ ${VALGRIND} -eq 1 ]; then
|
||||
context_run passt "make clean"
|
||||
|
@ -154,17 +168,30 @@ setup_passt_in_ns() {
|
|||
fi
|
||||
wait_for [ -f "${STATESETUP}/passt.pid" ]
|
||||
|
||||
__vmem="$((${MEM_KIB} / 1024 / 4))"
|
||||
if [ ${VHOST_USER} -eq 1 ]; then
|
||||
__vmem="$(((${__vmem} + 500) / 1000))G"
|
||||
__qemu_netdev=" \
|
||||
-chardev socket,id=c,path=${STATESETUP}/passt.socket \
|
||||
-netdev vhost-user,id=v,chardev=c \
|
||||
-device virtio-net,netdev=v \
|
||||
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
|
||||
-numa node,memdev=m"
|
||||
else
|
||||
__qemu_netdev="-device virtio-net-pci,netdev=s \
|
||||
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket"
|
||||
fi
|
||||
|
||||
GUEST_CID=94557
|
||||
context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -machine accel=kvm' \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
' -device virtio-net-pci,netdev=s0 ' \
|
||||
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \
|
||||
" ${__qemu_netdev}" \
|
||||
" -pidfile ${STATESETUP}/qemu.pid" \
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_CID"
|
||||
|
||||
|
@ -214,6 +241,7 @@ setup_two_guests() {
|
|||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
|
||||
|
||||
context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001"
|
||||
wait_for [ -f "${STATESETUP}/passt_1.pid" ]
|
||||
|
@ -222,33 +250,54 @@ setup_two_guests() {
|
|||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
[ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user"
|
||||
|
||||
context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004"
|
||||
wait_for [ -f "${STATESETUP}/passt_2.pid" ]
|
||||
|
||||
__vmem="$((${MEM_KIB} / 1024 / 4))"
|
||||
if [ ${VHOST_USER} -eq 1 ]; then
|
||||
__vmem="$(((${__vmem} + 500) / 1000))G"
|
||||
__qemu_netdev1=" \
|
||||
-chardev socket,id=c,path=${STATESETUP}/passt_1.socket \
|
||||
-netdev vhost-user,id=v,chardev=c \
|
||||
-device virtio-net,netdev=v \
|
||||
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
|
||||
-numa node,memdev=m"
|
||||
__qemu_netdev2=" \
|
||||
-chardev socket,id=c,path=${STATESETUP}/passt_2.socket \
|
||||
-netdev vhost-user,id=v,chardev=c \
|
||||
-device virtio-net,netdev=v \
|
||||
-object memory-backend-memfd,id=m,share=on,size=${__vmem} \
|
||||
-numa node,memdev=m"
|
||||
else
|
||||
__qemu_netdev1="-device virtio-net-pci,netdev=s \
|
||||
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket"
|
||||
__qemu_netdev2="-device virtio-net-pci,netdev=s \
|
||||
-netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket"
|
||||
fi
|
||||
|
||||
GUEST_1_CID=94557
|
||||
context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
' -device virtio-net-pci,netdev=s0 ' \
|
||||
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket " \
|
||||
" ${__qemu_netdev1}" \
|
||||
" -pidfile ${STATESETUP}/qemu_1.pid" \
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"
|
||||
|
||||
GUEST_2_CID=94558
|
||||
context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
|
||||
' -M accel=kvm:tcg' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -m '${__vmem}' -cpu host -smp '${VCPUS} \
|
||||
' -kernel '"${KERNEL}" \
|
||||
' -initrd '${INITRAMFS}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
' -append "console=ttyS0 mitigations=off apparmor=0" ' \
|
||||
' -device virtio-net-pci,netdev=s0 ' \
|
||||
" -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket " \
|
||||
" ${__qemu_netdev2}" \
|
||||
" -pidfile ${STATESETUP}/qemu_2.pid" \
|
||||
" -device vhost-vsock-pci,guest-cid=$GUEST_2_CID"
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ setup_memory() {
|
|||
|
||||
pane_or_context_run guest 'qemu-system-$(uname -m)' \
|
||||
' -machine accel=kvm' \
|
||||
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
|
||||
' -m '$((${MEM_KIB} / 1024 / 4))' -cpu host -smp '${VCPUS} \
|
||||
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
|
||||
' -initrd '${INITRAMFS_MEM}' -nographic -serial stdio' \
|
||||
' -nodefaults' \
|
||||
|
|
1
test/passt_vu
Symbolic link
1
test/passt_vu
Symbolic link
|
@ -0,0 +1 @@
|
|||
passt
|
1
test/passt_vu_in_ns
Symbolic link
1
test/passt_vu_in_ns
Symbolic link
|
@ -0,0 +1 @@
|
|||
passt_in_ns
|
211
test/perf/passt_vu_tcp
Normal file
211
test/perf/passt_vu_tcp
Normal file
|
@ -0,0 +1,211 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/perf/passt_vu_tcp - Check TCP performance in passt vhost-user mode
|
||||
#
|
||||
# Copyright (c) 2021 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
gtools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr # From neper
|
||||
nstools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr
|
||||
htools bc head sed seq
|
||||
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
test passt: throughput and latency
|
||||
|
||||
guest /sbin/sysctl -w net.core.rmem_max=536870912
|
||||
guest /sbin/sysctl -w net.core.wmem_max=536870912
|
||||
guest /sbin/sysctl -w net.core.rmem_default=33554432
|
||||
guest /sbin/sysctl -w net.core.wmem_default=33554432
|
||||
guest /sbin/sysctl -w net.ipv4.tcp_rmem="4096 131072 268435456"
|
||||
guest /sbin/sysctl -w net.ipv4.tcp_wmem="4096 131072 268435456"
|
||||
guest /sbin/sysctl -w net.ipv4.tcp_timestamps=0
|
||||
|
||||
ns /sbin/sysctl -w net.ipv4.tcp_rmem="4096 524288 134217728"
|
||||
ns /sbin/sysctl -w net.ipv4.tcp_wmem="4096 524288 134217728"
|
||||
ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0
|
||||
|
||||
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
|
||||
|
||||
hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
|
||||
hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
|
||||
hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
|
||||
|
||||
set THREADS 4
|
||||
set TIME 5
|
||||
set OMIT 0.1
|
||||
set OPTS -Z -P __THREADS__ -l 1M -O__OMIT__ -N
|
||||
|
||||
info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz
|
||||
report passt_vu tcp __THREADS__ __FREQ__
|
||||
|
||||
th MTU 256B 576B 1280B 1500B 9000B 65520B
|
||||
|
||||
|
||||
tr TCP throughput over IPv6: guest to host
|
||||
iperf3s ns 10002
|
||||
|
||||
bw -
|
||||
bw -
|
||||
guest ip link set dev __IFNAME__ mtu 1280
|
||||
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M
|
||||
bw __BW__ 1.2 1.5
|
||||
guest ip link set dev __IFNAME__ mtu 1500
|
||||
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 32M
|
||||
bw __BW__ 1.6 1.8
|
||||
guest ip link set dev __IFNAME__ mtu 9000
|
||||
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M
|
||||
bw __BW__ 4.0 5.0
|
||||
guest ip link set dev __IFNAME__ mtu 65520
|
||||
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M
|
||||
bw __BW__ 7.0 8.0
|
||||
|
||||
iperf3k ns
|
||||
|
||||
tl TCP RR latency over IPv6: guest to host
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
nsb tcp_rr --nolog -6
|
||||
gout LAT tcp_rr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 200 150
|
||||
|
||||
tl TCP CRR latency over IPv6: guest to host
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
nsb tcp_crr --nolog -6
|
||||
gout LAT tcp_crr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 500 400
|
||||
|
||||
tr TCP throughput over IPv4: guest to host
|
||||
iperf3s ns 10002
|
||||
|
||||
guest ip link set dev __IFNAME__ mtu 256
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 2M
|
||||
bw __BW__ 0.2 0.3
|
||||
guest ip link set dev __IFNAME__ mtu 576
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M
|
||||
bw __BW__ 0.5 0.8
|
||||
guest ip link set dev __IFNAME__ mtu 1280
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M
|
||||
bw __BW__ 1.2 1.5
|
||||
guest ip link set dev __IFNAME__ mtu 1500
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M
|
||||
bw __BW__ 1.6 1.8
|
||||
guest ip link set dev __IFNAME__ mtu 9000
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M
|
||||
bw __BW__ 4.0 5.0
|
||||
guest ip link set dev __IFNAME__ mtu 65520
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M
|
||||
bw __BW__ 7.0 8.0
|
||||
|
||||
iperf3k ns
|
||||
|
||||
# Reducing MTU below 1280 deconfigures IPv6, get our address back
|
||||
guest dhclient -6 -x
|
||||
guest dhclient -6 __IFNAME__
|
||||
|
||||
tl TCP RR latency over IPv4: guest to host
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
nsb tcp_rr --nolog -4
|
||||
gout LAT tcp_rr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 200 150
|
||||
|
||||
tl TCP CRR latency over IPv4: guest to host
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
nsb tcp_crr --nolog -4
|
||||
gout LAT tcp_crr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 500 400
|
||||
|
||||
tr TCP throughput over IPv6: host to guest
|
||||
iperf3s guest 10001
|
||||
|
||||
bw -
|
||||
bw -
|
||||
bw -
|
||||
bw -
|
||||
bw -
|
||||
iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -w 32M
|
||||
bw __BW__ 6.0 6.8
|
||||
|
||||
iperf3k guest
|
||||
|
||||
tl TCP RR latency over IPv6: host to guest
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
guestb tcp_rr --nolog -P 10001 -C 10011 -6
|
||||
sleep 1
|
||||
nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 200 150
|
||||
|
||||
tl TCP CRR latency over IPv6: host to guest
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
guestb tcp_crr --nolog -P 10001 -C 10011 -6
|
||||
sleep 1
|
||||
nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 500 350
|
||||
|
||||
|
||||
tr TCP throughput over IPv4: host to guest
|
||||
iperf3s guest 10001
|
||||
|
||||
bw -
|
||||
bw -
|
||||
bw -
|
||||
bw -
|
||||
bw -
|
||||
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -w 32M
|
||||
bw __BW__ 6.0 6.8
|
||||
|
||||
iperf3k guest
|
||||
|
||||
tl TCP RR latency over IPv4: host to guest
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
guestb tcp_rr --nolog -P 10001 -C 10011 -4
|
||||
sleep 1
|
||||
nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 200 150
|
||||
|
||||
tl TCP CRR latency over IPv6: host to guest
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
guestb tcp_crr --nolog -P 10001 -C 10011 -4
|
||||
sleep 1
|
||||
nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 500 300
|
||||
|
||||
te
|
159
test/perf/passt_vu_udp
Normal file
159
test/perf/passt_vu_udp
Normal file
|
@ -0,0 +1,159 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#
|
||||
# PASST - Plug A Simple Socket Transport
|
||||
# for qemu/UNIX domain socket mode
|
||||
#
|
||||
# PASTA - Pack A Subtle Tap Abstraction
|
||||
# for network namespace/tap device mode
|
||||
#
|
||||
# test/perf/passt_vu_udp - Check UDP performance in passt vhost-user mode
|
||||
#
|
||||
# Copyright (c) 2021 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
gtools /sbin/sysctl ip jq nproc sleep iperf3 udp_rr # From neper
|
||||
nstools ip jq sleep iperf3 udp_rr
|
||||
htools bc head sed
|
||||
|
||||
set MAP_NS4 192.0.2.2
|
||||
set MAP_NS6 2001:db8:9a55::2
|
||||
|
||||
test passt: throughput and latency
|
||||
|
||||
guest /sbin/sysctl -w net.core.rmem_max=16777216
|
||||
guest /sbin/sysctl -w net.core.wmem_max=16777216
|
||||
guest /sbin/sysctl -w net.core.rmem_default=16777216
|
||||
guest /sbin/sysctl -w net.core.wmem_default=16777216
|
||||
|
||||
hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
|
||||
hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
|
||||
hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
|
||||
|
||||
set THREADS 2
|
||||
set TIME 1
|
||||
set OPTS -u -P __THREADS__ --pacing-timer 1000
|
||||
|
||||
info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz
|
||||
|
||||
report passt_vu udp __THREADS__ __FREQ__
|
||||
|
||||
th pktlen 256B 576B 1280B 1500B 9000B 65520B
|
||||
|
||||
tr UDP throughput over IPv6: guest to host
|
||||
iperf3s ns 10002
|
||||
# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header
|
||||
|
||||
bw -
|
||||
bw -
|
||||
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 3G -l 1232
|
||||
bw __BW__ 0.8 1.2
|
||||
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 4G -l 1452
|
||||
bw __BW__ 1.0 1.5
|
||||
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 10G -l 8952
|
||||
bw __BW__ 4.0 5.0
|
||||
iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 20G -l 64372
|
||||
bw __BW__ 4.0 5.0
|
||||
|
||||
iperf3k ns
|
||||
|
||||
tl UDP RR latency over IPv6: guest to host
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
nsb udp_rr --nolog -6
|
||||
gout LAT udp_rr --nolog -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 200 150
|
||||
|
||||
|
||||
tr UDP throughput over IPv4: guest to host
|
||||
iperf3s ns 10002
|
||||
# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header
|
||||
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 1G -l 228
|
||||
bw __BW__ 0.0 0.0
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 2G -l 548
|
||||
bw __BW__ 0.4 0.6
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 3G -l 1252
|
||||
bw __BW__ 0.8 1.2
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 4G -l 1472
|
||||
bw __BW__ 1.0 1.5
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 10G -l 8972
|
||||
bw __BW__ 4.0 5.0
|
||||
iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 20G -l 65492
|
||||
bw __BW__ 4.0 5.0
|
||||
|
||||
iperf3k ns
|
||||
|
||||
tl UDP RR latency over IPv4: guest to host
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
nsb udp_rr --nolog -4
|
||||
gout LAT udp_rr --nolog -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 200 150
|
||||
|
||||
|
||||
tr UDP throughput over IPv6: host to guest
|
||||
iperf3s guest 10001
|
||||
# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header
|
||||
|
||||
bw -
|
||||
bw -
|
||||
iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 3G -l 1232
|
||||
bw __BW__ 0.8 1.2
|
||||
iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 4G -l 1452
|
||||
bw __BW__ 1.0 1.5
|
||||
iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 10G -l 8952
|
||||
bw __BW__ 3.0 4.0
|
||||
iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 20G -l 64372
|
||||
bw __BW__ 3.0 4.0
|
||||
|
||||
iperf3k guest
|
||||
|
||||
tl UDP RR latency over IPv6: host to guest
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
guestb udp_rr --nolog -P 10001 -C 10011 -6
|
||||
sleep 1
|
||||
nsout LAT udp_rr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 200 150
|
||||
|
||||
|
||||
tr UDP throughput over IPv4: host to guest
|
||||
iperf3s guest 10001
|
||||
# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header
|
||||
|
||||
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 1G -l 228
|
||||
bw __BW__ 0.0 0.0
|
||||
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 2G -l 548
|
||||
bw __BW__ 0.4 0.6
|
||||
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 3G -l 1252
|
||||
bw __BW__ 0.8 1.2
|
||||
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 4G -l 1472
|
||||
bw __BW__ 1.0 1.5
|
||||
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 10G -l 8972
|
||||
bw __BW__ 3.0 4.0
|
||||
iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 20G -l 65492
|
||||
bw __BW__ 3.0 4.0
|
||||
|
||||
iperf3k guest
|
||||
|
||||
tl UDP RR latency over IPv4: host to guest
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
lat -
|
||||
guestb udp_rr --nolog -P 10001 -C 10011 -4
|
||||
sleep 1
|
||||
nsout LAT udp_rr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
|
||||
lat __LAT__ 200 150
|
||||
|
||||
te
|
25
test/run
25
test/run
|
@ -93,6 +93,7 @@ run() {
|
|||
test memory/passt
|
||||
teardown memory
|
||||
|
||||
VHOST_USER=0
|
||||
setup passt
|
||||
test passt/ndp
|
||||
test passt/dhcp
|
||||
|
@ -115,7 +116,22 @@ run() {
|
|||
test two_guests/basic
|
||||
teardown two_guests
|
||||
|
||||
VHOST_USER=1
|
||||
setup passt_in_ns
|
||||
test passt_vu/ndp
|
||||
test passt_vu_in_ns/dhcp
|
||||
test passt_vu_in_ns/icmp
|
||||
test passt_vu_in_ns/tcp
|
||||
test passt_vu_in_ns/udp
|
||||
test passt_vu_in_ns/shutdown
|
||||
teardown passt_in_ns
|
||||
|
||||
setup two_guests
|
||||
test two_guests_vu/basic
|
||||
teardown two_guests
|
||||
|
||||
VALGRIND=0
|
||||
VHOST_USER=0
|
||||
setup passt_in_ns
|
||||
test passt/ndp
|
||||
test passt_in_ns/dhcp
|
||||
|
@ -126,6 +142,15 @@ run() {
|
|||
test passt_in_ns/shutdown
|
||||
teardown passt_in_ns
|
||||
|
||||
VHOST_USER=1
|
||||
setup passt_in_ns
|
||||
test passt_vu/ndp
|
||||
test passt_vu_in_ns/dhcp
|
||||
test perf/passt_vu_tcp
|
||||
test perf/passt_vu_udp
|
||||
test passt_vu_in_ns/shutdown
|
||||
teardown passt_in_ns
|
||||
|
||||
# TODO: Make those faster by at least pre-installing gcc and make on
|
||||
# non-x86 images, then re-enable.
|
||||
skip_distro() {
|
||||
|
|
1
test/two_guests_vu
Symbolic link
1
test/two_guests_vu
Symbolic link
|
@ -0,0 +1 @@
|
|||
two_guests
|
85
udp.c
85
udp.c
|
@ -109,8 +109,8 @@
|
|||
#include "pcap.h"
|
||||
#include "log.h"
|
||||
#include "flow_table.h"
|
||||
|
||||
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
|
||||
#include "udp_internal.h"
|
||||
#include "udp_vu.h"
|
||||
|
||||
/* "Spliced" sockets indexed by bound port (host order) */
|
||||
static int udp_splice_ns [IP_VERSIONS][NUM_PORTS];
|
||||
|
@ -118,20 +118,8 @@ static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
|
|||
|
||||
/* Static buffers */
|
||||
|
||||
/**
|
||||
* struct udp_payload_t - UDP header and data for inbound messages
|
||||
* @uh: UDP header
|
||||
* @data: UDP data
|
||||
*/
|
||||
static struct udp_payload_t {
|
||||
struct udphdr uh;
|
||||
char data[USHRT_MAX - sizeof(struct udphdr)];
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)))
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
|
||||
#endif
|
||||
udp_payload[UDP_MAX_FRAMES];
|
||||
/* UDP header and data for inbound messages */
|
||||
static struct udp_payload_t udp_payload[UDP_MAX_FRAMES];
|
||||
|
||||
/* Ethernet header for IPv4 frames */
|
||||
static struct ethhdr udp4_eth_hdr;
|
||||
|
@ -302,9 +290,9 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
|
|||
*
|
||||
* Return: size of IPv4 payload (UDP header + data)
|
||||
*/
|
||||
static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
|
||||
const struct flowside *toside, size_t dlen,
|
||||
bool no_udp_csum)
|
||||
size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
|
||||
const struct flowside *toside, size_t dlen,
|
||||
bool no_udp_csum)
|
||||
{
|
||||
const struct in_addr *src = inany_v4(&toside->oaddr);
|
||||
const struct in_addr *dst = inany_v4(&toside->eaddr);
|
||||
|
@ -345,9 +333,9 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
|
|||
*
|
||||
* Return: size of IPv6 payload (UDP header + data)
|
||||
*/
|
||||
static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
|
||||
const struct flowside *toside, size_t dlen,
|
||||
bool no_udp_csum)
|
||||
size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
|
||||
const struct flowside *toside, size_t dlen,
|
||||
bool no_udp_csum)
|
||||
{
|
||||
uint16_t l4len = dlen + sizeof(bp->uh);
|
||||
|
||||
|
@ -477,7 +465,7 @@ static int udp_sock_recverr(int s)
|
|||
*
|
||||
* Return: Number of errors handled, or < 0 if we have an unrecoverable error
|
||||
*/
|
||||
static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
|
||||
int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
|
||||
{
|
||||
unsigned n_err = 0;
|
||||
socklen_t errlen;
|
||||
|
@ -554,7 +542,7 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
|
|||
}
|
||||
|
||||
/**
|
||||
* udp_listen_sock_handler() - Handle new data from socket
|
||||
* udp_buf_listen_sock_handler() - Handle new data from socket
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
|
@ -562,8 +550,9 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
|
|||
*
|
||||
* #syscalls recvmmsg
|
||||
*/
|
||||
void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now)
|
||||
static void udp_buf_listen_sock_handler(const struct ctx *c,
|
||||
union epoll_ref ref, uint32_t events,
|
||||
const struct timespec *now)
|
||||
{
|
||||
const socklen_t sasize = sizeof(udp_meta[0].s_in);
|
||||
int n, i;
|
||||
|
@ -630,7 +619,26 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
}
|
||||
|
||||
/**
|
||||
* udp_reply_sock_handler() - Handle new data from flow specific socket
|
||||
* udp_listen_sock_handler() - Handle new data from socket
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void udp_listen_sock_handler(const struct ctx *c,
|
||||
union epoll_ref ref, uint32_t events,
|
||||
const struct timespec *now)
|
||||
{
|
||||
if (c->mode == MODE_VU) {
|
||||
udp_vu_listen_sock_handler(c, ref, events, now);
|
||||
return;
|
||||
}
|
||||
|
||||
udp_buf_listen_sock_handler(c, ref, events, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_buf_reply_sock_handler() - Handle new data from flow specific socket
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
|
@ -638,8 +646,9 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
*
|
||||
* #syscalls recvmmsg
|
||||
*/
|
||||
void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now)
|
||||
static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events,
|
||||
const struct timespec *now)
|
||||
{
|
||||
flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
|
||||
const struct flowside *toside = flowside_at_sidx(tosidx);
|
||||
|
@ -685,6 +694,24 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_reply_sock_handler() - Handle new data from flow specific socket
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now)
|
||||
{
|
||||
if (c->mode == MODE_VU) {
|
||||
udp_vu_reply_sock_handler(c, ref, events, now);
|
||||
return;
|
||||
}
|
||||
|
||||
udp_buf_reply_sock_handler(c, ref, events, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_tap_handler() - Handle packets from tap
|
||||
* @c: Execution context
|
||||
|
|
34
udp_internal.h
Normal file
34
udp_internal.h
Normal file
|
@ -0,0 +1,34 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright (c) 2021 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
#ifndef UDP_INTERNAL_H
|
||||
#define UDP_INTERNAL_H
|
||||
|
||||
#include "tap.h" /* needed by udp_meta_t */
|
||||
|
||||
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
|
||||
|
||||
/**
|
||||
* struct udp_payload_t - UDP header and data for inbound messages
|
||||
* @uh: UDP header
|
||||
* @data: UDP data
|
||||
*/
|
||||
struct udp_payload_t {
|
||||
struct udphdr uh;
|
||||
char data[USHRT_MAX - sizeof(struct udphdr)];
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)));
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
|
||||
const struct flowside *toside, size_t dlen,
|
||||
bool no_udp_csum);
|
||||
size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
|
||||
const struct flowside *toside, size_t dlen,
|
||||
bool no_udp_csum);
|
||||
int udp_sock_errs(const struct ctx *c, int s, uint32_t events);
|
||||
#endif /* UDP_INTERNAL_H */
|
336
udp_vu.c
Normal file
336
udp_vu.c
Normal file
|
@ -0,0 +1,336 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/* udp_vu.c - UDP L2 vhost-user management functions
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
#include <net/ethernet.h>
|
||||
#include <net/if.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/udp.h>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <sys/uio.h>
|
||||
#include <linux/virtio_net.h>
|
||||
|
||||
#include "checksum.h"
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "siphash.h"
|
||||
#include "inany.h"
|
||||
#include "passt.h"
|
||||
#include "pcap.h"
|
||||
#include "log.h"
|
||||
#include "vhost_user.h"
|
||||
#include "udp_internal.h"
|
||||
#include "flow.h"
|
||||
#include "flow_table.h"
|
||||
#include "udp_flow.h"
|
||||
#include "udp_vu.h"
|
||||
#include "vu_common.h"
|
||||
|
||||
static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE];
|
||||
static struct vu_virtq_element elem [VIRTQUEUE_MAX_SIZE];
|
||||
|
||||
/**
|
||||
* udp_vu_hdrlen() - return the size of the header in level 2 frame (UDP)
|
||||
* @v6: Set for IPv6 packet
|
||||
*
|
||||
* Return: Return the size of the header
|
||||
*/
|
||||
static size_t udp_vu_hdrlen(bool v6)
|
||||
{
|
||||
size_t hdrlen;
|
||||
|
||||
hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) +
|
||||
sizeof(struct ethhdr) + sizeof(struct udphdr);
|
||||
|
||||
if (v6)
|
||||
hdrlen += sizeof(struct ipv6hdr);
|
||||
else
|
||||
hdrlen += sizeof(struct iphdr);
|
||||
|
||||
return hdrlen;
|
||||
}
|
||||
|
||||
static int udp_vu_sock_init(int s, union sockaddr_inany *s_in)
|
||||
{
|
||||
struct msghdr msg = {
|
||||
.msg_name = s_in,
|
||||
.msg_namelen = sizeof(union sockaddr_inany),
|
||||
};
|
||||
|
||||
return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers
|
||||
* @c: Execution context
|
||||
* @s: Socket to receive from
|
||||
* @events: epoll events bitmap
|
||||
* @v6: Set for IPv6 connections
|
||||
* @dlen: Size of received data (output)
|
||||
*
|
||||
* Return: Number of iov entries used to store the datagram
|
||||
*/
|
||||
static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
|
||||
bool v6, ssize_t *dlen)
|
||||
{
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
int iov_cnt, idx, iov_used;
|
||||
struct msghdr msg = { 0 };
|
||||
size_t off, hdrlen;
|
||||
|
||||
ASSERT(!c->no_udp);
|
||||
|
||||
if (!(events & EPOLLIN))
|
||||
return 0;
|
||||
|
||||
/* compute L2 header length */
|
||||
hdrlen = udp_vu_hdrlen(v6);
|
||||
|
||||
vu_init_elem(elem, iov_vu, VIRTQUEUE_MAX_SIZE);
|
||||
|
||||
iov_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE,
|
||||
IP_MAX_MTU - sizeof(struct udphdr) + hdrlen,
|
||||
NULL);
|
||||
if (iov_cnt == 0)
|
||||
return 0;
|
||||
|
||||
/* reserve space for the headers */
|
||||
iov_vu[0].iov_base = (char *)iov_vu[0].iov_base + hdrlen;
|
||||
iov_vu[0].iov_len -= hdrlen;
|
||||
|
||||
/* read data from the socket */
|
||||
msg.msg_iov = iov_vu;
|
||||
msg.msg_iovlen = iov_cnt;
|
||||
|
||||
*dlen = recvmsg(s, &msg, 0);
|
||||
if (*dlen < 0) {
|
||||
vu_queue_rewind(vq, iov_cnt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* restore the pointer to the headers address */
|
||||
iov_vu[0].iov_base = (char *)iov_vu[0].iov_base - hdrlen;
|
||||
iov_vu[0].iov_len += hdrlen;
|
||||
|
||||
/* count the numbers of buffer filled by recvmsg() */
|
||||
idx = iov_skip_bytes(iov_vu, iov_cnt, *dlen + hdrlen, &off);
|
||||
|
||||
/* adjust last iov length */
|
||||
if (idx < iov_cnt)
|
||||
iov_vu[idx].iov_len = off;
|
||||
iov_used = idx + !!off;
|
||||
|
||||
vu_set_vnethdr(vdev, iov_vu[0].iov_base, iov_used);
|
||||
|
||||
/* release unused buffers */
|
||||
vu_queue_rewind(vq, iov_cnt - iov_used);
|
||||
|
||||
return iov_used;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_vu_prepare() - Prepare the packet header
|
||||
* @c: Execution context
|
||||
* @toside: Address information for one side of the flow
|
||||
* @dlen: Packet data length
|
||||
*
|
||||
* Return: Layer-4 length
|
||||
*/
|
||||
static size_t udp_vu_prepare(const struct ctx *c,
|
||||
const struct flowside *toside, ssize_t dlen)
|
||||
{
|
||||
struct ethhdr *eh;
|
||||
size_t l4len;
|
||||
|
||||
/* ethernet header */
|
||||
eh = vu_eth(iov_vu[0].iov_base);
|
||||
|
||||
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
|
||||
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
|
||||
|
||||
/* initialize header */
|
||||
if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
|
||||
struct iphdr *iph = vu_ip(iov_vu[0].iov_base);
|
||||
struct udp_payload_t *bp = vu_payloadv4(iov_vu[0].iov_base);
|
||||
|
||||
eh->h_proto = htons(ETH_P_IP);
|
||||
|
||||
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP);
|
||||
|
||||
l4len = udp_update_hdr4(iph, bp, toside, dlen, true);
|
||||
} else {
|
||||
struct ipv6hdr *ip6h = vu_ip(iov_vu[0].iov_base);
|
||||
struct udp_payload_t *bp = vu_payloadv6(iov_vu[0].iov_base);
|
||||
|
||||
eh->h_proto = htons(ETH_P_IPV6);
|
||||
|
||||
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP);
|
||||
|
||||
l4len = udp_update_hdr6(ip6h, bp, toside, dlen, true);
|
||||
}
|
||||
|
||||
return l4len;
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_vu_csum() - Calculate and set checksum for a UDP packet
|
||||
* @toside: ddress information for one side of the flow
|
||||
* @l4len: IPv4 Payload length
|
||||
* @iov_used: Length of the array
|
||||
*/
|
||||
static void udp_vu_csum(const struct flowside *toside, int iov_used)
|
||||
{
|
||||
const struct in_addr *src4 = inany_v4(&toside->oaddr);
|
||||
const struct in_addr *dst4 = inany_v4(&toside->eaddr);
|
||||
char *base = iov_vu[0].iov_base;
|
||||
struct udp_payload_t *bp;
|
||||
|
||||
if (src4 && dst4) {
|
||||
bp = vu_payloadv4(base);
|
||||
csum_udp4(&bp->uh, *src4, *dst4, iov_vu, iov_used,
|
||||
(char *)&bp->data - base);
|
||||
} else {
|
||||
bp = vu_payloadv6(base);
|
||||
csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
|
||||
iov_vu, iov_used, (char *)&bp->data - base);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_vu_listen_sock_handler() - Handle new data from socket
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now)
|
||||
{
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
int i;
|
||||
|
||||
if (udp_sock_errs(c, ref.fd, events) < 0) {
|
||||
err("UDP: Unrecoverable error on listening socket:"
|
||||
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < UDP_MAX_FRAMES; i++) {
|
||||
const struct flowside *toside;
|
||||
union sockaddr_inany s_in;
|
||||
flow_sidx_t sidx;
|
||||
uint8_t pif;
|
||||
ssize_t dlen;
|
||||
int iov_used;
|
||||
bool v6;
|
||||
|
||||
if (udp_vu_sock_init(ref.fd, &s_in) < 0)
|
||||
break;
|
||||
|
||||
sidx = udp_flow_from_sock(c, ref, &s_in, now);
|
||||
pif = pif_at_sidx(sidx);
|
||||
|
||||
if (pif != PIF_TAP) {
|
||||
if (flow_sidx_valid(sidx)) {
|
||||
flow_sidx_t fromsidx = flow_sidx_opposite(sidx);
|
||||
struct udp_flow *uflow = udp_at_sidx(sidx);
|
||||
|
||||
flow_err(uflow,
|
||||
"No support for forwarding UDP from %s to %s",
|
||||
pif_name(pif_at_sidx(fromsidx)),
|
||||
pif_name(pif));
|
||||
} else {
|
||||
debug("Discarding 1 datagram without flow");
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
toside = flowside_at_sidx(sidx);
|
||||
|
||||
v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
|
||||
|
||||
iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen);
|
||||
if (iov_used <= 0)
|
||||
break;
|
||||
|
||||
udp_vu_prepare(c, toside, dlen);
|
||||
if (*c->pcap) {
|
||||
udp_vu_csum(toside, iov_used);
|
||||
pcap_iov(iov_vu, iov_used,
|
||||
sizeof(struct virtio_net_hdr_mrg_rxbuf));
|
||||
}
|
||||
vu_flush(vdev, vq, elem, iov_used);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* udp_vu_reply_sock_handler() - Handle new data from flow specific socket
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now)
|
||||
{
|
||||
flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
|
||||
const struct flowside *toside = flowside_at_sidx(tosidx);
|
||||
struct udp_flow *uflow = udp_at_sidx(ref.flowside);
|
||||
int from_s = uflow->s[ref.flowside.sidei];
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
int i;
|
||||
|
||||
ASSERT(!c->no_udp);
|
||||
|
||||
if (udp_sock_errs(c, from_s, events) < 0) {
|
||||
flow_err(uflow, "Unrecoverable error on reply socket");
|
||||
flow_err_details(uflow);
|
||||
udp_flow_close(c, uflow);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < UDP_MAX_FRAMES; i++) {
|
||||
uint8_t topif = pif_at_sidx(tosidx);
|
||||
ssize_t dlen;
|
||||
int iov_used;
|
||||
bool v6;
|
||||
|
||||
ASSERT(uflow);
|
||||
|
||||
if (topif != PIF_TAP) {
|
||||
uint8_t frompif = pif_at_sidx(ref.flowside);
|
||||
|
||||
flow_err(uflow,
|
||||
"No support for forwarding UDP from %s to %s",
|
||||
pif_name(frompif), pif_name(topif));
|
||||
continue;
|
||||
}
|
||||
|
||||
v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
|
||||
|
||||
iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen);
|
||||
if (iov_used <= 0)
|
||||
break;
|
||||
flow_trace(uflow, "Received 1 datagram on reply socket");
|
||||
uflow->ts = now->tv_sec;
|
||||
|
||||
udp_vu_prepare(c, toside, dlen);
|
||||
if (*c->pcap) {
|
||||
udp_vu_csum(toside, iov_used);
|
||||
pcap_iov(iov_vu, iov_used,
|
||||
sizeof(struct virtio_net_hdr_mrg_rxbuf));
|
||||
}
|
||||
vu_flush(vdev, vq, elem, iov_used);
|
||||
}
|
||||
}
|
13
udp_vu.h
Normal file
13
udp_vu.h
Normal file
|
@ -0,0 +1,13 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*/
|
||||
|
||||
#ifndef UDP_VU_H
|
||||
#define UDP_VU_H
|
||||
|
||||
void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now);
|
||||
void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now);
|
||||
#endif /* UDP_VU_H */
|
9
util.h
9
util.h
|
@ -144,7 +144,16 @@ static inline uint32_t ntohl_unaligned(const void *p)
|
|||
return ntohl(val);
|
||||
}
|
||||
|
||||
static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); }
|
||||
#define smp_mb() do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0)
|
||||
#define smp_mb_release() do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0)
|
||||
#define smp_mb_acquire() do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0)
|
||||
|
||||
#define smp_wmb() smp_mb_release()
|
||||
#define smp_rmb() smp_mb_acquire()
|
||||
|
||||
#define NS_FN_STACK_SIZE (1024 * 1024) /* 1MiB */
|
||||
|
||||
int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
|
||||
void *arg);
|
||||
#define NS_CALL(fn, arg) \
|
||||
|
|
981
vhost_user.c
Normal file
981
vhost_user.c
Normal file
|
@ -0,0 +1,981 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* vhost-user API, command management and virtio interface
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*
|
||||
* Some parts from QEMU subprojects/libvhost-user/libvhost-user.c
|
||||
* licensed under the following terms:
|
||||
*
|
||||
* Copyright IBM, Corp. 2007
|
||||
* Copyright (c) 2016 Red Hat, Inc.
|
||||
*
|
||||
* Authors:
|
||||
* Anthony Liguori <aliguori@us.ibm.com>
|
||||
* Marc-André Lureau <mlureau@redhat.com>
|
||||
* Victor Kaplansky <victork@redhat.com>
|
||||
*
|
||||
* This work is licensed under the terms of the GNU GPL, version 2 or
|
||||
* later. See the COPYING file in the top-level directory.
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdbool.h>
|
||||
#include <inttypes.h>
|
||||
#include <time.h>
|
||||
#include <net/ethernet.h>
|
||||
#include <netinet/in.h>
|
||||
#include <sys/epoll.h>
|
||||
#include <sys/eventfd.h>
|
||||
#include <sys/mman.h>
|
||||
#include <linux/vhost_types.h>
|
||||
#include <linux/virtio_net.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "passt.h"
|
||||
#include "tap.h"
|
||||
#include "vhost_user.h"
|
||||
#include "pcap.h"
|
||||
|
||||
/* vhost-user version we are compatible with */
|
||||
#define VHOST_USER_VERSION 1
|
||||
|
||||
static struct vu_dev vdev_storage;
|
||||
|
||||
/**
|
||||
* vu_print_capabilities() - print vhost-user capabilities
|
||||
* this is part of the vhost-user backend
|
||||
* convention.
|
||||
*/
|
||||
void vu_print_capabilities(void)
|
||||
{
|
||||
info("{");
|
||||
info(" \"type\": \"net\"");
|
||||
info("}");
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_request_to_string() - convert a vhost-user request number to its name
|
||||
* @req: request number
|
||||
*
|
||||
* Return: the name of request number
|
||||
*/
|
||||
static const char *vu_request_to_string(unsigned int req)
|
||||
{
|
||||
if (req < VHOST_USER_MAX) {
|
||||
#define REQ(req) [req] = #req
|
||||
static const char * const vu_request_str[VHOST_USER_MAX] = {
|
||||
REQ(VHOST_USER_NONE),
|
||||
REQ(VHOST_USER_GET_FEATURES),
|
||||
REQ(VHOST_USER_SET_FEATURES),
|
||||
REQ(VHOST_USER_SET_OWNER),
|
||||
REQ(VHOST_USER_RESET_OWNER),
|
||||
REQ(VHOST_USER_SET_MEM_TABLE),
|
||||
REQ(VHOST_USER_SET_LOG_BASE),
|
||||
REQ(VHOST_USER_SET_LOG_FD),
|
||||
REQ(VHOST_USER_SET_VRING_NUM),
|
||||
REQ(VHOST_USER_SET_VRING_ADDR),
|
||||
REQ(VHOST_USER_SET_VRING_BASE),
|
||||
REQ(VHOST_USER_GET_VRING_BASE),
|
||||
REQ(VHOST_USER_SET_VRING_KICK),
|
||||
REQ(VHOST_USER_SET_VRING_CALL),
|
||||
REQ(VHOST_USER_SET_VRING_ERR),
|
||||
REQ(VHOST_USER_GET_PROTOCOL_FEATURES),
|
||||
REQ(VHOST_USER_SET_PROTOCOL_FEATURES),
|
||||
REQ(VHOST_USER_GET_QUEUE_NUM),
|
||||
REQ(VHOST_USER_SET_VRING_ENABLE),
|
||||
REQ(VHOST_USER_SEND_RARP),
|
||||
REQ(VHOST_USER_NET_SET_MTU),
|
||||
REQ(VHOST_USER_SET_BACKEND_REQ_FD),
|
||||
REQ(VHOST_USER_IOTLB_MSG),
|
||||
REQ(VHOST_USER_SET_VRING_ENDIAN),
|
||||
REQ(VHOST_USER_GET_CONFIG),
|
||||
REQ(VHOST_USER_SET_CONFIG),
|
||||
REQ(VHOST_USER_POSTCOPY_ADVISE),
|
||||
REQ(VHOST_USER_POSTCOPY_LISTEN),
|
||||
REQ(VHOST_USER_POSTCOPY_END),
|
||||
REQ(VHOST_USER_GET_INFLIGHT_FD),
|
||||
REQ(VHOST_USER_SET_INFLIGHT_FD),
|
||||
REQ(VHOST_USER_GPU_SET_SOCKET),
|
||||
REQ(VHOST_USER_VRING_KICK),
|
||||
REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
|
||||
REQ(VHOST_USER_ADD_MEM_REG),
|
||||
REQ(VHOST_USER_REM_MEM_REG),
|
||||
};
|
||||
#undef REQ
|
||||
return vu_request_str[req];
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
/**
|
||||
* qva_to_va() - Translate front-end (QEMU) virtual address to our virtual
|
||||
* address
|
||||
* @dev: vhost-user device
|
||||
* @qemu_addr: front-end userspace address
|
||||
*
|
||||
* Return: the memory address in our process virtual address space.
|
||||
*/
|
||||
static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
/* Find matching memory region. */
|
||||
for (i = 0; i < dev->nregions; i++) {
|
||||
const struct vu_dev_region *r = &dev->regions[i];
|
||||
|
||||
if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
|
||||
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
|
||||
return (void *)(qemu_addr - r->qva + r->mmap_addr +
|
||||
r->mmap_offset);
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* vmsg_close_fds() - Close all file descriptors of a given message
|
||||
* @vmsg: vhost-user message with the list of the file descriptors
|
||||
*/
|
||||
static void vmsg_close_fds(const struct vhost_user_msg *vmsg)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < vmsg->fd_num; i++)
|
||||
close(vmsg->fds[i]);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_remove_watch() - Remove a file descriptor from our passt epoll
|
||||
* file descriptor
|
||||
* @vdev: vhost-user device
|
||||
* @fd: file descriptor to remove
|
||||
*/
|
||||
static void vu_remove_watch(const struct vu_dev *vdev, int fd)
|
||||
{
|
||||
epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, fd, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* vmsg_set_reply_u64() - Set reply payload.u64 and clear request flags
|
||||
* and fd_num
|
||||
* @vmsg: vhost-user message
|
||||
* @val: 64-bit value to reply
|
||||
*/
|
||||
static void vmsg_set_reply_u64(struct vhost_user_msg *vmsg, uint64_t val)
|
||||
{
|
||||
vmsg->hdr.flags = 0; /* defaults will be set by vu_send_reply() */
|
||||
vmsg->hdr.size = sizeof(vmsg->payload.u64);
|
||||
vmsg->payload.u64 = val;
|
||||
vmsg->fd_num = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_message_read_default() - Read incoming vhost-user message from the
|
||||
* front-end
|
||||
* @conn_fd: vhost-user command socket
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: 0 if recvmsg() has been interrupted or if there's no data to read,
|
||||
* 1 if a message has been received
|
||||
*/
|
||||
static int vu_message_read_default(int conn_fd, struct vhost_user_msg *vmsg)
|
||||
{
|
||||
char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS *
|
||||
sizeof(int))] = { 0 };
|
||||
struct iovec iov = {
|
||||
.iov_base = (char *)vmsg,
|
||||
.iov_len = VHOST_USER_HDR_SIZE,
|
||||
};
|
||||
struct msghdr msg = {
|
||||
.msg_iov = &iov,
|
||||
.msg_iovlen = 1,
|
||||
.msg_control = control,
|
||||
.msg_controllen = sizeof(control),
|
||||
};
|
||||
ssize_t ret, sz_payload;
|
||||
struct cmsghdr *cmsg;
|
||||
|
||||
ret = recvmsg(conn_fd, &msg, MSG_DONTWAIT);
|
||||
if (ret < 0) {
|
||||
if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
|
||||
return 0;
|
||||
die_perror("vhost-user message receive (recvmsg)");
|
||||
}
|
||||
|
||||
vmsg->fd_num = 0;
|
||||
for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
|
||||
cmsg = CMSG_NXTHDR(&msg, cmsg)) {
|
||||
if (cmsg->cmsg_level == SOL_SOCKET &&
|
||||
cmsg->cmsg_type == SCM_RIGHTS) {
|
||||
size_t fd_size;
|
||||
|
||||
ASSERT(cmsg->cmsg_len >= CMSG_LEN(0));
|
||||
fd_size = cmsg->cmsg_len - CMSG_LEN(0);
|
||||
ASSERT(fd_size <= sizeof(vmsg->fds));
|
||||
vmsg->fd_num = fd_size / sizeof(int);
|
||||
memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
sz_payload = vmsg->hdr.size;
|
||||
if ((size_t)sz_payload > sizeof(vmsg->payload)) {
|
||||
die("vhost-user message request too big: %d,"
|
||||
" size: vmsg->size: %zd, "
|
||||
"while sizeof(vmsg->payload) = %zu",
|
||||
vmsg->hdr.request, sz_payload, sizeof(vmsg->payload));
|
||||
}
|
||||
|
||||
if (sz_payload) {
|
||||
do
|
||||
ret = recv(conn_fd, &vmsg->payload, sz_payload, 0);
|
||||
while (ret < 0 && errno == EINTR);
|
||||
|
||||
if (ret < 0)
|
||||
die_perror("vhost-user message receive");
|
||||
|
||||
if (ret == 0)
|
||||
die("EOF on vhost-user message receive");
|
||||
|
||||
if (ret < sz_payload)
|
||||
die("Short-read on vhost-user message receive");
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_message_write() - Send a message to the front-end
|
||||
* @conn_fd: vhost-user command socket
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* #syscalls:vu sendmsg
|
||||
*/
|
||||
static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg)
|
||||
{
|
||||
char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = { 0 };
|
||||
struct iovec iov = {
|
||||
.iov_base = (char *)vmsg,
|
||||
.iov_len = VHOST_USER_HDR_SIZE + vmsg->hdr.size,
|
||||
};
|
||||
struct msghdr msg = {
|
||||
.msg_iov = &iov,
|
||||
.msg_iovlen = 1,
|
||||
.msg_control = control,
|
||||
};
|
||||
int rc;
|
||||
|
||||
ASSERT(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS);
|
||||
if (vmsg->fd_num > 0) {
|
||||
size_t fdsize = vmsg->fd_num * sizeof(int);
|
||||
struct cmsghdr *cmsg;
|
||||
|
||||
msg.msg_controllen = CMSG_SPACE(fdsize);
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
cmsg->cmsg_len = CMSG_LEN(fdsize);
|
||||
cmsg->cmsg_level = SOL_SOCKET;
|
||||
cmsg->cmsg_type = SCM_RIGHTS;
|
||||
memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize);
|
||||
}
|
||||
|
||||
do
|
||||
rc = sendmsg(conn_fd, &msg, 0);
|
||||
while (rc < 0 && errno == EINTR);
|
||||
|
||||
if (rc < 0)
|
||||
die_perror("vhost-user message send");
|
||||
|
||||
if ((uint32_t)rc < VHOST_USER_HDR_SIZE + vmsg->hdr.size)
|
||||
die("EOF on vhost-user message send");
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_send_reply() - Update message flags and send it to front-end
|
||||
* @conn_fd: vhost-user command socket
|
||||
* @vmsg: vhost-user message
|
||||
*/
|
||||
static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
|
||||
{
|
||||
msg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
|
||||
msg->hdr.flags |= VHOST_USER_VERSION;
|
||||
msg->hdr.flags |= VHOST_USER_REPLY_MASK;
|
||||
|
||||
vu_message_write(conn_fd, msg);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_get_features_exec() - Provide back-end features bitmask to front-end
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: True as a reply is requested
|
||||
*/
|
||||
static bool vu_get_features_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
uint64_t features =
|
||||
1ULL << VIRTIO_F_VERSION_1 |
|
||||
1ULL << VIRTIO_NET_F_MRG_RXBUF |
|
||||
1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
|
||||
|
||||
(void)vdev;
|
||||
|
||||
vmsg_set_reply_u64(msg, features);
|
||||
|
||||
debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_enable_all_rings() - Enable/disable all the virtqueues
|
||||
* @vdev: vhost-user device
|
||||
* @enable: New virtqueues state
|
||||
*/
|
||||
static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable)
|
||||
{
|
||||
uint16_t i;
|
||||
|
||||
for (i = 0; i < VHOST_USER_MAX_QUEUES; i++)
|
||||
vdev->vq[i].enable = enable;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_features_exec() - Enable features of the back-end
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_features_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
debug("u64: 0x%016"PRIx64, msg->payload.u64);
|
||||
|
||||
vdev->features = msg->payload.u64;
|
||||
/* We only support devices conforming to VIRTIO 1.0 or
|
||||
* later
|
||||
*/
|
||||
if (!vu_has_feature(vdev, VIRTIO_F_VERSION_1))
|
||||
die("virtio legacy devices aren't supported by passt");
|
||||
|
||||
if (!vu_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES))
|
||||
vu_set_enable_all_rings(vdev, true);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_owner_exec() - Session start flag, do nothing in our case
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_owner_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
(void)vdev;
|
||||
(void)msg;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* map_ring() - Convert ring front-end (QEMU) addresses to our process
|
||||
* virtual address space.
|
||||
* @vdev: vhost-user device
|
||||
* @vq: Virtqueue
|
||||
*
|
||||
* Return: True if ring cannot be mapped to our address space
|
||||
*/
|
||||
static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq)
|
||||
{
|
||||
vq->vring.desc = qva_to_va(vdev, vq->vra.desc_user_addr);
|
||||
vq->vring.used = qva_to_va(vdev, vq->vra.used_user_addr);
|
||||
vq->vring.avail = qva_to_va(vdev, vq->vra.avail_user_addr);
|
||||
|
||||
debug("Setting virtq addresses:");
|
||||
debug(" vring_desc at %p", (void *)vq->vring.desc);
|
||||
debug(" vring_used at %p", (void *)vq->vring.used);
|
||||
debug(" vring_avail at %p", (void *)vq->vring.avail);
|
||||
|
||||
return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_mem_table_exec() - Sets the memory map regions to be able to
|
||||
* translate the vring addresses.
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*
|
||||
* #syscalls:vu mmap munmap
|
||||
*/
|
||||
static bool vu_set_mem_table_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
struct vhost_user_memory m = msg->payload.memory, *memory = &m;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < vdev->nregions; i++) {
|
||||
const struct vu_dev_region *r = &vdev->regions[i];
|
||||
|
||||
if (r->mmap_addr) {
|
||||
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
|
||||
munmap((void *)r->mmap_addr, r->size + r->mmap_offset);
|
||||
}
|
||||
}
|
||||
vdev->nregions = memory->nregions;
|
||||
|
||||
debug("vhost-user nregions: %u", memory->nregions);
|
||||
for (i = 0; i < vdev->nregions; i++) {
|
||||
struct vhost_user_memory_region *msg_region = &memory->regions[i];
|
||||
struct vu_dev_region *dev_region = &vdev->regions[i];
|
||||
void *mmap_addr;
|
||||
|
||||
debug("vhost-user region %d", i);
|
||||
debug(" guest_phys_addr: 0x%016"PRIx64,
|
||||
msg_region->guest_phys_addr);
|
||||
debug(" memory_size: 0x%016"PRIx64,
|
||||
msg_region->memory_size);
|
||||
debug(" userspace_addr 0x%016"PRIx64,
|
||||
msg_region->userspace_addr);
|
||||
debug(" mmap_offset 0x%016"PRIx64,
|
||||
msg_region->mmap_offset);
|
||||
|
||||
dev_region->gpa = msg_region->guest_phys_addr;
|
||||
dev_region->size = msg_region->memory_size;
|
||||
dev_region->qva = msg_region->userspace_addr;
|
||||
dev_region->mmap_offset = msg_region->mmap_offset;
|
||||
|
||||
/* We don't use offset argument of mmap() since the
|
||||
* mapped address has to be page aligned.
|
||||
*/
|
||||
mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
|
||||
PROT_READ | PROT_WRITE, MAP_SHARED |
|
||||
MAP_NORESERVE, msg->fds[i], 0);
|
||||
|
||||
if (mmap_addr == MAP_FAILED)
|
||||
die_perror("vhost-user region mmap error");
|
||||
|
||||
dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
|
||||
debug(" mmap_addr: 0x%016"PRIx64,
|
||||
dev_region->mmap_addr);
|
||||
|
||||
close(msg->fds[i]);
|
||||
}
|
||||
|
||||
for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
|
||||
if (vdev->vq[i].vring.desc) {
|
||||
if (map_ring(vdev, &vdev->vq[i]))
|
||||
die("remapping queue %d during setmemtable", i);
|
||||
}
|
||||
}
|
||||
|
||||
/* As vu_packet_check_range() has no access to the number of
|
||||
* memory regions, mark the end of the array with mmap_addr = 0
|
||||
*/
|
||||
ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1);
|
||||
vdev->regions[vdev->nregions].mmap_addr = 0;
|
||||
|
||||
tap_sock_update_pool(vdev->regions, 0);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_vring_num_exec() - Set the size of the queue (vring size)
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_num_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
unsigned int idx = msg->payload.state.index;
|
||||
unsigned int num = msg->payload.state.num;
|
||||
|
||||
debug("State.index: %u", idx);
|
||||
debug("State.num: %u", num);
|
||||
vdev->vq[idx].vring.num = num;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_vring_addr_exec() - Set the addresses of the vring
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
/* We need to copy the payload to vhost_vring_addr structure
|
||||
* to access index because address of msg->payload.addr
|
||||
* can be unaligned as it is packed.
|
||||
*/
|
||||
struct vhost_vring_addr addr = msg->payload.addr;
|
||||
struct vu_virtq *vq = &vdev->vq[addr.index];
|
||||
|
||||
debug("vhost_vring_addr:");
|
||||
debug(" index: %d", addr.index);
|
||||
debug(" flags: %d", addr.flags);
|
||||
debug(" desc_user_addr: 0x%016" PRIx64,
|
||||
(uint64_t)addr.desc_user_addr);
|
||||
debug(" used_user_addr: 0x%016" PRIx64,
|
||||
(uint64_t)addr.used_user_addr);
|
||||
debug(" avail_user_addr: 0x%016" PRIx64,
|
||||
(uint64_t)addr.avail_user_addr);
|
||||
debug(" log_guest_addr: 0x%016" PRIx64,
|
||||
(uint64_t)addr.log_guest_addr);
|
||||
|
||||
vq->vra = msg->payload.addr;
|
||||
vq->vring.flags = addr.flags;
|
||||
vq->vring.log_guest_addr = addr.log_guest_addr;
|
||||
|
||||
if (map_ring(vdev, vq))
|
||||
die("Invalid vring_addr message");
|
||||
|
||||
vq->used_idx = le16toh(vq->vring.used->idx);
|
||||
|
||||
if (vq->last_avail_idx != vq->used_idx) {
|
||||
debug("Last avail index != used index: %u != %u",
|
||||
vq->last_avail_idx, vq->used_idx);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* vu_set_vring_base_exec() - Sets the next index to use for descriptors
|
||||
* in this vring
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_base_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
unsigned int idx = msg->payload.state.index;
|
||||
unsigned int num = msg->payload.state.num;
|
||||
|
||||
debug("State.index: %u", idx);
|
||||
debug("State.num: %u", num);
|
||||
vdev->vq[idx].shadow_avail_idx = vdev->vq[idx].last_avail_idx = num;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_get_vring_base_exec() - Stops the vring and returns the current
|
||||
* descriptor index or indices
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: True as a reply is requested
|
||||
*/
|
||||
static bool vu_get_vring_base_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
unsigned int idx = msg->payload.state.index;
|
||||
|
||||
debug("State.index: %u", idx);
|
||||
msg->payload.state.num = vdev->vq[idx].last_avail_idx;
|
||||
msg->hdr.size = sizeof(msg->payload.state);
|
||||
|
||||
vdev->vq[idx].started = false;
|
||||
|
||||
if (vdev->vq[idx].call_fd != -1) {
|
||||
close(vdev->vq[idx].call_fd);
|
||||
vdev->vq[idx].call_fd = -1;
|
||||
}
|
||||
if (vdev->vq[idx].kick_fd != -1) {
|
||||
vu_remove_watch(vdev, vdev->vq[idx].kick_fd);
|
||||
close(vdev->vq[idx].kick_fd);
|
||||
vdev->vq[idx].kick_fd = -1;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_watch() - Add a file descriptor to the passt epoll file descriptor
|
||||
* @vdev: vhost-user device
|
||||
* @idx: queue index of the file descriptor to add
|
||||
*/
|
||||
static void vu_set_watch(const struct vu_dev *vdev, int idx)
|
||||
{
|
||||
union epoll_ref ref = {
|
||||
.type = EPOLL_TYPE_VHOST_KICK,
|
||||
.fd = vdev->vq[idx].kick_fd,
|
||||
.queue = idx
|
||||
};
|
||||
struct epoll_event ev = { 0 };
|
||||
|
||||
ev.data.u64 = ref.u64;
|
||||
ev.events = EPOLLIN;
|
||||
epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_check_queue_msg_file() - Check if a message is valid,
|
||||
* close fds if NOFD bit is set
|
||||
* @vmsg: vhost-user message
|
||||
*/
|
||||
static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
|
||||
{
|
||||
bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
|
||||
if (idx >= VHOST_USER_MAX_QUEUES)
|
||||
die("Invalid vhost-user queue index: %u", idx);
|
||||
|
||||
if (nofd) {
|
||||
vmsg_close_fds(msg);
|
||||
return;
|
||||
}
|
||||
|
||||
if (msg->fd_num != 1)
|
||||
die("Invalid fds in vhost-user request: %d", msg->hdr.request);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_vring_kick_exec() - Set the event file descriptor for adding buffers
|
||||
* to the vring
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
|
||||
debug("u64: 0x%016"PRIx64, msg->payload.u64);
|
||||
|
||||
vu_check_queue_msg_file(msg);
|
||||
|
||||
if (vdev->vq[idx].kick_fd != -1) {
|
||||
vu_remove_watch(vdev, vdev->vq[idx].kick_fd);
|
||||
close(vdev->vq[idx].kick_fd);
|
||||
vdev->vq[idx].kick_fd = -1;
|
||||
}
|
||||
|
||||
if (!nofd)
|
||||
vdev->vq[idx].kick_fd = msg->fds[0];
|
||||
|
||||
debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx);
|
||||
|
||||
vdev->vq[idx].started = true;
|
||||
|
||||
if (vdev->vq[idx].kick_fd != -1 && VHOST_USER_IS_QUEUE_TX(idx)) {
|
||||
vu_set_watch(vdev, idx);
|
||||
debug("Waiting for kicks on fd: %d for vq: %d",
|
||||
vdev->vq[idx].kick_fd, idx);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_vring_call_exec() - Set the event file descriptor to signal when
|
||||
* buffers are used
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_call_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
|
||||
debug("u64: 0x%016"PRIx64, msg->payload.u64);
|
||||
|
||||
vu_check_queue_msg_file(msg);
|
||||
|
||||
if (vdev->vq[idx].call_fd != -1) {
|
||||
close(vdev->vq[idx].call_fd);
|
||||
vdev->vq[idx].call_fd = -1;
|
||||
}
|
||||
|
||||
if (!nofd)
|
||||
vdev->vq[idx].call_fd = msg->fds[0];
|
||||
|
||||
/* in case of I/O hang after reconnecting */
|
||||
if (vdev->vq[idx].call_fd != -1)
|
||||
eventfd_write(msg->fds[0], 1);
|
||||
|
||||
debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_vring_err_exec() - Set the event file descriptor to signal when
|
||||
* error occurs
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_err_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
|
||||
int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
|
||||
|
||||
debug("u64: 0x%016"PRIx64, msg->payload.u64);
|
||||
|
||||
vu_check_queue_msg_file(msg);
|
||||
|
||||
if (vdev->vq[idx].err_fd != -1) {
|
||||
close(vdev->vq[idx].err_fd);
|
||||
vdev->vq[idx].err_fd = -1;
|
||||
}
|
||||
|
||||
if (!nofd)
|
||||
vdev->vq[idx].err_fd = msg->fds[0];
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_get_protocol_features_exec() - Provide the protocol (vhost-user) features
|
||||
* to the front-end
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: True as a reply is requested
|
||||
*/
|
||||
static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK;
|
||||
|
||||
(void)vdev;
|
||||
vmsg_set_reply_u64(msg, features);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_protocol_features_exec() - Enable protocol (vhost-user) features
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
uint64_t features = msg->payload.u64;
|
||||
|
||||
debug("u64: 0x%016"PRIx64, features);
|
||||
|
||||
vdev->protocol_features = msg->payload.u64;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_get_queue_num_exec() - Tell how many queues we support
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: True as a reply is requested
|
||||
*/
|
||||
static bool vu_get_queue_num_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
(void)vdev;
|
||||
|
||||
vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_vring_enable_exec() - Enable or disable corresponding vring
|
||||
* @vdev: vhost-user device
|
||||
* @vmsg: vhost-user message
|
||||
*
|
||||
* Return: False as no reply is requested
|
||||
*/
|
||||
static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg)
|
||||
{
|
||||
unsigned int enable = msg->payload.state.num;
|
||||
unsigned int idx = msg->payload.state.index;
|
||||
|
||||
debug("State.index: %u", idx);
|
||||
debug("State.enable: %u", enable);
|
||||
|
||||
if (idx >= VHOST_USER_MAX_QUEUES)
|
||||
die("Invalid vring_enable index: %u", idx);
|
||||
|
||||
vdev->vq[idx].enable = enable;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_init() - Initialize vhost-user device structure
|
||||
* @c: execution context
|
||||
* @vdev: vhost-user device
|
||||
*/
|
||||
void vu_init(struct ctx *c)
|
||||
{
|
||||
int i;
|
||||
|
||||
c->vdev = &vdev_storage;
|
||||
c->vdev->context = c;
|
||||
for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
|
||||
c->vdev->vq[i] = (struct vu_virtq){
|
||||
.call_fd = -1,
|
||||
.kick_fd = -1,
|
||||
.err_fd = -1,
|
||||
.notification = true,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_cleanup() - Reset vhost-user device
|
||||
* @vdev: vhost-user device
|
||||
*/
|
||||
void vu_cleanup(struct vu_dev *vdev)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
|
||||
struct vu_virtq *vq = &vdev->vq[i];
|
||||
|
||||
vq->started = false;
|
||||
vq->notification = true;
|
||||
|
||||
if (vq->call_fd != -1) {
|
||||
close(vq->call_fd);
|
||||
vq->call_fd = -1;
|
||||
}
|
||||
if (vq->err_fd != -1) {
|
||||
close(vq->err_fd);
|
||||
vq->err_fd = -1;
|
||||
}
|
||||
if (vq->kick_fd != -1) {
|
||||
vu_remove_watch(vdev, vq->kick_fd);
|
||||
close(vq->kick_fd);
|
||||
vq->kick_fd = -1;
|
||||
}
|
||||
|
||||
vq->vring.desc = 0;
|
||||
vq->vring.used = 0;
|
||||
vq->vring.avail = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < vdev->nregions; i++) {
|
||||
const struct vu_dev_region *r = &vdev->regions[i];
|
||||
|
||||
if (r->mmap_addr) {
|
||||
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
|
||||
munmap((void *)r->mmap_addr, r->size + r->mmap_offset);
|
||||
}
|
||||
}
|
||||
vdev->nregions = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_sock_reset() - Reset connection socket
|
||||
* @vdev: vhost-user device
|
||||
*/
|
||||
static void vu_sock_reset(struct vu_dev *vdev)
|
||||
{
|
||||
tap_sock_reset(vdev->context);
|
||||
}
|
||||
|
||||
static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
|
||||
struct vhost_user_msg *msg) = {
|
||||
[VHOST_USER_GET_FEATURES] = vu_get_features_exec,
|
||||
[VHOST_USER_SET_FEATURES] = vu_set_features_exec,
|
||||
[VHOST_USER_GET_PROTOCOL_FEATURES] = vu_get_protocol_features_exec,
|
||||
[VHOST_USER_SET_PROTOCOL_FEATURES] = vu_set_protocol_features_exec,
|
||||
[VHOST_USER_GET_QUEUE_NUM] = vu_get_queue_num_exec,
|
||||
[VHOST_USER_SET_OWNER] = vu_set_owner_exec,
|
||||
[VHOST_USER_SET_MEM_TABLE] = vu_set_mem_table_exec,
|
||||
[VHOST_USER_SET_VRING_NUM] = vu_set_vring_num_exec,
|
||||
[VHOST_USER_SET_VRING_ADDR] = vu_set_vring_addr_exec,
|
||||
[VHOST_USER_SET_VRING_BASE] = vu_set_vring_base_exec,
|
||||
[VHOST_USER_GET_VRING_BASE] = vu_get_vring_base_exec,
|
||||
[VHOST_USER_SET_VRING_KICK] = vu_set_vring_kick_exec,
|
||||
[VHOST_USER_SET_VRING_CALL] = vu_set_vring_call_exec,
|
||||
[VHOST_USER_SET_VRING_ERR] = vu_set_vring_err_exec,
|
||||
[VHOST_USER_SET_VRING_ENABLE] = vu_set_vring_enable_exec,
|
||||
};
|
||||
|
||||
/**
|
||||
* vu_control_handler() - Handle control commands for vhost-user
|
||||
* @vdev: vhost-user device
|
||||
* @fd: vhost-user message socket
|
||||
* @events: epoll events
|
||||
*/
|
||||
void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
|
||||
{
|
||||
struct vhost_user_msg msg = { 0 };
|
||||
bool need_reply, reply_requested;
|
||||
int ret;
|
||||
|
||||
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
|
||||
vu_sock_reset(vdev);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = vu_message_read_default(fd, &msg);
|
||||
if (ret == 0) {
|
||||
vu_sock_reset(vdev);
|
||||
return;
|
||||
}
|
||||
debug("================ Vhost user message ================");
|
||||
debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
|
||||
msg.hdr.request);
|
||||
debug("Flags: 0x%x", msg.hdr.flags);
|
||||
debug("Size: %u", msg.hdr.size);
|
||||
|
||||
need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
|
||||
|
||||
if (msg.hdr.request >= 0 && msg.hdr.request < VHOST_USER_MAX &&
|
||||
vu_handle[msg.hdr.request])
|
||||
reply_requested = vu_handle[msg.hdr.request](vdev, &msg);
|
||||
else
|
||||
die("Unhandled request: %d", msg.hdr.request);
|
||||
|
||||
/* cppcheck-suppress legacyUninitvar */
|
||||
if (!reply_requested && need_reply) {
|
||||
msg.payload.u64 = 0;
|
||||
msg.hdr.flags = 0;
|
||||
msg.hdr.size = sizeof(msg.payload.u64);
|
||||
msg.fd_num = 0;
|
||||
reply_requested = true;
|
||||
}
|
||||
|
||||
if (reply_requested)
|
||||
vu_send_reply(fd, &msg);
|
||||
}
|
206
vhost_user.h
Normal file
206
vhost_user.h
Normal file
|
@ -0,0 +1,206 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* vhost-user API, command management and virtio interface
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*/
|
||||
|
||||
/* some parts from subprojects/libvhost-user/libvhost-user.h */
|
||||
|
||||
#ifndef VHOST_USER_H
|
||||
#define VHOST_USER_H
|
||||
|
||||
#include "virtio.h"
|
||||
#include "iov.h"
|
||||
|
||||
#define VHOST_USER_F_PROTOCOL_FEATURES 30
|
||||
|
||||
#define VHOST_MEMORY_BASELINE_NREGIONS 8
|
||||
|
||||
/**
|
||||
* enum vhost_user_protocol_feature - List of available vhost-user features
|
||||
*/
|
||||
enum vhost_user_protocol_feature {
|
||||
VHOST_USER_PROTOCOL_F_MQ = 0,
|
||||
VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
|
||||
VHOST_USER_PROTOCOL_F_RARP = 2,
|
||||
VHOST_USER_PROTOCOL_F_REPLY_ACK = 3,
|
||||
VHOST_USER_PROTOCOL_F_NET_MTU = 4,
|
||||
VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5,
|
||||
VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6,
|
||||
VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
|
||||
VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
|
||||
VHOST_USER_PROTOCOL_F_CONFIG = 9,
|
||||
VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
|
||||
VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
|
||||
VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
|
||||
VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14,
|
||||
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15,
|
||||
|
||||
VHOST_USER_PROTOCOL_F_MAX
|
||||
};
|
||||
|
||||
/**
|
||||
* enum vhost_user_request - List of available vhost-user requests
|
||||
*/
|
||||
enum vhost_user_request {
|
||||
VHOST_USER_NONE = 0,
|
||||
VHOST_USER_GET_FEATURES = 1,
|
||||
VHOST_USER_SET_FEATURES = 2,
|
||||
VHOST_USER_SET_OWNER = 3,
|
||||
VHOST_USER_RESET_OWNER = 4,
|
||||
VHOST_USER_SET_MEM_TABLE = 5,
|
||||
VHOST_USER_SET_LOG_BASE = 6,
|
||||
VHOST_USER_SET_LOG_FD = 7,
|
||||
VHOST_USER_SET_VRING_NUM = 8,
|
||||
VHOST_USER_SET_VRING_ADDR = 9,
|
||||
VHOST_USER_SET_VRING_BASE = 10,
|
||||
VHOST_USER_GET_VRING_BASE = 11,
|
||||
VHOST_USER_SET_VRING_KICK = 12,
|
||||
VHOST_USER_SET_VRING_CALL = 13,
|
||||
VHOST_USER_SET_VRING_ERR = 14,
|
||||
VHOST_USER_GET_PROTOCOL_FEATURES = 15,
|
||||
VHOST_USER_SET_PROTOCOL_FEATURES = 16,
|
||||
VHOST_USER_GET_QUEUE_NUM = 17,
|
||||
VHOST_USER_SET_VRING_ENABLE = 18,
|
||||
VHOST_USER_SEND_RARP = 19,
|
||||
VHOST_USER_NET_SET_MTU = 20,
|
||||
VHOST_USER_SET_BACKEND_REQ_FD = 21,
|
||||
VHOST_USER_IOTLB_MSG = 22,
|
||||
VHOST_USER_SET_VRING_ENDIAN = 23,
|
||||
VHOST_USER_GET_CONFIG = 24,
|
||||
VHOST_USER_SET_CONFIG = 25,
|
||||
VHOST_USER_CREATE_CRYPTO_SESSION = 26,
|
||||
VHOST_USER_CLOSE_CRYPTO_SESSION = 27,
|
||||
VHOST_USER_POSTCOPY_ADVISE = 28,
|
||||
VHOST_USER_POSTCOPY_LISTEN = 29,
|
||||
VHOST_USER_POSTCOPY_END = 30,
|
||||
VHOST_USER_GET_INFLIGHT_FD = 31,
|
||||
VHOST_USER_SET_INFLIGHT_FD = 32,
|
||||
VHOST_USER_GPU_SET_SOCKET = 33,
|
||||
VHOST_USER_VRING_KICK = 35,
|
||||
VHOST_USER_GET_MAX_MEM_SLOTS = 36,
|
||||
VHOST_USER_ADD_MEM_REG = 37,
|
||||
VHOST_USER_REM_MEM_REG = 38,
|
||||
VHOST_USER_MAX
|
||||
};
|
||||
|
||||
/**
|
||||
* struct vhost_user_header - vhost-user message header
|
||||
* @request: Request type of the message
|
||||
* @flags: Request flags
|
||||
* @size: The following payload size
|
||||
*/
|
||||
struct vhost_user_header {
|
||||
enum vhost_user_request request;
|
||||
|
||||
#define VHOST_USER_VERSION_MASK 0x3
|
||||
#define VHOST_USER_REPLY_MASK (0x1 << 2)
|
||||
#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3)
|
||||
uint32_t flags;
|
||||
uint32_t size;
|
||||
} __attribute__ ((__packed__));
|
||||
|
||||
/**
|
||||
* struct vhost_user_memory_region - Front-end shared memory region information
|
||||
* @guest_phys_addr: Guest physical address of the region
|
||||
* @memory_size: Memory size
|
||||
* @userspace_addr: front-end (QEMU) userspace address
|
||||
* @mmap_offset: region offset in the shared memory area
|
||||
*/
|
||||
struct vhost_user_memory_region {
|
||||
uint64_t guest_phys_addr;
|
||||
uint64_t memory_size;
|
||||
uint64_t userspace_addr;
|
||||
uint64_t mmap_offset;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct vhost_user_memory - List of all the shared memory regions
|
||||
* @nregions: Number of memory regions
|
||||
* @padding: Padding
|
||||
* @regions: Memory regions list
|
||||
*/
|
||||
struct vhost_user_memory {
|
||||
uint32_t nregions;
|
||||
uint32_t padding;
|
||||
struct vhost_user_memory_region regions[VHOST_MEMORY_BASELINE_NREGIONS];
|
||||
};
|
||||
|
||||
/**
|
||||
* union vhost_user_payload - vhost-user message payload
|
||||
* @u64: 64-bit payload
|
||||
* @state: vring state payload
|
||||
* @addr: vring addresses payload
|
||||
* vhost_user_memory: Memory regions information payload
|
||||
*/
|
||||
union vhost_user_payload {
|
||||
#define VHOST_USER_VRING_IDX_MASK 0xff
|
||||
#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8)
|
||||
uint64_t u64;
|
||||
struct vhost_vring_state state;
|
||||
struct vhost_vring_addr addr;
|
||||
struct vhost_user_memory memory;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct vhost_user_msg - vhost-use message
|
||||
* @hdr: Message header
|
||||
* @payload: Message payload
|
||||
* @fds: File descriptors associated with the message
|
||||
* in the ancillary data.
|
||||
* (shared memory or event file descriptors)
|
||||
* @fd_num: Number of file descriptors
|
||||
*/
|
||||
struct vhost_user_msg {
|
||||
struct vhost_user_header hdr;
|
||||
union vhost_user_payload payload;
|
||||
|
||||
int fds[VHOST_MEMORY_BASELINE_NREGIONS];
|
||||
int fd_num;
|
||||
} __attribute__ ((__packed__));
|
||||
#define VHOST_USER_HDR_SIZE sizeof(struct vhost_user_header)
|
||||
|
||||
/* index of the RX virtqueue */
|
||||
#define VHOST_USER_RX_QUEUE 0
|
||||
/* index of the TX virtqueue */
|
||||
#define VHOST_USER_TX_QUEUE 1
|
||||
|
||||
/* in case of multiqueue, the RX and TX queues are interleaved */
|
||||
#define VHOST_USER_IS_QUEUE_TX(n) (n % 2)
|
||||
#define VHOST_USER_IS_QUEUE_RX(n) (!(n % 2))
|
||||
|
||||
/* Default virtio-net header for passt */
|
||||
#define VU_HEADER ((struct virtio_net_hdr){ \
|
||||
.flags = VIRTIO_NET_HDR_F_DATA_VALID, \
|
||||
.gso_type = VIRTIO_NET_HDR_GSO_NONE, \
|
||||
})
|
||||
|
||||
/**
|
||||
* vu_queue_enabled - Return state of a virtqueue
|
||||
* @vq: virtqueue to check
|
||||
*
|
||||
* Return: true if the virqueue is enabled, false otherwise
|
||||
*/
|
||||
static inline bool vu_queue_enabled(const struct vu_virtq *vq)
|
||||
{
|
||||
return vq->enable;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_queue_started - Return state of a virtqueue
|
||||
* @vq: virtqueue to check
|
||||
*
|
||||
* Return: true if the virqueue is started, false otherwise
|
||||
*/
|
||||
static inline bool vu_queue_started(const struct vu_virtq *vq)
|
||||
{
|
||||
return vq->started;
|
||||
}
|
||||
|
||||
void vu_print_capabilities(void);
|
||||
void vu_init(struct ctx *c);
|
||||
void vu_cleanup(struct vu_dev *vdev);
|
||||
void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events);
|
||||
#endif /* VHOST_USER_H */
|
660
virtio.c
Normal file
660
virtio.c
Normal file
|
@ -0,0 +1,660 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later AND BSD-3-Clause
|
||||
/*
|
||||
* virtio API, vring and virtqueue functions definition
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*/
|
||||
|
||||
/* Some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c
|
||||
* originally licensed under the following terms:
|
||||
*
|
||||
* --
|
||||
*
|
||||
* Copyright IBM, Corp. 2007
|
||||
* Copyright (c) 2016 Red Hat, Inc.
|
||||
*
|
||||
* Authors:
|
||||
* Anthony Liguori <aliguori@us.ibm.com>
|
||||
* Marc-André Lureau <mlureau@redhat.com>
|
||||
* Victor Kaplansky <victork@redhat.com>
|
||||
*
|
||||
* This work is licensed under the terms of the GNU GPL, version 2 or
|
||||
* later. See the COPYING file in the top-level directory.
|
||||
*
|
||||
* Some parts copied from QEMU hw/virtio/virtio.c
|
||||
* licensed under the following terms:
|
||||
*
|
||||
* Copyright IBM, Corp. 2007
|
||||
*
|
||||
* Authors:
|
||||
* Anthony Liguori <aliguori@us.ibm.com>
|
||||
*
|
||||
* This work is licensed under the terms of the GNU GPL, version 2. See
|
||||
* the COPYING file in the top-level directory.
|
||||
*
|
||||
* --
|
||||
*
|
||||
* virtq_used_event() and virtq_avail_event() from
|
||||
* https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-712000A
|
||||
* licensed under the following terms:
|
||||
*
|
||||
* --
|
||||
*
|
||||
* This header is BSD licensed so anyone can use the definitions
|
||||
* to implement compatible drivers/servers.
|
||||
*
|
||||
* Copyright 2007, 2009, IBM Corporation
|
||||
* Copyright 2011, Red Hat, Inc
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of IBM nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ‘‘AS IS’’ AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <endian.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <sys/eventfd.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "virtio.h"
|
||||
|
||||
#define VIRTQUEUE_MAX_SIZE 1024
|
||||
|
||||
/**
|
||||
* vu_gpa_to_va() - Translate guest physical address to our virtual address.
|
||||
* @dev: Vhost-user device
|
||||
* @plen: Physical length to map (input), capped to region (output)
|
||||
* @guest_addr: Guest physical address
|
||||
*
|
||||
* Return: virtual address in our address space of the guest physical address
|
||||
*/
|
||||
static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_addr)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
if (*plen == 0)
|
||||
return NULL;
|
||||
|
||||
/* Find matching memory region. */
|
||||
for (i = 0; i < dev->nregions; i++) {
|
||||
const struct vu_dev_region *r = &dev->regions[i];
|
||||
|
||||
if ((guest_addr >= r->gpa) &&
|
||||
(guest_addr < (r->gpa + r->size))) {
|
||||
if ((guest_addr + *plen) > (r->gpa + r->size))
|
||||
*plen = r->gpa + r->size - guest_addr;
|
||||
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
|
||||
return (void *)(guest_addr - r->gpa + r->mmap_addr +
|
||||
r->mmap_offset);
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* vring_avail_flags() - Read the available ring flags
|
||||
* @vq: Virtqueue
|
||||
*
|
||||
* Return: the available ring descriptor flags of the given virtqueue
|
||||
*/
|
||||
static inline uint16_t vring_avail_flags(const struct vu_virtq *vq)
|
||||
{
|
||||
return le16toh(vq->vring.avail->flags);
|
||||
}
|
||||
|
||||
/**
|
||||
* vring_avail_idx() - Read the available ring index
|
||||
* @vq: Virtqueue
|
||||
*
|
||||
* Return: the available ring index of the given virtqueue
|
||||
*/
|
||||
static inline uint16_t vring_avail_idx(struct vu_virtq *vq)
|
||||
{
|
||||
vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
|
||||
|
||||
return vq->shadow_avail_idx;
|
||||
}
|
||||
|
||||
/**
|
||||
* vring_avail_ring() - Read an available ring entry
|
||||
* @vq: Virtqueue
|
||||
* @i: Index of the entry to read
|
||||
*
|
||||
* Return: the ring entry content (head of the descriptor chain)
|
||||
*/
|
||||
static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i)
|
||||
{
|
||||
return le16toh(vq->vring.avail->ring[i]);
|
||||
}
|
||||
|
||||
/**
|
||||
* virtq_used_event - Get location of used event indices
|
||||
* (only with VIRTIO_F_EVENT_IDX)
|
||||
* @vq Virtqueue
|
||||
*
|
||||
* Return: return the location of the used event index
|
||||
*/
|
||||
static inline uint16_t *virtq_used_event(const struct vu_virtq *vq)
|
||||
{
|
||||
/* For backwards compat, used event index is at *end* of avail ring. */
|
||||
return &vq->vring.avail->ring[vq->vring.num];
|
||||
}
|
||||
|
||||
/**
|
||||
* vring_get_used_event() - Get the used event from the available ring
|
||||
* @vq Virtqueue
|
||||
*
|
||||
* Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set)
|
||||
* used_event is a performant alternative where the driver
|
||||
* specifies how far the device can progress before a notification
|
||||
* is required.
|
||||
*/
|
||||
static inline uint16_t vring_get_used_event(const struct vu_virtq *vq)
|
||||
{
|
||||
return le16toh(*virtq_used_event(vq));
|
||||
}
|
||||
|
||||
/**
|
||||
* virtqueue_get_head() - Get the head of the descriptor chain for a given
|
||||
* index
|
||||
* @vq: Virtqueue
|
||||
* @idx: Available ring entry index
|
||||
* @head: Head of the descriptor chain
|
||||
*/
|
||||
static void virtqueue_get_head(const struct vu_virtq *vq,
|
||||
unsigned int idx, unsigned int *head)
|
||||
{
|
||||
/* Grab the next descriptor number they're advertising, and increment
|
||||
* the index we've seen.
|
||||
*/
|
||||
*head = vring_avail_ring(vq, idx % vq->vring.num);
|
||||
|
||||
/* If their number is silly, that's a fatal mistake. */
|
||||
if (*head >= vq->vring.num)
|
||||
die("vhost-user: Guest says index %u is available", *head);
|
||||
}
|
||||
|
||||
/**
|
||||
* virtqueue_read_indirect_desc() - Copy virtio ring descriptors from guest
|
||||
* memory
|
||||
* @dev: Vhost-user device
|
||||
* @desc: Destination address to copy the descriptors to
|
||||
* @addr: Guest memory address to copy from
|
||||
* @len: Length of memory to copy
|
||||
*
|
||||
* Return: -1 if there is an error, 0 otherwise
|
||||
*/
|
||||
static int virtqueue_read_indirect_desc(struct vu_dev *dev, struct vring_desc *desc,
|
||||
uint64_t addr, size_t len)
|
||||
{
|
||||
uint64_t read_len;
|
||||
|
||||
if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc)))
|
||||
return -1;
|
||||
|
||||
if (len == 0)
|
||||
return -1;
|
||||
|
||||
while (len) {
|
||||
const struct vring_desc *orig_desc;
|
||||
|
||||
read_len = len;
|
||||
orig_desc = vu_gpa_to_va(dev, &read_len, addr);
|
||||
if (!orig_desc)
|
||||
return -1;
|
||||
|
||||
memcpy(desc, orig_desc, read_len);
|
||||
len -= read_len;
|
||||
addr += read_len;
|
||||
desc += read_len / sizeof(struct vring_desc);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* enum virtqueue_read_desc_state - State in the descriptor chain
|
||||
* @VIRTQUEUE_READ_DESC_ERROR Found an invalid descriptor
|
||||
* @VIRTQUEUE_READ_DESC_DONE No more descriptors in the chain
|
||||
* @VIRTQUEUE_READ_DESC_MORE there are more descriptors in the chain
|
||||
*/
|
||||
enum virtqueue_read_desc_state {
|
||||
VIRTQUEUE_READ_DESC_ERROR = -1,
|
||||
VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
|
||||
VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
|
||||
};
|
||||
|
||||
/**
|
||||
* virtqueue_read_next_desc() - Read the the next descriptor in the chain
|
||||
* @desc: Virtio ring descriptors
|
||||
* @i: Index of the current descriptor
|
||||
* @max: Maximum value of the descriptor index
|
||||
* @next: Index of the next descriptor in the chain (output value)
|
||||
*
|
||||
* Return: current chain descriptor state (error, next, done)
|
||||
*/
|
||||
static int virtqueue_read_next_desc(const struct vring_desc *desc,
|
||||
int i, unsigned int max, unsigned int *next)
|
||||
{
|
||||
/* If this descriptor says it doesn't chain, we're done. */
|
||||
if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT))
|
||||
return VIRTQUEUE_READ_DESC_DONE;
|
||||
|
||||
/* Check they're not leading us off end of descriptors. */
|
||||
*next = le16toh(desc[i].next);
|
||||
/* Make sure compiler knows to grab that: we don't want it changing! */
|
||||
smp_wmb();
|
||||
|
||||
if (*next >= max)
|
||||
return VIRTQUEUE_READ_DESC_ERROR;
|
||||
|
||||
return VIRTQUEUE_READ_DESC_MORE;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_queue_empty() - Check if virtqueue is empty
|
||||
* @vq: Virtqueue
|
||||
*
|
||||
* Return: true if the virtqueue is empty, false otherwise
|
||||
*/
|
||||
bool vu_queue_empty(struct vu_virtq *vq)
|
||||
{
|
||||
if (!vq->vring.avail)
|
||||
return true;
|
||||
|
||||
if (vq->shadow_avail_idx != vq->last_avail_idx)
|
||||
return false;
|
||||
|
||||
return vring_avail_idx(vq) == vq->last_avail_idx;
|
||||
}
|
||||
|
||||
/**
|
||||
* vring_can_notify() - Check if a notification can be sent
|
||||
* @dev: Vhost-user device
|
||||
* @vq: Virtqueue
|
||||
*
|
||||
* Return: true if notification can be sent
|
||||
*/
|
||||
static bool vring_can_notify(const struct vu_dev *dev, struct vu_virtq *vq)
|
||||
{
|
||||
uint16_t old, new;
|
||||
bool v;
|
||||
|
||||
/* We need to expose used array entries before checking used event. */
|
||||
smp_mb();
|
||||
|
||||
/* Always notify when queue is empty (when feature acknowledge) */
|
||||
if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
|
||||
!vq->inuse && vu_queue_empty(vq))
|
||||
return true;
|
||||
|
||||
if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX))
|
||||
return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
|
||||
|
||||
v = vq->signalled_used_valid;
|
||||
vq->signalled_used_valid = true;
|
||||
old = vq->signalled_used;
|
||||
new = vq->signalled_used = vq->used_idx;
|
||||
return !v || vring_need_event(vring_get_used_event(vq), new, old);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_queue_notify() - Send a notification to the given virtqueue
|
||||
* @dev: Vhost-user device
|
||||
* @vq: Virtqueue
|
||||
*/
|
||||
void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq)
|
||||
{
|
||||
if (!vq->vring.avail)
|
||||
return;
|
||||
|
||||
if (!vring_can_notify(dev, vq)) {
|
||||
debug("vhost-user: virtqueue can skip notify...");
|
||||
return;
|
||||
}
|
||||
|
||||
if (eventfd_write(vq->call_fd, 1) < 0)
|
||||
die_perror("Error writing vhost-user queue eventfd");
|
||||
}
|
||||
|
||||
/* virtq_avail_event() - Get location of available event indices
|
||||
* (only with VIRTIO_F_EVENT_IDX)
|
||||
* @vq: Virtqueue
|
||||
*
|
||||
* Return: return the location of the available event index
|
||||
*/
|
||||
static inline uint16_t *virtq_avail_event(const struct vu_virtq *vq)
|
||||
{
|
||||
/* For backwards compat, avail event index is at *end* of used ring. */
|
||||
return (uint16_t *)&vq->vring.used->ring[vq->vring.num];
|
||||
}
|
||||
|
||||
/**
|
||||
* vring_set_avail_event() - Set avail_event
|
||||
* @vq: Virtqueue
|
||||
* @val: Value to set to avail_event
|
||||
* avail_event is used in the same way the used_event is in the
|
||||
* avail_ring.
|
||||
* avail_event is used to advise the driver that notifications
|
||||
* are unnecessary until the driver writes entry with an index
|
||||
* specified by avail_event into the available ring.
|
||||
*/
|
||||
static inline void vring_set_avail_event(const struct vu_virtq *vq,
|
||||
uint16_t val)
|
||||
{
|
||||
uint16_t val_le = htole16(val);
|
||||
|
||||
if (!vq->notification)
|
||||
return;
|
||||
|
||||
memcpy(virtq_avail_event(vq), &val_le, sizeof(val_le));
|
||||
}
|
||||
|
||||
/**
|
||||
* virtqueue_map_desc() - Translate descriptor ring physical address into our
|
||||
* virtual address space
|
||||
* @dev: Vhost-user device
|
||||
* @p_num_sg: First iov entry to use (input),
|
||||
* first iov entry not used (output)
|
||||
* @iov: Iov array to use to store buffer virtual addresses
|
||||
* @max_num_sg: Maximum number of iov entries
|
||||
* @pa: Guest physical address of the buffer to map into our virtual
|
||||
* address
|
||||
* @sz: Size of the buffer
|
||||
*
|
||||
* Return: false on error, true otherwise
|
||||
*/
|
||||
static bool virtqueue_map_desc(struct vu_dev *dev,
|
||||
unsigned int *p_num_sg, struct iovec *iov,
|
||||
unsigned int max_num_sg,
|
||||
uint64_t pa, size_t sz)
|
||||
{
|
||||
unsigned int num_sg = *p_num_sg;
|
||||
|
||||
ASSERT(num_sg < max_num_sg);
|
||||
ASSERT(sz);
|
||||
|
||||
while (sz) {
|
||||
uint64_t len = sz;
|
||||
|
||||
iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa);
|
||||
if (iov[num_sg].iov_base == NULL)
|
||||
die("vhost-user: invalid address for buffers");
|
||||
iov[num_sg].iov_len = len;
|
||||
num_sg++;
|
||||
sz -= len;
|
||||
pa += len;
|
||||
}
|
||||
|
||||
*p_num_sg = num_sg;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_queue_map_desc - Map the virtqueue descriptor ring into our virtual
|
||||
* address space
|
||||
* @dev: Vhost-user device
|
||||
* @vq: Virtqueue
|
||||
* @idx: First descriptor ring entry to map
|
||||
* @elem: Virtqueue element to store descriptor ring iov
|
||||
*
|
||||
* Return: -1 if there is an error, 0 otherwise
|
||||
*/
|
||||
static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned int idx,
|
||||
struct vu_virtq_element *elem)
|
||||
{
|
||||
const struct vring_desc *desc = vq->vring.desc;
|
||||
struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
|
||||
unsigned int out_num = 0, in_num = 0;
|
||||
unsigned int max = vq->vring.num;
|
||||
unsigned int i = idx;
|
||||
uint64_t read_len;
|
||||
int rc;
|
||||
|
||||
if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
|
||||
unsigned int desc_len;
|
||||
uint64_t desc_addr;
|
||||
|
||||
if (le32toh(desc[i].len) % sizeof(struct vring_desc))
|
||||
die("vhost-user: Invalid size for indirect buffer table");
|
||||
|
||||
/* loop over the indirect descriptor table */
|
||||
desc_addr = le64toh(desc[i].addr);
|
||||
desc_len = le32toh(desc[i].len);
|
||||
max = desc_len / sizeof(struct vring_desc);
|
||||
read_len = desc_len;
|
||||
desc = vu_gpa_to_va(dev, &read_len, desc_addr);
|
||||
if (desc && read_len != desc_len) {
|
||||
/* Failed to use zero copy */
|
||||
desc = NULL;
|
||||
if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len))
|
||||
desc = desc_buf;
|
||||
}
|
||||
if (!desc)
|
||||
die("vhost-user: Invalid indirect buffer table");
|
||||
i = 0;
|
||||
}
|
||||
|
||||
/* Collect all the descriptors */
|
||||
do {
|
||||
if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
|
||||
if (!virtqueue_map_desc(dev, &in_num, elem->in_sg,
|
||||
elem->in_num,
|
||||
le64toh(desc[i].addr),
|
||||
le32toh(desc[i].len)))
|
||||
return -1;
|
||||
} else {
|
||||
if (in_num)
|
||||
die("Incorrect order for descriptors");
|
||||
if (!virtqueue_map_desc(dev, &out_num, elem->out_sg,
|
||||
elem->out_num,
|
||||
le64toh(desc[i].addr),
|
||||
le32toh(desc[i].len))) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* If we've got too many, that implies a descriptor loop. */
|
||||
if ((in_num + out_num) > max)
|
||||
die("vhost-user: Loop in queue descriptor list");
|
||||
rc = virtqueue_read_next_desc(desc, i, max, &i);
|
||||
} while (rc == VIRTQUEUE_READ_DESC_MORE);
|
||||
|
||||
if (rc == VIRTQUEUE_READ_DESC_ERROR)
|
||||
die("vhost-user: Failed to read descriptor list");
|
||||
|
||||
elem->index = idx;
|
||||
elem->in_num = in_num;
|
||||
elem->out_num = out_num;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_queue_pop() - Pop an entry from the virtqueue
|
||||
* @dev: Vhost-user device
|
||||
* @vq: Virtqueue
|
||||
* @elem: Virtqueue element to file with the entry information
|
||||
*
|
||||
* Return: -1 if there is an error, 0 otherwise
|
||||
*/
|
||||
int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem)
|
||||
{
|
||||
unsigned int head;
|
||||
int ret;
|
||||
|
||||
if (!vq->vring.avail)
|
||||
return -1;
|
||||
|
||||
if (vu_queue_empty(vq))
|
||||
return -1;
|
||||
|
||||
/* Needed after vu_queue_empty(), see comment in
|
||||
* virtqueue_num_heads().
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
if (vq->inuse >= vq->vring.num)
|
||||
die("vhost-user queue size exceeded");
|
||||
|
||||
virtqueue_get_head(vq, vq->last_avail_idx++, &head);
|
||||
|
||||
if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX))
|
||||
vring_set_avail_event(vq, vq->last_avail_idx);
|
||||
|
||||
ret = vu_queue_map_desc(dev, vq, head, elem);
|
||||
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
vq->inuse++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_queue_detach_element() - Detach an element from the virqueue
|
||||
* @vq: Virtqueue
|
||||
*/
|
||||
void vu_queue_detach_element(struct vu_virtq *vq)
|
||||
{
|
||||
vq->inuse--;
|
||||
/* unmap, when DMA support is added */
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_queue_unpop() - Push back the previously popped element from the virqueue
|
||||
* @vq: Virtqueue
|
||||
*/
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
void vu_queue_unpop(struct vu_virtq *vq)
|
||||
{
|
||||
vq->last_avail_idx--;
|
||||
vu_queue_detach_element(vq);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_queue_rewind() - Push back a given number of popped elements
|
||||
* @vq: Virtqueue
|
||||
* @num: Number of element to unpop
|
||||
*/
|
||||
bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num)
|
||||
{
|
||||
if (num > vq->inuse)
|
||||
return false;
|
||||
|
||||
vq->last_avail_idx -= num;
|
||||
vq->inuse -= num;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* vring_used_write() - Write an entry in the used ring
|
||||
* @vq: Virtqueue
|
||||
* @uelem: Entry to write
|
||||
* @i: Index of the entry in the used ring
|
||||
*/
|
||||
static inline void vring_used_write(struct vu_virtq *vq,
|
||||
const struct vring_used_elem *uelem, int i)
|
||||
{
|
||||
struct vring_used *used = vq->vring.used;
|
||||
|
||||
used->ring[i] = *uelem;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_queue_fill_by_index() - Update information of a descriptor ring entry
|
||||
* in the used ring
|
||||
* @vq: Virtqueue
|
||||
* @index: Descriptor ring index
|
||||
* @len: Size of the element
|
||||
* @idx: Used ring entry index
|
||||
*/
|
||||
void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
|
||||
unsigned int len, unsigned int idx)
|
||||
{
|
||||
struct vring_used_elem uelem;
|
||||
|
||||
if (!vq->vring.avail)
|
||||
return;
|
||||
|
||||
idx = (idx + vq->used_idx) % vq->vring.num;
|
||||
|
||||
uelem.id = htole32(index);
|
||||
uelem.len = htole32(len);
|
||||
vring_used_write(vq, &uelem, idx);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_queue_fill() - Update information of a given element in the used ring
|
||||
* @dev: Vhost-user device
|
||||
* @vq: Virtqueue
|
||||
* @elem: Element information to fill
|
||||
* @len: Size of the element
|
||||
* @idx: Used ring entry index
|
||||
*/
|
||||
void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem,
|
||||
unsigned int len, unsigned int idx)
|
||||
{
|
||||
vu_queue_fill_by_index(vq, elem->index, len, idx);
|
||||
}
|
||||
|
||||
/**
|
||||
* vring_used_idx_set() - Set the descriptor ring current index
|
||||
* @vq: Virtqueue
|
||||
* @val: Value to set in the index
|
||||
*/
|
||||
static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val)
|
||||
{
|
||||
vq->vring.used->idx = htole16(val);
|
||||
|
||||
vq->used_idx = val;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_queue_flush() - Flush the virtqueue
|
||||
* @vq: Virtqueue
|
||||
* @count: Number of entry to flush
|
||||
*/
|
||||
void vu_queue_flush(struct vu_virtq *vq, unsigned int count)
|
||||
{
|
||||
uint16_t old, new;
|
||||
|
||||
if (!vq->vring.avail)
|
||||
return;
|
||||
|
||||
/* Make sure buffer is written before we update index. */
|
||||
smp_wmb();
|
||||
|
||||
old = vq->used_idx;
|
||||
new = old + count;
|
||||
vring_used_idx_set(vq, new);
|
||||
vq->inuse -= count;
|
||||
if ((uint16_t)(new - vq->signalled_used) < (uint16_t)(new - old))
|
||||
vq->signalled_used_valid = false;
|
||||
}
|
184
virtio.h
Normal file
184
virtio.h
Normal file
|
@ -0,0 +1,184 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* virtio API, vring and virtqueue functions definition
|
||||
*
|
||||
* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*/
|
||||
|
||||
#ifndef VIRTIO_H
|
||||
#define VIRTIO_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <linux/vhost_types.h>
|
||||
|
||||
/* Maximum size of a virtqueue */
|
||||
#define VIRTQUEUE_MAX_SIZE 1024
|
||||
|
||||
/**
|
||||
* struct vu_ring - Virtqueue rings
|
||||
* @num: Size of the queue
|
||||
* @desc: Descriptor ring
|
||||
* @avail: Available ring
|
||||
* @used: Used ring
|
||||
* @log_guest_addr: Guest address for logging
|
||||
* @flags: Vring flags
|
||||
* VHOST_VRING_F_LOG is set if log address is valid
|
||||
*/
|
||||
struct vu_ring {
|
||||
unsigned int num;
|
||||
struct vring_desc *desc;
|
||||
struct vring_avail *avail;
|
||||
struct vring_used *used;
|
||||
uint64_t log_guest_addr;
|
||||
uint32_t flags;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct vu_virtq - Virtqueue definition
|
||||
* @vring: Virtqueue rings
|
||||
* @last_avail_idx: Next head to pop
|
||||
* @shadow_avail_idx: Last avail_idx read from VQ.
|
||||
* @used_idx: Descriptor ring current index
|
||||
* @signalled_used: Last used index value we have signalled on
|
||||
* @signalled_used_valid: True if signalled_used if valid
|
||||
* @notification: True if the queues notify (via event
|
||||
* index or interrupt)
|
||||
* @inuse: Number of entries in use
|
||||
* @call_fd: The event file descriptor to signal when
|
||||
* buffers are used.
|
||||
* @kick_fd: The event file descriptor for adding
|
||||
* buffers to the vring
|
||||
* @err_fd: The event file descriptor to signal when
|
||||
* error occurs
|
||||
* @enable: True if the virtqueue is enabled
|
||||
* @started: True if the virtqueue is started
|
||||
* @vra: QEMU address of our rings
|
||||
*/
|
||||
struct vu_virtq {
|
||||
struct vu_ring vring;
|
||||
uint16_t last_avail_idx;
|
||||
uint16_t shadow_avail_idx;
|
||||
uint16_t used_idx;
|
||||
uint16_t signalled_used;
|
||||
bool signalled_used_valid;
|
||||
bool notification;
|
||||
unsigned int inuse;
|
||||
int call_fd;
|
||||
int kick_fd;
|
||||
int err_fd;
|
||||
unsigned int enable;
|
||||
bool started;
|
||||
struct vhost_vring_addr vra;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct vu_dev_region - guest shared memory region
|
||||
* @gpa: Guest physical address of the region
|
||||
* @size: Memory size in bytes
|
||||
* @qva: QEMU virtual address
|
||||
* @mmap_offset: Offset where the region starts in the mapped memory
|
||||
* @mmap_addr: Address of the mapped memory
|
||||
*/
|
||||
struct vu_dev_region {
|
||||
uint64_t gpa;
|
||||
uint64_t size;
|
||||
uint64_t qva;
|
||||
uint64_t mmap_offset;
|
||||
uint64_t mmap_addr;
|
||||
};
|
||||
|
||||
#define VHOST_USER_MAX_QUEUES 2
|
||||
|
||||
/*
|
||||
* Set a reasonable maximum number of ram slots, which will be supported by
|
||||
* any architecture.
|
||||
*/
|
||||
#define VHOST_USER_MAX_RAM_SLOTS 32
|
||||
|
||||
/**
|
||||
* struct vu_dev - vhost-user device information
|
||||
* @context: Execution context
|
||||
* @nregions: Number of shared memory regions
|
||||
* @regions: Guest shared memory regions
|
||||
* @features: Vhost-user features
|
||||
* @protocol_features: Vhost-user protocol features
|
||||
*/
|
||||
struct vu_dev {
|
||||
struct ctx *context;
|
||||
uint32_t nregions;
|
||||
struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS];
|
||||
struct vu_virtq vq[VHOST_USER_MAX_QUEUES];
|
||||
uint64_t features;
|
||||
uint64_t protocol_features;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct vu_virtq_element - virtqueue element
|
||||
* @index: Descriptor ring index
|
||||
* @out_num: Number of outgoing iovec buffers
|
||||
* @in_num: Number of incoming iovec buffers
|
||||
* @in_sg: Incoming iovec buffers
|
||||
* @out_sg: Outgoing iovec buffers
|
||||
*/
|
||||
struct vu_virtq_element {
|
||||
unsigned int index;
|
||||
unsigned int out_num;
|
||||
unsigned int in_num;
|
||||
struct iovec *in_sg;
|
||||
struct iovec *out_sg;
|
||||
};
|
||||
|
||||
/**
|
||||
* has_feature() - Check a feature bit in a features set
|
||||
* @features: Features set
|
||||
* @fb: Feature bit to check
|
||||
*
|
||||
* Return: True if the feature bit is set
|
||||
*/
|
||||
static inline bool has_feature(uint64_t features, unsigned int fbit)
|
||||
{
|
||||
return !!(features & (1ULL << fbit));
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_has_feature() - Check if a virtio-net feature is available
|
||||
* @vdev: Vhost-user device
|
||||
* @bit: Feature to check
|
||||
*
|
||||
* Return: True if the feature is available
|
||||
*/
|
||||
static inline bool vu_has_feature(const struct vu_dev *vdev,
|
||||
unsigned int fbit)
|
||||
{
|
||||
return has_feature(vdev->features, fbit);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_has_protocol_feature() - Check if a vhost-user feature is available
|
||||
* @vdev: Vhost-user device
|
||||
* @bit: Feature to check
|
||||
*
|
||||
* Return: True if the feature is available
|
||||
*/
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
static inline bool vu_has_protocol_feature(const struct vu_dev *vdev,
|
||||
unsigned int fbit)
|
||||
{
|
||||
return has_feature(vdev->protocol_features, fbit);
|
||||
}
|
||||
|
||||
bool vu_queue_empty(struct vu_virtq *vq);
|
||||
void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq);
|
||||
int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq,
|
||||
struct vu_virtq_element *elem);
|
||||
void vu_queue_detach_element(struct vu_virtq *vq);
|
||||
void vu_queue_unpop(struct vu_virtq *vq);
|
||||
bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num);
|
||||
void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
|
||||
unsigned int len, unsigned int idx);
|
||||
void vu_queue_fill(struct vu_virtq *vq,
|
||||
const struct vu_virtq_element *elem, unsigned int len,
|
||||
unsigned int idx);
|
||||
void vu_queue_flush(struct vu_virtq *vq, unsigned int count);
|
||||
#endif /* VIRTIO_H */
|
285
vu_common.c
Normal file
285
vu_common.c
Normal file
|
@ -0,0 +1,285 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*
|
||||
* common_vu.c - vhost-user common UDP and TCP functions
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/uio.h>
|
||||
#include <sys/eventfd.h>
|
||||
#include <linux/virtio_net.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "passt.h"
|
||||
#include "tap.h"
|
||||
#include "vhost_user.h"
|
||||
#include "pcap.h"
|
||||
#include "vu_common.h"
|
||||
|
||||
/**
|
||||
* vu_packet_check_range() - Check if a given memory zone is contained in
|
||||
* a mapped guest memory region
|
||||
* @buf: Array of the available memory regions
|
||||
* @offset: Offset of data range in packet descriptor
|
||||
* @size: Length of desired data range
|
||||
* @start: Start of the packet descriptor
|
||||
*
|
||||
* Return: 0 if the zone is in a mapped memory region, -1 otherwise
|
||||
*/
|
||||
int vu_packet_check_range(void *buf, size_t offset, size_t len,
|
||||
const char *start)
|
||||
{
|
||||
struct vu_dev_region *dev_region;
|
||||
|
||||
for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
|
||||
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
|
||||
char *m = (char *)dev_region->mmap_addr;
|
||||
|
||||
if (m <= start &&
|
||||
start + offset + len <= m + dev_region->mmap_offset +
|
||||
dev_region->size)
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_init_elem() - initialize an array of virtqueue element with 1 iov in each
|
||||
* @elem: Array of virtqueue element to initialize
|
||||
* @iov: Array of iovec to assign to virtqueue element
|
||||
* @elem_cnt: Number of virtqueue element
|
||||
*/
|
||||
void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < elem_cnt; i++)
|
||||
vu_set_element(&elem[i], NULL, &iov[i]);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_collect() - collect virtio buffers from a given virtqueue
|
||||
* @vdev: vhost-user device
|
||||
* @vq: virtqueue to collect from
|
||||
* @elem: Array of virtqueue element
|
||||
* each element must be initialized with one iovec entry
|
||||
* in the in_sg array.
|
||||
* @max_elem: Number of virtqueue element in the array
|
||||
* @size: Maximum size of the data in the frame
|
||||
* @frame_size: The total size of the buffers (output)
|
||||
*
|
||||
* Return: number of elements used to contain the frame
|
||||
*/
|
||||
int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq,
|
||||
struct vu_virtq_element *elem, int max_elem,
|
||||
size_t size, size_t *frame_size)
|
||||
{
|
||||
size_t current_size = 0;
|
||||
int elem_cnt = 0;
|
||||
|
||||
while (current_size < size && elem_cnt < max_elem) {
|
||||
struct iovec *iov;
|
||||
int ret;
|
||||
|
||||
ret = vu_queue_pop(vdev, vq, &elem[elem_cnt]);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
if (elem[elem_cnt].in_num < 1) {
|
||||
warn("virtio-net receive queue contains no in buffers");
|
||||
vu_queue_detach_element(vq);
|
||||
break;
|
||||
}
|
||||
|
||||
iov = &elem[elem_cnt].in_sg[0];
|
||||
|
||||
if (iov->iov_len > size - current_size)
|
||||
iov->iov_len = size - current_size;
|
||||
|
||||
current_size += iov->iov_len;
|
||||
elem_cnt++;
|
||||
|
||||
if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
|
||||
break;
|
||||
}
|
||||
|
||||
if (frame_size)
|
||||
*frame_size = current_size;
|
||||
|
||||
return elem_cnt;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_vnethdr() - set virtio-net headers
|
||||
* @vdev: vhost-user device
|
||||
* @vnethdr: Address of the header to set
|
||||
* @num_buffers: Number of guest buffers of the frame
|
||||
*/
|
||||
void vu_set_vnethdr(const struct vu_dev *vdev,
|
||||
struct virtio_net_hdr_mrg_rxbuf *vnethdr,
|
||||
int num_buffers)
|
||||
{
|
||||
vnethdr->hdr = VU_HEADER;
|
||||
if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
|
||||
vnethdr->num_buffers = htole16(num_buffers);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_flush() - flush all the collected buffers to the vhost-user interface
|
||||
* @vdev: vhost-user device
|
||||
* @vq: vhost-user virtqueue
|
||||
* @elem: virtqueue element array to send back to the virqueue
|
||||
* @iov_used: Length of the array
|
||||
*/
|
||||
void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
|
||||
struct vu_virtq_element *elem, int elem_cnt)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < elem_cnt; i++)
|
||||
vu_queue_fill(vq, &elem[i], elem[i].in_sg[0].iov_len, i);
|
||||
|
||||
vu_queue_flush(vq, elem_cnt);
|
||||
vu_queue_notify(vdev, vq);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_handle_tx() - Receive data from the TX virtqueue
|
||||
* @vdev: vhost-user device
|
||||
* @index: index of the virtqueue
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
static void vu_handle_tx(struct vu_dev *vdev, int index,
|
||||
const struct timespec *now)
|
||||
{
|
||||
struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
|
||||
struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
|
||||
struct vu_virtq *vq = &vdev->vq[index];
|
||||
int hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
|
||||
int out_sg_count;
|
||||
int count;
|
||||
|
||||
if (!VHOST_USER_IS_QUEUE_TX(index)) {
|
||||
debug("vhost-user: index %d is not a TX queue", index);
|
||||
return;
|
||||
}
|
||||
|
||||
tap_flush_pools();
|
||||
|
||||
count = 0;
|
||||
out_sg_count = 0;
|
||||
while (count < VIRTQUEUE_MAX_SIZE) {
|
||||
int ret;
|
||||
|
||||
vu_set_element(&elem[count], &out_sg[out_sg_count], NULL);
|
||||
ret = vu_queue_pop(vdev, vq, &elem[count]);
|
||||
if (ret < 0)
|
||||
break;
|
||||
out_sg_count += elem[count].out_num;
|
||||
|
||||
if (elem[count].out_num < 1) {
|
||||
warn("virtio-net transmit queue contains no out buffers");
|
||||
break;
|
||||
}
|
||||
ASSERT(elem[count].out_num == 1);
|
||||
|
||||
tap_add_packet(vdev->context,
|
||||
elem[count].out_sg[0].iov_len - hdrlen,
|
||||
(char *)elem[count].out_sg[0].iov_base + hdrlen);
|
||||
count++;
|
||||
}
|
||||
tap_handler(vdev->context, now);
|
||||
|
||||
if (count) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < count; i++)
|
||||
vu_queue_fill(vq, &elem[i], 0, i);
|
||||
vu_queue_flush(vq, count);
|
||||
vu_queue_notify(vdev, vq);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_kick_cb() - Called on a kick event to start to receive data
|
||||
* @vdev: vhost-user device
|
||||
* @ref: epoll reference information
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
|
||||
const struct timespec *now)
|
||||
{
|
||||
eventfd_t kick_data;
|
||||
ssize_t rc;
|
||||
|
||||
rc = eventfd_read(ref.fd, &kick_data);
|
||||
if (rc == -1)
|
||||
die_perror("vhost-user kick eventfd_read()");
|
||||
|
||||
debug("vhost-user: ot kick_data: %016"PRIx64" idx:%d",
|
||||
kick_data, ref.queue);
|
||||
if (VHOST_USER_IS_QUEUE_TX(ref.queue))
|
||||
vu_handle_tx(vdev, ref.queue, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_send_single() - Send a buffer to the front-end using the RX virtqueue
|
||||
* @c: execution context
|
||||
* @buf: address of the buffer
|
||||
* @size: size of the buffer
|
||||
*
|
||||
* Return: number of bytes sent, -1 if there is an error
|
||||
*/
|
||||
int vu_send_single(const struct ctx *c, const void *buf, size_t size)
|
||||
{
|
||||
struct vu_dev *vdev = c->vdev;
|
||||
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
|
||||
struct iovec in_sg[VIRTQUEUE_MAX_SIZE];
|
||||
size_t total;
|
||||
int elem_cnt;
|
||||
int i;
|
||||
|
||||
debug("vu_send_single size %zu", size);
|
||||
|
||||
if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
|
||||
err("Got packet, but RX virtqueue not usable yet");
|
||||
return 0;
|
||||
}
|
||||
|
||||
vu_init_elem(elem, in_sg, VIRTQUEUE_MAX_SIZE);
|
||||
|
||||
size += sizeof(struct virtio_net_hdr_mrg_rxbuf);
|
||||
elem_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE, size, &total);
|
||||
if (total < size) {
|
||||
debug("vu_send_single: no space to send the data "
|
||||
"elem_cnt %d size %zd", elem_cnt, total);
|
||||
goto err;
|
||||
}
|
||||
|
||||
vu_set_vnethdr(vdev, in_sg[0].iov_base, elem_cnt);
|
||||
|
||||
total -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
|
||||
|
||||
/* copy data from the buffer to the iovec */
|
||||
iov_from_buf(in_sg, elem_cnt, sizeof(struct virtio_net_hdr_mrg_rxbuf),
|
||||
buf, total);
|
||||
|
||||
if (*c->pcap) {
|
||||
pcap_iov(in_sg, elem_cnt,
|
||||
sizeof(struct virtio_net_hdr_mrg_rxbuf));
|
||||
}
|
||||
|
||||
vu_flush(vdev, vq, elem, elem_cnt);
|
||||
|
||||
debug("vhost-user sent %zu", total);
|
||||
|
||||
return total;
|
||||
err:
|
||||
for (i = 0; i < elem_cnt; i++)
|
||||
vu_queue_detach_element(vq);
|
||||
|
||||
return -1;
|
||||
}
|
60
vu_common.h
Normal file
60
vu_common.h
Normal file
|
@ -0,0 +1,60 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright Red Hat
|
||||
* Author: Laurent Vivier <lvivier@redhat.com>
|
||||
*
|
||||
* vhost-user common UDP and TCP functions
|
||||
*/
|
||||
|
||||
#ifndef VU_COMMON_H
|
||||
#define VU_COMMON_H
|
||||
#include <linux/virtio_net.h>
|
||||
|
||||
static inline void *vu_eth(void *base)
|
||||
{
|
||||
return ((char *)base + sizeof(struct virtio_net_hdr_mrg_rxbuf));
|
||||
}
|
||||
|
||||
static inline void *vu_ip(void *base)
|
||||
{
|
||||
return (struct ethhdr *)vu_eth(base) + 1;
|
||||
}
|
||||
|
||||
static inline void *vu_payloadv4(void *base)
|
||||
{
|
||||
return (struct iphdr *)vu_ip(base) + 1;
|
||||
}
|
||||
|
||||
static inline void *vu_payloadv6(void *base)
|
||||
{
|
||||
return (struct ipv6hdr *)vu_ip(base) + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* vu_set_element() - Initialize a vu_virtq_element
|
||||
* @elem: Element to initialize
|
||||
* @out_sg: One out iovec entry to set in elem
|
||||
* @in_sg: One in iovec entry to set in elem
|
||||
*/
|
||||
static inline void vu_set_element(struct vu_virtq_element *elem,
|
||||
struct iovec *out_sg, struct iovec *in_sg)
|
||||
{
|
||||
elem->out_num = !!out_sg;
|
||||
elem->out_sg = out_sg;
|
||||
elem->in_num = !!in_sg;
|
||||
elem->in_sg = in_sg;
|
||||
}
|
||||
|
||||
void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov,
|
||||
int elem_cnt);
|
||||
int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq,
|
||||
struct vu_virtq_element *elem, int max_elem, size_t size,
|
||||
size_t *frame_size);
|
||||
void vu_set_vnethdr(const struct vu_dev *vdev,
|
||||
struct virtio_net_hdr_mrg_rxbuf *vnethdr,
|
||||
int num_buffers);
|
||||
void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
|
||||
struct vu_virtq_element *elem, int elem_cnt);
|
||||
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
|
||||
const struct timespec *now);
|
||||
int vu_send_single(const struct ctx *c, const void *buf, size_t size);
|
||||
#endif /* VU_COMMON_H */
|
Loading…
Add table
Reference in a new issue