From b2229bd24ffc75b2538073958221e1ae82ac0767 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Tue, 8 Nov 2022 21:16:46 +0100 Subject: [PATCH] vhost-user: introduce vhost-user API Add vhost_user.c and vhost_user.h that define the functions needed to implement vhost-user backend. Signed-off-by: Laurent Vivier --- Makefile | 5 +- passt.c | 2 + passt.h | 8 + tap.c | 2 +- tap.h | 3 + vhost_user.c | 1050 ++++++++++++++++++++++++++++++++++++++++++++++++++ vhost_user.h | 139 +++++++ 7 files changed, 1206 insertions(+), 3 deletions(-) create mode 100644 vhost_user.c create mode 100644 vhost_user.h diff --git a/Makefile b/Makefile index 4157329..d853264 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c virtio.c + tcp_buf.c tcp_splice.c udp.c util.c vhost_user.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h inany.h iov.h ip.h isolation.h lineread.h log.h \ ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h siphash.h tap.h \ - tcp.h tcp_buf.h tcp_conn.h tcp_splice.h udp.h util.h virtio.h + tcp.h tcp_buf.h tcp_conn.h tcp_splice.h udp.h util.h vhost_user.h \ + virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/passt.c b/passt.c index f3e9dae..7da05c7 100644 --- a/passt.c +++ b/passt.c @@ -73,6 +73,8 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device", [EPOLL_TYPE_TAP_PASST] = "connected qemu socket", [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", + [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", + [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); diff --git a/passt.h b/passt.h index e6d4358..521fb1b 100644 --- a/passt.h +++ b/passt.h @@ -42,6 +42,7 @@ union epoll_ref; #include "fwd.h" #include "tcp.h" #include "udp.h" +#include "vhost_user.h" /** * enum epoll_type - Different types of fds we poll over @@ -73,6 +74,10 @@ enum epoll_type { EPOLL_TYPE_TAP_PASST, /* socket listening for qemu socket connections */ EPOLL_TYPE_TAP_LISTEN, + /* vhost-user command socket */ + EPOLL_TYPE_VHOST_CMD, + /* vhost-user kick event socket */ + EPOLL_TYPE_VHOST_KICK, EPOLL_NUM_TYPES, }; @@ -307,6 +312,9 @@ struct ctx { int low_wmem; int low_rmem; + + /* vhost-user */ + struct VuDev vdev; }; void proto_update_l2_buf(const unsigned char *eth_d, diff --git a/tap.c b/tap.c index 65445cb..d7d1c3e 100644 --- a/tap.c +++ b/tap.c @@ -1054,7 +1054,7 @@ void packet_add_all_do(struct ctx *c, ssize_t len, char *p, * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket * @c: Execution context */ -static void tap_sock_reset(struct ctx *c) +void tap_sock_reset(struct ctx *c) { if (c->one_off) { info("Client closed connection, exiting"); diff --git a/tap.h b/tap.h index c5606a2..6b2b180 100644 --- a/tap.h +++ b/tap.h @@ -97,12 +97,15 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, const struct timespec *now); void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now); +void tap_sock_reset(struct ctx *c); void tap_sock_init(struct ctx *c); void pool_flush_all(void); void tap_handler_all(struct ctx *c, const struct timespec *now); void packet_add_do(struct pool *p, size_t len, const char *start, const char *func, int line); +void packet_add_all_do(struct ctx *c, ssize_t len, char *p, + const char *func, int line); #define packet_add_all(p, len, start) \ packet_add_all_do(p, len, start, __func__, __LINE__) diff --git a/vhost_user.c b/vhost_user.c new file mode 100644 index 0000000..2acd723 --- /dev/null +++ b/vhost_user.c @@ -0,0 +1,1050 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* some parts from QEMU subprojects/libvhost-user/libvhost-user.c */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "passt.h" +#include "tap.h" +#include "vhost_user.h" + +#define VHOST_USER_VERSION 1 + +static unsigned char buffer[65536][VHOST_USER_MAX_QUEUES]; + +void vu_print_capabilities(void) +{ + printf("{\n"); + printf(" \"type\": \"net\"\n"); + printf("}\n"); + exit(EXIT_SUCCESS); +} + +static const char * +vu_request_to_string(unsigned int req) +{ +#define REQ(req) [req] = #req + static const char *vu_request_str[] = { + REQ(VHOST_USER_NONE), + REQ(VHOST_USER_GET_FEATURES), + REQ(VHOST_USER_SET_FEATURES), + REQ(VHOST_USER_SET_OWNER), + REQ(VHOST_USER_RESET_OWNER), + REQ(VHOST_USER_SET_MEM_TABLE), + REQ(VHOST_USER_SET_LOG_BASE), + REQ(VHOST_USER_SET_LOG_FD), + REQ(VHOST_USER_SET_VRING_NUM), + REQ(VHOST_USER_SET_VRING_ADDR), + REQ(VHOST_USER_SET_VRING_BASE), + REQ(VHOST_USER_GET_VRING_BASE), + REQ(VHOST_USER_SET_VRING_KICK), + REQ(VHOST_USER_SET_VRING_CALL), + REQ(VHOST_USER_SET_VRING_ERR), + REQ(VHOST_USER_GET_PROTOCOL_FEATURES), + REQ(VHOST_USER_SET_PROTOCOL_FEATURES), + REQ(VHOST_USER_GET_QUEUE_NUM), + REQ(VHOST_USER_SET_VRING_ENABLE), + REQ(VHOST_USER_SEND_RARP), + REQ(VHOST_USER_NET_SET_MTU), + REQ(VHOST_USER_SET_BACKEND_REQ_FD), + REQ(VHOST_USER_IOTLB_MSG), + REQ(VHOST_USER_SET_VRING_ENDIAN), + REQ(VHOST_USER_GET_CONFIG), + REQ(VHOST_USER_SET_CONFIG), + REQ(VHOST_USER_POSTCOPY_ADVISE), + REQ(VHOST_USER_POSTCOPY_LISTEN), + REQ(VHOST_USER_POSTCOPY_END), + REQ(VHOST_USER_GET_INFLIGHT_FD), + REQ(VHOST_USER_SET_INFLIGHT_FD), + REQ(VHOST_USER_GPU_SET_SOCKET), + REQ(VHOST_USER_VRING_KICK), + REQ(VHOST_USER_GET_MAX_MEM_SLOTS), + REQ(VHOST_USER_ADD_MEM_REG), + REQ(VHOST_USER_REM_MEM_REG), + REQ(VHOST_USER_MAX), + }; +#undef REQ + + if (req < VHOST_USER_MAX) { + return vu_request_str[req]; + } else { + return "unknown"; + } +} + +/* Translate qemu virtual address to our virtual address. */ +static void *qva_to_va(VuDev *dev, uint64_t qemu_addr) +{ + unsigned int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *r = &dev->regions[i]; + + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { + return (void *)(uintptr_t) + (qemu_addr - r->qva + r->mmap_addr + r->mmap_offset); + } + } + + return NULL; +} + +static void +vmsg_close_fds(VhostUserMsg *vmsg) +{ + int i; + + for (i = 0; i < vmsg->fd_num; i++) + close(vmsg->fds[i]); +} + +static void vu_remove_watch(VuDev *vdev, int fd) +{ + struct ctx *c = (struct ctx *) ((char *)vdev - offsetof(struct ctx, vdev)); + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL); +} + +/* Set reply payload.u64 and clear request flags and fd_num */ +static void vmsg_set_reply_u64(struct VhostUserMsg *vmsg, uint64_t val) +{ + vmsg->hdr.flags = 0; /* defaults will be set by vu_send_reply() */ + vmsg->hdr.size = sizeof(vmsg->payload.u64); + vmsg->payload.u64 = val; + vmsg->fd_num = 0; +} + +static ssize_t vu_message_read_default(VuDev *dev, int conn_fd, struct VhostUserMsg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * + sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + .msg_controllen = sizeof(control), + }; + size_t fd_size; + struct cmsghdr *cmsg; + ssize_t ret, sz_payload; + + ret = recvmsg(conn_fd, &msg, MSG_DONTWAIT); + if (ret < 0) { + if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); + goto out; + } + + vmsg->fd_num = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + vmsg->fd_num = fd_size / sizeof(int); + memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); + break; + } + } + + sz_payload = vmsg->hdr.size; + if ((size_t)sz_payload > sizeof(vmsg->payload)) { + vu_panic(dev, + "Error: too big message request: %d, size: vmsg->size: %zd, " + "while sizeof(vmsg->payload) = %zu", + vmsg->hdr.request, sz_payload, sizeof(vmsg->payload)); + goto out; + } + + if (sz_payload) { + do { + ret = recv(conn_fd, &vmsg->payload, sz_payload, 0); + } while (ret < 0 && (errno == EINTR || errno == EAGAIN)); + + if (ret < sz_payload) { + vu_panic(dev, "Error while reading: %s", strerror(errno)); + goto out; + } + } + + return 1; +out: + vmsg_close_fds(vmsg); + + return -ECONNRESET; +} + +static int vu_message_write(VuDev *dev, int conn_fd, struct VhostUserMsg *vmsg) +{ + int rc; + uint8_t *p = (uint8_t *)vmsg; + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + }; + struct cmsghdr *cmsg; + + memset(control, 0, sizeof(control)); + assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); + if (vmsg->fd_num > 0) { + size_t fdsize = vmsg->fd_num * sizeof(int); + msg.msg_controllen = CMSG_SPACE(fdsize); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); + } else { + msg.msg_controllen = 0; + } + + do { + rc = sendmsg(conn_fd, &msg, 0); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + if (vmsg->hdr.size) { + do { + if (vmsg->data) { + rc = write(conn_fd, vmsg->data, vmsg->hdr.size); + } else { + rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->hdr.size); + } + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + } + + if (rc <= 0) { + vu_panic(dev, "Error while writing: %s", strerror(errno)); + return false; + } + + return true; +} + +static int vu_send_reply(VuDev *dev, int conn_fd, struct VhostUserMsg *msg) +{ + msg->hdr.flags &= ~VHOST_USER_VERSION_MASK; + msg->hdr.flags |= VHOST_USER_VERSION; + msg->hdr.flags |= VHOST_USER_REPLY_MASK; + + return vu_message_write(dev, conn_fd, msg); +} + +static bool vu_get_features_exec(struct VhostUserMsg *msg) +{ + uint64_t features = + 1ULL << VIRTIO_F_VERSION_1 | + 1ULL << VIRTIO_NET_F_MRG_RXBUF | + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; + + vmsg_set_reply_u64(msg, features); + + debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64); + + return true; +} + +static void +vu_set_enable_all_rings(VuDev *vdev, bool enabled) +{ + uint16_t i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + vdev->vq[i].enable = enabled; + } +} + +static bool +vu_set_features_exec(VuDev *vdev, struct VhostUserMsg *msg) +{ + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vdev->features = msg->payload.u64; + if (!vu_has_feature(vdev, VIRTIO_F_VERSION_1)) { + /* + * We only support devices conforming to VIRTIO 1.0 or + * later + */ + vu_panic(vdev, "virtio legacy devices aren't supported by passt"); + return false; + } + + if (!vu_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) { + vu_set_enable_all_rings(vdev, true); + } + + /* virtio-net features */ + + if (vu_has_feature(vdev, VIRTIO_F_VERSION_1) || + vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { + vdev->hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + vdev->hdrlen = sizeof(struct virtio_net_hdr); + } + + return false; +} + +static bool +vu_set_owner_exec(void) +{ + return false; +} + +static bool map_ring(VuDev *vdev, VuVirtq *vq) +{ + vq->vring.desc = qva_to_va(vdev, vq->vra.desc_user_addr); + vq->vring.used = qva_to_va(vdev, vq->vra.used_user_addr); + vq->vring.avail = qva_to_va(vdev, vq->vra.avail_user_addr); + + debug("Setting virtq addresses:"); + debug(" vring_desc at %p", (void *)vq->vring.desc); + debug(" vring_used at %p", (void *)vq->vring.used); + debug(" vring_avail at %p", (void *)vq->vring.avail); + + return !(vq->vring.desc && vq->vring.used && vq->vring.avail); +} + +/* + * #syscalls:passt mmap munmap + */ + +static bool vu_set_mem_table_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int i; + struct VhostUserMemory m = msg->payload.memory, *memory = &m; + + for (i = 0; i < vdev->nregions; i++) { + VuDevRegion *r = &vdev->regions[i]; + void *m = (void *) (uintptr_t) r->mmap_addr; + + if (m) + munmap(m, r->size + r->mmap_offset); + } + vdev->nregions = memory->nregions; + + debug("Nregions: %u", memory->nregions); + for (i = 0; i < vdev->nregions; i++) { + void *mmap_addr; + VhostUserMemory_region *msg_region = &memory->regions[i]; + VuDevRegion *dev_region = &vdev->regions[i]; + + debug("Region %d", i); + debug(" guest_phys_addr: 0x%016"PRIx64, + msg_region->guest_phys_addr); + debug(" memory_size: 0x%016"PRIx64, + msg_region->memory_size); + debug(" userspace_addr 0x%016"PRIx64, + msg_region->userspace_addr); + debug(" mmap_offset 0x%016"PRIx64, + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, + msg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) { + vu_panic(vdev, "region mmap error: %s", strerror(errno)); + } else { + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + debug(" mmap_addr: 0x%016"PRIx64, + dev_region->mmap_addr); + } + + close(msg->fds[i]); + } + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + if (vdev->vq[i].vring.desc) { + if (map_ring(vdev, &vdev->vq[i])) { + vu_panic(vdev, "remapping queue %d during setmemtable", i); + } + } + } + + return false; +} + +static bool vu_set_vring_num_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int index = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", index); + debug("State.num: %u", num); + vdev->vq[index].vring.num = num; + + return false; +} + +static bool vu_set_vring_addr_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + struct vhost_vring_addr addr = msg->payload.addr, *vra = &addr; + unsigned int index = vra->index; + VuVirtq *vq = &vdev->vq[index]; + + debug("vhost_vring_addr:"); + debug(" index: %d", vra->index); + debug(" flags: %d", vra->flags); + debug(" desc_user_addr: 0x%016" PRIx64, (uint64_t)vra->desc_user_addr); + debug(" used_user_addr: 0x%016" PRIx64, (uint64_t)vra->used_user_addr); + debug(" avail_user_addr: 0x%016" PRIx64, (uint64_t)vra->avail_user_addr); + debug(" log_guest_addr: 0x%016" PRIx64, (uint64_t)vra->log_guest_addr); + + vq->vra = *vra; + vq->vring.flags = vra->flags; + vq->vring.log_guest_addr = vra->log_guest_addr; + + if (map_ring(vdev, vq)) { + vu_panic(vdev, "Invalid vring_addr message"); + return false; + } + + vq->used_idx = le16toh(vq->vring.used->idx); + + if (vq->last_avail_idx != vq->used_idx) { + debug("Last avail index != used index: %u != %u", + vq->last_avail_idx, vq->used_idx); + } + + return false; +} + +static bool vu_set_vring_base_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int index = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", index); + debug("State.num: %u", num); + vdev->vq[index].shadow_avail_idx = vdev->vq[index].last_avail_idx = num; + + return false; +} + +static bool vu_get_vring_base_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int index = msg->payload.state.index; + + debug("State.index: %u", index); + msg->payload.state.num = vdev->vq[index].last_avail_idx; + msg->hdr.size = sizeof(msg->payload.state); + + vdev->vq[index].started = false; + + if (vdev->vq[index].call_fd != -1) { + close(vdev->vq[index].call_fd); + vdev->vq[index].call_fd = -1; + } + if (vdev->vq[index].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[index].kick_fd); + close(vdev->vq[index].kick_fd); + vdev->vq[index].kick_fd = -1; + } + + return true; +} + +static void vu_set_watch(VuDev *vdev, int fd) +{ + struct ctx *c = (struct ctx *) ((char *)vdev - offsetof(struct ctx, vdev)); + union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_KICK, .fd = fd }; + struct epoll_event ev = { 0 }; + + ev.data.u64 = ref.u64; + ev.events = EPOLLIN; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev); +} + +int vu_send(const struct ctx *c, const void *buf, size_t size) +{ + VuDev *vdev = (VuDev *)&c->vdev; + size_t hdrlen = vdev->hdrlen; + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + unsigned int indexes[VIRTQUEUE_MAX_SIZE]; + size_t lens[VIRTQUEUE_MAX_SIZE]; + size_t offset; + int i, j; + __virtio16 *num_buffers_ptr; + + debug("vu_send size %zu hdrlen %zu", size, hdrlen); + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + err("Got packet, but no available descriptors on RX virtq."); + return 0; + } + + offset = 0; + i = 0; + num_buffers_ptr = NULL; + while (offset < size) { + VuVirtqElement *elem; + size_t len; + int total; + + total = 0; + + if (i == VIRTQUEUE_MAX_SIZE) { + err("virtio-net unexpected long buffer chain"); + goto err; + } + + elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), + buffer[VHOST_USER_RX_QUEUE]); + if (!elem) { + if (!vdev->broken) { + eventfd_t kick_data; + ssize_t rc; + int status; + + /* wait the kernel to put new entries in the queue */ + + status = fcntl(vq->kick_fd, F_GETFL); + if (status != -1) { + fcntl(vq->kick_fd, F_SETFL, status & ~O_NONBLOCK); + rc = eventfd_read(vq->kick_fd, &kick_data); + fcntl(vq->kick_fd, F_SETFL, status); + if (rc != -1) + continue; + } + } + if (i) { + err("virtio-net unexpected empty queue: " + "i %d mergeable %d offset %zd, size %zd, " + "features 0x%" PRIx64, + i, vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF), + offset, size, vdev->features); + } + offset = -1; + goto err; + } + + if (elem->in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_detach_element(vdev, vq, elem->index, 0); + offset = -1; + goto err; + } + + if (i == 0) { + struct virtio_net_hdr hdr = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, + }; + + ASSERT(offset == 0); + ASSERT(elem->in_sg[0].iov_len >= hdrlen); + + len = iov_from_buf(elem->in_sg, elem->in_num, 0, &hdr, sizeof hdr); + + num_buffers_ptr = (__virtio16 *)((char *)elem->in_sg[0].iov_base + + len); + + total += hdrlen; + } + + len = iov_from_buf(elem->in_sg, elem->in_num, total, (char *)buf + offset, + size - offset); + + total += len; + offset += len; + + /* If buffers can't be merged, at this point we + * must have consumed the complete packet. + * Otherwise, drop it. + */ + if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) && offset < size) { + vu_queue_unpop(vdev, vq, elem->index, total); + goto err; + } + + indexes[i] = elem->index; + lens[i] = total; + i++; + } + + if (num_buffers_ptr && vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { + *num_buffers_ptr = htole16(i); + } + + for (j = 0; j < i; j++) { + debug("filling total %zd idx %d", lens[j], j); + vu_queue_fill_by_index(vdev, vq, indexes[j], lens[j], j); + } + + vu_queue_flush(vdev, vq, i); + vu_queue_notify(vdev, vq); + + debug("sent %zu", offset); + + return offset; +err: + for (j = 0; j < i; j++) { + vu_queue_detach_element(vdev, vq, indexes[j], lens[j]); + } + + return offset; +} + +size_t tap_send_frames_vu(const struct ctx *c, const struct iovec *iov, size_t n) +{ + size_t i; + int ret; + + debug("tap_send_frames_vu n %zd", n); + + for (i = 0; i < n; i++) { + ret = vu_send(c, iov[i].iov_base, iov[i].iov_len); + if (ret < 0) + break; + } + debug("count %zd", i); + return i; +} + +static void vu_handle_tx(VuDev *vdev, int index) +{ + struct ctx *c = (struct ctx *) ((char *)vdev - offsetof(struct ctx, vdev)); + VuVirtq *vq = &vdev->vq[index]; + int hdrlen = vdev->hdrlen; + struct timespec now; + char *p; + size_t n; + + if (index % 2 != VHOST_USER_TX_QUEUE) { + debug("index %d is not an TX queue", index); + return; + } + + clock_gettime(CLOCK_MONOTONIC, &now); + + p = pkt_buf; + + pool_flush_all(); + + while (1) { + VuVirtqElement *elem; + unsigned int out_num; + struct iovec sg[VIRTQUEUE_MAX_SIZE], *out_sg; + + ASSERT(index == VHOST_USER_TX_QUEUE); + elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer[index]); + if (!elem) { + break; + } + + out_num = elem->out_num; + out_sg = elem->out_sg; + if (out_num < 1) { + debug("virtio-net header not in first element"); + break; + } + + if (hdrlen) { + unsigned sg_num; + + sg_num = iov_copy(sg, ARRAY_SIZE(sg), out_sg, out_num, + hdrlen, -1); + out_num = sg_num; + out_sg = sg; + } + + n = iov_to_buf(out_sg, out_num, 0, p, TAP_BUF_FILL); + + packet_add_all(c, n, p); + + p += n; + + vu_queue_push(vdev, vq, elem, 0); + vu_queue_notify(vdev, vq); + } + tap_handler_all(c, &now); +} + +void vu_kick_cb(struct ctx *c, union epoll_ref ref) +{ + VuDev *vdev = &c->vdev; + eventfd_t kick_data; + ssize_t rc; + int index; + + for (index = 0; index < VHOST_USER_MAX_QUEUES; index++) + if (c->vdev.vq[index].kick_fd == ref.fd) + break; + + if (index == VHOST_USER_MAX_QUEUES) + return; + + rc = eventfd_read(ref.fd, &kick_data); + if (rc == -1) { + vu_panic(vdev, "kick eventfd_read(): %s", strerror(errno)); + vu_remove_watch(vdev, ref.fd); + } else { + debug("Got kick_data: %016"PRIx64" idx:%d", + kick_data, index); + if (index % 2 == VHOST_USER_TX_QUEUE) + vu_handle_tx(vdev, index); + } +} + +static bool vu_check_queue_msg_file(VuDev *vdev, struct VhostUserMsg *msg) +{ + int index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + if (index >= VHOST_USER_MAX_QUEUES) { + vmsg_close_fds(msg); + vu_panic(vdev, "Invalid queue index: %u", index); + return false; + } + + if (nofd) { + vmsg_close_fds(msg); + return true; + } + + if (msg->fd_num != 1) { + vmsg_close_fds(msg); + vu_panic(vdev, "Invalid fds in request: %d", msg->hdr.request); + return false; + } + + return true; +} + +static bool vu_set_vring_kick_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + int index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + if (!vu_check_queue_msg_file(vdev, msg)) + return false; + + if (vdev->vq[index].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[index].kick_fd); + close(vdev->vq[index].kick_fd); + vdev->vq[index].kick_fd = -1; + } + + vdev->vq[index].kick_fd = nofd ? -1 : msg->fds[0]; + debug("Got kick_fd: %d for vq: %d", vdev->vq[index].kick_fd, index); + + vdev->vq[index].started = true; + + if (vdev->vq[index].kick_fd != -1 && index % 2 == VHOST_USER_TX_QUEUE) { + vu_set_watch(vdev, vdev->vq[index].kick_fd); + debug("Waiting for kicks on fd: %d for vq: %d", + vdev->vq[index].kick_fd, index); + } + + return false; +} + +static bool vu_set_vring_call_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + int index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + if (!vu_check_queue_msg_file(vdev, msg)) + return false; + + if (vdev->vq[index].call_fd != -1) { + close(vdev->vq[index].call_fd); + vdev->vq[index].call_fd = -1; + } + + vdev->vq[index].call_fd = nofd ? -1 : msg->fds[0]; + + /* in case of I/O hang after reconnecting */ + if (vdev->vq[index].call_fd != -1) { + eventfd_write(msg->fds[0], 1); + } + + debug("Got call_fd: %d for vq: %d", vdev->vq[index].call_fd, index); + + return false; +} + +static bool vu_set_vring_err_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + int index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + if (!vu_check_queue_msg_file(vdev, msg)) + return false; + + if (vdev->vq[index].err_fd != -1) { + close(vdev->vq[index].err_fd); + vdev->vq[index].err_fd = -1; + } + + vdev->vq[index].err_fd = nofd ? -1 : msg->fds[0]; + + return false; +} + +static bool vu_get_protocol_features_exec(struct VhostUserMsg *msg) +{ + uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK; + + vmsg_set_reply_u64(msg, features); + + return true; +} + +static bool vu_set_protocol_features_exec(VuDev *vdev, struct VhostUserMsg *msg) +{ + uint64_t features = msg->payload.u64; + + debug("u64: 0x%016"PRIx64, features); + + vdev->protocol_features = msg->payload.u64; + + if (vu_has_protocol_feature(vdev, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && + (!vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_BACKEND_REQ) || + !vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) { + /* + * The use case for using messages for kick/call is simulation, to make + * the kick and call synchronous. To actually get that behaviour, both + * of the other features are required. + * Theoretically, one could use only kick messages, or do them without + * having F_REPLY_ACK, but too many (possibly pending) messages on the + * socket will eventually cause the master to hang, to avoid this in + * scenarios where not desired enforce that the settings are in a way + * that actually enables the simulation case. + */ + vu_panic(vdev, + "F_IN_BAND_NOTIFICATIONS requires F_BACKEND_REQ && F_REPLY_ACK"); + return false; + } + + return false; +} + + +static bool vu_get_queue_num_exec(struct VhostUserMsg *msg) +{ + vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES); + return true; +} + +static bool vu_set_vring_enable_exec(VuDev *vdev, struct VhostUserMsg *msg) +{ + unsigned int index = msg->payload.state.index; + unsigned int enable = msg->payload.state.num; + + debug("State.index: %u", index); + debug("State.enable: %u", enable); + + if (index >= VHOST_USER_MAX_QUEUES) { + vu_panic(vdev, "Invalid vring_enable index: %u", index); + return false; + } + + vdev->vq[index].enable = enable; + return false; +} + +void vu_init(struct ctx *c) +{ + int i; + + c->vdev.hdrlen = 0; + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) + c->vdev.vq[i] = (VuVirtq){ + .call_fd = -1, + .kick_fd = -1, + .err_fd = -1, + .notification = true, + }; +} + +static void vu_cleanup(VuDev *vdev) +{ + unsigned int i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + VuVirtq *vq = &vdev->vq[i]; + + vq->started = false; + vq->notification = true; + + if (vq->call_fd != -1) { + close(vq->call_fd); + vq->call_fd = -1; + } + if (vq->err_fd != -1) { + close(vq->err_fd); + vq->err_fd = -1; + } + if (vq->kick_fd != -1) { + vu_remove_watch(vdev, vq->kick_fd); + close(vq->kick_fd); + vq->kick_fd = -1; + } + + vq->vring.desc = 0; + vq->vring.used = 0; + vq->vring.avail = 0; + } + vdev->hdrlen = 0; + + for (i = 0; i < vdev->nregions; i++) { + VuDevRegion *r = &vdev->regions[i]; + void *m = (void *) (uintptr_t) r->mmap_addr; + + if (m) + munmap(m, r->size + r->mmap_offset); + } + vdev->nregions = 0; +} + +/** + * tap_handler_vu() - Packet handler for vhost-user + * @c: Execution context + * @events: epoll events + */ +void tap_handler_vu(struct ctx *c, uint32_t events) +{ + VuDev *dev = &c->vdev; + struct VhostUserMsg msg = { 0 }; + bool need_reply, reply_requested; + int ret; + + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { + tap_sock_reset(c); + return; + } + + + ret = vu_message_read_default(dev, c->fd_tap, &msg); + if (ret <= 0) { + if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) + tap_sock_reset(c); + return; + } + debug("================ Vhost user message ================"); + debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request), + msg.hdr.request); + debug("Flags: 0x%x", msg.hdr.flags); + debug("Size: %u", msg.hdr.size); + + need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK; + switch (msg.hdr.request) { + case VHOST_USER_GET_FEATURES: + reply_requested = vu_get_features_exec(&msg); + break; + case VHOST_USER_SET_FEATURES: + reply_requested = vu_set_features_exec(dev, &msg); + break; + case VHOST_USER_GET_PROTOCOL_FEATURES: + reply_requested = vu_get_protocol_features_exec(&msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + reply_requested = vu_set_protocol_features_exec(dev, &msg); + break; + case VHOST_USER_GET_QUEUE_NUM: + reply_requested = vu_get_queue_num_exec(&msg); + break; + case VHOST_USER_SET_OWNER: + reply_requested = vu_set_owner_exec(); + break; + case VHOST_USER_SET_MEM_TABLE: + reply_requested = vu_set_mem_table_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_NUM: + reply_requested = vu_set_vring_num_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_ADDR: + reply_requested = vu_set_vring_addr_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_BASE: + reply_requested = vu_set_vring_base_exec(dev, &msg); + break; + case VHOST_USER_GET_VRING_BASE: + reply_requested = vu_get_vring_base_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_KICK: + reply_requested = vu_set_vring_kick_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + reply_requested = vu_set_vring_call_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_ERR: + reply_requested = vu_set_vring_err_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_ENABLE: + reply_requested = vu_set_vring_enable_exec(dev, &msg); + break; + case VHOST_USER_NONE: + vu_cleanup(dev); + return; + default: + vu_panic(dev, "Unhandled request: %d", msg.hdr.request); + return; + } + + if (!reply_requested && need_reply) { + msg.payload.u64 = 0; + msg.hdr.flags = 0; + msg.hdr.size = sizeof(msg.payload.u64); + msg.fd_num = 0; + reply_requested = true; + } + + if (reply_requested) + ret = vu_send_reply(dev, c->fd_tap, &msg); + free(msg.data); +} diff --git a/vhost_user.h b/vhost_user.h new file mode 100644 index 0000000..25f0b61 --- /dev/null +++ b/vhost_user.h @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* some parts from subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VHOST_USER_H +#define VHOST_USER_H + +#include "virtio.h" +#include "iov.h" + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +#define VHOST_MEMORY_BASELINE_NREGIONS 8 + +enum vhost_user_protocol_feature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + VHOST_USER_PROTOCOL_F_REPLY_ACK = 3, + VHOST_USER_PROTOCOL_F_NET_MTU = 4, + VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5, + VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7, + VHOST_USER_PROTOCOL_F_PAGEFAULT = 8, + VHOST_USER_PROTOCOL_F_CONFIG = 9, + VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, + VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, + + VHOST_USER_PROTOCOL_F_MAX +}; + +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_SET_BACKEND_REQ_FD = 21, + VHOST_USER_IOTLB_MSG = 22, + VHOST_USER_SET_VRING_ENDIAN = 23, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_CREATE_CRYPTO_SESSION = 26, + VHOST_USER_CLOSE_CRYPTO_SESSION = 27, + VHOST_USER_POSTCOPY_ADVISE = 28, + VHOST_USER_POSTCOPY_LISTEN = 29, + VHOST_USER_POSTCOPY_END = 30, + VHOST_USER_GET_INFLIGHT_FD = 31, + VHOST_USER_SET_INFLIGHT_FD = 32, + VHOST_USER_GPU_SET_SOCKET = 33, + VHOST_USER_VRING_KICK = 35, + VHOST_USER_GET_MAX_MEM_SLOTS = 36, + VHOST_USER_ADD_MEM_REG = 37, + VHOST_USER_REM_MEM_REG = 38, + VHOST_USER_MAX +}; + +typedef struct { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3) + uint32_t flags; + uint32_t size; /* the following payload size */ +} __attribute__ ((__packed__)) vhost_user_header; + +typedef struct VhostUserMemory_region { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemory_region; + +struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + struct VhostUserMemory_region regions[VHOST_MEMORY_BASELINE_NREGIONS]; +}; + +typedef union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct VhostUserMemory memory; +} vhost_user_payload; + +typedef struct VhostUserMsg { + vhost_user_header hdr; + vhost_user_payload payload; + + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; + int fd_num; + uint8_t *data; +} __attribute__ ((__packed__)) VhostUserMsg; +#define VHOST_USER_HDR_SIZE sizeof(vhost_user_header) + +#define VHOST_USER_RX_QUEUE 0 +#define VHOST_USER_TX_QUEUE 1 + +static inline bool vu_queue_enabled(VuVirtq *vq) +{ + return vq->enable; +} + +static inline bool vu_queue_started(const VuVirtq *vq) +{ + return vq->started; +} + +size_t tap_send_frames_vu(const struct ctx *c, const struct iovec *iov, + size_t n); +int vu_send(const struct ctx *c, const void *data, size_t len); +void vu_print_capabilities(void); +void vu_init(struct ctx *c); +void vu_kick_cb(struct ctx *c, union epoll_ref ref); +void tap_handler_vu(struct ctx *c, uint32_t events); +#endif /* VHOST_USER_H */