Compare commits
16 commits
master
...
vhost-user
Author | SHA1 | Date | |
---|---|---|---|
|
60e35ab2bd | ||
|
95aebad0a4 | ||
|
2d5528c9be | ||
|
1bf4abe402 | ||
|
37f457a76c | ||
|
b2229bd24f | ||
|
45b1403f42 | ||
|
bb3877dde3 | ||
|
27a713947c | ||
|
0938100596 | ||
|
72cadf34ad | ||
|
4d7ca742ef | ||
|
9cc20cbdb1 | ||
|
c38f260820 | ||
|
576c1cca2c | ||
|
a66fceb280 |
28 changed files with 3631 additions and 865 deletions
5
Makefile
5
Makefile
|
@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
|
|||
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
|
||||
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
|
||||
ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
|
||||
tcp_splice.c udp.c util.c
|
||||
tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_vu.c util.c vhost_user.c virtio.c
|
||||
QRAP_SRCS = qrap.c
|
||||
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
|
||||
|
||||
|
@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
|
|||
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
|
||||
flow_table.h icmp.h inany.h iov.h ip.h isolation.h lineread.h log.h \
|
||||
ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h siphash.h tap.h \
|
||||
tcp.h tcp_conn.h tcp_splice.h udp.h util.h
|
||||
tcp.h tcp_buf.h tcp_conn.h tcp_splice.h tcp_vu.h udp.h udp_internal.h \
|
||||
udp_vu.h util.h vhost_user.h virtio.h
|
||||
HEADERS = $(PASST_HEADERS) seccomp.h
|
||||
|
||||
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
|
||||
|
|
32
conf.c
32
conf.c
|
@ -44,6 +44,7 @@
|
|||
#include "lineread.h"
|
||||
#include "isolation.h"
|
||||
#include "log.h"
|
||||
#include "vhost_user.h"
|
||||
|
||||
/**
|
||||
* next_chunk - Return the next piece of a string delimited by a character
|
||||
|
@ -146,7 +147,7 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
|
|||
if (fwd->mode)
|
||||
goto mode_conflict;
|
||||
|
||||
if (c->mode != MODE_PASST)
|
||||
if (c->mode == MODE_PASTA)
|
||||
die("'all' port forwarding is only allowed for passt");
|
||||
|
||||
fwd->mode = FWD_ALL;
|
||||
|
@ -721,9 +722,12 @@ static void print_usage(const char *name, int status)
|
|||
info( " -I, --ns-ifname NAME namespace interface name");
|
||||
info( " default: same interface name as external one");
|
||||
} else {
|
||||
info( " -s, --socket PATH UNIX domain socket path");
|
||||
info( " -s, --socket, --socket-path PATH UNIX domain socket path");
|
||||
info( " default: probe free path starting from "
|
||||
UNIX_SOCK_PATH, 1);
|
||||
info( " --vhost-user Enable vhost-user mode");
|
||||
info( " UNIX domain socket is provided by -s option");
|
||||
info( " --print-capabilities print back-end capabilities in JSON format");
|
||||
}
|
||||
|
||||
info( " -F, --fd FD Use FD as pre-opened connected socket");
|
||||
|
@ -1109,6 +1113,7 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
{"help", no_argument, NULL, 'h' },
|
||||
{"socket", required_argument, NULL, 's' },
|
||||
{"fd", required_argument, NULL, 'F' },
|
||||
{"socket-path", required_argument, NULL, 's' }, /* vhost-user mandatory */
|
||||
{"ns-ifname", required_argument, NULL, 'I' },
|
||||
{"pcap", required_argument, NULL, 'p' },
|
||||
{"pid", required_argument, NULL, 'P' },
|
||||
|
@ -1155,6 +1160,8 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
{"config-net", no_argument, NULL, 17 },
|
||||
{"no-copy-routes", no_argument, NULL, 18 },
|
||||
{"no-copy-addrs", no_argument, NULL, 19 },
|
||||
{"vhost-user", no_argument, NULL, 20 },
|
||||
{"print-capabilities", no_argument, NULL, 21 }, /* vhost-user mandatory */
|
||||
{ 0 },
|
||||
};
|
||||
char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 };
|
||||
|
@ -1226,7 +1233,7 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
c->no_dhcp_dns = 0;
|
||||
break;
|
||||
case 6:
|
||||
if (c->mode != MODE_PASST)
|
||||
if (c->mode == MODE_PASTA)
|
||||
die("--no-dhcp-dns is for passt mode only");
|
||||
|
||||
c->no_dhcp_dns = 1;
|
||||
|
@ -1238,7 +1245,7 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
c->no_dhcp_dns_search = 0;
|
||||
break;
|
||||
case 8:
|
||||
if (c->mode != MODE_PASST)
|
||||
if (c->mode == MODE_PASTA)
|
||||
die("--no-dhcp-search is for passt mode only");
|
||||
|
||||
c->no_dhcp_dns_search = 1;
|
||||
|
@ -1293,7 +1300,7 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
break;
|
||||
case 14:
|
||||
fprintf(stdout,
|
||||
c->mode == MODE_PASST ? "passt " : "pasta ");
|
||||
c->mode == MODE_PASTA ? "pasta " : "passt ");
|
||||
fprintf(stdout, VERSION_BLOB);
|
||||
exit(EXIT_SUCCESS);
|
||||
case 15:
|
||||
|
@ -1314,7 +1321,6 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
sizeof(c->ip6.ifname_out), "%s", optarg);
|
||||
if (ret <= 0 || ret >= (int)sizeof(c->ip6.ifname_out))
|
||||
die("Invalid interface name: %s", optarg);
|
||||
|
||||
break;
|
||||
case 17:
|
||||
if (c->mode != MODE_PASTA)
|
||||
|
@ -1336,6 +1342,16 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
warn("--no-copy-addrs will be dropped soon");
|
||||
c->no_copy_addrs = copy_addrs_opt = true;
|
||||
break;
|
||||
case 20:
|
||||
if (c->mode == MODE_PASTA) {
|
||||
err("--vhost-user is for passt mode only");
|
||||
usage(argv[0]);
|
||||
}
|
||||
c->mode = MODE_VU;
|
||||
break;
|
||||
case 21:
|
||||
vu_print_capabilities();
|
||||
break;
|
||||
case 'd':
|
||||
if (c->debug)
|
||||
die("Multiple --debug options given");
|
||||
|
@ -1596,7 +1612,7 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
v6_only = true;
|
||||
break;
|
||||
case '1':
|
||||
if (c->mode != MODE_PASST)
|
||||
if (c->mode == MODE_PASTA)
|
||||
die("--one-off is for passt mode only");
|
||||
|
||||
if (c->one_off)
|
||||
|
@ -1643,7 +1659,7 @@ void conf(struct ctx *c, int argc, char **argv)
|
|||
conf_ugid(runas, &uid, &gid);
|
||||
|
||||
if (logfile) {
|
||||
logfile_init(c->mode == MODE_PASST ? "passt" : "pasta",
|
||||
logfile_init(c->mode == MODE_PASTA ? "pasta" : "passt",
|
||||
logfile, logsize);
|
||||
}
|
||||
|
||||
|
|
39
iov.c
39
iov.c
|
@ -156,42 +156,3 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt)
|
|||
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* iov_copy - Copy data from one scatter/gather I/O vector (struct iovec) to
|
||||
* another.
|
||||
*
|
||||
* @dst_iov: Pointer to the destination array of struct iovec describing
|
||||
* the scatter/gather I/O vector to copy to.
|
||||
* @dst_iov_cnt: Number of elements in the destination iov array.
|
||||
* @iov: Pointer to the source array of struct iovec describing
|
||||
* the scatter/gather I/O vector to copy from.
|
||||
* @iov_cnt: Number of elements in the source iov array.
|
||||
* @offset: Offset within the source iov from where copying should start.
|
||||
* @bytes: Total number of bytes to copy from iov to dst_iov.
|
||||
*
|
||||
* Returns: The number of elements successfully copied to the destination
|
||||
* iov array.
|
||||
*/
|
||||
/* cppcheck-suppress unusedFunction */
|
||||
unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt,
|
||||
const struct iovec *iov, size_t iov_cnt,
|
||||
size_t offset, size_t bytes)
|
||||
{
|
||||
unsigned int i, j;
|
||||
|
||||
i = iov_skip_bytes(iov, iov_cnt, offset, &offset);
|
||||
|
||||
/* copying data */
|
||||
for (j = 0; i < iov_cnt && j < dst_iov_cnt && bytes; i++) {
|
||||
size_t len = MIN(bytes, iov[i].iov_len - offset);
|
||||
|
||||
dst_iov[j].iov_base = (char *)iov[i].iov_base + offset;
|
||||
dst_iov[j].iov_len = len;
|
||||
j++;
|
||||
bytes -= len;
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
return j;
|
||||
}
|
||||
|
|
3
iov.h
3
iov.h
|
@ -25,7 +25,4 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
|
|||
size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
|
||||
size_t offset, void *buf, size_t bytes);
|
||||
size_t iov_size(const struct iovec *iov, size_t iov_cnt);
|
||||
unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt,
|
||||
const struct iovec *iov, size_t iov_cnt,
|
||||
size_t offset, size_t bytes);
|
||||
#endif /* IOVEC_H */
|
||||
|
|
10
isolation.c
10
isolation.c
|
@ -312,7 +312,7 @@ int isolate_prefork(const struct ctx *c)
|
|||
* PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody
|
||||
* ever gets around seccomp profiles -- there's no harm in passing it.
|
||||
*/
|
||||
if (!c->foreground || c->mode == MODE_PASST)
|
||||
if (!c->foreground || c->mode != MODE_PASTA)
|
||||
flags |= CLONE_NEWPID;
|
||||
|
||||
if (unshare(flags)) {
|
||||
|
@ -379,12 +379,12 @@ void isolate_postfork(const struct ctx *c)
|
|||
|
||||
prctl(PR_SET_DUMPABLE, 0);
|
||||
|
||||
if (c->mode == MODE_PASST) {
|
||||
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
|
||||
prog.filter = filter_passt;
|
||||
} else {
|
||||
if (c->mode == MODE_PASTA) {
|
||||
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
|
||||
prog.filter = filter_pasta;
|
||||
} else {
|
||||
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
|
||||
prog.filter = filter_passt;
|
||||
}
|
||||
|
||||
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
|
||||
|
|
85
packet.c
85
packet.c
|
@ -22,6 +22,42 @@
|
|||
#include "util.h"
|
||||
#include "log.h"
|
||||
|
||||
static int packet_check_range(const struct pool *p, size_t offset, size_t len,
|
||||
const char *start, const char *func, int line)
|
||||
{
|
||||
ASSERT(p->buf);
|
||||
|
||||
if (p->buf_size == 0)
|
||||
return vu_packet_check_range((void *)p->buf, offset, len, start,
|
||||
func, line);
|
||||
|
||||
if (start < p->buf) {
|
||||
if (func) {
|
||||
trace("add packet start %p before buffer start %p, "
|
||||
"%s:%i", (void *)start, (void *)p->buf, func, line);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (start + len + offset > p->buf + p->buf_size) {
|
||||
if (func) {
|
||||
trace("packet offset plus length %lu from size %lu, "
|
||||
"%s:%i", start - p->buf + len + offset,
|
||||
p->buf_size, func, line);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
#if UINTPTR_MAX == UINT64_MAX
|
||||
if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {
|
||||
trace("add packet start %p, buffer start %p, %s:%i",
|
||||
(void *)start, (void *)p->buf, func, line);
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
/**
|
||||
* packet_add_do() - Add data as packet descriptor to given pool
|
||||
* @p: Existing pool
|
||||
|
@ -41,34 +77,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
|
|||
return;
|
||||
}
|
||||
|
||||
if (start < p->buf) {
|
||||
trace("add packet start %p before buffer start %p, %s:%i",
|
||||
(void *)start, (void *)p->buf, func, line);
|
||||
if (packet_check_range(p, 0, len, start, func, line))
|
||||
return;
|
||||
}
|
||||
|
||||
if (start + len > p->buf + p->buf_size) {
|
||||
trace("add packet start %p, length: %zu, buffer end %p, %s:%i",
|
||||
(void *)start, len, (void *)(p->buf + p->buf_size),
|
||||
func, line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (len > UINT16_MAX) {
|
||||
trace("add packet length %zu, %s:%i", len, func, line);
|
||||
return;
|
||||
}
|
||||
|
||||
#if UINTPTR_MAX == UINT64_MAX
|
||||
if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {
|
||||
trace("add packet start %p, buffer start %p, %s:%i",
|
||||
(void *)start, (void *)p->buf, func, line);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
p->pkt[idx].offset = start - p->buf;
|
||||
p->pkt[idx].len = len;
|
||||
p->pkt[idx].iov_base = (void *)start;
|
||||
p->pkt[idx].iov_len = len;
|
||||
|
||||
p->count++;
|
||||
}
|
||||
|
@ -104,28 +122,23 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (p->pkt[idx].offset + len + offset > p->buf_size) {
|
||||
if (len + offset > p->pkt[idx].iov_len) {
|
||||
if (func) {
|
||||
trace("packet offset plus length %zu from size %zu, "
|
||||
"%s:%i", p->pkt[idx].offset + len + offset,
|
||||
p->buf_size, func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (len + offset > p->pkt[idx].len) {
|
||||
if (func) {
|
||||
trace("data length %zu, offset %zu from length %u, "
|
||||
"%s:%i", len, offset, p->pkt[idx].len,
|
||||
trace("data length %zu, offset %zu from length %zu, "
|
||||
"%s:%i", len, offset, p->pkt[idx].iov_len,
|
||||
func, line);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (left)
|
||||
*left = p->pkt[idx].len - offset - len;
|
||||
if (packet_check_range(p, offset, len, p->pkt[idx].iov_base,
|
||||
func, line))
|
||||
return NULL;
|
||||
|
||||
return p->buf + p->pkt[idx].offset + offset;
|
||||
if (left)
|
||||
*left = p->pkt[idx].iov_len - offset - len;
|
||||
|
||||
return (char *)p->pkt[idx].iov_base + offset;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
16
packet.h
16
packet.h
|
@ -6,16 +6,6 @@
|
|||
#ifndef PACKET_H
|
||||
#define PACKET_H
|
||||
|
||||
/**
|
||||
* struct desc - Generic offset-based descriptor within buffer
|
||||
* @offset: Offset of descriptor relative to buffer start, 32-bit limit
|
||||
* @len: Length of descriptor, host order, 16-bit limit
|
||||
*/
|
||||
struct desc {
|
||||
uint32_t offset;
|
||||
uint16_t len;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct pool - Generic pool of packets stored in a buffer
|
||||
* @buf: Buffer storing packet descriptors
|
||||
|
@ -29,9 +19,11 @@ struct pool {
|
|||
size_t buf_size;
|
||||
size_t size;
|
||||
size_t count;
|
||||
struct desc pkt[1];
|
||||
struct iovec pkt[1];
|
||||
};
|
||||
|
||||
int vu_packet_check_range(void *buf, size_t offset, size_t len,
|
||||
const char *start, const char *func, int line);
|
||||
void packet_add_do(struct pool *p, size_t len, const char *start,
|
||||
const char *func, int line);
|
||||
void *packet_get_do(const struct pool *p, const size_t idx,
|
||||
|
@ -54,7 +46,7 @@ struct _name ## _t { \
|
|||
size_t buf_size; \
|
||||
size_t size; \
|
||||
size_t count; \
|
||||
struct desc pkt[_size]; \
|
||||
struct iovec pkt[_size]; \
|
||||
}
|
||||
|
||||
#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \
|
||||
|
|
18
passt.c
18
passt.c
|
@ -73,6 +73,8 @@ char *epoll_type_str[] = {
|
|||
[EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device",
|
||||
[EPOLL_TYPE_TAP_PASST] = "connected qemu socket",
|
||||
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
|
||||
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
|
||||
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
|
||||
};
|
||||
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
|
||||
"epoll_type_str[] doesn't match enum epoll_type");
|
||||
|
@ -165,7 +167,7 @@ static void timer_init(struct ctx *c, const struct timespec *now)
|
|||
*/
|
||||
void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
|
||||
{
|
||||
tcp_update_l2_buf(eth_d, eth_s);
|
||||
tcp_buf_update_l2(eth_d, eth_s);
|
||||
udp_update_l2_buf(eth_d, eth_s);
|
||||
}
|
||||
|
||||
|
@ -278,6 +280,7 @@ int main(int argc, char **argv)
|
|||
pasta_netns_quit_init(&c);
|
||||
|
||||
tap_sock_init(&c);
|
||||
vu_init(&c);
|
||||
|
||||
secret_init(&c);
|
||||
|
||||
|
@ -348,7 +351,7 @@ loop:
|
|||
uint32_t eventmask = events[i].events;
|
||||
|
||||
trace("%s: epoll event on %s %i (events: 0x%08x)",
|
||||
c.mode == MODE_PASST ? "passt" : "pasta",
|
||||
c.mode == MODE_PASTA ? "pasta" : "passt",
|
||||
EPOLL_TYPE_STR(ref.type), ref.fd, eventmask);
|
||||
|
||||
switch (ref.type) {
|
||||
|
@ -380,7 +383,10 @@ loop:
|
|||
tcp_timer_handler(&c, ref);
|
||||
break;
|
||||
case EPOLL_TYPE_UDP:
|
||||
udp_sock_handler(&c, ref, eventmask, &now);
|
||||
if (c.mode == MODE_VU)
|
||||
udp_vu_sock_handler(&c, ref, eventmask, &now);
|
||||
else
|
||||
udp_buf_sock_handler(&c, ref, eventmask, &now);
|
||||
break;
|
||||
case EPOLL_TYPE_ICMP:
|
||||
icmp_sock_handler(&c, AF_INET, ref);
|
||||
|
@ -388,6 +394,12 @@ loop:
|
|||
case EPOLL_TYPE_ICMPV6:
|
||||
icmp_sock_handler(&c, AF_INET6, ref);
|
||||
break;
|
||||
case EPOLL_TYPE_VHOST_CMD:
|
||||
tap_handler_vu(&c, eventmask);
|
||||
break;
|
||||
case EPOLL_TYPE_VHOST_KICK:
|
||||
vu_kick_cb(&c, ref);
|
||||
break;
|
||||
default:
|
||||
/* Can't happen */
|
||||
ASSERT(0);
|
||||
|
|
10
passt.h
10
passt.h
|
@ -42,6 +42,8 @@ union epoll_ref;
|
|||
#include "fwd.h"
|
||||
#include "tcp.h"
|
||||
#include "udp.h"
|
||||
#include "udp_vu.h"
|
||||
#include "vhost_user.h"
|
||||
|
||||
/**
|
||||
* enum epoll_type - Different types of fds we poll over
|
||||
|
@ -73,6 +75,10 @@ enum epoll_type {
|
|||
EPOLL_TYPE_TAP_PASST,
|
||||
/* socket listening for qemu socket connections */
|
||||
EPOLL_TYPE_TAP_LISTEN,
|
||||
/* vhost-user command socket */
|
||||
EPOLL_TYPE_VHOST_CMD,
|
||||
/* vhost-user kick event socket */
|
||||
EPOLL_TYPE_VHOST_KICK,
|
||||
|
||||
EPOLL_NUM_TYPES,
|
||||
};
|
||||
|
@ -140,6 +146,7 @@ struct fqdn {
|
|||
enum passt_modes {
|
||||
MODE_PASST,
|
||||
MODE_PASTA,
|
||||
MODE_VU,
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -307,6 +314,9 @@ struct ctx {
|
|||
|
||||
int low_wmem;
|
||||
int low_rmem;
|
||||
|
||||
/* vhost-user */
|
||||
struct VuDev vdev;
|
||||
};
|
||||
|
||||
void proto_update_l2_buf(const unsigned char *eth_d,
|
||||
|
|
323
tap.c
323
tap.c
|
@ -58,6 +58,7 @@
|
|||
#include "packet.h"
|
||||
#include "tap.h"
|
||||
#include "log.h"
|
||||
#include "vhost_user.h"
|
||||
|
||||
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
|
||||
static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
|
||||
|
@ -76,19 +77,22 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
|
|||
*/
|
||||
int tap_send(const struct ctx *c, const void *data, size_t len)
|
||||
{
|
||||
int flags = MSG_NOSIGNAL | MSG_DONTWAIT;
|
||||
uint32_t vnet_len = htonl(len);
|
||||
|
||||
pcap(data, len);
|
||||
|
||||
if (c->mode == MODE_PASST) {
|
||||
int flags = MSG_NOSIGNAL | MSG_DONTWAIT;
|
||||
uint32_t vnet_len = htonl(len);
|
||||
|
||||
switch (c->mode) {
|
||||
case MODE_PASST:
|
||||
if (send(c->fd_tap, &vnet_len, 4, flags) < 0)
|
||||
return -1;
|
||||
|
||||
return send(c->fd_tap, data, len, flags);
|
||||
case MODE_PASTA:
|
||||
return write(c->fd_tap, (char *)data, len);
|
||||
case MODE_VU:
|
||||
return vu_send(c, data, len);
|
||||
}
|
||||
|
||||
return write(c->fd_tap, (char *)data, len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -350,6 +354,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c,
|
|||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_send_iov_pasta() - Send out multiple prepared frames
|
||||
* @c: Execution context
|
||||
* @iov: Array of frames, each frames is divided in an array of iovecs.
|
||||
* The first entry of the iovec is ignored
|
||||
* @n: Number of frames in @iov
|
||||
*
|
||||
* Return: number of frames actually sent
|
||||
*/
|
||||
static size_t tap_send_iov_pasta(const struct ctx *c,
|
||||
struct iovec iov[][TCP_IOV_NUM], size_t n)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
if (!tap_send_frames_pasta(c, &iov[i][TCP_IOV_ETH],
|
||||
TCP_IOV_NUM - TCP_IOV_ETH))
|
||||
break;
|
||||
}
|
||||
|
||||
return i;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_send_frames_passt() - Send multiple frames to the passt tap
|
||||
* @c: Execution context
|
||||
|
@ -390,6 +418,42 @@ static size_t tap_send_frames_passt(const struct ctx *c,
|
|||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_send_iov_passt() - Send out multiple prepared frames
|
||||
* @c: Execution context
|
||||
* @iov: Array of frames, each frames is divided in an array of iovecs.
|
||||
* The first entry of the iovec is updated to point to an
|
||||
* uint32_t storing the frame length.
|
||||
* @n: Number of frames in @iov
|
||||
*
|
||||
* Return: number of frames actually sent
|
||||
*/
|
||||
static size_t tap_send_iov_passt(const struct ctx *c,
|
||||
struct iovec iov[][TCP_IOV_NUM],
|
||||
size_t n)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
uint32_t vnet_len;
|
||||
int j;
|
||||
|
||||
vnet_len = 0;
|
||||
for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
|
||||
vnet_len += iov[i][j].iov_len;
|
||||
|
||||
vnet_len = htonl(vnet_len);
|
||||
iov[i][TCP_IOV_VNET].iov_base = &vnet_len;
|
||||
iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len);
|
||||
|
||||
if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))
|
||||
break;
|
||||
}
|
||||
|
||||
return i;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_send_frames() - Send out multiple prepared frames
|
||||
* @c: Execution context
|
||||
|
@ -405,10 +469,19 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n)
|
|||
if (!n)
|
||||
return 0;
|
||||
|
||||
if (c->mode == MODE_PASST)
|
||||
m = tap_send_frames_passt(c, iov, n);
|
||||
else
|
||||
switch (c->mode) {
|
||||
case MODE_PASTA:
|
||||
m = tap_send_frames_pasta(c, iov, n);
|
||||
break;
|
||||
case MODE_PASST:
|
||||
m = tap_send_frames_passt(c, iov, n);
|
||||
break;
|
||||
case MODE_VU:
|
||||
ASSERT(0);
|
||||
default:
|
||||
m = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (m < n)
|
||||
debug("tap: failed to send %zu frames of %zu", n - m, n);
|
||||
|
@ -418,6 +491,50 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n)
|
|||
return m;
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_send_iov() - Send out multiple prepared frames
|
||||
* @c: Execution context
|
||||
* @iov: Array of frames, each frames is divided in an array of iovecs.
|
||||
* iovec array is:
|
||||
* TCP_IOV_VNET (0) vnet length
|
||||
* TCP_IOV_ETH (1) ethernet header
|
||||
* TCP_IOV_IP (2) IP (v4/v6) header
|
||||
* TCP_IOV_PAYLOAD (3) IP payload (TCP header + data)
|
||||
* TCP_IOV_NUM (4) is the number of entries in the iovec array
|
||||
* TCP_IOV_VNET entry is updated with passt, ignored with pasta.
|
||||
* @n: Number of frames in @iov
|
||||
*
|
||||
* Return: number of frames actually sent
|
||||
*/
|
||||
size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
|
||||
size_t n)
|
||||
{
|
||||
size_t m;
|
||||
unsigned int i;
|
||||
|
||||
if (!n)
|
||||
return 0;
|
||||
|
||||
switch (c->mode) {
|
||||
case MODE_PASST:
|
||||
m = tap_send_iov_passt(c, iov, n);
|
||||
break;
|
||||
case MODE_PASTA:
|
||||
m = tap_send_iov_pasta(c, iov, n);
|
||||
break;
|
||||
default:
|
||||
ASSERT(0);
|
||||
}
|
||||
|
||||
if (m < n)
|
||||
debug("tap: failed to send %zu frames of %zu", n - m, n);
|
||||
|
||||
for (i = 0; i < m; i++)
|
||||
pcap_iov(&iov[i][TCP_IOV_ETH], TCP_IOV_NUM - TCP_IOV_ETH);
|
||||
|
||||
return m;
|
||||
}
|
||||
|
||||
/**
|
||||
* eth_update_mac() - Update tap L2 header with new Ethernet addresses
|
||||
* @eh: Ethernet headers to update
|
||||
|
@ -589,7 +706,7 @@ resume:
|
|||
if (!eh)
|
||||
continue;
|
||||
if (ntohs(eh->h_proto) == ETH_P_ARP) {
|
||||
PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
|
||||
PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
|
||||
|
||||
packet_add(pkt, l2_len, (char *)eh);
|
||||
arp(c, pkt);
|
||||
|
@ -629,7 +746,7 @@ resume:
|
|||
continue;
|
||||
|
||||
if (iph->protocol == IPPROTO_ICMP) {
|
||||
PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
|
||||
PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
|
||||
|
||||
if (c->no_icmp)
|
||||
continue;
|
||||
|
@ -648,7 +765,7 @@ resume:
|
|||
continue;
|
||||
|
||||
if (iph->protocol == IPPROTO_UDP) {
|
||||
PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
|
||||
PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
|
||||
|
||||
packet_add(pkt, l2_len, (char *)eh);
|
||||
if (dhcp(c, pkt))
|
||||
|
@ -797,7 +914,7 @@ resume:
|
|||
}
|
||||
|
||||
if (proto == IPPROTO_ICMPV6) {
|
||||
PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
|
||||
PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
|
||||
|
||||
if (c->no_icmp)
|
||||
continue;
|
||||
|
@ -821,7 +938,7 @@ resume:
|
|||
uh = (struct udphdr *)l4h;
|
||||
|
||||
if (proto == IPPROTO_UDP) {
|
||||
PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
|
||||
PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
|
||||
|
||||
packet_add(pkt, l4_len, l4h);
|
||||
|
||||
|
@ -907,11 +1024,50 @@ append:
|
|||
return in->count;
|
||||
}
|
||||
|
||||
void pool_flush_all(void)
|
||||
{
|
||||
pool_flush(pool_tap4);
|
||||
pool_flush(pool_tap6);
|
||||
}
|
||||
|
||||
void tap_handler_all(struct ctx *c, const struct timespec *now)
|
||||
{
|
||||
tap4_handler(c, pool_tap4, now);
|
||||
tap6_handler(c, pool_tap6, now);
|
||||
}
|
||||
|
||||
void packet_add_all_do(struct ctx *c, ssize_t len, char *p,
|
||||
const char *func, int line)
|
||||
{
|
||||
const struct ethhdr *eh;
|
||||
|
||||
pcap(p, len);
|
||||
|
||||
eh = (struct ethhdr *)p;
|
||||
|
||||
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
|
||||
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
|
||||
proto_update_l2_buf(c->mac_guest, NULL);
|
||||
}
|
||||
|
||||
switch (ntohs(eh->h_proto)) {
|
||||
case ETH_P_ARP:
|
||||
case ETH_P_IP:
|
||||
packet_add_do(pool_tap4, len, p, func, line);
|
||||
break;
|
||||
case ETH_P_IPV6:
|
||||
packet_add_do(pool_tap6, len, p, func, line);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket
|
||||
* @c: Execution context
|
||||
*/
|
||||
static void tap_sock_reset(struct ctx *c)
|
||||
void tap_sock_reset(struct ctx *c)
|
||||
{
|
||||
if (c->one_off) {
|
||||
info("Client closed connection, exiting");
|
||||
|
@ -933,7 +1089,6 @@ static void tap_sock_reset(struct ctx *c)
|
|||
void tap_handler_passt(struct ctx *c, uint32_t events,
|
||||
const struct timespec *now)
|
||||
{
|
||||
const struct ethhdr *eh;
|
||||
ssize_t n, rem;
|
||||
char *p;
|
||||
|
||||
|
@ -946,8 +1101,7 @@ redo:
|
|||
p = pkt_buf;
|
||||
rem = 0;
|
||||
|
||||
pool_flush(pool_tap4);
|
||||
pool_flush(pool_tap6);
|
||||
pool_flush_all();
|
||||
|
||||
n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT);
|
||||
if (n < 0) {
|
||||
|
@ -974,37 +1128,18 @@ redo:
|
|||
/* Complete the partial read above before discarding a malformed
|
||||
* frame, otherwise the stream will be inconsistent.
|
||||
*/
|
||||
if (len < (ssize_t)sizeof(*eh) || len > (ssize_t)ETH_MAX_MTU)
|
||||
if (len < (ssize_t)sizeof(struct ethhdr) ||
|
||||
len > (ssize_t)ETH_MAX_MTU)
|
||||
goto next;
|
||||
|
||||
pcap(p, len);
|
||||
|
||||
eh = (struct ethhdr *)p;
|
||||
|
||||
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
|
||||
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
|
||||
proto_update_l2_buf(c->mac_guest, NULL);
|
||||
}
|
||||
|
||||
switch (ntohs(eh->h_proto)) {
|
||||
case ETH_P_ARP:
|
||||
case ETH_P_IP:
|
||||
packet_add(pool_tap4, len, p);
|
||||
break;
|
||||
case ETH_P_IPV6:
|
||||
packet_add(pool_tap6, len, p);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
packet_add_all(c, len, p);
|
||||
|
||||
next:
|
||||
p += len;
|
||||
n -= len;
|
||||
}
|
||||
|
||||
tap4_handler(c, pool_tap4, now);
|
||||
tap6_handler(c, pool_tap6, now);
|
||||
tap_handler_all(c, now);
|
||||
|
||||
/* We can't use EPOLLET otherwise. */
|
||||
if (rem)
|
||||
|
@ -1029,35 +1164,18 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
|
|||
redo:
|
||||
n = 0;
|
||||
|
||||
pool_flush(pool_tap4);
|
||||
pool_flush(pool_tap6);
|
||||
pool_flush_all();
|
||||
restart:
|
||||
while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
|
||||
const struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n);
|
||||
|
||||
if (len < (ssize_t)sizeof(*eh) || len > (ssize_t)ETH_MAX_MTU) {
|
||||
if (len < (ssize_t)sizeof(struct ethhdr) ||
|
||||
len > (ssize_t)ETH_MAX_MTU) {
|
||||
n += len;
|
||||
continue;
|
||||
}
|
||||
|
||||
pcap(pkt_buf + n, len);
|
||||
|
||||
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
|
||||
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
|
||||
proto_update_l2_buf(c->mac_guest, NULL);
|
||||
}
|
||||
|
||||
switch (ntohs(eh->h_proto)) {
|
||||
case ETH_P_ARP:
|
||||
case ETH_P_IP:
|
||||
packet_add(pool_tap4, len, pkt_buf + n);
|
||||
break;
|
||||
case ETH_P_IPV6:
|
||||
packet_add(pool_tap6, len, pkt_buf + n);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
packet_add_all(c, len, pkt_buf + n);
|
||||
|
||||
if ((n += len) == TAP_BUF_BYTES)
|
||||
break;
|
||||
|
@ -1068,8 +1186,7 @@ restart:
|
|||
|
||||
ret = errno;
|
||||
|
||||
tap4_handler(c, pool_tap4, now);
|
||||
tap6_handler(c, pool_tap6, now);
|
||||
tap_handler_all(c, now);
|
||||
|
||||
if (len > 0 || ret == EAGAIN)
|
||||
return;
|
||||
|
@ -1145,11 +1262,17 @@ static void tap_sock_unix_init(struct ctx *c)
|
|||
ev.data.u64 = ref.u64;
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
|
||||
|
||||
info("You can now start qemu (>= 7.2, with commit 13c6be96618c):");
|
||||
info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
|
||||
addr.sun_path);
|
||||
info("or qrap, for earlier qemu versions:");
|
||||
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
|
||||
if (c->mode == MODE_VU) {
|
||||
info("You can start qemu with:");
|
||||
info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n",
|
||||
addr.sun_path);
|
||||
} else {
|
||||
info("You can now start qemu (>= 7.2, with commit 13c6be96618c):");
|
||||
info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
|
||||
addr.sun_path);
|
||||
info("or qrap, for earlier qemu versions:");
|
||||
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1159,7 +1282,7 @@ static void tap_sock_unix_init(struct ctx *c)
|
|||
*/
|
||||
void tap_listen_handler(struct ctx *c, uint32_t events)
|
||||
{
|
||||
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST };
|
||||
union epoll_ref ref;
|
||||
struct epoll_event ev = { 0 };
|
||||
int v = INT_MAX / 2;
|
||||
struct ucred ucred;
|
||||
|
@ -1200,7 +1323,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
|
|||
trace("tap: failed to set SO_SNDBUF to %i", v);
|
||||
|
||||
ref.fd = c->fd_tap;
|
||||
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
|
||||
if (c->mode == MODE_VU) {
|
||||
ref.type = EPOLL_TYPE_VHOST_CMD;
|
||||
ev.events = EPOLLIN | EPOLLRDHUP;
|
||||
} else {
|
||||
ref.type = EPOLL_TYPE_TAP_PASST;
|
||||
ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
|
||||
}
|
||||
ev.data.u64 = ref.u64;
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
|
||||
}
|
||||
|
@ -1261,6 +1390,23 @@ static void tap_sock_tun_init(struct ctx *c)
|
|||
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
|
||||
}
|
||||
|
||||
void tap_sock_update_buf(void *base, size_t size)
|
||||
{
|
||||
int i;
|
||||
|
||||
pool_tap4_storage.buf = base;
|
||||
pool_tap4_storage.buf_size = size;
|
||||
pool_tap6_storage.buf = base;
|
||||
pool_tap6_storage.buf_size = size;
|
||||
|
||||
for (i = 0; i < TAP_SEQS; i++) {
|
||||
tap4_l4[i].p.buf = base;
|
||||
tap4_l4[i].p.buf_size = size;
|
||||
tap6_l4[i].p.buf = base;
|
||||
tap6_l4[i].p.buf_size = size;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor
|
||||
* @c: Execution context
|
||||
|
@ -1272,10 +1418,22 @@ void tap_sock_init(struct ctx *c)
|
|||
|
||||
pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz);
|
||||
pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz);
|
||||
if (c->mode == MODE_VU) {
|
||||
pool_tap4_storage.buf = NULL;
|
||||
pool_tap4_storage.buf_size = 0;
|
||||
pool_tap6_storage.buf = NULL;
|
||||
pool_tap6_storage.buf_size = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < TAP_SEQS; i++) {
|
||||
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
|
||||
tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
|
||||
if (c->mode == MODE_VU) {
|
||||
tap4_l4[i].p.buf = NULL;
|
||||
tap4_l4[i].p.buf_size = 0;
|
||||
tap6_l4[i].p.buf = NULL;
|
||||
tap6_l4[i].p.buf_size = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (c->fd_tap != -1) { /* Passed as --fd */
|
||||
|
@ -1284,21 +1442,30 @@ void tap_sock_init(struct ctx *c)
|
|||
|
||||
ASSERT(c->one_off);
|
||||
ref.fd = c->fd_tap;
|
||||
if (c->mode == MODE_PASST)
|
||||
switch (c->mode) {
|
||||
case MODE_PASST:
|
||||
ref.type = EPOLL_TYPE_TAP_PASST;
|
||||
else
|
||||
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
|
||||
break;
|
||||
case MODE_PASTA:
|
||||
ref.type = EPOLL_TYPE_TAP_PASTA;
|
||||
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
|
||||
break;
|
||||
case MODE_VU:
|
||||
ref.type = EPOLL_TYPE_VHOST_CMD;
|
||||
ev.events = EPOLLIN | EPOLLRDHUP;
|
||||
break;
|
||||
}
|
||||
|
||||
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
|
||||
ev.data.u64 = ref.u64;
|
||||
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
|
||||
return;
|
||||
}
|
||||
|
||||
if (c->mode == MODE_PASST) {
|
||||
if (c->mode == MODE_PASTA) {
|
||||
tap_sock_tun_init(c);
|
||||
} else {
|
||||
if (c->fd_tap_listen == -1)
|
||||
tap_sock_unix_init(c);
|
||||
} else {
|
||||
tap_sock_tun_init(c);
|
||||
}
|
||||
}
|
||||
|
|
27
tap.h
27
tap.h
|
@ -6,6 +6,20 @@
|
|||
#ifndef TAP_H
|
||||
#define TAP_H
|
||||
|
||||
/*
|
||||
* TCP frame iovec array:
|
||||
* TCP_IOV_VNET vnet length
|
||||
* TCP_IOV_ETH ethernet header
|
||||
* TCP_IOV_IP IP (v4/v6) header
|
||||
* TCP_IOV_PAYLOAD IP payload (TCP header + data)
|
||||
* TCP_IOV_NUM is the number of entries in the iovec array
|
||||
*/
|
||||
#define TCP_IOV_VNET 0
|
||||
#define TCP_IOV_ETH 1
|
||||
#define TCP_IOV_IP 2
|
||||
#define TCP_IOV_PAYLOAD 3
|
||||
#define TCP_IOV_NUM 4
|
||||
|
||||
/**
|
||||
* struct tap_hdr - L2 and tap specific headers
|
||||
* @vnet_len: Frame length (for qemu socket transport)
|
||||
|
@ -74,6 +88,8 @@ void tap_icmp6_send(const struct ctx *c,
|
|||
const void *in, size_t len);
|
||||
int tap_send(const struct ctx *c, const void *data, size_t len);
|
||||
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n);
|
||||
size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
|
||||
size_t n);
|
||||
void eth_update_mac(struct ethhdr *eh,
|
||||
const unsigned char *eth_d, const unsigned char *eth_s);
|
||||
void tap_listen_handler(struct ctx *c, uint32_t events);
|
||||
|
@ -81,6 +97,17 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
|
|||
const struct timespec *now);
|
||||
void tap_handler_passt(struct ctx *c, uint32_t events,
|
||||
const struct timespec *now);
|
||||
void tap_sock_reset(struct ctx *c);
|
||||
void tap_sock_update_buf(void *base, size_t size);
|
||||
void tap_sock_init(struct ctx *c);
|
||||
void pool_flush_all(void);
|
||||
void tap_handler_all(struct ctx *c, const struct timespec *now);
|
||||
|
||||
void packet_add_do(struct pool *p, size_t len, const char *start,
|
||||
const char *func, int line);
|
||||
void packet_add_all_do(struct ctx *c, ssize_t len, char *p,
|
||||
const char *func, int line);
|
||||
#define packet_add_all(p, len, start) \
|
||||
packet_add_all_do(p, len, start, __func__, __LINE__)
|
||||
|
||||
#endif /* TAP_H */
|
||||
|
|
688
tcp.c
688
tcp.c
|
@ -301,57 +301,20 @@
|
|||
#include "flow.h"
|
||||
|
||||
#include "flow_table.h"
|
||||
#include "tcp_internal.h"
|
||||
#include "tcp_buf.h"
|
||||
#include "tcp_vu.h"
|
||||
|
||||
/* Sides of a flow as we use them in "tap" connections */
|
||||
#define SOCKSIDE 0
|
||||
#define TAPSIDE 1
|
||||
|
||||
#define TCP_FRAMES_MEM 128
|
||||
#define TCP_FRAMES \
|
||||
(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
|
||||
|
||||
#define TCP_HASH_TABLE_LOAD 70 /* % */
|
||||
#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)
|
||||
|
||||
#define MAX_WS 8
|
||||
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
|
||||
|
||||
/* MSS rounding: see SET_MSS() */
|
||||
#define MSS_DEFAULT 536
|
||||
|
||||
struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
|
||||
#ifdef __AVX2__
|
||||
uint8_t pad[26];
|
||||
#else
|
||||
uint8_t pad[2];
|
||||
#endif
|
||||
struct tap_hdr taph;
|
||||
struct iphdr iph;
|
||||
struct tcphdr th;
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)));
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
|
||||
#ifdef __AVX2__
|
||||
uint8_t pad[14];
|
||||
#else
|
||||
uint8_t pad[2];
|
||||
#endif
|
||||
struct tap_hdr taph;
|
||||
struct ipv6hdr ip6h;
|
||||
struct tcphdr th;
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)));
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
|
||||
#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
|
||||
|
||||
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
|
||||
#ifdef HAS_SND_WND
|
||||
# define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd)
|
||||
|
@ -373,31 +336,9 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
|
|||
*/
|
||||
#define SOL_TCP IPPROTO_TCP
|
||||
|
||||
#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
|
||||
#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
|
||||
#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
|
||||
#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
|
||||
#define ACK_IF_NEEDED 0 /* See tcp_buf_send_flag() */
|
||||
|
||||
#define FIN (1 << 0)
|
||||
#define SYN (1 << 1)
|
||||
#define RST (1 << 2)
|
||||
#define ACK (1 << 4)
|
||||
/* Flags for internal usage */
|
||||
#define DUP_ACK (1 << 5)
|
||||
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
|
||||
|
||||
#define OPT_EOL 0
|
||||
#define OPT_NOP 1
|
||||
#define OPT_MSS 2
|
||||
#define OPT_MSS_LEN 4
|
||||
#define OPT_WS 3
|
||||
#define OPT_WS_LEN 3
|
||||
#define OPT_SACKP 4
|
||||
#define OPT_SACK 5
|
||||
#define OPT_TS 8
|
||||
|
||||
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
|
||||
#define CONN_V6(conn) (!CONN_V4(conn))
|
||||
#define CONN_IS_CLOSING(conn) \
|
||||
((conn->events & ESTABLISHED) && \
|
||||
(conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
|
||||
|
@ -434,144 +375,11 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
|
|||
*/
|
||||
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
|
||||
|
||||
/**
|
||||
* tcp_buf_seq_update - Sequences to update with length of frames once sent
|
||||
* @seq: Pointer to sequence number sent to tap-side, to be updated
|
||||
* @len: TCP payload length
|
||||
*/
|
||||
struct tcp_buf_seq_update {
|
||||
uint32_t *seq;
|
||||
uint16_t len;
|
||||
};
|
||||
|
||||
/* Static buffers */
|
||||
|
||||
/**
|
||||
* tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
|
||||
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
|
||||
* @taph: Tap-level headers (partially pre-filled)
|
||||
* @iph: Pre-filled IP header (except for tot_len and saddr)
|
||||
* @uh: Headroom for TCP header
|
||||
* @data: Storage for TCP payload
|
||||
*/
|
||||
static struct tcp4_l2_buf_t {
|
||||
#ifdef __AVX2__
|
||||
uint8_t pad[26]; /* 0, align th to 32 bytes */
|
||||
#else
|
||||
uint8_t pad[2]; /* align iph to 4 bytes 0 */
|
||||
#endif
|
||||
struct tap_hdr taph; /* 26 2 */
|
||||
struct iphdr iph; /* 44 20 */
|
||||
struct tcphdr th; /* 64 40 */
|
||||
uint8_t data[MSS4]; /* 84 60 */
|
||||
/* 65536 65532 */
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)))
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
|
||||
#endif
|
||||
tcp4_l2_buf[TCP_FRAMES_MEM];
|
||||
|
||||
static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
|
||||
|
||||
static unsigned int tcp4_l2_buf_used;
|
||||
|
||||
/**
|
||||
* tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
|
||||
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
|
||||
* @taph: Tap-level headers (partially pre-filled)
|
||||
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
|
||||
* @th: Headroom for TCP header
|
||||
* @data: Storage for TCP payload
|
||||
*/
|
||||
struct tcp6_l2_buf_t {
|
||||
#ifdef __AVX2__
|
||||
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
|
||||
#else
|
||||
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
|
||||
#endif
|
||||
struct tap_hdr taph; /* 14 2 */
|
||||
struct ipv6hdr ip6h; /* 32 20 */
|
||||
struct tcphdr th; /* 72 60 */
|
||||
uint8_t data[MSS6]; /* 92 80 */
|
||||
/* 65536 65532 */
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)))
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
|
||||
#endif
|
||||
tcp6_l2_buf[TCP_FRAMES_MEM];
|
||||
|
||||
static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
|
||||
|
||||
static unsigned int tcp6_l2_buf_used;
|
||||
|
||||
/* recvmsg()/sendmsg() data for tap */
|
||||
static char tcp_buf_discard [MAX_WINDOW];
|
||||
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
|
||||
|
||||
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM];
|
||||
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM];
|
||||
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM];
|
||||
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM];
|
||||
char tcp_buf_discard [MAX_WINDOW];
|
||||
|
||||
/* sendmsg() to socket */
|
||||
static struct iovec tcp_iov [UIO_MAXIOV];
|
||||
|
||||
/**
|
||||
* tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
|
||||
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
|
||||
* @taph: Tap-level headers (partially pre-filled)
|
||||
* @iph: Pre-filled IP header (except for tot_len and saddr)
|
||||
* @th: Headroom for TCP header
|
||||
* @opts: Headroom for TCP options
|
||||
*/
|
||||
static struct tcp4_l2_flags_buf_t {
|
||||
#ifdef __AVX2__
|
||||
uint8_t pad[26]; /* 0, align th to 32 bytes */
|
||||
#else
|
||||
uint8_t pad[2]; /* align iph to 4 bytes 0 */
|
||||
#endif
|
||||
struct tap_hdr taph; /* 26 2 */
|
||||
struct iphdr iph; /* 44 20 */
|
||||
struct tcphdr th; /* 64 40 */
|
||||
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)))
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
|
||||
#endif
|
||||
tcp4_l2_flags_buf[TCP_FRAMES_MEM];
|
||||
|
||||
static unsigned int tcp4_l2_flags_buf_used;
|
||||
|
||||
/**
|
||||
* tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
|
||||
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
|
||||
* @taph: Tap-level headers (partially pre-filled)
|
||||
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
|
||||
* @th: Headroom for TCP header
|
||||
* @opts: Headroom for TCP options
|
||||
*/
|
||||
static struct tcp6_l2_flags_buf_t {
|
||||
#ifdef __AVX2__
|
||||
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
|
||||
#else
|
||||
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
|
||||
#endif
|
||||
struct tap_hdr taph; /* 14 2 */
|
||||
struct ipv6hdr ip6h; /* 32 20 */
|
||||
struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */
|
||||
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)))
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
|
||||
#endif
|
||||
tcp6_l2_flags_buf[TCP_FRAMES_MEM];
|
||||
|
||||
static unsigned int tcp6_l2_flags_buf_used;
|
||||
|
||||
#define CONN(idx) (&(FLOW(idx)->tcp))
|
||||
|
||||
/* Table for lookup from remote address, local port, remote port */
|
||||
|
@ -612,14 +420,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
|
|||
return EPOLLRDHUP;
|
||||
}
|
||||
|
||||
static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
unsigned long flag);
|
||||
#define conn_flag(c, conn, flag) \
|
||||
do { \
|
||||
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
|
||||
conn_flag_do(c, conn, flag); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
|
||||
* @c: Execution context
|
||||
|
@ -731,8 +531,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||
* @conn: Connection pointer
|
||||
* @flag: Flag to set, or ~flag to unset
|
||||
*/
|
||||
static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
unsigned long flag)
|
||||
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
unsigned long flag)
|
||||
{
|
||||
if (flag & (flag - 1)) {
|
||||
int flag_index = fls(~flag);
|
||||
|
@ -782,8 +582,8 @@ static void tcp_hash_remove(const struct ctx *c,
|
|||
* @conn: Connection pointer
|
||||
* @event: Connection event
|
||||
*/
|
||||
static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
unsigned long event)
|
||||
void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
unsigned long event)
|
||||
{
|
||||
int prev, new, num = fls(event);
|
||||
|
||||
|
@ -831,12 +631,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
|||
tcp_timer_ctl(c, conn);
|
||||
}
|
||||
|
||||
#define conn_event(c, conn, event) \
|
||||
do { \
|
||||
flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
|
||||
conn_event_do(c, conn, event); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
|
||||
* @conn: Connection pointer
|
||||
|
@ -966,91 +760,6 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
|
|||
th->check = csum(th, payload_len, sum);
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
|
||||
* @eth_d: Ethernet destination address, NULL if unchanged
|
||||
* @eth_s: Ethernet source address, NULL if unchanged
|
||||
*/
|
||||
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
|
||||
struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
|
||||
struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
|
||||
struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
|
||||
|
||||
eth_update_mac(&b4->taph.eh, eth_d, eth_s);
|
||||
eth_update_mac(&b6->taph.eh, eth_d, eth_s);
|
||||
eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
|
||||
eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
|
||||
* @c: Execution context
|
||||
*/
|
||||
static void tcp_sock4_iov_init(const struct ctx *c)
|
||||
{
|
||||
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
struct iovec *iov;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
|
||||
tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
|
||||
.taph = TAP_HDR_INIT(ETH_P_IP),
|
||||
.iph = iph,
|
||||
.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
|
||||
};
|
||||
}
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
|
||||
tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
|
||||
.taph = TAP_HDR_INIT(ETH_P_IP),
|
||||
.iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
|
||||
};
|
||||
}
|
||||
|
||||
for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
|
||||
iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph);
|
||||
|
||||
for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
|
||||
iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph);
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
|
||||
* @c: Execution context
|
||||
*/
|
||||
static void tcp_sock6_iov_init(const struct ctx *c)
|
||||
{
|
||||
struct iovec *iov;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
|
||||
tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
|
||||
.taph = TAP_HDR_INIT(ETH_P_IPV6),
|
||||
.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
|
||||
.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
|
||||
};
|
||||
}
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
|
||||
tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
|
||||
.taph = TAP_HDR_INIT(ETH_P_IPV6),
|
||||
.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
|
||||
};
|
||||
}
|
||||
|
||||
for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
|
||||
iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph);
|
||||
|
||||
for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
|
||||
iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph);
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_opt_get() - Get option, and value if any, from TCP header
|
||||
* @opts: Pointer to start of TCP options in header
|
||||
|
@ -1276,46 +985,6 @@ bool tcp_flow_defer(union flow *flow)
|
|||
return true;
|
||||
}
|
||||
|
||||
static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
|
||||
#define tcp_rst(c, conn) \
|
||||
do { \
|
||||
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
|
||||
tcp_rst_do(c, conn); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags)
|
||||
* @c: Execution context
|
||||
*/
|
||||
static void tcp_l2_flags_buf_flush(const struct ctx *c)
|
||||
{
|
||||
tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
|
||||
tcp6_l2_flags_buf_used = 0;
|
||||
|
||||
tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
|
||||
tcp4_l2_flags_buf_used = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_l2_data_buf_flush() - Send out buffers for segments with data
|
||||
* @c: Execution context
|
||||
*/
|
||||
static void tcp_l2_data_buf_flush(const struct ctx *c)
|
||||
{
|
||||
unsigned i;
|
||||
size_t m;
|
||||
|
||||
m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
|
||||
for (i = 0; i < m; i++)
|
||||
*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
|
||||
tcp6_l2_buf_used = 0;
|
||||
|
||||
m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
|
||||
for (i = 0; i < m; i++)
|
||||
*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
|
||||
tcp4_l2_buf_used = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_defer_handler() - Handler for TCP deferred tasks
|
||||
* @c: Execution context
|
||||
|
@ -1323,8 +992,8 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
|
|||
/* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
|
||||
void tcp_defer_handler(struct ctx *c)
|
||||
{
|
||||
tcp_l2_flags_buf_flush(c);
|
||||
tcp_l2_data_buf_flush(c);
|
||||
tcp_buf_l2_flags_flush(c);
|
||||
tcp_buf_l2_data_flush(c);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1362,7 +1031,7 @@ static void tcp_fill_header(struct tcphdr *th,
|
|||
*
|
||||
* Return: The total length of the IPv4 packet, host order
|
||||
*/
|
||||
static size_t tcp_fill_headers4(const struct ctx *c,
|
||||
size_t tcp_fill_headers4(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn,
|
||||
struct iphdr *iph, struct tcphdr *th,
|
||||
size_t plen, const uint16_t *check,
|
||||
|
@ -1383,7 +1052,8 @@ static size_t tcp_fill_headers4(const struct ctx *c,
|
|||
|
||||
tcp_fill_header(th, conn, seq);
|
||||
|
||||
tcp_update_check_tcp4(iph, th);
|
||||
if (c->mode != MODE_VU)
|
||||
tcp_update_check_tcp4(iph, th);
|
||||
|
||||
return ip_len;
|
||||
}
|
||||
|
@ -1400,10 +1070,10 @@ static size_t tcp_fill_headers4(const struct ctx *c,
|
|||
*
|
||||
* Return: The total length of the IPv6 packet, host order
|
||||
*/
|
||||
static size_t tcp_fill_headers6(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn,
|
||||
struct ipv6hdr *ip6h, struct tcphdr *th,
|
||||
size_t plen, uint32_t seq)
|
||||
size_t tcp_fill_headers6(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn,
|
||||
struct ipv6hdr *ip6h, struct tcphdr *th,
|
||||
size_t plen, uint32_t seq)
|
||||
{
|
||||
size_t ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
|
||||
|
||||
|
@ -1424,49 +1094,12 @@ static size_t tcp_fill_headers6(const struct ctx *c,
|
|||
|
||||
tcp_fill_header(th, conn, seq);
|
||||
|
||||
tcp_update_check_tcp6(ip6h, th);
|
||||
if (c->mode != MODE_VU)
|
||||
tcp_update_check_tcp6(ip6h, th);
|
||||
|
||||
return ip_len;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @p: Pointer to any type of TCP pre-cooked buffer
|
||||
* @plen: Payload length (including TCP header options)
|
||||
* @check: Checksum, if already known
|
||||
* @seq: Sequence number for this segment
|
||||
*
|
||||
* Return: frame length including L2 headers, host order
|
||||
*/
|
||||
static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn,
|
||||
void *p, size_t plen,
|
||||
const uint16_t *check, uint32_t seq)
|
||||
{
|
||||
const struct in_addr *a4 = inany_v4(&conn->faddr);
|
||||
size_t ip_len, tlen;
|
||||
|
||||
if (a4) {
|
||||
struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p;
|
||||
|
||||
ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
|
||||
check, seq);
|
||||
|
||||
tlen = tap_iov_len(c, &b->taph, ip_len);
|
||||
} else {
|
||||
struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
|
||||
|
||||
ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
|
||||
seq);
|
||||
|
||||
tlen = tap_iov_len(c, &b->taph, ip_len);
|
||||
}
|
||||
|
||||
return tlen;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap
|
||||
* @c: Execution context
|
||||
|
@ -1476,8 +1109,8 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
|
|||
*
|
||||
* Return: 1 if sequence or window were updated, 0 otherwise
|
||||
*/
|
||||
static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int force_seq, struct tcp_info *tinfo)
|
||||
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int force_seq, struct tcp_info *tinfo)
|
||||
{
|
||||
uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
|
||||
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
|
||||
|
@ -1584,27 +1217,27 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
|
|||
}
|
||||
|
||||
/**
|
||||
* tcp_send_flag() - Send segment with flags to tap (no payload)
|
||||
* tcp_fill_flag_header() - Prepare header for flags-only segment (no payload)
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @flags: TCP flags: if not set, send segment only if ACK is due
|
||||
* @th: TCP header to update
|
||||
* @opts: buffer to store TCP option
|
||||
* @optlen: size of the TCP option buffer
|
||||
*
|
||||
* Return: negative error code on connection reset, 0 otherwise
|
||||
* Return: < 0 error code on connection reset,
|
||||
* 0 if there is no flag to send
|
||||
* 1 otherwise
|
||||
*/
|
||||
static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int flags, struct tcphdr *th, char *opts,
|
||||
size_t *optlen)
|
||||
{
|
||||
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
|
||||
uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
|
||||
struct tcp4_l2_flags_buf_t *b4 = NULL;
|
||||
struct tcp6_l2_flags_buf_t *b6 = NULL;
|
||||
struct tcp_info tinfo = { 0 };
|
||||
socklen_t sl = sizeof(tinfo);
|
||||
int s = conn->sock;
|
||||
size_t optlen = 0;
|
||||
struct iovec *iov;
|
||||
struct tcphdr *th;
|
||||
char *data;
|
||||
void *p;
|
||||
|
||||
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
|
||||
!flags && conn->wnd_to_tap)
|
||||
|
@ -1626,37 +1259,19 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
|||
if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
|
||||
return 0;
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used;
|
||||
p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
|
||||
th = &b4->th;
|
||||
|
||||
/* gcc 11.2 would complain on data = (char *)(th + 1); */
|
||||
data = b4->opts;
|
||||
} else {
|
||||
iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used;
|
||||
p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
|
||||
th = &b6->th;
|
||||
data = b6->opts;
|
||||
}
|
||||
|
||||
if (flags & SYN) {
|
||||
int mss;
|
||||
|
||||
/* Options: MSS, NOP and window scale (8 bytes) */
|
||||
optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
|
||||
*optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
|
||||
|
||||
*data++ = OPT_MSS;
|
||||
*data++ = OPT_MSS_LEN;
|
||||
*opts++ = OPT_MSS;
|
||||
*opts++ = OPT_MSS_LEN;
|
||||
|
||||
if (c->mtu == -1) {
|
||||
mss = tinfo.tcpi_snd_mss;
|
||||
} else {
|
||||
mss = c->mtu - sizeof(struct tcphdr);
|
||||
if (CONN_V4(conn))
|
||||
mss -= sizeof(struct iphdr);
|
||||
else
|
||||
mss -= sizeof(struct ipv6hdr);
|
||||
|
||||
if (c->low_wmem &&
|
||||
!(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn))
|
||||
|
@ -1664,16 +1279,16 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
|||
else if (mss > PAGE_SIZE)
|
||||
mss = ROUND_DOWN(mss, PAGE_SIZE);
|
||||
}
|
||||
*(uint16_t *)data = htons(MIN(USHRT_MAX, mss));
|
||||
*(uint16_t *)opts = htons(MIN(USHRT_MAX, mss));
|
||||
|
||||
data += OPT_MSS_LEN - 2;
|
||||
opts += OPT_MSS_LEN - 2;
|
||||
|
||||
conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
|
||||
|
||||
*data++ = OPT_NOP;
|
||||
*data++ = OPT_WS;
|
||||
*data++ = OPT_WS_LEN;
|
||||
*data++ = conn->ws_to_tap;
|
||||
*opts++ = OPT_NOP;
|
||||
*opts++ = OPT_WS;
|
||||
*opts++ = OPT_WS_LEN;
|
||||
*opts++ = conn->ws_to_tap;
|
||||
|
||||
th->ack = !!(flags & ACK);
|
||||
} else {
|
||||
|
@ -1682,15 +1297,12 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
|||
!prev_wnd_to_tap;
|
||||
}
|
||||
|
||||
th->doff = (sizeof(*th) + optlen) / 4;
|
||||
th->doff = (sizeof(*th) + *optlen) / 4;
|
||||
|
||||
th->rst = !!(flags & RST);
|
||||
th->syn = !!(flags & SYN);
|
||||
th->fin = !!(flags & FIN);
|
||||
|
||||
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen,
|
||||
NULL, conn->seq_to_tap);
|
||||
|
||||
if (th->ack) {
|
||||
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
|
||||
conn_flag(c, conn, ~ACK_TO_TAP_DUE);
|
||||
|
@ -1705,27 +1317,14 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
|||
if (th->fin || th->syn)
|
||||
conn->seq_to_tap++;
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
if (flags & DUP_ACK) {
|
||||
memcpy(b4 + 1, b4, sizeof(*b4));
|
||||
(iov + 1)->iov_len = iov->iov_len;
|
||||
tcp4_l2_flags_buf_used++;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
|
||||
tcp_l2_flags_buf_flush(c);
|
||||
} else {
|
||||
if (flags & DUP_ACK) {
|
||||
memcpy(b6 + 1, b6, sizeof(*b6));
|
||||
(iov + 1)->iov_len = iov->iov_len;
|
||||
tcp6_l2_flags_buf_used++;
|
||||
}
|
||||
|
||||
if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
|
||||
tcp_l2_flags_buf_flush(c);
|
||||
}
|
||||
|
||||
return 0;
|
||||
int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
{
|
||||
if (c->mode == MODE_VU)
|
||||
return tcp_vu_send_flag(c, conn, flags);
|
||||
return tcp_buf_send_flag(c, conn, flags);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1733,7 +1332,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
|||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
*/
|
||||
static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
if (conn->events == CLOSED)
|
||||
return;
|
||||
|
@ -1881,21 +1480,22 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af)
|
|||
*
|
||||
* Return: clamped MSS value
|
||||
*/
|
||||
static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
|
||||
static uint16_t tcp_conn_tap_mss(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn,
|
||||
const char *opts, size_t optlen)
|
||||
{
|
||||
unsigned int mss;
|
||||
int ret;
|
||||
|
||||
(void)c; /* unused */
|
||||
(void)conn; /* unused */
|
||||
|
||||
if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
|
||||
mss = MSS_DEFAULT;
|
||||
else
|
||||
mss = ret;
|
||||
|
||||
if (CONN_V4(conn))
|
||||
mss = MIN(MSS4, mss);
|
||||
else
|
||||
mss = MIN(MSS6, mss);
|
||||
mss = MIN(MSS, mss);
|
||||
|
||||
return MIN(mss, USHRT_MAX);
|
||||
}
|
||||
|
@ -2051,7 +1651,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
|
|||
|
||||
conn->wnd_to_tap = WINDOW_DEFAULT;
|
||||
|
||||
mss = tcp_conn_tap_mss(conn, opts, optlen);
|
||||
mss = tcp_conn_tap_mss(c, conn, opts, optlen);
|
||||
if (setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)))
|
||||
flow_trace(conn, "failed to set TCP_MAXSEG on socket %i", s);
|
||||
MSS_SET(conn, mss);
|
||||
|
@ -2156,174 +1756,12 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @plen: Payload length at L4
|
||||
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
|
||||
* @seq: Sequence number to be sent
|
||||
*/
|
||||
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
ssize_t plen, int no_csum, uint32_t seq)
|
||||
{
|
||||
uint32_t *seq_update = &conn->seq_to_tap;
|
||||
struct iovec *iov;
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
|
||||
const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
|
||||
|
||||
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
|
||||
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
|
||||
|
||||
iov = tcp4_l2_iov + tcp4_l2_buf_used++;
|
||||
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
|
||||
check, seq);
|
||||
if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
|
||||
tcp_l2_data_buf_flush(c);
|
||||
} else if (CONN_V6(conn)) {
|
||||
struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
|
||||
|
||||
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
|
||||
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
|
||||
|
||||
iov = tcp6_l2_iov + tcp6_l2_buf_used++;
|
||||
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
|
||||
NULL, seq);
|
||||
if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
|
||||
tcp_l2_data_buf_flush(c);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
*
|
||||
* Return: negative on connection reset, 0 otherwise
|
||||
*
|
||||
* #syscalls recvmsg
|
||||
*/
|
||||
static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
|
||||
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
|
||||
int sendlen, len, plen, v4 = CONN_V4(conn);
|
||||
int s = conn->sock, i, ret = 0;
|
||||
struct msghdr mh_sock = { 0 };
|
||||
uint16_t mss = MSS_GET(conn);
|
||||
uint32_t already_sent, seq;
|
||||
struct iovec *iov;
|
||||
if (c->mode == MODE_VU)
|
||||
return tcp_vu_data_from_sock(c, conn);
|
||||
|
||||
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
|
||||
if (SEQ_LT(already_sent, 0)) {
|
||||
/* RFC 761, section 2.1. */
|
||||
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
|
||||
conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
already_sent = 0;
|
||||
}
|
||||
|
||||
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up buffer descriptors we'll fill completely and partially. */
|
||||
fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
|
||||
if (fill_bufs > TCP_FRAMES) {
|
||||
fill_bufs = TCP_FRAMES;
|
||||
iov_rem = 0;
|
||||
} else {
|
||||
iov_rem = (wnd_scaled - already_sent) % mss;
|
||||
}
|
||||
|
||||
mh_sock.msg_iov = iov_sock;
|
||||
mh_sock.msg_iovlen = fill_bufs + 1;
|
||||
|
||||
iov_sock[0].iov_base = tcp_buf_discard;
|
||||
iov_sock[0].iov_len = already_sent;
|
||||
|
||||
if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
|
||||
(!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
|
||||
tcp_l2_data_buf_flush(c);
|
||||
|
||||
/* Silence Coverity CWE-125 false positive */
|
||||
tcp4_l2_buf_used = tcp6_l2_buf_used = 0;
|
||||
}
|
||||
|
||||
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
|
||||
if (v4)
|
||||
iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
|
||||
else
|
||||
iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
|
||||
iov->iov_len = mss;
|
||||
}
|
||||
if (iov_rem)
|
||||
iov_sock[fill_bufs].iov_len = iov_rem;
|
||||
|
||||
/* Receive into buffers, don't dequeue until acknowledged by guest. */
|
||||
do
|
||||
len = recvmsg(s, &mh_sock, MSG_PEEK);
|
||||
while (len < 0 && errno == EINTR);
|
||||
|
||||
if (len < 0)
|
||||
goto err;
|
||||
|
||||
if (!len) {
|
||||
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
||||
if ((ret = tcp_send_flag(c, conn, FIN | ACK))) {
|
||||
tcp_rst(c, conn);
|
||||
return ret;
|
||||
}
|
||||
|
||||
conn_event(c, conn, TAP_FIN_SENT);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
sendlen = len - already_sent;
|
||||
if (sendlen <= 0) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
return 0;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ~STALLED);
|
||||
|
||||
send_bufs = DIV_ROUND_UP(sendlen, mss);
|
||||
last_len = sendlen - (send_bufs - 1) * mss;
|
||||
|
||||
/* Likely, some new data was acked too. */
|
||||
tcp_update_seqack_wnd(c, conn, 0, NULL);
|
||||
|
||||
/* Finally, queue to tap */
|
||||
plen = mss;
|
||||
seq = conn->seq_to_tap;
|
||||
for (i = 0; i < send_bufs; i++) {
|
||||
int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
|
||||
|
||||
if (i == send_bufs - 1)
|
||||
plen = last_len;
|
||||
|
||||
tcp_data_to_tap(c, conn, plen, no_csum, seq);
|
||||
seq += plen;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
ret = -errno;
|
||||
tcp_rst(c, conn);
|
||||
}
|
||||
|
||||
return ret;
|
||||
return tcp_buf_data_from_sock(c, conn);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2542,7 +1980,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
|
|||
if (!(conn->wnd_from_tap >>= conn->ws_from_tap))
|
||||
conn->wnd_from_tap = 1;
|
||||
|
||||
MSS_SET(conn, tcp_conn_tap_mss(conn, opts, optlen));
|
||||
MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen));
|
||||
|
||||
conn->seq_init_from_tap = ntohl(th->seq) + 1;
|
||||
conn->seq_from_tap = conn->seq_init_from_tap;
|
||||
|
@ -3179,10 +2617,10 @@ int tcp_init(struct ctx *c)
|
|||
tc_hash[b] = FLOW_SIDX_NONE;
|
||||
|
||||
if (c->ifi4)
|
||||
tcp_sock4_iov_init(c);
|
||||
tcp_buf_sock4_iov_init(c);
|
||||
|
||||
if (c->ifi6)
|
||||
tcp_sock6_iov_init(c);
|
||||
tcp_buf_sock6_iov_init(c);
|
||||
|
||||
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
|
||||
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
|
||||
|
|
2
tcp.h
2
tcp.h
|
@ -23,7 +23,7 @@ int tcp_init(struct ctx *c);
|
|||
void tcp_timer(struct ctx *c, const struct timespec *now);
|
||||
void tcp_defer_handler(struct ctx *c);
|
||||
|
||||
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
|
||||
void tcp_buf_update_l2(const unsigned char *eth_d, const unsigned char *eth_s);
|
||||
|
||||
/**
|
||||
* union tcp_epoll_ref - epoll reference portion for TCP connections
|
||||
|
|
494
tcp_buf.c
Normal file
494
tcp_buf.c
Normal file
|
@ -0,0 +1,494 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* PASST - Plug A Simple Socket Transport
|
||||
* for qemu/UNIX domain socket mode
|
||||
*
|
||||
* PASTA - Pack A Subtle Tap Abstraction
|
||||
* for network namespace/tap device mode
|
||||
*
|
||||
* tcp_buf.c - TCP L2-L4 translation state machine
|
||||
*
|
||||
* Copyright (c) 2020-2022 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <netinet/ip.h>
|
||||
|
||||
#include <linux/tcp.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "passt.h"
|
||||
#include "tap.h"
|
||||
#include "siphash.h"
|
||||
#include "inany.h"
|
||||
#include "tcp_conn.h"
|
||||
#include "tcp_internal.h"
|
||||
#include "tcp_buf.h"
|
||||
|
||||
#define TCP_FRAMES_MEM 128
|
||||
#define TCP_FRAMES \
|
||||
(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
|
||||
|
||||
/**
|
||||
* tcp_buf_seq_update - Sequences to update with length of frames once sent
|
||||
* @seq: Pointer to sequence number sent to tap-side, to be updated
|
||||
* @len: TCP payload length
|
||||
*/
|
||||
struct tcp_buf_seq_update {
|
||||
uint32_t *seq;
|
||||
uint16_t len;
|
||||
};
|
||||
|
||||
/* Static buffers */
|
||||
/**
|
||||
* tcp_l2_flags_t - TCP header and data to send option flags
|
||||
* @th: TCP header
|
||||
* @opts TCP option flags
|
||||
*/
|
||||
struct tcp_l2_flags_t {
|
||||
struct tcphdr th;
|
||||
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
|
||||
};
|
||||
/**
|
||||
* tcp_l2_payload_t - TCP header and data to send data
|
||||
* 32 bytes aligned to be able to use AVX2 checksum
|
||||
* @th: TCP header
|
||||
* @data: TCP data
|
||||
*/
|
||||
struct tcp_l2_payload_t {
|
||||
struct tcphdr th; /* 20 bytes */
|
||||
uint8_t data[MSS]; /* 65516 bytes */
|
||||
#ifdef __AVX2__
|
||||
} __attribute__ ((packed, aligned(32)));
|
||||
#else
|
||||
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
|
||||
#endif
|
||||
|
||||
/* Ethernet header for IPv4 frames */
|
||||
static struct ethhdr tcp4_eth_src;
|
||||
|
||||
/* IPv4 headers */
|
||||
static struct iphdr tcp4_l2_ip[TCP_FRAMES_MEM];
|
||||
/* TCP headers and data for IPv4 frames */
|
||||
static struct tcp_l2_payload_t tcp4_l2_payload[TCP_FRAMES_MEM];
|
||||
|
||||
static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
|
||||
static unsigned int tcp4_l2_buf_used;
|
||||
|
||||
/* IPv4 headers for TCP option flags frames */
|
||||
static struct iphdr tcp4_l2_flags_ip[TCP_FRAMES_MEM];
|
||||
/* TCP headers and option flags for IPv4 frames */
|
||||
static struct tcp_l2_flags_t tcp4_l2_flags[TCP_FRAMES_MEM];
|
||||
|
||||
static unsigned int tcp4_l2_flags_buf_used;
|
||||
|
||||
/* Ethernet header for IPv6 frames */
|
||||
static struct ethhdr tcp6_eth_src;
|
||||
|
||||
/* IPv6 headers */
|
||||
static struct ipv6hdr tcp6_l2_ip[TCP_FRAMES_MEM];
|
||||
/* TCP headers and data for IPv6 frames */
|
||||
static struct tcp_l2_payload_t tcp6_l2_payload[TCP_FRAMES_MEM];
|
||||
|
||||
static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
|
||||
static unsigned int tcp6_l2_buf_used;
|
||||
|
||||
/* IPv6 headers for TCP option flags frames */
|
||||
static struct ipv6hdr tcp6_l2_flags_ip[TCP_FRAMES_MEM];
|
||||
/* TCP headers and option flags for IPv6 frames */
|
||||
static struct tcp_l2_flags_t tcp6_l2_flags[TCP_FRAMES_MEM];
|
||||
|
||||
static unsigned int tcp6_l2_flags_buf_used;
|
||||
|
||||
/* recvmsg()/sendmsg() data for tap */
|
||||
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
|
||||
|
||||
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
|
||||
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
|
||||
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
|
||||
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM];
|
||||
|
||||
/**
|
||||
* tcp_buf_update_l2() - Update L2 buffers with Ethernet and IPv4 addresses
|
||||
* @eth_d: Ethernet destination address, NULL if unchanged
|
||||
* @eth_s: Ethernet source address, NULL if unchanged
|
||||
*/
|
||||
void tcp_buf_update_l2(const unsigned char *eth_d, const unsigned char *eth_s)
|
||||
{
|
||||
eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
|
||||
eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_buf_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_buf_sock4_iov_init(const struct ctx *c)
|
||||
{
|
||||
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
int i;
|
||||
|
||||
(void)c;
|
||||
|
||||
tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
struct iovec *iov;
|
||||
|
||||
/* headers */
|
||||
tcp4_l2_ip[i] = iph;
|
||||
tcp4_l2_payload[i].th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
tcp4_l2_flags_ip[i] = iph;
|
||||
tcp4_l2_flags[i].th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
/* iovecs */
|
||||
iov = tcp4_l2_iov[i];
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
||||
iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i];
|
||||
iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i];
|
||||
|
||||
iov = tcp4_l2_flags_iov[i];
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
||||
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
||||
iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i];
|
||||
iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_buf_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_buf_sock6_iov_init(const struct ctx *c)
|
||||
{
|
||||
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
int i;
|
||||
|
||||
(void)c;
|
||||
|
||||
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
|
||||
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
||||
struct iovec *iov;
|
||||
|
||||
/* headers */
|
||||
tcp6_l2_ip[i] = ip6;
|
||||
tcp6_l2_payload[i].th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
tcp6_l2_flags_ip[i] = ip6;
|
||||
tcp6_l2_flags[i].th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
/* iovecs */
|
||||
iov = tcp6_l2_iov[i];
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
||||
iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i];
|
||||
iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i];
|
||||
|
||||
iov = tcp6_l2_flags_iov[i];
|
||||
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
||||
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
||||
iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i];
|
||||
iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_buf_l2_flags_flush() - Send out buffers for segments with no data (flags)
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_buf_l2_flags_flush(const struct ctx *c)
|
||||
{
|
||||
tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
|
||||
tcp6_l2_flags_buf_used = 0;
|
||||
|
||||
tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
|
||||
tcp4_l2_flags_buf_used = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_buf_l2_data_flush() - Send out buffers for segments with data
|
||||
* @c: Execution context
|
||||
*/
|
||||
void tcp_buf_l2_data_flush(const struct ctx *c)
|
||||
{
|
||||
unsigned i;
|
||||
size_t m;
|
||||
|
||||
m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used);
|
||||
for (i = 0; i < m; i++)
|
||||
*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
|
||||
tcp6_l2_buf_used = 0;
|
||||
|
||||
m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used);
|
||||
for (i = 0; i < m; i++)
|
||||
*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
|
||||
tcp4_l2_buf_used = 0;
|
||||
}
|
||||
|
||||
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
{
|
||||
struct tcp_l2_flags_t *payload;
|
||||
struct iovec *dup_iov;
|
||||
struct iovec *iov;
|
||||
struct tcphdr *th;
|
||||
size_t optlen = 0;
|
||||
size_t ip_len;
|
||||
char *data;
|
||||
int ret;
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++];
|
||||
dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used];
|
||||
} else {
|
||||
iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++];
|
||||
dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used];
|
||||
}
|
||||
payload = iov[TCP_IOV_PAYLOAD].iov_base;
|
||||
th = &payload->th;
|
||||
data = payload->opts;
|
||||
|
||||
ret = tcp_fill_flag_header(c, conn, flags, th, data, &optlen);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
struct iphdr *iph = iov[TCP_IOV_IP].iov_base;
|
||||
|
||||
ip_len = tcp_fill_headers4(c, conn, iph, th, optlen, NULL,
|
||||
conn->seq_to_tap);
|
||||
} else {
|
||||
struct ipv6hdr *ip6h = iov[TCP_IOV_IP].iov_base;
|
||||
|
||||
ip_len = tcp_fill_headers6(c, conn, ip6h, th, optlen,
|
||||
conn->seq_to_tap);
|
||||
}
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
|
||||
|
||||
if (flags & DUP_ACK) {
|
||||
int i;
|
||||
for (i = 0; i < TCP_IOV_NUM; i++) {
|
||||
memcpy(dup_iov[i].iov_base, iov[i].iov_base,
|
||||
iov[i].iov_len);
|
||||
dup_iov[i].iov_len = iov[i].iov_len;
|
||||
}
|
||||
}
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
if (flags & DUP_ACK)
|
||||
tcp4_l2_flags_buf_used++;
|
||||
|
||||
if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
|
||||
tcp_buf_l2_flags_flush(c);
|
||||
} else {
|
||||
if (flags & DUP_ACK)
|
||||
tcp6_l2_flags_buf_used++;
|
||||
|
||||
if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
|
||||
tcp_buf_l2_flags_flush(c);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @plen: Payload length at L4
|
||||
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
|
||||
* @seq: Sequence number to be sent
|
||||
*/
|
||||
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
ssize_t plen, int no_csum, uint32_t seq)
|
||||
{
|
||||
uint32_t *seq_update = &conn->seq_to_tap;
|
||||
struct iovec *iov;
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1];
|
||||
const uint16_t *check = NULL;
|
||||
|
||||
if (no_csum) {
|
||||
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
|
||||
check = &iph->check;
|
||||
}
|
||||
|
||||
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
|
||||
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
|
||||
|
||||
iov = tcp4_l2_iov[tcp4_l2_buf_used++];
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = tcp_fill_headers4(c, conn,
|
||||
iov[TCP_IOV_IP].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base,
|
||||
plen, check, seq);
|
||||
|
||||
if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_buf_l2_data_flush(c);
|
||||
} else if (CONN_V6(conn)) {
|
||||
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
|
||||
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
|
||||
|
||||
iov = tcp6_l2_iov[tcp6_l2_buf_used++];
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = tcp_fill_headers6(c, conn,
|
||||
iov[TCP_IOV_IP].iov_base,
|
||||
iov[TCP_IOV_PAYLOAD].iov_base,
|
||||
plen, seq);
|
||||
|
||||
if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1)
|
||||
tcp_buf_l2_data_flush(c);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
*
|
||||
* Return: negative on connection reset, 0 otherwise
|
||||
*
|
||||
* #syscalls recvmsg
|
||||
*/
|
||||
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
|
||||
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
|
||||
int sendlen, len, plen, v4 = CONN_V4(conn);
|
||||
int s = conn->sock, i, ret = 0;
|
||||
struct msghdr mh_sock = { 0 };
|
||||
uint16_t mss = MSS_GET(conn);
|
||||
uint32_t already_sent, seq;
|
||||
struct iovec *iov;
|
||||
|
||||
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
|
||||
if (SEQ_LT(already_sent, 0)) {
|
||||
/* RFC 761, section 2.1. */
|
||||
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
|
||||
conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
already_sent = 0;
|
||||
}
|
||||
|
||||
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up buffer descriptors we'll fill completely and partially. */
|
||||
fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
|
||||
if (fill_bufs > TCP_FRAMES) {
|
||||
fill_bufs = TCP_FRAMES;
|
||||
iov_rem = 0;
|
||||
} else {
|
||||
iov_rem = (wnd_scaled - already_sent) % mss;
|
||||
}
|
||||
|
||||
mh_sock.msg_iov = iov_sock;
|
||||
mh_sock.msg_iovlen = fill_bufs + 1;
|
||||
|
||||
iov_sock[0].iov_base = tcp_buf_discard;
|
||||
iov_sock[0].iov_len = already_sent;
|
||||
|
||||
if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) ||
|
||||
(!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) {
|
||||
tcp_buf_l2_data_flush(c);
|
||||
|
||||
/* Silence Coverity CWE-125 false positive */
|
||||
tcp4_l2_buf_used = tcp6_l2_buf_used = 0;
|
||||
}
|
||||
|
||||
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
|
||||
if (v4)
|
||||
iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data;
|
||||
else
|
||||
iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data;
|
||||
iov->iov_len = mss;
|
||||
}
|
||||
if (iov_rem)
|
||||
iov_sock[fill_bufs].iov_len = iov_rem;
|
||||
|
||||
/* Receive into buffers, don't dequeue until acknowledged by guest. */
|
||||
do
|
||||
len = recvmsg(s, &mh_sock, MSG_PEEK);
|
||||
while (len < 0 && errno == EINTR);
|
||||
|
||||
if (len < 0)
|
||||
goto err;
|
||||
|
||||
if (!len) {
|
||||
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
||||
if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
|
||||
tcp_rst(c, conn);
|
||||
return ret;
|
||||
}
|
||||
|
||||
conn_event(c, conn, TAP_FIN_SENT);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
sendlen = len - already_sent;
|
||||
if (sendlen <= 0) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
return 0;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ~STALLED);
|
||||
|
||||
send_bufs = DIV_ROUND_UP(sendlen, mss);
|
||||
last_len = sendlen - (send_bufs - 1) * mss;
|
||||
|
||||
/* Likely, some new data was acked too. */
|
||||
tcp_update_seqack_wnd(c, conn, 0, NULL);
|
||||
|
||||
/* Finally, queue to tap */
|
||||
plen = mss;
|
||||
seq = conn->seq_to_tap;
|
||||
for (i = 0; i < send_bufs; i++) {
|
||||
int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
|
||||
|
||||
if (i == send_bufs - 1)
|
||||
plen = last_len;
|
||||
|
||||
tcp_data_to_tap(c, conn, plen, no_csum, seq);
|
||||
seq += plen;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
ret = -errno;
|
||||
tcp_rst(c, conn);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
17
tcp_buf.h
Normal file
17
tcp_buf.h
Normal file
|
@ -0,0 +1,17 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright (c) 2021 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
#ifndef TCP_BUF_H
|
||||
#define TCP_BUF_H
|
||||
|
||||
void tcp_buf_sock4_iov_init(const struct ctx *c);
|
||||
void tcp_buf_sock6_iov_init(const struct ctx *c);
|
||||
void tcp_buf_l2_flags_flush(const struct ctx *c);
|
||||
void tcp_buf_l2_data_flush(const struct ctx *c);
|
||||
uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn);
|
||||
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
|
||||
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||
|
||||
#endif /*TCP_BUF_H */
|
81
tcp_internal.h
Normal file
81
tcp_internal.h
Normal file
|
@ -0,0 +1,81 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright (c) 2021 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
#ifndef TCP_INTERNAL_H
|
||||
#define TCP_INTERNAL_H
|
||||
|
||||
#define MAX_WS 8
|
||||
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
|
||||
#define MSS (USHRT_MAX - sizeof(struct tcphdr))
|
||||
|
||||
#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
|
||||
#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
|
||||
#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
|
||||
#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
|
||||
|
||||
#define FIN (1 << 0)
|
||||
#define SYN (1 << 1)
|
||||
#define RST (1 << 2)
|
||||
#define ACK (1 << 4)
|
||||
|
||||
/* Flags for internal usage */
|
||||
#define DUP_ACK (1 << 5)
|
||||
#define OPT_EOL 0
|
||||
#define OPT_NOP 1
|
||||
#define OPT_MSS 2
|
||||
#define OPT_MSS_LEN 4
|
||||
#define OPT_WS 3
|
||||
#define OPT_WS_LEN 3
|
||||
#define OPT_SACKP 4
|
||||
#define OPT_SACK 5
|
||||
#define OPT_TS 8
|
||||
|
||||
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
|
||||
#define CONN_V6(conn) (!CONN_V4(conn))
|
||||
|
||||
extern char tcp_buf_discard[MAX_WINDOW];
|
||||
|
||||
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
unsigned long flag);
|
||||
#define conn_flag(c, conn, flag) \
|
||||
do { \
|
||||
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
|
||||
conn_flag_do(c, conn, flag); \
|
||||
} while (0)
|
||||
|
||||
|
||||
void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
unsigned long event);
|
||||
#define conn_event(c, conn, event) \
|
||||
do { \
|
||||
flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
|
||||
conn_event_do(c, conn, event); \
|
||||
} while (0)
|
||||
|
||||
void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
|
||||
#define tcp_rst(c, conn) \
|
||||
do { \
|
||||
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
|
||||
tcp_rst_do(c, conn); \
|
||||
} while (0)
|
||||
|
||||
|
||||
|
||||
size_t tcp_fill_headers4(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn,
|
||||
struct iphdr *iph, struct tcphdr *th,
|
||||
size_t plen, const uint16_t *check,
|
||||
uint32_t seq);
|
||||
size_t tcp_fill_headers6(const struct ctx *c,
|
||||
const struct tcp_tap_conn *conn,
|
||||
struct ipv6hdr *ip6h, struct tcphdr *th,
|
||||
size_t plen, uint32_t seq);
|
||||
|
||||
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
int force_seq, struct tcp_info *tinfo);
|
||||
int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, int flags,
|
||||
struct tcphdr *th, char *opts, size_t *optlen);
|
||||
|
||||
#endif /* TCP_INTERNAL_H */
|
460
tcp_vu.c
Normal file
460
tcp_vu.c
Normal file
|
@ -0,0 +1,460 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <errno.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <netinet/ip.h>
|
||||
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/virtio_net.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "passt.h"
|
||||
#include "siphash.h"
|
||||
#include "inany.h"
|
||||
#include "vhost_user.h"
|
||||
#include "tcp.h"
|
||||
#include "pcap.h"
|
||||
#include "flow.h"
|
||||
#include "tcp_conn.h"
|
||||
#include "flow_table.h"
|
||||
#include "tcp_vu.h"
|
||||
#include "tcp_internal.h"
|
||||
#include "checksum.h"
|
||||
|
||||
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
|
||||
#define CONN_V6(conn) (!CONN_V4(conn))
|
||||
|
||||
/* vhost-user */
|
||||
static const struct virtio_net_hdr vu_header = {
|
||||
.flags = VIRTIO_NET_HDR_F_DATA_VALID,
|
||||
.gso_type = VIRTIO_NET_HDR_GSO_NONE,
|
||||
};
|
||||
|
||||
static unsigned char buffer[65536];
|
||||
static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE];
|
||||
static unsigned int indexes [VIRTQUEUE_MAX_SIZE];
|
||||
|
||||
uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn)
|
||||
{
|
||||
(void)conn;
|
||||
return USHRT_MAX;
|
||||
}
|
||||
|
||||
int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
{
|
||||
VuDev *vdev = (VuDev *)&c->vdev;
|
||||
VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
size_t tlen, vnet_hdrlen, ip_len, optlen = 0;
|
||||
struct virtio_net_hdr_mrg_rxbuf *vh;
|
||||
VuVirtqElement *elem;
|
||||
struct ethhdr *eh;
|
||||
int nb_ack;
|
||||
int ret;
|
||||
|
||||
elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
|
||||
if (!elem)
|
||||
return 0;
|
||||
|
||||
if (elem->in_num < 1) {
|
||||
err("virtio-net receive queue contains no in buffers");
|
||||
vu_queue_rewind(vdev, vq, 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
vh = elem->in_sg[0].iov_base;
|
||||
|
||||
vh->hdr = vu_header;
|
||||
if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
|
||||
vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
|
||||
vh->num_buffers = htole16(1);
|
||||
} else {
|
||||
vnet_hdrlen = sizeof(struct virtio_net_hdr);
|
||||
}
|
||||
eh = (struct ethhdr *)((char *)elem->in_sg[0].iov_base + vnet_hdrlen);
|
||||
|
||||
memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
|
||||
memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
|
||||
|
||||
if (CONN_V4(conn)) {
|
||||
struct iphdr *iph = (struct iphdr *)(eh + 1);
|
||||
struct tcphdr *th = (struct tcphdr *)(iph + 1);
|
||||
char *data = (char *)(th + 1);
|
||||
|
||||
eh->h_proto = htons(ETH_P_IP);
|
||||
|
||||
*th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
|
||||
ret = tcp_fill_flag_header(c, conn, flags, th, data, &optlen);
|
||||
if (ret <= 0) {
|
||||
vu_queue_rewind(vdev, vq, 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ip_len = tcp_fill_headers4(c, conn, iph,
|
||||
(struct tcphdr *)(iph + 1), optlen,
|
||||
NULL, conn->seq_to_tap);
|
||||
|
||||
tlen = ip_len + sizeof(struct ethhdr);
|
||||
|
||||
if (*c->pcap) {
|
||||
uint32_t sum = proto_ipv4_header_psum(iph->tot_len,
|
||||
IPPROTO_TCP,
|
||||
(struct in_addr){ .s_addr = iph->saddr },
|
||||
(struct in_addr){ .s_addr = iph->daddr });
|
||||
|
||||
th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
|
||||
}
|
||||
} else {
|
||||
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
|
||||
struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
|
||||
char *data = (char *)(th + 1);
|
||||
|
||||
eh->h_proto = htons(ETH_P_IPV6);
|
||||
|
||||
*th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
|
||||
ret = tcp_fill_flag_header(c, conn, flags, th, data, &optlen);
|
||||
if (ret <= 0) {
|
||||
vu_queue_rewind(vdev, vq, 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ip_len = tcp_fill_headers6(c, conn, ip6h,
|
||||
(struct tcphdr *)(ip6h + 1),
|
||||
optlen, conn->seq_to_tap);
|
||||
|
||||
tlen = ip_len + sizeof(struct ethhdr);
|
||||
|
||||
if (*c->pcap) {
|
||||
uint32_t sum = proto_ipv6_header_psum(ip6h->payload_len,
|
||||
IPPROTO_TCP,
|
||||
&ip6h->saddr,
|
||||
&ip6h->daddr);
|
||||
|
||||
th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
|
||||
}
|
||||
}
|
||||
|
||||
pcap((void *)eh, tlen);
|
||||
|
||||
tlen += vnet_hdrlen;
|
||||
vu_queue_fill(vdev, vq, elem, tlen, 0);
|
||||
nb_ack = 1;
|
||||
|
||||
if (flags & DUP_ACK) {
|
||||
elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
|
||||
if (elem) {
|
||||
if (elem->in_num < 1 || elem->in_sg[0].iov_len < tlen) {
|
||||
vu_queue_rewind(vdev, vq, 1);
|
||||
} else {
|
||||
memcpy(elem->in_sg[0].iov_base, vh, tlen);
|
||||
nb_ack++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vu_queue_flush(vdev, vq, nb_ack);
|
||||
vu_queue_notify(vdev, vq);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
|
||||
{
|
||||
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
|
||||
uint32_t already_sent;
|
||||
VuDev *vdev = (VuDev *)&c->vdev;
|
||||
VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
int s = conn->sock, v4 = CONN_V4(conn);
|
||||
int i, ret = 0, iov_count, iov_used;
|
||||
struct msghdr mh_sock = { 0 };
|
||||
size_t l2_hdrlen, vnet_hdrlen, fillsize;
|
||||
ssize_t len;
|
||||
uint16_t *check;
|
||||
uint16_t mss = MSS_GET(conn);
|
||||
int num_buffers;
|
||||
int segment_size;
|
||||
struct iovec *first;
|
||||
bool has_mrg_rxbuf;
|
||||
|
||||
if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
|
||||
err("Got packet, but no available descriptors on RX virtq.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
|
||||
if (SEQ_LT(already_sent, 0)) {
|
||||
/* RFC 761, section 2.1. */
|
||||
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
|
||||
conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
already_sent = 0;
|
||||
}
|
||||
|
||||
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up buffer descriptors we'll fill completely and partially. */
|
||||
|
||||
fillsize = wnd_scaled;
|
||||
|
||||
iov_vu[0].iov_base = tcp_buf_discard;
|
||||
iov_vu[0].iov_len = already_sent;
|
||||
fillsize -= already_sent;
|
||||
|
||||
has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF);
|
||||
if (has_mrg_rxbuf) {
|
||||
vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
|
||||
} else {
|
||||
vnet_hdrlen = sizeof(struct virtio_net_hdr);
|
||||
}
|
||||
l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr);
|
||||
if (v4) {
|
||||
l2_hdrlen += sizeof(struct iphdr);
|
||||
} else {
|
||||
l2_hdrlen += sizeof(struct ipv6hdr);
|
||||
}
|
||||
|
||||
iov_count = 0;
|
||||
segment_size = 0;
|
||||
while (fillsize > 0 && iov_count < VIRTQUEUE_MAX_SIZE - 1) {
|
||||
VuVirtqElement *elem;
|
||||
|
||||
elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
|
||||
if (!elem)
|
||||
break;
|
||||
|
||||
if (elem->in_num < 1) {
|
||||
err("virtio-net receive queue contains no in buffers");
|
||||
goto err;
|
||||
}
|
||||
|
||||
ASSERT(elem->in_num == 1);
|
||||
ASSERT(elem->in_sg[0].iov_len >= l2_hdrlen);
|
||||
|
||||
indexes[iov_count] = elem->index;
|
||||
|
||||
if (segment_size == 0) {
|
||||
iov_vu[iov_count + 1].iov_base =
|
||||
(char *)elem->in_sg[0].iov_base + l2_hdrlen;
|
||||
iov_vu[iov_count + 1].iov_len =
|
||||
elem->in_sg[0].iov_len - l2_hdrlen;
|
||||
} else {
|
||||
iov_vu[iov_count + 1].iov_base = elem->in_sg[0].iov_base;
|
||||
iov_vu[iov_count + 1].iov_len = elem->in_sg[0].iov_len;
|
||||
}
|
||||
|
||||
if (iov_vu[iov_count + 1].iov_len > fillsize)
|
||||
iov_vu[iov_count + 1].iov_len = fillsize;
|
||||
|
||||
segment_size += iov_vu[iov_count + 1].iov_len;
|
||||
if (!has_mrg_rxbuf) {
|
||||
segment_size = 0;
|
||||
} else if (segment_size >= mss) {
|
||||
iov_vu[iov_count + 1].iov_len -= segment_size - mss;
|
||||
segment_size = 0;
|
||||
}
|
||||
fillsize -= iov_vu[iov_count + 1].iov_len;
|
||||
|
||||
iov_count++;
|
||||
}
|
||||
if (iov_count == 0)
|
||||
return 0;
|
||||
|
||||
mh_sock.msg_iov = iov_vu;
|
||||
mh_sock.msg_iovlen = iov_count + 1;
|
||||
|
||||
do
|
||||
len = recvmsg(s, &mh_sock, MSG_PEEK);
|
||||
while (len < 0 && errno == EINTR);
|
||||
|
||||
if (len < 0)
|
||||
goto err;
|
||||
|
||||
if (!len) {
|
||||
vu_queue_rewind(vdev, vq, iov_count);
|
||||
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
||||
if ((ret = tcp_vu_send_flag(c, conn, FIN | ACK))) {
|
||||
tcp_rst(c, conn);
|
||||
return ret;
|
||||
}
|
||||
|
||||
conn_event(c, conn, TAP_FIN_SENT);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
len -= already_sent;
|
||||
if (len <= 0) {
|
||||
conn_flag(c, conn, STALLED);
|
||||
vu_queue_rewind(vdev, vq, iov_count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
conn_flag(c, conn, ~STALLED);
|
||||
|
||||
/* Likely, some new data was acked too. */
|
||||
tcp_update_seqack_wnd(c, conn, 0, NULL);
|
||||
|
||||
/* initialize headers */
|
||||
iov_used = 0;
|
||||
num_buffers = 0;
|
||||
check = NULL;
|
||||
segment_size = 0;
|
||||
for (i = 0; i < iov_count && len; i++) {
|
||||
|
||||
if (segment_size == 0)
|
||||
first = &iov_vu[i + 1];
|
||||
|
||||
if (iov_vu[i + 1].iov_len > (size_t)len)
|
||||
iov_vu[i + 1].iov_len = len;
|
||||
|
||||
len -= iov_vu[i + 1].iov_len;
|
||||
iov_used++;
|
||||
|
||||
segment_size += iov_vu[i + 1].iov_len;
|
||||
num_buffers++;
|
||||
|
||||
if (segment_size >= mss || len == 0 ||
|
||||
i + 1 == iov_count || !has_mrg_rxbuf) {
|
||||
|
||||
struct ethhdr *eh;
|
||||
struct virtio_net_hdr_mrg_rxbuf *vh;
|
||||
char *base = (char *)first->iov_base - l2_hdrlen;
|
||||
size_t size = first->iov_len + l2_hdrlen;
|
||||
|
||||
vh = (struct virtio_net_hdr_mrg_rxbuf *)base;
|
||||
|
||||
vh->hdr = vu_header;
|
||||
if (has_mrg_rxbuf)
|
||||
vh->num_buffers = htole16(num_buffers);
|
||||
|
||||
eh = (struct ethhdr *)((char *)base + vnet_hdrlen);
|
||||
|
||||
memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
|
||||
memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
|
||||
|
||||
/* initialize header */
|
||||
if (v4) {
|
||||
struct iphdr *iph = (struct iphdr *)(eh + 1);
|
||||
struct tcphdr *th = (struct tcphdr *)(iph + 1);
|
||||
|
||||
eh->h_proto = htons(ETH_P_IP);
|
||||
|
||||
*th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
|
||||
|
||||
tcp_fill_headers4(c, conn, iph,
|
||||
(struct tcphdr *)(iph + 1),
|
||||
segment_size, len ? check : NULL,
|
||||
conn->seq_to_tap);
|
||||
|
||||
if (*c->pcap) {
|
||||
uint32_t sum = proto_ipv4_header_psum(iph->tot_len,
|
||||
IPPROTO_TCP,
|
||||
(struct in_addr){ .s_addr = iph->saddr },
|
||||
(struct in_addr){ .s_addr = iph->daddr });
|
||||
|
||||
first->iov_base = th;
|
||||
first->iov_len = size - l2_hdrlen + sizeof(*th);
|
||||
|
||||
th->check = csum_iov(first, num_buffers, sum);
|
||||
}
|
||||
|
||||
check = &iph->check;
|
||||
} else {
|
||||
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
|
||||
struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
|
||||
|
||||
eh->h_proto = htons(ETH_P_IPV6);
|
||||
|
||||
*th = (struct tcphdr){
|
||||
.doff = sizeof(struct tcphdr) / 4,
|
||||
.ack = 1
|
||||
};
|
||||
|
||||
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
|
||||
|
||||
tcp_fill_headers6(c, conn, ip6h,
|
||||
(struct tcphdr *)(ip6h + 1),
|
||||
segment_size, conn->seq_to_tap);
|
||||
if (*c->pcap) {
|
||||
uint32_t sum = proto_ipv6_header_psum(ip6h->payload_len,
|
||||
IPPROTO_TCP,
|
||||
&ip6h->saddr,
|
||||
&ip6h->daddr);
|
||||
|
||||
first->iov_base = th;
|
||||
first->iov_len = size - l2_hdrlen + sizeof(*th);
|
||||
|
||||
th->check = csum_iov(first, num_buffers, sum);
|
||||
}
|
||||
}
|
||||
|
||||
/* set iov for pcap logging */
|
||||
first->iov_base = eh;
|
||||
first->iov_len = size - vnet_hdrlen;
|
||||
|
||||
pcap_iov(first, num_buffers);
|
||||
|
||||
/* set iov_len for vu_queue_fill_by_index(); */
|
||||
|
||||
first->iov_base = base;
|
||||
first->iov_len = size;
|
||||
|
||||
conn->seq_to_tap += segment_size;
|
||||
|
||||
segment_size = 0;
|
||||
num_buffers = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* release unused buffers */
|
||||
vu_queue_rewind(vdev, vq, iov_count - iov_used);
|
||||
|
||||
/* send packets */
|
||||
for (i = 0; i < iov_used; i++) {
|
||||
vu_queue_fill_by_index(vdev, vq, indexes[i],
|
||||
iov_vu[i + 1].iov_len, i);
|
||||
}
|
||||
|
||||
vu_queue_flush(vdev, vq, iov_used);
|
||||
vu_queue_notify(vdev, vq);
|
||||
|
||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||
|
||||
return 0;
|
||||
err:
|
||||
vu_queue_rewind(vdev, vq, iov_count);
|
||||
|
||||
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
ret = -errno;
|
||||
tcp_rst(c, conn);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
9
tcp_vu.h
Normal file
9
tcp_vu.h
Normal file
|
@ -0,0 +1,9 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#ifndef TCP_VU_H
|
||||
#define TCP_VU_H
|
||||
|
||||
int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
|
||||
int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
|
||||
|
||||
#endif /*TCP_VU_H */
|
125
udp.c
125
udp.c
|
@ -120,9 +120,7 @@
|
|||
#include "tap.h"
|
||||
#include "pcap.h"
|
||||
#include "log.h"
|
||||
|
||||
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
|
||||
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
|
||||
#include "udp_internal.h"
|
||||
|
||||
/**
|
||||
* struct udp_tap_port - Port tracking based on tap-facing source port
|
||||
|
@ -230,11 +228,11 @@ static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES];
|
|||
static struct iovec udp4_iov_splice [UDP_MAX_FRAMES];
|
||||
static struct iovec udp6_iov_splice [UDP_MAX_FRAMES];
|
||||
|
||||
static struct sockaddr_in udp4_localname = {
|
||||
struct sockaddr_in udp4_localname = {
|
||||
.sin_family = AF_INET,
|
||||
.sin_addr = IN4ADDR_LOOPBACK_INIT,
|
||||
};
|
||||
static struct sockaddr_in6 udp6_localname = {
|
||||
struct sockaddr_in6 udp6_localname = {
|
||||
.sin6_family = AF_INET6,
|
||||
.sin6_addr = IN6ADDR_LOOPBACK_INIT,
|
||||
};
|
||||
|
@ -567,21 +565,22 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
|
|||
*
|
||||
* Return: size of tap frame with headers
|
||||
*/
|
||||
static size_t udp_update_hdr4(const struct ctx *c, int n, in_port_t dstport,
|
||||
const struct timespec *now)
|
||||
size_t udp_update_hdr4(const struct ctx *c, struct iphdr *iph,
|
||||
size_t data_len, struct sockaddr_in *s_in,
|
||||
in_port_t dstport, const struct timespec *now)
|
||||
{
|
||||
struct udp4_l2_buf_t *b = &udp4_l2_buf[n];
|
||||
struct udphdr *uh = (struct udphdr *)(iph + 1);
|
||||
const struct in_addr *src;
|
||||
in_port_t src_port;
|
||||
size_t ip_len;
|
||||
|
||||
ip_len = udp4_l2_mh_sock[n].msg_len + sizeof(b->iph) + sizeof(b->uh);
|
||||
ip_len = data_len + sizeof(struct iphdr) + sizeof(struct udphdr);
|
||||
|
||||
b->iph.tot_len = htons(ip_len);
|
||||
b->iph.daddr = c->ip4.addr_seen.s_addr;
|
||||
iph->tot_len = htons(ip_len);
|
||||
iph->daddr = c->ip4.addr_seen.s_addr;
|
||||
|
||||
src = &b->s_in.sin_addr;
|
||||
src_port = ntohs(b->s_in.sin_port);
|
||||
src = &s_in->sin_addr;
|
||||
src_port = ntohs(s_in->sin_port);
|
||||
|
||||
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) &&
|
||||
IN4_ARE_ADDR_EQUAL(src, &c->ip4.dns_host) && src_port == 53) {
|
||||
|
@ -600,15 +599,16 @@ static size_t udp_update_hdr4(const struct ctx *c, int n, in_port_t dstport,
|
|||
|
||||
src = &c->ip4.gw;
|
||||
}
|
||||
b->iph.saddr = src->s_addr;
|
||||
iph->saddr = src->s_addr;
|
||||
|
||||
b->iph.check = csum_ip4_header(b->iph.tot_len, IPPROTO_UDP,
|
||||
*src, c->ip4.addr_seen);
|
||||
b->uh.source = b->s_in.sin_port;
|
||||
b->uh.dest = htons(dstport);
|
||||
b->uh.len = htons(udp4_l2_mh_sock[n].msg_len + sizeof(b->uh));
|
||||
iph->check = csum_ip4_header(iph->tot_len, IPPROTO_UDP,
|
||||
*src, c->ip4.addr_seen);
|
||||
uh->source = s_in->sin_port;
|
||||
uh->dest = htons(dstport);
|
||||
uh->len = htons(data_len + sizeof(struct udphdr));
|
||||
uh->check = 0;
|
||||
|
||||
return tap_iov_len(c, &b->taph, ip_len);
|
||||
return ip_len;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -620,23 +620,24 @@ static size_t udp_update_hdr4(const struct ctx *c, int n, in_port_t dstport,
|
|||
*
|
||||
* Return: size of tap frame with headers
|
||||
*/
|
||||
static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport,
|
||||
const struct timespec *now)
|
||||
size_t udp_update_hdr6(const struct ctx *c, struct ipv6hdr *ip6h,
|
||||
size_t data_len, struct sockaddr_in6 *s_in6,
|
||||
in_port_t dstport, const struct timespec *now)
|
||||
{
|
||||
struct udp6_l2_buf_t *b = &udp6_l2_buf[n];
|
||||
struct udphdr *uh = (struct udphdr *)(ip6h + 1);
|
||||
const struct in6_addr *src, *dst;
|
||||
uint16_t payload_len;
|
||||
in_port_t src_port;
|
||||
size_t ip_len;
|
||||
|
||||
dst = &c->ip6.addr_seen;
|
||||
src = &b->s_in6.sin6_addr;
|
||||
src_port = ntohs(b->s_in6.sin6_port);
|
||||
src = &s_in6->sin6_addr;
|
||||
src_port = ntohs(s_in6->sin6_port);
|
||||
|
||||
ip_len = udp6_l2_mh_sock[n].msg_len + sizeof(b->ip6h) + sizeof(b->uh);
|
||||
ip_len = data_len + sizeof(struct ipv6hdr) + sizeof(struct udphdr);
|
||||
|
||||
payload_len = udp6_l2_mh_sock[n].msg_len + sizeof(b->uh);
|
||||
b->ip6h.payload_len = htons(payload_len);
|
||||
payload_len = data_len + sizeof(struct udphdr);
|
||||
ip6h->payload_len = htons(payload_len);
|
||||
|
||||
if (IN6_IS_ADDR_LINKLOCAL(src)) {
|
||||
dst = &c->ip6.addr_ll_seen;
|
||||
|
@ -668,23 +669,25 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport,
|
|||
src = &c->ip6.gw;
|
||||
else
|
||||
src = &c->ip6.addr_ll;
|
||||
|
||||
}
|
||||
b->ip6h.daddr = *dst;
|
||||
b->ip6h.saddr = *src;
|
||||
b->ip6h.version = 6;
|
||||
b->ip6h.nexthdr = IPPROTO_UDP;
|
||||
b->ip6h.hop_limit = 255;
|
||||
ip6h->daddr = *dst;
|
||||
ip6h->saddr = *src;
|
||||
ip6h->version = 6;
|
||||
ip6h->nexthdr = IPPROTO_UDP;
|
||||
ip6h->hop_limit = 255;
|
||||
|
||||
b->uh.source = b->s_in6.sin6_port;
|
||||
b->uh.dest = htons(dstport);
|
||||
b->uh.len = b->ip6h.payload_len;
|
||||
b->uh.check = 0;
|
||||
b->uh.check = csum(&b->uh, payload_len,
|
||||
proto_ipv6_header_psum(payload_len, IPPROTO_UDP,
|
||||
src, dst));
|
||||
uh->source = s_in6->sin6_port;
|
||||
uh->dest = htons(dstport);
|
||||
uh->len = ip6h->payload_len;
|
||||
uh->check = 0;
|
||||
if (c->mode != MODE_VU)
|
||||
uh->check = csum(uh, payload_len,
|
||||
proto_ipv6_header_psum(payload_len, IPPROTO_UDP,
|
||||
src, dst));
|
||||
else
|
||||
uh->check = 0xffff; /* zero checksum is invalid with IPv6 */
|
||||
|
||||
return tap_iov_len(c, &b->taph, ip_len);
|
||||
return ip_len;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -698,6 +701,11 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport,
|
|||
*
|
||||
* Return: size of tap frame with headers
|
||||
*/
|
||||
#pragma GCC diagnostic push
|
||||
/* ignore unaligned pointer value warning for &udp6_l2_buf[i].ip6h and
|
||||
* &udp4_l2_buf[i].iph
|
||||
*/
|
||||
#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
|
||||
static void udp_tap_send(const struct ctx *c,
|
||||
unsigned int start, unsigned int n,
|
||||
in_port_t dstport, bool v6, const struct timespec *now)
|
||||
|
@ -711,21 +719,34 @@ static void udp_tap_send(const struct ctx *c,
|
|||
tap_iov = udp4_l2_iov_tap;
|
||||
|
||||
for (i = start; i < start + n; i++) {
|
||||
size_t buf_len;
|
||||
size_t ip_len;
|
||||
|
||||
if (v6)
|
||||
buf_len = udp_update_hdr6(c, i, dstport, now);
|
||||
else
|
||||
buf_len = udp_update_hdr4(c, i, dstport, now);
|
||||
if (v6) {
|
||||
ip_len = udp_update_hdr6(c, &udp6_l2_buf[i].ip6h,
|
||||
udp6_l2_mh_sock[i].msg_len,
|
||||
&udp6_l2_buf[i].s_in6, dstport,
|
||||
now);
|
||||
tap_iov[i].iov_len = tap_iov_len(c,
|
||||
&udp6_l2_buf[i].taph,
|
||||
ip_len);
|
||||
} else {
|
||||
ip_len = udp_update_hdr4(c, &udp4_l2_buf[i].iph,
|
||||
udp4_l2_mh_sock[i].msg_len,
|
||||
&udp4_l2_buf[i].s_in,
|
||||
dstport, now);
|
||||
|
||||
tap_iov[i].iov_len = buf_len;
|
||||
tap_iov[i].iov_len = tap_iov_len(c,
|
||||
&udp4_l2_buf[i].taph,
|
||||
ip_len);
|
||||
}
|
||||
}
|
||||
|
||||
tap_send_frames(c, tap_iov + start, n);
|
||||
}
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
/**
|
||||
* udp_sock_handler() - Handle new data from socket
|
||||
* udp_buf_sock_handler() - Handle new data from socket
|
||||
* @c: Execution context
|
||||
* @ref: epoll reference
|
||||
* @events: epoll events bitmap
|
||||
|
@ -733,8 +754,8 @@ static void udp_tap_send(const struct ctx *c,
|
|||
*
|
||||
* #syscalls recvmmsg
|
||||
*/
|
||||
void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
|
||||
const struct timespec *now)
|
||||
void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
|
||||
const struct timespec *now)
|
||||
{
|
||||
/* For not entirely clear reasons (data locality?) pasta gets
|
||||
* better throughput if we receive tap datagrams one at a
|
||||
|
@ -744,7 +765,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
|
|||
* whether we'll use tap or splice, always go one at a time
|
||||
* for pasta mode.
|
||||
*/
|
||||
ssize_t n = (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1);
|
||||
ssize_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
|
||||
in_port_t dstport = ref.udp.port;
|
||||
bool v6 = ref.udp.v6;
|
||||
struct mmsghdr *mmh_recv;
|
||||
|
|
2
udp.h
2
udp.h
|
@ -9,7 +9,7 @@
|
|||
#define UDP_TIMER_INTERVAL 1000 /* ms */
|
||||
|
||||
void udp_portmap_clear(void);
|
||||
void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
|
||||
void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
|
||||
const struct timespec *now);
|
||||
int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
const void *saddr, const void *daddr,
|
||||
|
|
21
udp_internal.h
Normal file
21
udp_internal.h
Normal file
|
@ -0,0 +1,21 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
* Copyright (c) 2021 Red Hat GmbH
|
||||
* Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
*/
|
||||
|
||||
#ifndef UDP_INTERNAL_H
|
||||
#define UDP_INTERNAL_H
|
||||
|
||||
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
|
||||
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
|
||||
|
||||
extern struct sockaddr_in udp4_localname;
|
||||
extern struct sockaddr_in6 udp6_localname;
|
||||
|
||||
size_t udp_update_hdr4(const struct ctx *c, struct iphdr *iph,
|
||||
size_t data_len, struct sockaddr_in *s_in,
|
||||
in_port_t dstport, const struct timespec *now);
|
||||
size_t udp_update_hdr6(const struct ctx *c, struct ipv6hdr *ip6h,
|
||||
size_t data_len, struct sockaddr_in6 *s_in6,
|
||||
in_port_t dstport, const struct timespec *now);
|
||||
#endif /* UDP_INTERNAL_H */
|
218
udp_vu.c
Normal file
218
udp_vu.c
Normal file
|
@ -0,0 +1,218 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <unistd.h>
|
||||
#include <net/ethernet.h>
|
||||
#include <net/if.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/udp.h>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <sys/uio.h>
|
||||
#include <linux/virtio_net.h>
|
||||
|
||||
#include "checksum.h"
|
||||
#include "util.h"
|
||||
#include "ip.h"
|
||||
#include "passt.h"
|
||||
#include "pcap.h"
|
||||
#include "log.h"
|
||||
#include "vhost_user.h"
|
||||
#include "udp_internal.h"
|
||||
#include "udp_vu.h"
|
||||
|
||||
/* vhost-user */
|
||||
static const struct virtio_net_hdr vu_header = {
|
||||
.flags = VIRTIO_NET_HDR_F_DATA_VALID,
|
||||
.gso_type = VIRTIO_NET_HDR_GSO_NONE,
|
||||
};
|
||||
|
||||
static unsigned char buffer[65536];
|
||||
static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE];
|
||||
static unsigned int indexes [VIRTQUEUE_MAX_SIZE];
|
||||
|
||||
void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
|
||||
const struct timespec *now)
|
||||
{
|
||||
VuDev *vdev = (VuDev *)&c->vdev;
|
||||
VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
|
||||
size_t l2_hdrlen, vnet_hdrlen, fillsize;
|
||||
ssize_t data_len;
|
||||
in_port_t dstport = ref.udp.port;
|
||||
bool has_mrg_rxbuf, v6 = ref.udp.v6;
|
||||
struct msghdr msg;
|
||||
int i, iov_count, iov_used, virtqueue_max;
|
||||
|
||||
if (c->no_udp || !(events & EPOLLIN))
|
||||
return;
|
||||
|
||||
has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF);
|
||||
if (has_mrg_rxbuf) {
|
||||
vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
|
||||
virtqueue_max = VIRTQUEUE_MAX_SIZE;
|
||||
} else {
|
||||
vnet_hdrlen = sizeof(struct virtio_net_hdr);
|
||||
virtqueue_max = 1;
|
||||
}
|
||||
l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct udphdr);
|
||||
|
||||
if (v6) {
|
||||
l2_hdrlen += sizeof(struct ipv6hdr);
|
||||
|
||||
udp6_localname.sin6_port = htons(dstport);
|
||||
msg.msg_name = &udp6_localname;
|
||||
msg.msg_namelen = sizeof(udp6_localname);
|
||||
} else {
|
||||
l2_hdrlen += sizeof(struct iphdr);
|
||||
|
||||
udp4_localname.sin_port = htons(dstport);
|
||||
msg.msg_name = &udp4_localname;
|
||||
msg.msg_namelen = sizeof(udp4_localname);
|
||||
}
|
||||
|
||||
msg.msg_control = NULL;
|
||||
msg.msg_controllen = 0;
|
||||
msg.msg_flags = 0;
|
||||
|
||||
for (i = 0; i < UDP_MAX_FRAMES; i++) {
|
||||
struct virtio_net_hdr_mrg_rxbuf *vh;
|
||||
struct ethhdr *eh;
|
||||
char *base;
|
||||
size_t size;
|
||||
|
||||
fillsize = USHRT_MAX;
|
||||
iov_count = 0;
|
||||
while (fillsize && iov_count < virtqueue_max) {
|
||||
VuVirtqElement *elem;
|
||||
|
||||
elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
|
||||
if (!elem)
|
||||
break;
|
||||
|
||||
if (elem->in_num < 1) {
|
||||
err("virtio-net receive queue contains no in buffers");
|
||||
vu_queue_rewind(vdev, vq, iov_count);
|
||||
return;
|
||||
}
|
||||
ASSERT(elem->in_num == 1);
|
||||
ASSERT(elem->in_sg[0].iov_len >= l2_hdrlen);
|
||||
|
||||
indexes[iov_count] = elem->index;
|
||||
if (iov_count == 0) {
|
||||
iov_vu[0].iov_base = (char *)elem->in_sg[0].iov_base + l2_hdrlen;
|
||||
iov_vu[0].iov_len = elem->in_sg[0].iov_len - l2_hdrlen;
|
||||
} else {
|
||||
iov_vu[iov_count].iov_base = elem->in_sg[0].iov_base;
|
||||
iov_vu[iov_count].iov_len = elem->in_sg[0].iov_len;
|
||||
}
|
||||
|
||||
if (iov_vu[iov_count].iov_len > fillsize)
|
||||
iov_vu[iov_count].iov_len = fillsize;
|
||||
|
||||
fillsize -= iov_vu[iov_count].iov_len;
|
||||
|
||||
iov_count++;
|
||||
}
|
||||
if (iov_count == 0)
|
||||
break;
|
||||
|
||||
msg.msg_iov = iov_vu;
|
||||
msg.msg_iovlen = iov_count;
|
||||
|
||||
data_len = recvmsg(ref.fd, &msg, 0);
|
||||
if (data_len < 0) {
|
||||
vu_queue_rewind(vdev, vq, iov_count);
|
||||
return;
|
||||
}
|
||||
|
||||
iov_used = 0;
|
||||
size = data_len;
|
||||
while (size) {
|
||||
if (iov_vu[iov_used].iov_len > size)
|
||||
iov_vu[iov_used].iov_len = size;
|
||||
|
||||
size -= iov_vu[iov_used].iov_len;
|
||||
iov_used++;
|
||||
}
|
||||
|
||||
base = (char *)iov_vu[0].iov_base - l2_hdrlen;
|
||||
size = iov_vu[0].iov_len + l2_hdrlen;
|
||||
|
||||
/* release unused buffers */
|
||||
vu_queue_rewind(vdev, vq, iov_count - iov_used);
|
||||
|
||||
/* vnet_header */
|
||||
vh = (struct virtio_net_hdr_mrg_rxbuf *)base;
|
||||
vh->hdr = vu_header;
|
||||
if (has_mrg_rxbuf)
|
||||
vh->num_buffers = htole16(iov_used);
|
||||
|
||||
/* ethernet header */
|
||||
eh = (struct ethhdr *)(base + vnet_hdrlen);
|
||||
|
||||
memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
|
||||
memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
|
||||
|
||||
/* initialize header */
|
||||
if (v6) {
|
||||
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
|
||||
struct udphdr *uh = (struct udphdr *)(ip6h + 1);
|
||||
uint32_t sum;
|
||||
|
||||
eh->h_proto = htons(ETH_P_IPV6);
|
||||
|
||||
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP);
|
||||
|
||||
udp_update_hdr6(c, ip6h, data_len, &udp6_localname,
|
||||
dstport, now);
|
||||
if (*c->pcap) {
|
||||
sum = proto_ipv6_header_psum(ip6h->payload_len,
|
||||
IPPROTO_UDP,
|
||||
&ip6h->saddr,
|
||||
&ip6h->daddr);
|
||||
|
||||
iov_vu[0].iov_base = uh;
|
||||
iov_vu[0].iov_len = size - l2_hdrlen + sizeof(*uh);
|
||||
uh->check = csum_iov(iov_vu, iov_used, sum);
|
||||
}
|
||||
} else {
|
||||
struct iphdr *iph = (struct iphdr *)(eh + 1);
|
||||
struct udphdr *uh = (struct udphdr *)(iph + 1);
|
||||
uint32_t sum;
|
||||
|
||||
eh->h_proto = htons(ETH_P_IP);
|
||||
|
||||
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP);
|
||||
|
||||
udp_update_hdr4(c, iph, data_len, &udp4_localname,
|
||||
dstport, now);
|
||||
if (*c->pcap) {
|
||||
sum = proto_ipv4_header_psum(iph->tot_len,
|
||||
IPPROTO_UDP,
|
||||
(struct in_addr){ .s_addr = iph->saddr },
|
||||
(struct in_addr){ .s_addr = iph->daddr });
|
||||
|
||||
iov_vu[0].iov_base = uh;
|
||||
iov_vu[0].iov_len = size - l2_hdrlen + sizeof(*uh);
|
||||
uh->check = csum_iov(iov_vu, iov_used, sum);
|
||||
}
|
||||
}
|
||||
|
||||
/* set iov for pcap logging */
|
||||
iov_vu[0].iov_base = base + vnet_hdrlen;
|
||||
iov_vu[0].iov_len = size - vnet_hdrlen;
|
||||
pcap_iov(iov_vu, iov_used);
|
||||
|
||||
/* set iov_len for vu_queue_fill_by_index(); */
|
||||
iov_vu[0].iov_base = base;
|
||||
iov_vu[0].iov_len = size;
|
||||
|
||||
/* send packets */
|
||||
for (i = 0; i < iov_used; i++)
|
||||
vu_queue_fill_by_index(vdev, vq, indexes[i],
|
||||
iov_vu[i].iov_len, i);
|
||||
|
||||
vu_queue_flush(vdev, vq, iov_used);
|
||||
vu_queue_notify(vdev, vq);
|
||||
}
|
||||
}
|
8
udp_vu.h
Normal file
8
udp_vu.h
Normal file
|
@ -0,0 +1,8 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#ifndef UDP_VU_H
|
||||
#define UDP_VU_H
|
||||
|
||||
void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
uint32_t events, const struct timespec *now);
|
||||
#endif /* UDP_VU_H */
|
11
util.h
11
util.h
|
@ -43,6 +43,9 @@
|
|||
#define ROUND_DOWN(x, y) ((x) & ~((y) - 1))
|
||||
#define ROUND_UP(x, y) (((x) + (y) - 1) & ~((y) - 1))
|
||||
|
||||
#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
|
||||
#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
|
||||
|
||||
#define MAX_FROM_BITS(n) (((1U << (n)) - 1))
|
||||
|
||||
#define BIT(n) (1UL << (n))
|
||||
|
@ -110,6 +113,14 @@
|
|||
#define htonl_constant(x) (__bswap_constant_32(x))
|
||||
#endif
|
||||
|
||||
#define barrier() do { __asm__ __volatile__("" ::: "memory"); } while (0)
|
||||
#define smp_mb() do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0)
|
||||
#define smp_mb_release() do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0)
|
||||
#define smp_mb_acquire() do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0)
|
||||
|
||||
#define smp_wmb() smp_mb_release()
|
||||
#define smp_rmb() smp_mb_acquire()
|
||||
|
||||
#define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8)
|
||||
int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
|
||||
void *arg);
|
||||
|
|
1050
vhost_user.c
Normal file
1050
vhost_user.c
Normal file
File diff suppressed because it is too large
Load diff
137
vhost_user.h
Normal file
137
vhost_user.h
Normal file
|
@ -0,0 +1,137 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* some parts from subprojects/libvhost-user/libvhost-user.h */
|
||||
|
||||
#ifndef VHOST_USER_H
|
||||
#define VHOST_USER_H
|
||||
|
||||
#include "virtio.h"
|
||||
#include "iov.h"
|
||||
|
||||
#define VHOST_USER_F_PROTOCOL_FEATURES 30
|
||||
|
||||
#define VHOST_MEMORY_BASELINE_NREGIONS 8
|
||||
|
||||
enum vhost_user_protocol_feature {
|
||||
VHOST_USER_PROTOCOL_F_MQ = 0,
|
||||
VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
|
||||
VHOST_USER_PROTOCOL_F_RARP = 2,
|
||||
VHOST_USER_PROTOCOL_F_REPLY_ACK = 3,
|
||||
VHOST_USER_PROTOCOL_F_NET_MTU = 4,
|
||||
VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5,
|
||||
VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6,
|
||||
VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
|
||||
VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
|
||||
VHOST_USER_PROTOCOL_F_CONFIG = 9,
|
||||
VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
|
||||
VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
|
||||
VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
|
||||
VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14,
|
||||
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15,
|
||||
|
||||
VHOST_USER_PROTOCOL_F_MAX
|
||||
};
|
||||
|
||||
enum vhost_user_request {
|
||||
VHOST_USER_NONE = 0,
|
||||
VHOST_USER_GET_FEATURES = 1,
|
||||
VHOST_USER_SET_FEATURES = 2,
|
||||
VHOST_USER_SET_OWNER = 3,
|
||||
VHOST_USER_RESET_OWNER = 4,
|
||||
VHOST_USER_SET_MEM_TABLE = 5,
|
||||
VHOST_USER_SET_LOG_BASE = 6,
|
||||
VHOST_USER_SET_LOG_FD = 7,
|
||||
VHOST_USER_SET_VRING_NUM = 8,
|
||||
VHOST_USER_SET_VRING_ADDR = 9,
|
||||
VHOST_USER_SET_VRING_BASE = 10,
|
||||
VHOST_USER_GET_VRING_BASE = 11,
|
||||
VHOST_USER_SET_VRING_KICK = 12,
|
||||
VHOST_USER_SET_VRING_CALL = 13,
|
||||
VHOST_USER_SET_VRING_ERR = 14,
|
||||
VHOST_USER_GET_PROTOCOL_FEATURES = 15,
|
||||
VHOST_USER_SET_PROTOCOL_FEATURES = 16,
|
||||
VHOST_USER_GET_QUEUE_NUM = 17,
|
||||
VHOST_USER_SET_VRING_ENABLE = 18,
|
||||
VHOST_USER_SEND_RARP = 19,
|
||||
VHOST_USER_NET_SET_MTU = 20,
|
||||
VHOST_USER_SET_BACKEND_REQ_FD = 21,
|
||||
VHOST_USER_IOTLB_MSG = 22,
|
||||
VHOST_USER_SET_VRING_ENDIAN = 23,
|
||||
VHOST_USER_GET_CONFIG = 24,
|
||||
VHOST_USER_SET_CONFIG = 25,
|
||||
VHOST_USER_CREATE_CRYPTO_SESSION = 26,
|
||||
VHOST_USER_CLOSE_CRYPTO_SESSION = 27,
|
||||
VHOST_USER_POSTCOPY_ADVISE = 28,
|
||||
VHOST_USER_POSTCOPY_LISTEN = 29,
|
||||
VHOST_USER_POSTCOPY_END = 30,
|
||||
VHOST_USER_GET_INFLIGHT_FD = 31,
|
||||
VHOST_USER_SET_INFLIGHT_FD = 32,
|
||||
VHOST_USER_GPU_SET_SOCKET = 33,
|
||||
VHOST_USER_VRING_KICK = 35,
|
||||
VHOST_USER_GET_MAX_MEM_SLOTS = 36,
|
||||
VHOST_USER_ADD_MEM_REG = 37,
|
||||
VHOST_USER_REM_MEM_REG = 38,
|
||||
VHOST_USER_MAX
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
enum vhost_user_request request;
|
||||
|
||||
#define VHOST_USER_VERSION_MASK 0x3
|
||||
#define VHOST_USER_REPLY_MASK (0x1 << 2)
|
||||
#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3)
|
||||
uint32_t flags;
|
||||
uint32_t size; /* the following payload size */
|
||||
} __attribute__ ((__packed__)) vhost_user_header;
|
||||
|
||||
typedef struct VhostUserMemory_region {
|
||||
uint64_t guest_phys_addr;
|
||||
uint64_t memory_size;
|
||||
uint64_t userspace_addr;
|
||||
uint64_t mmap_offset;
|
||||
} VhostUserMemory_region;
|
||||
|
||||
struct VhostUserMemory {
|
||||
uint32_t nregions;
|
||||
uint32_t padding;
|
||||
struct VhostUserMemory_region regions[VHOST_MEMORY_BASELINE_NREGIONS];
|
||||
};
|
||||
|
||||
typedef union {
|
||||
#define VHOST_USER_VRING_IDX_MASK 0xff
|
||||
#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8)
|
||||
uint64_t u64;
|
||||
struct vhost_vring_state state;
|
||||
struct vhost_vring_addr addr;
|
||||
struct VhostUserMemory memory;
|
||||
} vhost_user_payload;
|
||||
|
||||
typedef struct VhostUserMsg {
|
||||
vhost_user_header hdr;
|
||||
vhost_user_payload payload;
|
||||
|
||||
int fds[VHOST_MEMORY_BASELINE_NREGIONS];
|
||||
int fd_num;
|
||||
uint8_t *data;
|
||||
} __attribute__ ((__packed__)) VhostUserMsg;
|
||||
#define VHOST_USER_HDR_SIZE sizeof(vhost_user_header)
|
||||
|
||||
#define VHOST_USER_RX_QUEUE 0
|
||||
#define VHOST_USER_TX_QUEUE 1
|
||||
|
||||
static inline bool vu_queue_enabled(VuVirtq *vq)
|
||||
{
|
||||
return vq->enable;
|
||||
}
|
||||
|
||||
static inline bool vu_queue_started(const VuVirtq *vq)
|
||||
{
|
||||
return vq->started;
|
||||
}
|
||||
|
||||
int vu_send(const struct ctx *c, const void *data, size_t len);
|
||||
void vu_print_capabilities(void);
|
||||
void vu_init(struct ctx *c);
|
||||
void vu_kick_cb(struct ctx *c, union epoll_ref ref);
|
||||
void tap_handler_vu(struct ctx *c, uint32_t events);
|
||||
#endif /* VHOST_USER_H */
|
484
virtio.c
Normal file
484
virtio.c
Normal file
|
@ -0,0 +1,484 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
/* some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c */
|
||||
|
||||
#include <stddef.h>
|
||||
#include <endian.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <sys/eventfd.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "virtio.h"
|
||||
|
||||
#define VIRTQUEUE_MAX_SIZE 1024
|
||||
|
||||
/* Translate guest physical address to our virtual address. */
|
||||
static void *vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
if (*plen == 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Find matching memory region. */
|
||||
for (i = 0; i < dev->nregions; i++) {
|
||||
VuDevRegion *r = &dev->regions[i];
|
||||
|
||||
if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
|
||||
if ((guest_addr + *plen) > (r->gpa + r->size)) {
|
||||
*plen = r->gpa + r->size - guest_addr;
|
||||
}
|
||||
return (void *)(guest_addr - (uintptr_t)r->gpa +
|
||||
(uintptr_t)r->mmap_addr + r->mmap_offset);
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline uint16_t vring_avail_flags(VuVirtq *vq)
|
||||
{
|
||||
return le16toh(vq->vring.avail->flags);
|
||||
}
|
||||
|
||||
static inline uint16_t vring_avail_idx(VuVirtq *vq)
|
||||
{
|
||||
vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
|
||||
|
||||
return vq->shadow_avail_idx;
|
||||
}
|
||||
|
||||
static inline uint16_t vring_avail_ring(VuVirtq *vq, int i)
|
||||
{
|
||||
return le16toh(vq->vring.avail->ring[i]);
|
||||
}
|
||||
|
||||
static inline uint16_t vring_get_used_event(VuVirtq *vq)
|
||||
{
|
||||
return vring_avail_ring(vq, vq->vring.num);
|
||||
}
|
||||
|
||||
static bool virtqueue_get_head(VuDev *dev, VuVirtq *vq,
|
||||
unsigned int idx, unsigned int *head)
|
||||
{
|
||||
/* Grab the next descriptor number they're advertising, and increment
|
||||
* the index we've seen. */
|
||||
*head = vring_avail_ring(vq, idx % vq->vring.num);
|
||||
|
||||
/* If their number is silly, that's a fatal mistake. */
|
||||
if (*head >= vq->vring.num) {
|
||||
vu_panic(dev, "Guest says index %u is available", *head);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int
|
||||
virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc,
|
||||
uint64_t addr, size_t len)
|
||||
{
|
||||
struct vring_desc *ori_desc;
|
||||
uint64_t read_len;
|
||||
|
||||
if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (len == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
while (len) {
|
||||
read_len = len;
|
||||
ori_desc = vu_gpa_to_va(dev, &read_len, addr);
|
||||
if (!ori_desc) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
memcpy(desc, ori_desc, read_len);
|
||||
len -= read_len;
|
||||
addr += read_len;
|
||||
desc += read_len;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
enum {
|
||||
VIRTQUEUE_READ_DESC_ERROR = -1,
|
||||
VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
|
||||
VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
|
||||
};
|
||||
|
||||
static int
|
||||
virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc,
|
||||
int i, unsigned int max, unsigned int *next)
|
||||
{
|
||||
/* If this descriptor says it doesn't chain, we're done. */
|
||||
if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
|
||||
return VIRTQUEUE_READ_DESC_DONE;
|
||||
}
|
||||
|
||||
/* Check they're not leading us off end of descriptors. */
|
||||
*next = le16toh(desc[i].next);
|
||||
/* Make sure compiler knows to grab that: we don't want it changing! */
|
||||
smp_wmb();
|
||||
|
||||
if (*next >= max) {
|
||||
vu_panic(dev, "Desc next is %u", *next);
|
||||
return VIRTQUEUE_READ_DESC_ERROR;
|
||||
}
|
||||
|
||||
return VIRTQUEUE_READ_DESC_MORE;
|
||||
}
|
||||
|
||||
bool vu_queue_empty(VuDev *dev, VuVirtq *vq)
|
||||
{
|
||||
if (dev->broken ||
|
||||
!vq->vring.avail) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (vq->shadow_avail_idx != vq->last_avail_idx) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return vring_avail_idx(vq) == vq->last_avail_idx;
|
||||
}
|
||||
|
||||
static bool vring_notify(VuDev *dev, VuVirtq *vq)
|
||||
{
|
||||
uint16_t old, new;
|
||||
bool v;
|
||||
|
||||
/* We need to expose used array entries before checking used event. */
|
||||
smp_mb();
|
||||
|
||||
/* Always notify when queue is empty (when feature acknowledge) */
|
||||
if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
|
||||
!vq->inuse && vu_queue_empty(dev, vq)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
|
||||
return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
|
||||
}
|
||||
|
||||
v = vq->signalled_used_valid;
|
||||
vq->signalled_used_valid = true;
|
||||
old = vq->signalled_used;
|
||||
new = vq->signalled_used = vq->used_idx;
|
||||
return !v || vring_need_event(vring_get_used_event(vq), new, old);
|
||||
}
|
||||
|
||||
void vu_queue_notify(VuDev *dev, VuVirtq *vq)
|
||||
{
|
||||
if (dev->broken || !vq->vring.avail) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!vring_notify(dev, vq)) {
|
||||
debug("skipped notify...");
|
||||
return;
|
||||
}
|
||||
|
||||
if (eventfd_write(vq->call_fd, 1) < 0) {
|
||||
vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void vring_set_avail_event(VuVirtq *vq, uint16_t val)
|
||||
{
|
||||
uint16_t val_le = htole16(val);
|
||||
|
||||
if (!vq->notification) {
|
||||
return;
|
||||
}
|
||||
|
||||
memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t));
|
||||
}
|
||||
|
||||
static bool virtqueue_map_desc(VuDev *dev,
|
||||
unsigned int *p_num_sg, struct iovec *iov,
|
||||
unsigned int max_num_sg,
|
||||
uint64_t pa, size_t sz)
|
||||
{
|
||||
unsigned num_sg = *p_num_sg;
|
||||
|
||||
ASSERT(num_sg <= max_num_sg);
|
||||
|
||||
if (!sz) {
|
||||
vu_panic(dev, "virtio: zero sized buffers are not allowed");
|
||||
return false;
|
||||
}
|
||||
|
||||
while (sz) {
|
||||
uint64_t len = sz;
|
||||
|
||||
if (num_sg == max_num_sg) {
|
||||
vu_panic(dev, "virtio: too many descriptors in indirect table");
|
||||
return false;
|
||||
}
|
||||
|
||||
iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa);
|
||||
if (iov[num_sg].iov_base == NULL) {
|
||||
vu_panic(dev, "virtio: invalid address for buffers");
|
||||
return false;
|
||||
}
|
||||
iov[num_sg].iov_len = len;
|
||||
num_sg++;
|
||||
sz -= len;
|
||||
pa += len;
|
||||
}
|
||||
|
||||
*p_num_sg = num_sg;
|
||||
return true;
|
||||
}
|
||||
|
||||
static void * virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num, unsigned char *buffer)
|
||||
{
|
||||
VuVirtqElement *elem;
|
||||
size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
|
||||
size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
|
||||
size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
|
||||
|
||||
if (out_sg_end > 65536)
|
||||
return NULL;
|
||||
|
||||
elem = (void *)buffer;
|
||||
elem->out_num = out_num;
|
||||
elem->in_num = in_num;
|
||||
elem->in_sg = (struct iovec *)((uintptr_t)elem + in_sg_ofs);
|
||||
elem->out_sg = (struct iovec *)((uintptr_t)elem + out_sg_ofs);
|
||||
return elem;
|
||||
}
|
||||
|
||||
static void *
|
||||
vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz, unsigned char *buffer)
|
||||
{
|
||||
struct vring_desc *desc = vq->vring.desc;
|
||||
uint64_t desc_addr, read_len;
|
||||
unsigned int desc_len;
|
||||
unsigned int max = vq->vring.num;
|
||||
unsigned int i = idx;
|
||||
VuVirtqElement *elem;
|
||||
unsigned int out_num = 0, in_num = 0;
|
||||
struct iovec iov[VIRTQUEUE_MAX_SIZE];
|
||||
struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
|
||||
int rc;
|
||||
|
||||
if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
|
||||
if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
|
||||
vu_panic(dev, "Invalid size for indirect buffer table");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* loop over the indirect descriptor table */
|
||||
desc_addr = le64toh(desc[i].addr);
|
||||
desc_len = le32toh(desc[i].len);
|
||||
max = desc_len / sizeof(struct vring_desc);
|
||||
read_len = desc_len;
|
||||
desc = vu_gpa_to_va(dev, &read_len, desc_addr);
|
||||
if (desc && read_len != desc_len) {
|
||||
/* Failed to use zero copy */
|
||||
desc = NULL;
|
||||
if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len)) {
|
||||
desc = desc_buf;
|
||||
}
|
||||
}
|
||||
if (!desc) {
|
||||
vu_panic(dev, "Invalid indirect buffer table");
|
||||
return NULL;
|
||||
}
|
||||
i = 0;
|
||||
}
|
||||
|
||||
/* Collect all the descriptors */
|
||||
do {
|
||||
if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
|
||||
if (!virtqueue_map_desc(dev, &in_num, iov + out_num,
|
||||
VIRTQUEUE_MAX_SIZE - out_num,
|
||||
le64toh(desc[i].addr),
|
||||
le32toh(desc[i].len))) {
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
if (in_num) {
|
||||
vu_panic(dev, "Incorrect order for descriptors");
|
||||
return NULL;
|
||||
}
|
||||
if (!virtqueue_map_desc(dev, &out_num, iov,
|
||||
VIRTQUEUE_MAX_SIZE,
|
||||
le64toh(desc[i].addr),
|
||||
le32toh(desc[i].len))) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* If we've got too many, that implies a descriptor loop. */
|
||||
if ((in_num + out_num) > max) {
|
||||
vu_panic(dev, "Looped descriptor");
|
||||
return NULL;
|
||||
}
|
||||
rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
|
||||
} while (rc == VIRTQUEUE_READ_DESC_MORE);
|
||||
|
||||
if (rc == VIRTQUEUE_READ_DESC_ERROR) {
|
||||
vu_panic(dev, "read descriptor error");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Now copy what we have collected and mapped */
|
||||
elem = virtqueue_alloc_element(sz, out_num, in_num, buffer);
|
||||
if (!elem) {
|
||||
return NULL;
|
||||
}
|
||||
elem->index = idx;
|
||||
for (i = 0; i < out_num; i++) {
|
||||
elem->out_sg[i] = iov[i];
|
||||
}
|
||||
for (i = 0; i < in_num; i++) {
|
||||
elem->in_sg[i] = iov[out_num + i];
|
||||
}
|
||||
|
||||
return elem;
|
||||
}
|
||||
|
||||
void *vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz, unsigned char *buffer)
|
||||
{
|
||||
unsigned int head;
|
||||
VuVirtqElement *elem;
|
||||
|
||||
if (dev->broken || !vq->vring.avail) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (vu_queue_empty(dev, vq)) {
|
||||
return NULL;
|
||||
}
|
||||
/*
|
||||
* Needed after virtio_queue_empty(), see comment in
|
||||
* virtqueue_num_heads().
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
if (vq->inuse >= vq->vring.num) {
|
||||
vu_panic(dev, "Virtqueue size exceeded");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
|
||||
vring_set_avail_event(vq, vq->last_avail_idx);
|
||||
}
|
||||
|
||||
elem = vu_queue_map_desc(dev, vq, head, sz, buffer);
|
||||
|
||||
if (!elem) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
vq->inuse++;
|
||||
|
||||
return elem;
|
||||
}
|
||||
|
||||
void vu_queue_detach_element(VuDev *dev, VuVirtq *vq,
|
||||
unsigned int index, size_t len)
|
||||
{
|
||||
(void)dev;
|
||||
(void)index;
|
||||
(void)len;
|
||||
|
||||
vq->inuse--;
|
||||
/* unmap, when DMA support is added */
|
||||
}
|
||||
|
||||
void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len)
|
||||
{
|
||||
vq->last_avail_idx--;
|
||||
vu_queue_detach_element(dev, vq, index, len);
|
||||
}
|
||||
|
||||
bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num)
|
||||
{
|
||||
(void)dev;
|
||||
if (num > vq->inuse) {
|
||||
return false;
|
||||
}
|
||||
vq->last_avail_idx -= num;
|
||||
vq->inuse -= num;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void vring_used_write(VuVirtq *vq,
|
||||
struct vring_used_elem *uelem, int i)
|
||||
{
|
||||
struct vring_used *used = vq->vring.used;
|
||||
|
||||
used->ring[i] = *uelem;
|
||||
}
|
||||
|
||||
void vu_queue_fill_by_index(VuDev *dev, VuVirtq *vq, unsigned int index,
|
||||
unsigned int len, unsigned int idx)
|
||||
{
|
||||
struct vring_used_elem uelem;
|
||||
|
||||
if (dev->broken || !vq->vring.avail)
|
||||
return;
|
||||
|
||||
idx = (idx + vq->used_idx) % vq->vring.num;
|
||||
|
||||
uelem.id = htole32(index);
|
||||
uelem.len = htole32(len);
|
||||
vring_used_write(vq, &uelem, idx);
|
||||
}
|
||||
|
||||
void vu_queue_fill(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem,
|
||||
unsigned int len, unsigned int idx)
|
||||
{
|
||||
vu_queue_fill_by_index(dev, vq, elem->index, len, idx);
|
||||
}
|
||||
|
||||
static inline void vring_used_idx_set(VuVirtq *vq, uint16_t val)
|
||||
{
|
||||
vq->vring.used->idx = htole16(val);
|
||||
|
||||
vq->used_idx = val;
|
||||
}
|
||||
|
||||
void vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count)
|
||||
{
|
||||
uint16_t old, new;
|
||||
|
||||
if (dev->broken ||
|
||||
!vq->vring.avail) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Make sure buffer is written before we update index. */
|
||||
smp_wmb();
|
||||
|
||||
old = vq->used_idx;
|
||||
new = old + count;
|
||||
vring_used_idx_set(vq, new);
|
||||
vq->inuse -= count;
|
||||
if ((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old)) {
|
||||
vq->signalled_used_valid = false;
|
||||
}
|
||||
}
|
||||
|
||||
void vu_queue_push(VuDev *dev, VuVirtq *vq,
|
||||
VuVirtqElement *elem, unsigned int len)
|
||||
{
|
||||
vu_queue_fill(dev, vq, elem, len, 0);
|
||||
vu_queue_flush(dev, vq, 1);
|
||||
}
|
||||
|
121
virtio.h
Normal file
121
virtio.h
Normal file
|
@ -0,0 +1,121 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
//
|
||||
/* come parts copied from QEMU subprojects/libvhost-user/libvhost-user.h */
|
||||
|
||||
#ifndef VIRTIO_H
|
||||
#define VIRTIO_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <linux/vhost_types.h>
|
||||
|
||||
#define VIRTQUEUE_MAX_SIZE 1024
|
||||
|
||||
#define vu_panic(vdev, ...) \
|
||||
do { \
|
||||
(vdev)->broken = true; \
|
||||
err( __VA_ARGS__ ); \
|
||||
} while (0)
|
||||
|
||||
typedef struct VuRing {
|
||||
unsigned int num;
|
||||
struct vring_desc *desc;
|
||||
struct vring_avail *avail;
|
||||
struct vring_used *used;
|
||||
uint64_t log_guest_addr;
|
||||
uint32_t flags;
|
||||
} VuRing;
|
||||
|
||||
typedef struct VuVirtq {
|
||||
VuRing vring;
|
||||
|
||||
/* Next head to pop */
|
||||
uint16_t last_avail_idx;
|
||||
|
||||
/* Last avail_idx read from VQ. */
|
||||
uint16_t shadow_avail_idx;
|
||||
|
||||
uint16_t used_idx;
|
||||
|
||||
/* Last used index value we have signalled on */
|
||||
uint16_t signalled_used;
|
||||
|
||||
/* Last used index value we have signalled on */
|
||||
bool signalled_used_valid;
|
||||
|
||||
bool notification;
|
||||
|
||||
unsigned int inuse;
|
||||
|
||||
int call_fd;
|
||||
int kick_fd;
|
||||
int err_fd;
|
||||
unsigned int enable;
|
||||
bool started;
|
||||
|
||||
/* Guest addresses of our ring */
|
||||
struct vhost_vring_addr vra;
|
||||
} VuVirtq;
|
||||
|
||||
typedef struct VuDevRegion {
|
||||
uint64_t gpa;
|
||||
uint64_t size;
|
||||
uint64_t qva;
|
||||
uint64_t mmap_offset;
|
||||
uint64_t mmap_addr;
|
||||
} VuDevRegion;
|
||||
|
||||
#define VHOST_USER_MAX_QUEUES 2
|
||||
|
||||
/*
|
||||
* Set a reasonable maximum number of ram slots, which will be supported by
|
||||
* any architecture.
|
||||
*/
|
||||
#define VHOST_USER_MAX_RAM_SLOTS 32
|
||||
|
||||
typedef struct VuDev {
|
||||
uint32_t nregions;
|
||||
VuDevRegion regions[VHOST_USER_MAX_RAM_SLOTS];
|
||||
VuVirtq vq[VHOST_USER_MAX_QUEUES];
|
||||
uint64_t features;
|
||||
uint64_t protocol_features;
|
||||
bool broken;
|
||||
int hdrlen;
|
||||
} VuDev;
|
||||
|
||||
typedef struct VuVirtqElement {
|
||||
unsigned int index;
|
||||
unsigned int out_num;
|
||||
unsigned int in_num;
|
||||
struct iovec *in_sg;
|
||||
struct iovec *out_sg;
|
||||
} VuVirtqElement;
|
||||
|
||||
static inline bool has_feature(uint64_t features, unsigned int fbit)
|
||||
{
|
||||
return !!(features & (1ULL << fbit));
|
||||
}
|
||||
|
||||
static inline bool vu_has_feature(VuDev *vdev, unsigned int fbit)
|
||||
{
|
||||
return has_feature(vdev->features, fbit);
|
||||
}
|
||||
|
||||
static inline bool vu_has_protocol_feature(VuDev *vdev, unsigned int fbit)
|
||||
{
|
||||
return has_feature(vdev->protocol_features, fbit);
|
||||
}
|
||||
|
||||
bool vu_queue_empty(VuDev *dev, VuVirtq *vq);
|
||||
void vu_queue_notify(VuDev *dev, VuVirtq *vq);
|
||||
void *vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz, unsigned char *buffer);
|
||||
void vu_queue_detach_element(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len);
|
||||
void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len);
|
||||
bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num);
|
||||
|
||||
void vu_queue_fill_by_index(VuDev *dev, VuVirtq *vq, unsigned int index,
|
||||
unsigned int len, unsigned int idx);
|
||||
void vu_queue_fill(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, unsigned int len,
|
||||
unsigned int idx);
|
||||
void vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count);
|
||||
void vu_queue_push(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, unsigned int len);
|
||||
#endif /* VIRTIO_H */
|
Loading…
Reference in a new issue