treewide: Packet abstraction with mandatory boundary checks

Implement a packet abstraction providing boundary and size checks
based on packet descriptors: packets stored in a buffer can be queued
into a pool (without storage of its own), and data can be retrieved
referring to an index in the pool, specifying offset and length.

Checks ensure data is not read outside the boundaries of buffer and
descriptors, and that packets added to a pool are within the buffer
range with valid offset and indices.

This implies a wider rework: usage of the "queueing" part of the
abstraction mostly affects tap_handler_{passt,pasta}() functions and
their callees, while the "fetching" part affects all the guest or tap
facing implementations: TCP, UDP, ICMP, ARP, NDP, DHCP and DHCPv6
handlers.

Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Stefano Brivio 2022-03-25 13:02:47 +01:00
parent 3e4c2d1098
commit bb70811183
23 changed files with 1002 additions and 703 deletions

View file

@ -291,7 +291,7 @@ speeding up local connections, and usually requiring NAT. _pasta_:
* ✅ restrictive seccomp profiles (25 syscalls allowed for _passt_, 37 for * ✅ restrictive seccomp profiles (25 syscalls allowed for _passt_, 37 for
_pasta_ on x86_64) _pasta_ on x86_64)
* ✅ static checkers in continuous integration (clang-tidy, cppcheck) * ✅ static checkers in continuous integration (clang-tidy, cppcheck)
* 🛠️ clearly defined packet abstraction * ✅️ clearly defined boundary-checked packet abstraction
* 🛠️ ~5 000 LoC target * 🛠️ ~5 000 LoC target
* ⌚ [fuzzing](https://bugs.passt.top/show_bug.cgi?id=9), _packetdrill_ tests * ⌚ [fuzzing](https://bugs.passt.top/show_bug.cgi?id=9), _packetdrill_ tests
* ⌚ stricter [synflood protection](https://bugs.passt.top/show_bug.cgi?id=10) * ⌚ stricter [synflood protection](https://bugs.passt.top/show_bug.cgi?id=10)

45
arp.c
View file

@ -30,53 +30,56 @@
#include "tap.h" #include "tap.h"
/** /**
* arp() - Check if this is an ARP message, reply as needed * arp() - Check if this is a supported ARP message, reply as needed
* @c: Execution context * @c: Execution context
* @len: Total L2 packet length * @p: Packet pool, single packet with Ethernet buffer
* @eh: Packet buffer, Ethernet header
* *
* Return: 0 if it's not an ARP message, 1 if handled, -1 on failure * Return: 1 if handled, -1 on failure
*/ */
int arp(struct ctx *c, struct ethhdr *eh, size_t len) int arp(struct ctx *c, struct pool *p)
{ {
struct arphdr *ah = (struct arphdr *)(eh + 1);
struct arpmsg *am = (struct arpmsg *)(ah + 1);
unsigned char swap[4]; unsigned char swap[4];
struct ethhdr *eh;
struct arphdr *ah;
struct arpmsg *am;
size_t len;
if (eh->h_proto != htons(ETH_P_ARP)) eh = packet_get(p, 0, 0, sizeof(*eh), NULL);
return 0; ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL);
am = packet_get(p, 0, sizeof(*eh) + sizeof(*ah), sizeof(*am), NULL);
if (len < sizeof(*eh) + sizeof(*ah) + sizeof(*am)) if (!eh || !ah || !am)
return -1; return -1;
if (ah->ar_hrd != htons(ARPHRD_ETHER) || if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
ah->ar_pro != htons(ETH_P_IP) || ah->ar_pro != htons(ETH_P_IP) ||
ah->ar_hln != ETH_ALEN || ah->ar_pln != 4 || ah->ar_hln != ETH_ALEN ||
ah->ar_pln != 4 ||
ah->ar_op != htons(ARPOP_REQUEST)) ah->ar_op != htons(ARPOP_REQUEST))
return 1; return 1;
/* Discard announcements (but not 0.0.0.0 "probes"): we might have the /* Discard announcements (but not 0.0.0.0 "probes"): we might have the
* same IP address, hide that. * same IP address, hide that.
*/ */
if (memcmp(am->sip, (unsigned char[4]){ 0, 0, 0, 0 }, 4) && if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) &&
!memcmp(am->sip, am->tip, 4)) !memcmp(am->sip, am->tip, sizeof(am->sip)))
return 1; return 1;
/* Don't resolve our own address, either. */ /* Don't resolve our own address, either. */
if (!memcmp(am->tip, &c->addr4, 4)) if (!memcmp(am->tip, &c->addr4, sizeof(am->tip)))
return 1; return 1;
ah->ar_op = htons(ARPOP_REPLY); ah->ar_op = htons(ARPOP_REPLY);
memcpy(am->tha, am->sha, ETH_ALEN); memcpy(am->tha, am->sha, sizeof(am->tha));
memcpy(am->sha, c->mac, ETH_ALEN); memcpy(am->sha, c->mac, sizeof(am->sha));
memcpy(swap, am->tip, 4); memcpy(swap, am->tip, sizeof(am->tip));
memcpy(am->tip, am->sip, 4); memcpy(am->tip, am->sip, sizeof(am->tip));
memcpy(am->sip, swap, 4); memcpy(am->sip, swap, sizeof(am->sip));
len = sizeof(*eh) + sizeof(*ah) + sizeof(*am); len = sizeof(*eh) + sizeof(*ah) + sizeof(*am);
memcpy(eh->h_dest, eh->h_source, ETH_ALEN); memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest));
memcpy(eh->h_source, c->mac, ETH_ALEN); memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
if (tap_send(c, eh, len, 0) < 0) if (tap_send(c, eh, len, 0) < 0)
perror("ARP: send"); perror("ARP: send");

2
arp.h
View file

@ -17,4 +17,4 @@ struct arpmsg {
unsigned char tip[4]; unsigned char tip[4];
} __attribute__((__packed__)); } __attribute__((__packed__));
int arp(struct ctx *c, struct ethhdr *eh, size_t len); int arp(struct ctx *c, struct pool *p);

52
dhcp.c
View file

@ -22,9 +22,11 @@
#include <stdint.h> #include <stdint.h>
#include <unistd.h> #include <unistd.h>
#include <string.h> #include <string.h>
#include <limits.h>
#include "util.h" #include "util.h"
#include "checksum.h" #include "checksum.h"
#include "packet.h"
#include "passt.h" #include "passt.h"
#include "tap.h" #include "tap.h"
#include "dhcp.h" #include "dhcp.h"
@ -257,27 +259,32 @@ static void opt_set_dns_search(struct ctx *c, size_t max_len)
/** /**
* dhcp() - Check if this is a DHCP message, reply as needed * dhcp() - Check if this is a DHCP message, reply as needed
* @c: Execution context * @c: Execution context
* @len: Total L2 packet length * @p: Packet pool, single packet with Ethernet buffer
* @eh: Packet buffer, Ethernet header
* *
* Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure * Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure
*/ */
int dhcp(struct ctx *c, struct ethhdr *eh, size_t len) int dhcp(struct ctx *c, struct pool *p)
{ {
struct iphdr *iph = (struct iphdr *)(eh + 1); size_t mlen, len, offset = 0, opt_len, opt_off = 0;
size_t mlen, olen; struct ethhdr *eh;
struct iphdr *iph;
struct udphdr *uh; struct udphdr *uh;
unsigned int i; unsigned int i;
struct msg *m; struct msg *m;
if (len < sizeof(*eh) + sizeof(*iph)) eh = packet_get(p, 0, offset, sizeof(*eh), NULL);
return 0; offset += sizeof(*eh);
if (len < sizeof(*eh) + (long)iph->ihl * 4 + sizeof(*uh)) iph = packet_get(p, 0, offset, sizeof(*iph), NULL);
return 0; if (!eh || !iph)
return -1;
uh = (struct udphdr *)((char *)iph + (long)(iph->ihl * 4)); offset += iph->ihl * 4UL;
m = (struct msg *)(uh + 1); uh = packet_get(p, 0, offset, sizeof(*uh), &mlen);
offset += sizeof(*uh);
if (!uh)
return -1;
if (uh->dest != htons(67)) if (uh->dest != htons(67))
return 0; return 0;
@ -285,18 +292,29 @@ int dhcp(struct ctx *c, struct ethhdr *eh, size_t len)
if (c->no_dhcp) if (c->no_dhcp)
return 1; return 1;
mlen = len - sizeof(*eh) - (long)iph->ihl * 4 - sizeof(*uh); m = packet_get(p, 0, offset, offsetof(struct msg, o), &opt_len);
if (mlen != ntohs(uh->len) - sizeof(*uh) || if (!m ||
mlen != ntohs(uh->len) - sizeof(*uh) ||
mlen < offsetof(struct msg, o) || mlen < offsetof(struct msg, o) ||
m->op != BOOTREQUEST) m->op != BOOTREQUEST)
return -1; return -1;
olen = mlen - offsetof(struct msg, o); offset += offsetof(struct msg, o);
for (i = 0; i + 2 < olen; i += m->o[i + 1] + 2) {
if (m->o[i + 1] + i + 2 >= olen) while (opt_off + 2 < opt_len) {
uint8_t *olen, *type, *val;
type = packet_get(p, 0, offset + opt_off, 1, NULL);
olen = packet_get(p, 0, offset + opt_off + 1, 1, NULL);
if (!type || !olen)
return -1; return -1;
memcpy(&opts[m->o[i]].c, &m->o[i + 2], m->o[i + 1]); val = packet_get(p, 0, offset + opt_off + 2, *olen, NULL);
if (!val)
return -1;
memcpy(&opts[*type].c, val, *olen);
opt_off += *olen + 2;
} }
if (opts[53].c[0] == DHCPDISCOVER) { if (opts[53].c[0] == DHCPDISCOVER) {

2
dhcp.h
View file

@ -3,5 +3,5 @@
* Author: Stefano Brivio <sbrivio@redhat.com> * Author: Stefano Brivio <sbrivio@redhat.com>
*/ */
int dhcp(struct ctx *c, struct ethhdr *eh, size_t len); int dhcp(struct ctx *c, struct pool *p);
void dhcp_init(void); void dhcp_init(void);

137
dhcpv6.c
View file

@ -24,7 +24,9 @@
#include <unistd.h> #include <unistd.h>
#include <string.h> #include <string.h>
#include <time.h> #include <time.h>
#include <limits.h>
#include "packet.h"
#include "util.h" #include "util.h"
#include "passt.h" #include "passt.h"
#include "tap.h" #include "tap.h"
@ -69,6 +71,8 @@ struct opt_hdr {
#endif #endif
#define OPT_SIZE(x) OPT_SIZE_CONV(sizeof(struct opt_##x) - \ #define OPT_SIZE(x) OPT_SIZE_CONV(sizeof(struct opt_##x) - \
sizeof(struct opt_hdr)) sizeof(struct opt_hdr))
#define OPT_VSIZE(x) (sizeof(struct opt_##x) - \
sizeof(struct opt_hdr))
/** /**
* struct opt_client_id - DHCPv6 Client Identifier option * struct opt_client_id - DHCPv6 Client Identifier option
@ -287,26 +291,30 @@ static struct resp_not_on_link_t {
/** /**
* dhcpv6_opt() - Get option from DHCPv6 message * dhcpv6_opt() - Get option from DHCPv6 message
* @o: First option header to check * @p: Packet pool, single packet with UDP header
* @offset: Offset to look at, 0: end of header, set to option start
* @type: Option type to look up, network order * @type: Option type to look up, network order
* @len: Remaining length, host order, modified on return
* *
* Return: pointer to option header, or NULL on malformed or missing option * Return: pointer to option header, or NULL on malformed or missing option
*/ */
static struct opt_hdr *dhcpv6_opt(struct opt_hdr *o, uint16_t type, size_t *len) static struct opt_hdr *dhcpv6_opt(struct pool *p, size_t *offset, uint16_t type)
{ {
while (*len >= sizeof(struct opt_hdr)) { struct opt_hdr *o;
unsigned int opt_len = ntohs(o->l) + sizeof(struct opt_hdr); size_t left;
if (opt_len > *len) if (!*offset)
*offset = sizeof(struct udphdr) + sizeof(struct msg_hdr);
while ((o = packet_get_try(p, 0, *offset, sizeof(*o), &left))) {
unsigned int opt_len = ntohs(o->l) + sizeof(*o);
if (ntohs(o->l) > left)
return NULL; return NULL;
*len -= opt_len;
if (o->t == type) if (o->t == type)
return o; return o;
o = (struct opt_hdr *)((uint8_t *)o + opt_len); *offset += opt_len;
} }
return NULL; return NULL;
@ -320,55 +328,39 @@ static struct opt_hdr *dhcpv6_opt(struct opt_hdr *o, uint16_t type, size_t *len)
* *
* Return: pointer to non-appropriate IA_NA or IA_TA, if any, NULL otherwise * Return: pointer to non-appropriate IA_NA or IA_TA, if any, NULL otherwise
*/ */
static struct opt_hdr *dhcpv6_ia_notonlink(struct opt_hdr *o, size_t rem_len, static struct opt_hdr *dhcpv6_ia_notonlink(struct pool *p, struct in6_addr *la)
struct in6_addr *addr)
{ {
struct opt_hdr *ia, *ia_addr;
char buf[INET6_ADDRSTRLEN]; char buf[INET6_ADDRSTRLEN];
struct in6_addr *req_addr; struct in6_addr *req_addr;
size_t len; struct opt_hdr *ia, *h;
size_t offset;
int ia_type; int ia_type;
ia_type = OPT_IA_NA; ia_type = OPT_IA_NA;
ia_ta: ia_ta:
len = rem_len; offset = 0;
ia = o; while ((ia = dhcpv6_opt(p, &offset, ia_type))) {
if (ntohs(ia->l) < OPT_VSIZE(ia_na))
return NULL;
while ((ia = dhcpv6_opt(ia, ia_type, &len))) { offset += sizeof(struct opt_ia_na);
size_t ia_len = ntohs(ia->l);
if (ia_type == OPT_IA_NA) { while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
struct opt_ia_na *subopt = (struct opt_ia_na *)ia + 1; struct opt_ia_addr *opt_addr = (struct opt_ia_addr *)h;
ia_addr = (struct opt_hdr *)subopt; if (ntohs(h->l) != OPT_VSIZE(ia_addr))
} else if (ia_type == OPT_IA_TA) { return NULL;
struct opt_ia_ta *subopt = (struct opt_ia_ta *)ia + 1;
ia_addr = (struct opt_hdr *)subopt; req_addr = &opt_addr->addr;
} if (!IN6_ARE_ADDR_EQUAL(la, req_addr)) {
ia_len -= sizeof(struct opt_ia_na) - sizeof(struct opt_hdr);
while ((ia_addr = dhcpv6_opt(ia_addr, OPT_IAAADR, &ia_len))) {
struct opt_ia_addr *next;
req_addr = (struct in6_addr *)(ia_addr + 1);
if (!IN6_ARE_ADDR_EQUAL(addr, req_addr)) {
info("DHCPv6: requested address %s not on link", info("DHCPv6: requested address %s not on link",
inet_ntop(AF_INET6, req_addr, inet_ntop(AF_INET6, req_addr,
buf, sizeof(buf))); buf, sizeof(buf)));
return ia; return ia;
} }
next = (struct opt_ia_addr *)ia_addr + 1; offset += sizeof(struct opt_ia_addr);
ia_addr = (struct opt_hdr *)next;
} }
if (!ia_addr)
break;
ia = ia_addr;
} }
if (ia_type == OPT_IA_NA) { if (ia_type == OPT_IA_NA) {
@ -449,59 +441,58 @@ search:
/** /**
* dhcpv6() - Check if this is a DHCPv6 message, reply as needed * dhcpv6() - Check if this is a DHCPv6 message, reply as needed
* @c: Execution context * @c: Execution context
* @eh: Packet buffer, Ethernet header * @p: Packet pool, single packet starting from UDP header
* @len: Total L2 packet length * @saddr: Source IPv6 address of original message
* @daddr: Destination IPv6 address of original message
* *
* Return: 0 if it's not a DHCPv6 message, 1 if handled, -1 on failure * Return: 0 if it's not a DHCPv6 message, 1 if handled, -1 on failure
*/ */
int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len) int dhcpv6(struct ctx *c, struct pool *p,
const struct in6_addr *saddr, const struct in6_addr *daddr)
{ {
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
struct opt_hdr *ia, *bad_ia, *client_id, *server_id; struct opt_hdr *ia, *bad_ia, *client_id, *server_id;
struct in6_addr *src; struct in6_addr *src;
struct msg_hdr *mh; struct msg_hdr *mh;
struct udphdr *uh; struct udphdr *uh;
uint8_t proto; size_t mlen, n;
size_t mlen;
size_t n;
uh = (struct udphdr *)ipv6_l4hdr(ip6h, &proto); uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);
if (!uh || proto != IPPROTO_UDP || uh->dest != htons(547)) if (!uh)
return -1;
if (uh->dest != htons(547))
return 0; return 0;
if (c->no_dhcpv6) if (c->no_dhcpv6)
return 1; return 1;
if (!IN6_IS_ADDR_MULTICAST(&ip6h->daddr)) if (!IN6_IS_ADDR_MULTICAST(daddr))
return -1; return -1;
mlen = len - ((intptr_t)uh - (intptr_t)eh) - sizeof(*uh); if (mlen + sizeof(*uh) != ntohs(uh->len) || mlen < sizeof(*mh))
if (mlen != ntohs(uh->len) - sizeof(*uh) ||
mlen < sizeof(struct msg_hdr))
return -1; return -1;
c->addr6_ll_seen = ip6h->saddr; c->addr6_ll_seen = *saddr;
if (IN6_IS_ADDR_LINKLOCAL(&c->gw6)) if (IN6_IS_ADDR_LINKLOCAL(&c->gw6))
src = &c->gw6; src = &c->gw6;
else else
src = &c->addr6_ll; src = &c->addr6_ll;
mh = (struct msg_hdr *)(uh + 1); mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL);
mlen -= sizeof(struct msg_hdr); if (!mh)
n = mlen;
client_id = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_CLIENTID, &n);
if (!client_id || ntohs(client_id->l) > ntohs(OPT_SIZE(client_id)))
return -1; return -1;
n = mlen; client_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_CLIENTID);
server_id = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_SERVERID, &n); if (!client_id || ntohs(client_id->l) > OPT_VSIZE(client_id))
return -1;
n = mlen; server_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_SERVERID);
ia = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_IA_NA, &n); if (server_id && ntohs(server_id->l) != OPT_VSIZE(server_id))
if (ia && ntohs(ia->l) < ntohs(OPT_SIZE(ia_na))) return -1;
ia = dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_NA);
if (ia && ntohs(ia->l) < MIN(OPT_VSIZE(ia_na), OPT_VSIZE(ia_ta)))
return -1; return -1;
resp.hdr.type = TYPE_REPLY; resp.hdr.type = TYPE_REPLY;
@ -516,18 +507,17 @@ int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len)
if (mh->type == TYPE_CONFIRM && server_id) if (mh->type == TYPE_CONFIRM && server_id)
return -1; return -1;
if ((bad_ia = dhcpv6_ia_notonlink((struct opt_hdr *)(mh + 1), if ((bad_ia = dhcpv6_ia_notonlink(p, &c->addr6))) {
mlen, &c->addr6))) {
info("DHCPv6: received CONFIRM with inappropriate IA," info("DHCPv6: received CONFIRM with inappropriate IA,"
" sending NotOnLink status in REPLY"); " sending NotOnLink status in REPLY");
n = ntohs(bad_ia->l) + sizeof(struct opt_hdr); bad_ia->l = htons(OPT_VSIZE(ia_na) +
bad_ia->l = htons(n - sizeof(struct opt_hdr) +
sizeof(sc_not_on_link)); sizeof(sc_not_on_link));
n = sizeof(struct opt_ia_na);
memcpy(resp_not_on_link.var, bad_ia, n); memcpy(resp_not_on_link.var, bad_ia, n);
memcpy(resp_not_on_link.var + n, &sc_not_on_link, memcpy(resp_not_on_link.var + n,
sizeof(sc_not_on_link)); &sc_not_on_link, sizeof(sc_not_on_link));
n += sizeof(sc_not_on_link); n += sizeof(sc_not_on_link);
memcpy(resp_not_on_link.var + n, client_id, memcpy(resp_not_on_link.var + n, client_id,
@ -552,8 +542,7 @@ int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len)
memcmp(&resp.server_id, server_id, sizeof(resp.server_id))) memcmp(&resp.server_id, server_id, sizeof(resp.server_id)))
return -1; return -1;
n = mlen; if (ia || dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_TA))
if (ia || dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_IA_TA, &n))
return -1; return -1;
info("DHCPv6: received INFORMATION_REQUEST, sending REPLY"); info("DHCPv6: received INFORMATION_REQUEST, sending REPLY");

View file

@ -3,5 +3,6 @@
* Author: Stefano Brivio <sbrivio@redhat.com> * Author: Stefano Brivio <sbrivio@redhat.com>
*/ */
int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len); int dhcpv6(struct ctx *c, struct pool *p,
struct in6_addr *saddr, struct in6_addr *daddr);
void dhcpv6_init(struct ctx *c); void dhcpv6_init(struct ctx *c);

28
icmp.c
View file

@ -31,9 +31,11 @@
#include <linux/icmpv6.h> #include <linux/icmpv6.h>
#include "packet.h"
#include "util.h" #include "util.h"
#include "passt.h" #include "passt.h"
#include "tap.h" #include "tap.h"
#include "packet.h"
#include "icmp.h" #include "icmp.h"
#define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */ #define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */
@ -134,17 +136,15 @@ void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
* icmp_tap_handler() - Handle packets from tap * icmp_tap_handler() - Handle packets from tap
* @c: Execution context * @c: Execution context
* @af: Address family, AF_INET or AF_INET6 * @af: Address family, AF_INET or AF_INET6
* @ * @p: Packet pool, single packet with ICMP/ICMPv6 header
* @msg: Input message
* @count: Message count (always 1 for ICMP)
* @now: Current timestamp * @now: Current timestamp
* *
* Return: count of consumed packets (always 1, even if malformed) * Return: count of consumed packets (always 1, even if malformed)
*/ */
int icmp_tap_handler(struct ctx *c, int af, void *addr, int icmp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
struct tap_l4_msg *msg, int count, struct timespec *now) struct timespec *now)
{ {
(void)count; size_t plen;
if (af == AF_INET) { if (af == AF_INET) {
union icmp_epoll_ref iref = { .icmp.v6 = 0 }; union icmp_epoll_ref iref = { .icmp.v6 = 0 };
@ -155,9 +155,8 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
struct icmphdr *ih; struct icmphdr *ih;
int id, s; int id, s;
ih = (struct icmphdr *)(pkt_buf + msg[0].pkt_buf_offset); ih = packet_get(p, 0, 0, sizeof(*ih), &plen);
if (!ih)
if (msg[0].l4_len < sizeof(*ih) || ih->type != ICMP_ECHO)
return 1; return 1;
sa.sin_port = ih->un.echo.id; sa.sin_port = ih->un.echo.id;
@ -175,7 +174,7 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
bitmap_set(icmp_act[V4], id); bitmap_set(icmp_act[V4], id);
sa.sin_addr = *(struct in_addr *)addr; sa.sin_addr = *(struct in_addr *)addr;
sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL, sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa)); (struct sockaddr *)&sa, sizeof(sa));
} else if (af == AF_INET6) { } else if (af == AF_INET6) {
union icmp_epoll_ref iref = { .icmp.v6 = 1 }; union icmp_epoll_ref iref = { .icmp.v6 = 1 };
@ -186,10 +185,11 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
struct icmp6hdr *ih; struct icmp6hdr *ih;
int id, s; int id, s;
ih = (struct icmp6hdr *)(pkt_buf + msg[0].pkt_buf_offset); ih = packet_get(p, 0, 0, sizeof(struct icmp6hdr), &plen);
if (!ih)
return 1;
if (msg[0].l4_len < sizeof(*ih) || if (ih->icmp6_type != 128 && ih->icmp6_type != 129)
(ih->icmp6_type != 128 && ih->icmp6_type != 129))
return 1; return 1;
sa.sin6_port = ih->icmp6_identifier; sa.sin6_port = ih->icmp6_identifier;
@ -207,7 +207,7 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
bitmap_set(icmp_act[V6], id); bitmap_set(icmp_act[V6], id);
sa.sin6_addr = *(struct in6_addr *)addr; sa.sin6_addr = *(struct in6_addr *)addr;
sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL, sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa)); (struct sockaddr *)&sa, sizeof(sa));
} }

4
icmp.h
View file

@ -12,8 +12,8 @@ struct ctx;
void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now); struct timespec *now);
int icmp_tap_handler(struct ctx *c, int af, void *addr, int icmp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
struct tap_l4_msg *msg, int count, struct timespec *now); struct timespec *now);
void icmp_timer(struct ctx *c, struct timespec *ts); void icmp_timer(struct ctx *c, struct timespec *ts);
/** /**

45
ndp.c
View file

@ -39,28 +39,23 @@
/** /**
* ndp() - Check for NDP solicitations, reply as needed * ndp() - Check for NDP solicitations, reply as needed
* @c: Execution context * @c: Execution context
* @len: Total L2 packet length * @ih: ICMPv6 header
* @eh: Packet buffer, Ethernet header * @eh_source: Source Ethernet address
* @saddr Source IPv6 address
* *
* Return: 0 if not handled here, 1 if handled, -1 on failure * Return: 0 if not handled here, 1 if handled, -1 on failure
*/ */
int ndp(struct ctx *c, struct ethhdr *eh, size_t len) int ndp(struct ctx *c, struct icmp6hdr *ih, unsigned char *eh_source,
struct in6_addr *saddr)
{ {
struct ethhdr *ehr;
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1), *ip6hr;
struct icmp6hdr *ih, *ihr;
char buf[BUFSIZ] = { 0 }; char buf[BUFSIZ] = { 0 };
uint8_t proto, *p; struct ipv6hdr *ip6hr;
struct icmp6hdr *ihr;
struct ethhdr *ehr;
unsigned char *p;
size_t len;
if (len < sizeof(*ehr) + sizeof(*ip6h) + sizeof(*ih)) if (ih->icmp6_type < RS || ih->icmp6_type > NA)
return 0;
ih = (struct icmp6hdr *)ipv6_l4hdr(ip6h, &proto);
if (!ih)
return -1;
if (proto != IPPROTO_ICMPV6 ||
ih->icmp6_type < RS || ih->icmp6_type > NA)
return 0; return 0;
if (c->no_ndp) if (c->no_ndp)
@ -71,11 +66,7 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
ihr = (struct icmp6hdr *)(ip6hr + 1); ihr = (struct icmp6hdr *)(ip6hr + 1);
if (ih->icmp6_type == NS) { if (ih->icmp6_type == NS) {
if (len < sizeof(*ehr) + sizeof(*ip6h) + sizeof(*ih) + if (IN6_IS_ADDR_UNSPECIFIED(saddr))
sizeof(struct in6_addr))
return -1;
if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->saddr))
return 1; return 1;
info("NDP: received NS, sending NA"); info("NDP: received NS, sending NA");
@ -149,7 +140,7 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
if (!c->no_dhcp_dns_search && dns_s_len) { if (!c->no_dhcp_dns_search && dns_s_len) {
*p++ = 31; /* DNSSL */ *p++ = 31; /* DNSSL */
*p++ = (len + 8 - 1) / 8 + 1; /* length */ *p++ = (dns_s_len + 8 - 1) / 8 + 1; /* length */
p += 2; /* reserved */ p += 2; /* reserved */
*(uint32_t *)p = htonl(60); /* lifetime */ *(uint32_t *)p = htonl(60); /* lifetime */
p += 4; p += 4;
@ -185,12 +176,12 @@ dns_done:
len = (uintptr_t)p - (uintptr_t)ihr - sizeof(*ihr); len = (uintptr_t)p - (uintptr_t)ihr - sizeof(*ihr);
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) if (IN6_IS_ADDR_LINKLOCAL(saddr))
c->addr6_ll_seen = ip6h->saddr; c->addr6_ll_seen = *saddr;
else else
c->addr6_seen = ip6h->saddr; c->addr6_seen = *saddr;
ip6hr->daddr = ip6h->saddr; ip6hr->daddr = *saddr;
if (IN6_IS_ADDR_LINKLOCAL(&c->gw6)) if (IN6_IS_ADDR_LINKLOCAL(&c->gw6))
ip6hr->saddr = c->gw6; ip6hr->saddr = c->gw6;
else else
@ -207,7 +198,7 @@ dns_done:
ip6hr->hop_limit = 255; ip6hr->hop_limit = 255;
len += sizeof(*ehr) + sizeof(*ip6hr) + sizeof(*ihr); len += sizeof(*ehr) + sizeof(*ip6hr) + sizeof(*ihr);
memcpy(ehr->h_dest, eh->h_source, ETH_ALEN); memcpy(ehr->h_dest, eh_source, ETH_ALEN);
memcpy(ehr->h_source, c->mac, ETH_ALEN); memcpy(ehr->h_source, c->mac, ETH_ALEN);
ehr->h_proto = htons(ETH_P_IPV6); ehr->h_proto = htons(ETH_P_IPV6);

3
ndp.h
View file

@ -3,4 +3,5 @@
* Author: Stefano Brivio <sbrivio@redhat.com> * Author: Stefano Brivio <sbrivio@redhat.com>
*/ */
int ndp(struct ctx *c, struct ethhdr *eh, size_t len); int ndp(struct ctx *c, struct icmp6hdr *ih, unsigned char *eh_source,
struct in6_addr *saddr);

134
packet.c Normal file
View file

@ -0,0 +1,134 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* packet.c - Packet abstraction: add packets to pool, flush, get packet data
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#include <netinet/ip6.h>
#include "packet.h"
#include "util.h"
/**
* packet_add_do() - Add data as packet descriptor to given pool
* @p: Existing pool
* @len: Length of new descriptor
* @start: Start of data
* @func: For tracing: name of calling function, NULL means no trace()
* @line: For tracing: caller line of function call
*/
void packet_add_do(struct pool *p, size_t len, const char *start,
const char *func, const int line)
{
size_t index = p->count;
if (index >= p->size) {
trace("add packet index %lu to pool with size %lu, %s:%i",
index, p->size, func, line);
return;
}
if (start < p->buf) {
trace("add packet start %p before buffer start %p, %s:%i",
start, p->buf, func, line);
return;
}
if (start + len > p->buf + p->buf_size) {
trace("add packet start %p, length: %lu, buffer end %p, %s:%i",
start, len, p->buf + p->buf_size, func, line);
return;
}
if (len > UINT16_MAX) {
trace("add packet length %lu, %s:%i", func, line);
return;
}
if ((unsigned int)((intptr_t)start - (intptr_t)p->buf) > UINT32_MAX) {
trace("add packet start %p, buffer start %lu, %s:%i",
start, p->buf, func, line);
return;
}
p->pkt[index].offset = start - p->buf;
p->pkt[index].len = len;
p->count++;
}
/**
* packet_get_do() - Get data range from packet descriptor from given pool
* @p: Packet pool
* @index: Index of packet descriptor in pool
* @offset: Offset of data range in packet descriptor
* @len: Length of desired data range
* @left: Length of available data after range, set on return, can be NULL
* @func: For tracing: name of calling function, NULL means no trace()
* @line: For tracing: caller line of function call
*
* Return: pointer to start of data range, NULL on invalid range or descriptor
*/
void *packet_get_do(struct pool *p, size_t index, size_t offset, size_t len,
size_t *left, const char *func, const int line)
{
if (index > p->size || index > p->count) {
if (func) {
trace("packet %lu from pool size: %lu, count: %lu, "
"%s:%i", index, p->size, p->count, func, line);
}
return NULL;
}
if (len > UINT16_MAX || len + offset > UINT32_MAX) {
if (func) {
trace("packet data length %lu, offset %lu, %s:%i",
len, offset, func, line);
}
return NULL;
}
if (p->pkt[index].offset + len + offset > p->buf_size) {
if (func) {
trace("packet offset plus length %lu from size %lu, "
"%s:%i", p->pkt[index].offset + len + offset,
p->buf_size, func, line);
}
return NULL;
}
if (len + offset > p->pkt[index].len) {
if (func) {
trace("data length %lu, offset %lu from length %lu, "
"%s:%i", len, offset, p->pkt[index].len,
func, line);
}
return NULL;
}
if (left)
*left = p->pkt[index].len - offset - len;
return p->buf + p->pkt[index].offset + offset;
}
/**
* pool_flush() - Flush a packet pool
* @p: Pointer to packet pool
*/
void pool_flush(struct pool *p)
{
p->count = 0;
}

81
packet.h Normal file
View file

@ -0,0 +1,81 @@
/* SPDX-License-Identifier: AGPL-3.0-or-later
* Copyright (c) 2022 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#ifndef PACKET_H
#define PACKET_H
/**
* struct desc - Generic offset-based descriptor within buffer
* @offset: Offset of descriptor relative to buffer start, 32-bit limit
* @len: Length of descriptor, host order, 16-bit limit
*/
struct desc {
uint32_t offset;
uint16_t len;
};
/**
* struct pool - Generic pool of packets stored in a buffer
* @buf: Buffer storing packet descriptors
* @buf_size: Total size of buffer
* @size: Number of usable descriptors for the pool
* @count: Number of used descriptors for the pool
* @pkt: Descriptors: see macros below
*/
struct pool {
char *buf;
size_t buf_size;
size_t size;
size_t count;
struct desc pkt[1];
};
void packet_add_do(struct pool *p, size_t len, const char *start,
const char *func, const int line);
void *packet_get_do(struct pool *p, size_t index, size_t offset, size_t len,
size_t *left, const char *func, const int line);
void pool_flush(struct pool *p);
#define packet_add(p, len, start) \
packet_add_do(p, len, start, __func__, __LINE__);
#define packet_get(p, index, offset, len, left) \
packet_get_do(p, index, offset, len, left, __func__, __LINE__);
#define packet_get_try(p, index, offset, len, left) \
packet_get_do(p, index, offset, len, left, NULL, 0)
#define PACKET_POOL_DECL(_name, _size, _buf) \
struct _name ## _t { \
char *buf; \
size_t buf_size; \
size_t size; \
size_t count; \
struct desc pkt[_size]; \
}
#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \
{ \
.buf_size = _buf_size, \
.buf = _buf, \
.size = _size, \
}
#define PACKET_POOL(name, size, buf, buf_size) \
PACKET_POOL_DECL(name, size, buf) name = \
PACKET_POOL_INIT_NOCAST(size, buf, buf_size)
#define PACKET_INIT(name, size, buf, buf_size) \
(struct name ## _t) PACKET_POOL_INIT_NOCAST(size, buf, buf_size)
#define PACKET_POOL_NOINIT(name, size, buf) \
PACKET_POOL_DECL(name, size, buf) name ## _storage; \
static struct pool *name = (struct pool *)&name ## _storage
#define PACKET_POOL_P(name, size, buf, buf_size) \
PACKET_POOL(name ## _storage, size, buf, buf_size); \
struct pool *name = (struct pool *)&name ## _storage
#endif /* PACKET_H */

View file

@ -28,6 +28,7 @@ struct tap_l4_msg {
union epoll_ref; union epoll_ref;
#include "packet.h"
#include "icmp.h" #include "icmp.h"
#include "tcp.h" #include "tcp.h"
#include "udp.h" #include "udp.h"

329
tap.c
View file

@ -51,10 +51,11 @@
#include "pcap.h" #include "pcap.h"
#include "netlink.h" #include "netlink.h"
#include "pasta.h" #include "pasta.h"
#include "packet.h"
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
static struct tap_msg seq4[TAP_MSGS]; static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
static struct tap_msg seq6[TAP_MSGS]; static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
/** /**
* tap_send() - Send frame, with qemu socket header if needed * tap_send() - Send frame, with qemu socket header if needed
@ -202,6 +203,8 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
} }
} }
PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
/** /**
* struct l4_seq4_t - Message sequence for one protocol handler call, IPv4 * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
* @msgs: Count of messages in sequence * @msgs: Count of messages in sequence
@ -212,8 +215,7 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
* @daddr: Destination address * @daddr: Destination address
* @msg: Array of messages that can be handled in a single call * @msg: Array of messages that can be handled in a single call
*/ */
static struct tap_l4_seq4 { static struct tap4_l4_t {
uint16_t msgs;
uint8_t protocol; uint8_t protocol;
uint16_t source; uint16_t source;
@ -222,8 +224,8 @@ static struct tap_l4_seq4 {
uint32_t saddr; uint32_t saddr;
uint32_t daddr; uint32_t daddr;
struct tap_l4_msg msg[UIO_MAXIOV]; struct pool_l4_t p;
} l4_seq4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */]; } tap4_l4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */];
/** /**
* struct l4_seq6_t - Message sequence for one protocol handler call, IPv6 * struct l4_seq6_t - Message sequence for one protocol handler call, IPv6
@ -235,8 +237,7 @@ static struct tap_l4_seq4 {
* @daddr: Destination address * @daddr: Destination address
* @msg: Array of messages that can be handled in a single call * @msg: Array of messages that can be handled in a single call
*/ */
static struct tap_l4_seq6 { static struct tap6_l4_t {
uint16_t msgs;
uint8_t protocol; uint8_t protocol;
uint16_t source; uint16_t source;
@ -245,8 +246,8 @@ static struct tap_l4_seq6 {
struct in6_addr saddr; struct in6_addr saddr;
struct in6_addr daddr; struct in6_addr daddr;
struct tap_l4_msg msg[UIO_MAXIOV]; struct pool_l4_t p;
} l4_seq6[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */]; } tap6_l4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */];
/** /**
* tap_packet_debug() - Print debug message for packet(s) from guest/tap * tap_packet_debug() - Print debug message for packet(s) from guest/tap
@ -258,8 +259,8 @@ static struct tap_l4_seq6 {
* @count: Count of packets in this sequence * @count: Count of packets in this sequence
*/ */
static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h, static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h,
struct tap_l4_seq4 *seq4, uint8_t proto6, struct tap4_l4_t *seq4, uint8_t proto6,
struct tap_l4_seq6 *seq6, int count) struct tap6_l4_t *seq6, int count)
{ {
char buf6s[INET6_ADDRSTRLEN], buf6d[INET6_ADDRSTRLEN]; char buf6s[INET6_ADDRSTRLEN], buf6d[INET6_ADDRSTRLEN];
char buf4s[INET_ADDRSTRLEN], buf4d[INET_ADDRSTRLEN]; char buf4s[INET_ADDRSTRLEN], buf4d[INET_ADDRSTRLEN];
@ -283,14 +284,15 @@ static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h,
} }
if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) { if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) {
trace("protocol %i from tap: %s:%i -> %s:%i (%i packet%s)", trace("tap: protocol %i, %s%s%s:%i -> %s%s%s:%i (%i packet%s)",
proto, seq4 ? buf4s : buf6s, proto,
seq4 ? "" : "[", seq4 ? buf4s : buf6s, seq4 ? "" : "]",
ntohs(seq4 ? seq4->source : seq6->source), ntohs(seq4 ? seq4->source : seq6->source),
seq4 ? buf4d : buf6d, seq4 ? "" : "[", seq4 ? buf4d : buf6d, seq4 ? "" : "]",
ntohs(seq4 ? seq4->dest : seq6->dest), ntohs(seq4 ? seq4->dest : seq6->dest),
count, count == 1 ? "" : "s"); count, count == 1 ? "" : "s");
} else { } else {
trace("protocol %i from tap: %s -> %s (%i packet%s)", trace("tap: protocol %i, %s -> %s (%i packet%s)",
proto, iph ? buf4s : buf6s, iph ? buf4d : buf6d, proto, iph ? buf4s : buf6s, iph ? buf4d : buf6d,
count, count == 1 ? "" : "s"); count, count == 1 ? "" : "s");
} }
@ -299,78 +301,83 @@ static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h,
/** /**
* tap4_handler() - IPv4 and ARP packet handler for tap file descriptor * tap4_handler() - IPv4 and ARP packet handler for tap file descriptor
* @c: Execution context * @c: Execution context
* @msg: Array of messages with IPv4 or ARP protocol * @in: Ingress packet pool, packets with Ethernet headers
* @count: Count of messages
* @now: Current timestamp * @now: Current timestamp
* *
* Return: count of packets consumed by handlers * Return: count of packets consumed by handlers
*/ */
static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count, static int tap4_handler(struct ctx *c, struct pool *in, struct timespec *now)
struct timespec *now)
{ {
unsigned int i, j, seq_count; unsigned int i, j, seq_count;
struct tap_l4_msg *l4_msg; struct tap4_l4_t *seq;
struct tap_l4_seq4 *seq;
size_t len, l4_len; if (!c->v4 || !in->count)
return in->count;
i = 0;
resume:
for (seq_count = 0, seq = NULL; i < in->count; i++) {
size_t l2_len, l3_len, hlen, l4_len;
struct ethhdr *eh; struct ethhdr *eh;
struct iphdr *iph; struct iphdr *iph;
struct udphdr *uh; struct udphdr *uh;
char *l4h; char *l4h;
if (!c->v4) packet_get(in, i, 0, 0, &l2_len);
return count;
i = 0; eh = packet_get(in, i, 0, sizeof(*eh), &l3_len);
resume: if (!eh)
for (seq_count = 0, seq = NULL; i < count; i++) { continue;
eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset); if (ntohs(eh->h_proto) == ETH_P_ARP) {
len = msg[i].len; PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
if (len < sizeof(*eh)) packet_add(pkt, l2_len, (char *)eh);
arp(c, pkt);
continue;
}
iph = packet_get(in, i, sizeof(*eh), sizeof(*iph), NULL);
if (!iph)
continue; continue;
if (ntohs(eh->h_proto) == ETH_P_ARP && arp(c, eh, len)) hlen = iph->ihl * 4UL;
if (hlen < sizeof(*iph) || htons(iph->tot_len) != l3_len ||
hlen > l3_len)
continue; continue;
if (len < sizeof(*eh) + sizeof(*iph)) l4_len = l3_len - hlen;
continue;
iph = (struct iphdr *)(eh + 1);
if ((size_t)iph->ihl * 4 + sizeof(*eh) > len)
continue;
if ((size_t)iph->ihl * 4 < (int)sizeof(*iph))
continue;
if (iph->saddr && c->addr4_seen != iph->saddr) { if (iph->saddr && c->addr4_seen != iph->saddr) {
c->addr4_seen = iph->saddr; c->addr4_seen = iph->saddr;
proto_update_l2_buf(NULL, NULL, &c->addr4_seen); proto_update_l2_buf(NULL, NULL, &c->addr4_seen);
} }
l4h = (char *)iph + (size_t)iph->ihl * 4; l4h = packet_get(in, i, sizeof(*eh) + hlen, l4_len, NULL);
l4_len = len - ((intptr_t)l4h - (intptr_t)eh); if (!l4h)
continue;
if (iph->protocol == IPPROTO_ICMP) { if (iph->protocol == IPPROTO_ICMP) {
struct tap_l4_msg icmp_msg = { l4h - pkt_buf, PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
l4_len };
if (l4_len < sizeof(struct icmphdr)) if (c->no_icmp)
continue; continue;
tap_packet_debug(iph, NULL, NULL, 0, NULL, 1); packet_add(pkt, l4_len, l4h);
if (!c->no_icmp) { icmp_tap_handler(c, AF_INET, &iph->daddr, pkt, now);
icmp_tap_handler(c, AF_INET, &iph->daddr,
&icmp_msg, 1, now);
}
continue; continue;
} }
if (l4_len < sizeof(*uh)) uh = packet_get(in, i, sizeof(*eh) + hlen, sizeof(*uh), NULL);
if (!uh)
continue; continue;
uh = (struct udphdr *)l4h; if (iph->protocol == IPPROTO_UDP) {
PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
if (iph->protocol == IPPROTO_UDP && dhcp(c, eh, len)) packet_add(pkt, l2_len, (char *)eh);
if (dhcp(c, pkt))
continue; continue;
}
if (iph->protocol != IPPROTO_TCP && if (iph->protocol != IPPROTO_TCP &&
iph->protocol != IPPROTO_UDP) { iph->protocol != IPPROTO_UDP) {
@ -392,147 +399,145 @@ resume:
seq->daddr = iph->daddr; \ seq->daddr = iph->daddr; \
} while (0) } while (0)
if (seq && L4_MATCH(iph, uh, seq) && seq->msgs < UIO_MAXIOV) if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
goto append; goto append;
for (seq = l4_seq4 + seq_count - 1; seq >= l4_seq4; seq--) { for (seq = tap4_l4 + seq_count - 1; seq >= tap4_l4; seq--) {
if (L4_MATCH(iph, uh, seq)) { if (L4_MATCH(iph, uh, seq)) {
if (seq->msgs >= UIO_MAXIOV) if (seq->p.count >= UIO_MAXIOV)
seq = NULL; seq = NULL;
break; break;
} }
} }
if (!seq || seq < l4_seq4) { if (!seq || seq < tap4_l4) {
seq = l4_seq4 + seq_count++; seq = tap4_l4 + seq_count++;
L4_SET(iph, uh, seq); L4_SET(iph, uh, seq);
seq->msgs = 0; pool_flush((struct pool *)&seq->p);
} }
#undef L4_MATCH #undef L4_MATCH
#undef L4_SET #undef L4_SET
append: append:
l4_msg = &seq->msg[seq->msgs++]; packet_add((struct pool *)&seq->p, l4_len, l4h);
l4_msg->pkt_buf_offset = l4h - pkt_buf;
l4_msg->l4_len = l4_len;
if (seq_count == UIO_MAXIOV) if (seq_count == UIO_MAXIOV)
break; /* Resume after flushing if i < count */ break; /* Resume after flushing if i < count */
} }
for (j = 0, seq = l4_seq4; j < seq_count; j++, seq++) { for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) {
int n = seq->msgs; struct pool *p = (struct pool *)&seq->p;
uint32_t *da = &seq->daddr;
l4_msg = seq->msg; size_t n = p->count;
tap_packet_debug(NULL, NULL, seq, 0, NULL, n); tap_packet_debug(NULL, NULL, seq, 0, NULL, n);
if (seq->protocol == IPPROTO_TCP) { if (seq->protocol == IPPROTO_TCP) {
if (c->no_tcp) if (c->no_tcp)
continue; continue;
while ((n -= tcp_tap_handler(c, AF_INET, &seq->daddr, while ((n -= tcp_tap_handler(c, AF_INET, da, p, now)));
l4_msg, n, now)));
} else if (seq->protocol == IPPROTO_UDP) { } else if (seq->protocol == IPPROTO_UDP) {
if (c->no_udp) if (c->no_udp)
continue; continue;
while ((n -= udp_tap_handler(c, AF_INET, &seq->daddr, while ((n -= udp_tap_handler(c, AF_INET, da, p, now)));
l4_msg, n, now)));
} }
} }
if (i < count) if (i < in->count)
goto resume; goto resume;
return count; return in->count;
} }
/** /**
* tap6_handler() - IPv6 packet handler for tap file descriptor * tap6_handler() - IPv6 packet handler for tap file descriptor
* @c: Execution context * @c: Execution context
* @msg: Array of messages with IPv6 protocol * @in: Ingress packet pool, packets with Ethernet headers
* @count: Count of messages
* @now: Current timestamp * @now: Current timestamp
* *
* Return: count of packets consumed by handlers * Return: count of packets consumed by handlers
*/ */
static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count, static int tap6_handler(struct ctx *c, struct pool *in, struct timespec *now)
struct timespec *now)
{ {
unsigned int i, j, seq_count = 0; unsigned int i, j, seq_count = 0;
struct tap_l4_msg *l4_msg; struct tap6_l4_t *seq;
struct tap_l4_seq6 *seq;
if (!c->v6 || !in->count)
return in->count;
i = 0;
resume:
for (seq_count = 0, seq = NULL; i < in->count; i++) {
size_t l4_len, plen, check;
struct in6_addr *saddr, *daddr;
struct ipv6hdr *ip6h; struct ipv6hdr *ip6h;
size_t len, l4_len;
struct ethhdr *eh; struct ethhdr *eh;
struct udphdr *uh; struct udphdr *uh;
uint8_t proto; uint8_t proto;
char *l4h; char *l4h;
if (!c->v6) eh = packet_get(in, i, 0, sizeof(*eh), NULL);
return count; if (!eh)
i = 0;
resume:
for (seq_count = 0, seq = NULL; i < count; i++) {
eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset);
len = msg[i].len;
if (len < sizeof(*eh))
continue; continue;
if (len < sizeof(*eh) + sizeof(*ip6h)) ip6h = packet_get(in, i, sizeof(*eh), sizeof(*ip6h), &check);
return 1; if (!ip6h)
continue;
ip6h = (struct ipv6hdr *)(eh + 1); saddr = &ip6h->saddr;
daddr = &ip6h->daddr;
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) { plen = ntohs(ip6h->payload_len);
c->addr6_ll_seen = ip6h->saddr; if (plen != check)
continue;
if (!(l4h = ipv6_l4hdr(in, i, sizeof(*eh), &proto, &l4_len)))
continue;
if (IN6_IS_ADDR_LINKLOCAL(saddr)) {
c->addr6_ll_seen = *saddr;
if (IN6_IS_ADDR_UNSPECIFIED(&c->addr6_seen)) { if (IN6_IS_ADDR_UNSPECIFIED(&c->addr6_seen)) {
c->addr6_seen = ip6h->saddr; c->addr6_seen = *saddr;
} }
} else { } else {
c->addr6_seen = ip6h->saddr; c->addr6_seen = *saddr;
} }
if (ntohs(ip6h->payload_len) >
len - sizeof(*eh) - sizeof(*ip6h))
continue;
if (!(l4h = ipv6_l4hdr(ip6h, &proto)))
continue;
l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
if (proto == IPPROTO_ICMPV6) { if (proto == IPPROTO_ICMPV6) {
struct tap_l4_msg icmpv6_msg = { l4h - pkt_buf, PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
l4_len };
if (c->no_icmp)
continue;
if (l4_len < sizeof(struct icmp6hdr)) if (l4_len < sizeof(struct icmp6hdr))
continue; continue;
if (ndp(c, eh, len)) if (ndp(c, (struct icmp6hdr *)l4h, eh->h_source, saddr))
continue; continue;
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
if (!c->no_icmp) {
icmp_tap_handler(c, AF_INET6, &ip6h->daddr, packet_add(pkt, l4_len, l4h);
&icmpv6_msg, 1, now); icmp_tap_handler(c, AF_INET6, daddr, pkt, now);
}
continue; continue;
} }
if (l4_len < sizeof(*uh)) if (l4_len < sizeof(*uh))
continue; continue;
uh = (struct udphdr *)l4h; uh = (struct udphdr *)l4h;
if (proto == IPPROTO_UDP && dhcpv6(c, eh, len)) if (proto == IPPROTO_UDP) {
continue; PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
ip6h->saddr = c->addr6; packet_add(pkt, l4_len, l4h);
if (dhcpv6(c, pkt, saddr, daddr))
continue;
}
*saddr = c->addr6;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
@ -542,73 +547,68 @@ resume:
#define L4_MATCH(ip6h, proto, uh, seq) \ #define L4_MATCH(ip6h, proto, uh, seq) \
(seq->protocol == proto && \ (seq->protocol == proto && \
seq->source == uh->source && seq->dest == uh->dest && \ seq->source == uh->source && seq->dest == uh->dest && \
IN6_ARE_ADDR_EQUAL(&seq->saddr, &ip6h->saddr) && \ IN6_ARE_ADDR_EQUAL(&seq->saddr, saddr) && \
IN6_ARE_ADDR_EQUAL(&seq->daddr, &ip6h->daddr)) IN6_ARE_ADDR_EQUAL(&seq->daddr, daddr))
#define L4_SET(ip6h, proto, uh, seq) \ #define L4_SET(ip6h, proto, uh, seq) \
do { \ do { \
seq->protocol = proto; \ seq->protocol = proto; \
seq->source = uh->source; \ seq->source = uh->source; \
seq->dest = uh->dest; \ seq->dest = uh->dest; \
seq->saddr = ip6h->saddr; \ seq->saddr = *saddr; \
seq->daddr = ip6h->daddr; \ seq->daddr = *daddr; \
} while (0) } while (0)
if (seq && L4_MATCH(ip6h, proto, uh, seq) && if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
seq->msgs < UIO_MAXIOV) seq->p.count < UIO_MAXIOV)
goto append; goto append;
for (seq = l4_seq6 + seq_count - 1; seq >= l4_seq6; seq--) { for (seq = tap6_l4 + seq_count - 1; seq >= tap6_l4; seq--) {
if (L4_MATCH(ip6h, proto, uh, seq)) { if (L4_MATCH(ip6h, proto, uh, seq)) {
if (seq->msgs >= UIO_MAXIOV) if (seq->p.count >= UIO_MAXIOV)
seq = NULL; seq = NULL;
break; break;
} }
} }
if (!seq || seq < l4_seq6) { if (!seq || seq < tap6_l4) {
seq = l4_seq6 + seq_count++; seq = tap6_l4 + seq_count++;
L4_SET(ip6h, proto, uh, seq); L4_SET(ip6h, proto, uh, seq);
seq->msgs = 0; pool_flush((struct pool *)&seq->p);
} }
#undef L4_MATCH #undef L4_MATCH
#undef L4_SET #undef L4_SET
append: append:
l4_msg = &seq->msg[seq->msgs++]; packet_add((struct pool *)&seq->p, l4_len, l4h);
l4_msg->pkt_buf_offset = l4h - pkt_buf;
l4_msg->l4_len = l4_len;
if (seq_count == UIO_MAXIOV) if (seq_count == UIO_MAXIOV)
break; /* Resume after flushing if i < count */ break; /* Resume after flushing if i < count */
} }
for (j = 0, seq = l4_seq6; j < seq_count; j++, seq++) { for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) {
int n = seq->msgs; struct pool *p = (struct pool *)&seq->p;
struct in6_addr *da = &seq->daddr;
l4_msg = seq->msg; size_t n = p->count;
tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n); tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n);
if (seq->protocol == IPPROTO_TCP) { if (seq->protocol == IPPROTO_TCP) {
if (c->no_tcp) if (c->no_tcp)
continue; continue;
while ((n -= tcp_tap_handler(c, AF_INET6, &seq->daddr, while ((n -= tcp_tap_handler(c, AF_INET6, da, p, now)));
l4_msg, n, now)));
} else if (seq->protocol == IPPROTO_UDP) { } else if (seq->protocol == IPPROTO_UDP) {
if (c->no_udp) if (c->no_udp)
continue; continue;
while ((n -= udp_tap_handler(c, AF_INET6, &seq->daddr, while ((n -= udp_tap_handler(c, AF_INET6, da, p, now)));
l4_msg, n, now)));
} }
} }
if (i < count) if (i < in->count)
goto resume; goto resume;
return count; return in->count;
} }
/** /**
@ -620,14 +620,16 @@ append:
*/ */
static int tap_handler_passt(struct ctx *c, struct timespec *now) static int tap_handler_passt(struct ctx *c, struct timespec *now)
{ {
int seq4_i, seq6_i;
struct ethhdr *eh; struct ethhdr *eh;
ssize_t n, rem; ssize_t n, rem;
char *p; char *p;
redo: redo:
p = pkt_buf; p = pkt_buf;
seq4_i = seq6_i = rem = 0; rem = 0;
pool_flush(pool_tap4);
pool_flush(pool_tap6);
n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT); n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT);
if (n < 0) { if (n < 0) {
@ -673,12 +675,10 @@ redo:
switch (ntohs(eh->h_proto)) { switch (ntohs(eh->h_proto)) {
case ETH_P_ARP: case ETH_P_ARP:
case ETH_P_IP: case ETH_P_IP:
seq4[seq4_i].pkt_buf_offset = p - pkt_buf; packet_add(pool_tap4, len, p);
seq4[seq4_i++].len = len;
break; break;
case ETH_P_IPV6: case ETH_P_IPV6:
seq6[seq6_i].pkt_buf_offset = p - pkt_buf; packet_add(pool_tap6, len, p);
seq6[seq6_i++].len = len;
break; break;
default: default:
break; break;
@ -689,11 +689,8 @@ next:
n -= len; n -= len;
} }
if (seq4_i) tap4_handler(c, pool_tap4, now);
tap4_handler(c, seq4, seq4_i, now); tap6_handler(c, pool_tap6, now);
if (seq6_i)
tap6_handler(c, seq6, seq6_i, now);
/* We can't use EPOLLET otherwise. */ /* We can't use EPOLLET otherwise. */
if (rem) if (rem)
@ -712,8 +709,10 @@ next:
static int tap_handler_pasta(struct ctx *c, struct timespec *now) static int tap_handler_pasta(struct ctx *c, struct timespec *now)
{ {
ssize_t n = 0, len; ssize_t n = 0, len;
int ret, seq4_i = 0, seq6_i = 0; int ret;
pool_flush(pool_tap4);
pool_flush(pool_tap6);
restart: restart:
while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) { while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n); struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n);
@ -733,12 +732,10 @@ restart:
switch (ntohs(eh->h_proto)) { switch (ntohs(eh->h_proto)) {
case ETH_P_ARP: case ETH_P_ARP:
case ETH_P_IP: case ETH_P_IP:
seq4[seq4_i].pkt_buf_offset = n; packet_add(pool_tap4, len, pkt_buf + n);
seq4[seq4_i++].len = len;
break; break;
case ETH_P_IPV6: case ETH_P_IPV6:
seq6[seq6_i].pkt_buf_offset = n; packet_add(pool_tap6, len, pkt_buf + n);
seq6[seq6_i++].len = len;
break; break;
default: default:
break; break;
@ -752,11 +749,8 @@ restart:
ret = errno; ret = errno;
if (seq4_i) tap4_handler(c, pool_tap4, now);
tap4_handler(c, seq4, seq4_i, now); tap6_handler(c, pool_tap6, now);
if (seq6_i)
tap6_handler(c, seq6, seq6_i, now);
if (len > 0 || ret == EAGAIN) if (len > 0 || ret == EAGAIN)
return 0; return 0;
@ -920,6 +914,17 @@ static void tap_sock_tun_init(struct ctx *c)
*/ */
void tap_sock_init(struct ctx *c) void tap_sock_init(struct ctx *c)
{ {
size_t sz = sizeof(pkt_buf);
int i;
pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz);
pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz);
for (i = 0; i < UIO_MAXIOV; i++) {
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
}
if (c->fd_tap != -1) { if (c->fd_tap != -1) {
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
close(c->fd_tap); close(c->fd_tap);

368
tcp.c
View file

@ -303,6 +303,9 @@
#define TCP_FRAMES \ #define TCP_FRAMES \
(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
#define TCP_FILE_PRESSURE 30 /* % of c->nofile */
#define TCP_CONN_PRESSURE 30 /* % of c->tcp.conn_count */
#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1) #define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1)
#define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \ #define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \
@ -440,6 +443,7 @@ struct tcp_conn {
#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1) #define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1)
#define TCP_WS_BITS 4 /* RFC 7323 */ #define TCP_WS_BITS 4 /* RFC 7323 */
#define TCP_WS_MAX 14
unsigned int ws_from_tap :TCP_WS_BITS; unsigned int ws_from_tap :TCP_WS_BITS;
unsigned int ws_to_tap :TCP_WS_BITS; unsigned int ws_to_tap :TCP_WS_BITS;
@ -476,7 +480,6 @@ struct tcp_conn {
uint32_t seq_init_from_tap; uint32_t seq_init_from_tap;
}; };
#define CONN_IS_CLOSED(conn) (conn->events == CLOSED)
#define CONN_IS_CLOSING(conn) \ #define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \ ((conn->events & ESTABLISHED) && \
(conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
@ -699,7 +702,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
return EPOLLET; return EPOLLET;
if (conn_flags & STALLED) if (conn_flags & STALLED)
return EPOLLIN | EPOLLRDHUP | EPOLLET; return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;
return EPOLLIN | EPOLLRDHUP; return EPOLLIN | EPOLLRDHUP;
} }
@ -733,8 +736,11 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn)
.r.p.tcp.tcp.v6 = CONN_V6(conn) }; .r.p.tcp.tcp.v6 = CONN_V6(conn) };
struct epoll_event ev = { .data.u64 = ref.u64 }; struct epoll_event ev = { .data.u64 = ref.u64 };
if (CONN_IS_CLOSED(conn)) { if (conn->events == CLOSED) {
if (conn->flags & IN_EPOLL)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
if (conn->timer != -1)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
return 0; return 0;
} }
@ -745,6 +751,18 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn)
conn->flags |= IN_EPOLL; /* No need to log this */ conn->flags |= IN_EPOLL; /* No need to log this */
if (conn->timer != -1) {
union epoll_ref ref_t = { .r.proto = IPPROTO_TCP,
.r.s = conn->sock,
.r.p.tcp.tcp.timer = 1,
.r.p.tcp.tcp.index = conn - tc };
struct epoll_event ev_t = { .data.u64 = ref_t.u64,
.events = EPOLLIN | EPOLLET };
if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
return -errno;
}
return 0; return 0;
} }
@ -759,6 +777,9 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn)
{ {
struct itimerspec it = { { 0 }, { 0 } }; struct itimerspec it = { { 0 }, { 0 } };
if (conn->events == CLOSED)
return;
if (conn->timer == -1) { if (conn->timer == -1) {
union epoll_ref ref = { .r.proto = IPPROTO_TCP, union epoll_ref ref = { .r.proto = IPPROTO_TCP,
.r.s = conn->sock, .r.s = conn->sock,
@ -783,15 +804,11 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn)
} }
} }
if (conn->events == CLOSED) { if (conn->flags & ACK_TO_TAP_DUE) {
it.it_value.tv_nsec = 1;
} else if (conn->flags & ACK_TO_TAP_DUE) {
it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
} else if (conn->flags & ACK_FROM_TAP_DUE) { } else if (conn->flags & ACK_FROM_TAP_DUE) {
if (!(conn->events & ESTABLISHED)) if (!(conn->events & ESTABLISHED))
it.it_value.tv_sec = SYN_TIMEOUT; it.it_value.tv_sec = SYN_TIMEOUT;
else if (conn->events & TAP_FIN_SENT)
it.it_value.tv_sec = FIN_TIMEOUT;
else else
it.it_value.tv_sec = ACK_TIMEOUT; it.it_value.tv_sec = ACK_TIMEOUT;
} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
@ -834,7 +851,9 @@ static void conn_flag_do(struct ctx *c, struct tcp_conn *conn,
if (flag == STALLED || flag == ~STALLED) if (flag == STALLED || flag == ~STALLED)
tcp_epoll_ctl(c, conn); tcp_epoll_ctl(c, conn);
if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE) if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE ||
(flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
(flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE)))
tcp_timer_ctl(c, conn); tcp_timer_ctl(c, conn);
} }
@ -888,7 +907,7 @@ static void conn_event_do(struct ctx *c, struct tcp_conn *conn,
else else
tcp_epoll_ctl(c, conn); tcp_epoll_ctl(c, conn);
if (event == CLOSED || CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
tcp_timer_ctl(c, conn); tcp_timer_ctl(c, conn);
} }
@ -1182,36 +1201,32 @@ static void tcp_sock6_iov_init(void)
/** /**
* tcp_opt_get() - Get option, and value if any, from TCP header * tcp_opt_get() - Get option, and value if any, from TCP header
* @th: Pointer to TCP header * @opts: Pointer to start of TCP options in header
* @len: Length of buffer, including TCP header * @len: Length of buffer, excluding TCP header -- NOT checked here!
* @type_find: Option type to look for * @type_find: Option type to look for
* @optlen_set: Optional, filled with option length if passed * @optlen_set: Optional, filled with option length if passed
* @value_set: Optional, set to start of option value if passed * @value_set: Optional, set to start of option value if passed
* *
* Return: option value, meaningful for up to 4 bytes, -1 if not found * Return: option value, meaningful for up to 4 bytes, -1 if not found
*/ */
static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find, static int tcp_opt_get(char *opts, size_t len, uint8_t type_find,
uint8_t *optlen_set, char **value_set) uint8_t *optlen_set, char **value_set)
{ {
uint8_t type, optlen; uint8_t type, optlen;
char *p;
if (len > (size_t)th->doff * 4) if (!len)
len = (size_t)th->doff * 4; return -1;
len -= sizeof(*th); for (; len >= 2; opts += optlen, len -= optlen) {
p = (char *)(th + 1); switch (*opts) {
for (; len >= 2; p += optlen, len -= optlen) {
switch (*p) {
case OPT_EOL: case OPT_EOL:
return -1; return -1;
case OPT_NOP: case OPT_NOP:
optlen = 1; optlen = 1;
break; break;
default: default:
type = *(p++); type = *(opts++);
optlen = *(p++) - 2; optlen = *(opts++) - 2;
len -= 2; len -= 2;
if (type != type_find) if (type != type_find)
@ -1220,17 +1235,17 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find,
if (optlen_set) if (optlen_set)
*optlen_set = optlen; *optlen_set = optlen;
if (value_set) if (value_set)
*value_set = p; *value_set = opts;
switch (optlen) { switch (optlen) {
case 0: case 0:
return 0; return 0;
case 1: case 1:
return *p; return *opts;
case 2: case 2:
return ntohs(*(uint16_t *)p); return ntohs(*(uint16_t *)opts);
default: default:
return ntohl(*(uint32_t *)p); return ntohl(*(uint32_t *)opts);
} }
} }
} }
@ -1415,12 +1430,12 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
if ((hole - tc) == --c->tcp.conn_count) { if ((hole - tc) == --c->tcp.conn_count) {
debug("TCP: hash table compaction: index %i (%p) was max index", debug("TCP: hash table compaction: index %i (%p) was max index",
hole - tc, hole); hole - tc, hole);
memset(hole, 0, sizeof(*hole));
return; return;
} }
from = CONN(c->tcp.conn_count); from = CONN(c->tcp.conn_count);
memcpy(hole, from, sizeof(*hole)); memcpy(hole, from, sizeof(*hole));
from->flags = from->events = 0;
to = hole; to = hole;
tcp_hash_update(from, to); tcp_hash_update(from, to);
@ -1430,25 +1445,23 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
debug("TCP: hash table compaction: old index %i, new index %i, " debug("TCP: hash table compaction: old index %i, new index %i, "
"sock %i, from: %p, to: %p", "sock %i, from: %p, to: %p",
from - tc, to - tc, from->sock, from, to); from - tc, to - tc, from->sock, from, to);
memset(from, 0, sizeof(*from));
} }
/** /**
* tcp_conn_destroy() - Close connection, drop from epoll file descriptor * tcp_conn_destroy() - Close sockets, trigger hash table removal and compaction
* @c: Execution context * @c: Execution context
* @conn: Connection pointer * @conn: Connection pointer
*/ */
static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn) static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn)
{ {
if (CONN_IS_CLOSED(conn))
return;
conn_event(c, conn, CLOSED);
conn->flags = 0;
close(conn->sock); close(conn->sock);
if (conn->timer != -1)
close(conn->timer);
/* Removal from hash table and connection table compaction deferred to tcp_hash_remove(conn);
* timer. tcp_table_compact(c, conn);
*/
} }
static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn); static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn);
@ -1582,9 +1595,23 @@ static void tcp_l2_data_buf_flush(struct ctx *c)
*/ */
void tcp_defer_handler(struct ctx *c) void tcp_defer_handler(struct ctx *c)
{ {
int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE;
int max_files = c->nofile / 100 * TCP_FILE_PRESSURE;
struct tcp_conn *conn;
tcp_l2_flags_buf_flush(c); tcp_l2_flags_buf_flush(c);
tcp_l2_data_buf_flush(c); tcp_l2_data_buf_flush(c);
tcp_splice_defer_handler(c); tcp_splice_defer_handler(c);
if (c->tcp.conn_count < MIN(max_files, max_conns))
return;
for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) {
if (conn->events == CLOSED)
tcp_conn_destroy(c, conn);
}
} }
/** /**
@ -1610,7 +1637,13 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn,
b->th.dest = htons(conn->tap_port); \ b->th.dest = htons(conn->tap_port); \
b->th.seq = htonl(seq); \ b->th.seq = htonl(seq); \
b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ b->th.ack_seq = htonl(conn->seq_ack_to_tap); \
b->th.window = htons(MIN(conn->wnd_to_tap, USHRT_MAX)); \ if (conn->events & ESTABLISHED) { \
b->th.window = htons(conn->wnd_to_tap); \
} else { \
unsigned wnd = conn->wnd_to_tap << conn->ws_to_tap; \
\
b->th.window = htons(MIN(wnd, USHRT_MAX)); \
} \
} while (0) } while (0)
if (CONN_V6(conn)) { if (CONN_V6(conn)) {
@ -1692,7 +1725,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
conn->seq_ack_to_tap = prev_ack_to_tap; conn->seq_ack_to_tap = prev_ack_to_tap;
#else #else
if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn) if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
|| CONN_IS_CLOSING(conn) || conn->flags & LOCAL || force_seq) { || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
conn->seq_ack_to_tap = conn->seq_from_tap; conn->seq_ack_to_tap = conn->seq_from_tap;
} else if (conn->seq_ack_to_tap != conn->seq_from_tap) { } else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
if (!tinfo) { if (!tinfo) {
@ -1712,18 +1745,20 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
if (!KERNEL_REPORTS_SND_WND(c)) { if (!KERNEL_REPORTS_SND_WND(c)) {
tcp_get_sndbuf(conn); tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW); new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
conn->wnd_to_tap = new_wnd_to_tap >> conn->ws_to_tap; conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
USHRT_MAX);
goto out; goto out;
} }
if (!tinfo) { if (!tinfo) {
if (prev_wnd_to_tap > WINDOW_DEFAULT) if (prev_wnd_to_tap > WINDOW_DEFAULT) {
goto out; goto out;
}
tinfo = &tinfo_new; tinfo = &tinfo_new;
if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) {
goto out; goto out;
} }
}
#ifdef HAS_SND_WND #ifdef HAS_SND_WND
if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
@ -1735,10 +1770,15 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
} }
#endif #endif
conn->wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW) >> conn->ws_to_tap; new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
if (!(conn->events & ESTABLISHED))
new_wnd_to_tap = MAX(new_wnd_to_tap, WINDOW_DEFAULT);
conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, USHRT_MAX);
if (!conn->wnd_to_tap) if (!conn->wnd_to_tap)
conn_flag(c, conn, ACK_TO_TAP_DUE); conn_flag(c, conn, ACK_TO_TAP_DUE);
out: out:
return new_wnd_to_tap != prev_wnd_to_tap || return new_wnd_to_tap != prev_wnd_to_tap ||
conn->seq_ack_to_tap != prev_ack_to_tap; conn->seq_ack_to_tap != prev_ack_to_tap;
@ -1772,10 +1812,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
return 0; return 0;
if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
tcp_conn_destroy(c, conn); conn_event(c, conn, CLOSED);
return -ECONNRESET; return -ECONNRESET;
} }
#ifdef HAS_SND_WND
if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
c->tcp.kernel_snd_wnd = 1;
#endif
if (!(conn->flags & LOCAL)) if (!(conn->flags & LOCAL))
tcp_rtt_dst_check(conn, &tinfo); tcp_rtt_dst_check(conn, &tinfo);
@ -1825,11 +1870,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
data += OPT_MSS_LEN - 2; data += OPT_MSS_LEN - 2;
th->doff += OPT_MSS_LEN / 4; th->doff += OPT_MSS_LEN / 4;
#ifdef HAS_SND_WND
if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
c->tcp.kernel_snd_wnd = 1;
#endif
conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale); conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
*data++ = OPT_NOP; *data++ = OPT_NOP;
@ -1854,10 +1894,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
NULL, conn->seq_to_tap); NULL, conn->seq_to_tap);
iov->iov_len = eth_len + sizeof(uint32_t); iov->iov_len = eth_len + sizeof(uint32_t);
/* First value is not scaled: scale now */
if (flags & SYN)
conn->wnd_to_tap >>= conn->ws_to_tap;
if (CONN_V4(conn)) if (CONN_V4(conn))
tcp4_l2_flags_buf_bytes += iov->iov_len; tcp4_l2_flags_buf_bytes += iov->iov_len;
else else
@ -1905,69 +1941,56 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
*/ */
static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn) static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn)
{ {
if (CONN_IS_CLOSED(conn)) if (conn->events == CLOSED)
return; return;
if (!tcp_send_flag(c, conn, RST)) if (!tcp_send_flag(c, conn, RST))
tcp_conn_destroy(c, conn); conn_event(c, conn, CLOSED);
} }
/** /**
* tcp_clamp_window() - Set window and scaling from option, clamp on socket * tcp_get_tap_ws() - Get Window Scaling option for connection from tap/guest
* @conn: Connection pointer * @conn: Connection pointer
* @th: TCP header, from tap, can be NULL if window is passed * @opts: Pointer to start of TCP options
* @len: Buffer length, at L4, can be 0 if no header is passed * @optlen: Bytes in options: caller MUST ensure available length
* @window: Window value, host order, unscaled, if no header is passed
* @init: Set if this is the very first segment from tap
*/ */
static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, static void tcp_get_tap_ws(struct tcp_conn *conn, char *opts, size_t optlen)
struct tcphdr *th, int len, unsigned int window,
int init)
{ {
if (init && th) { int ws = tcp_opt_get(opts, optlen, OPT_WS, NULL, NULL);
int ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
conn->ws_from_tap = ws & 0xf; if (ws >= 0 && ws <= TCP_WS_MAX)
conn->ws_from_tap = ws;
else
conn->ws_from_tap = 0;
}
/* RFC 7323, 2.2: first value is not scaled. Also, don't clamp /**
* yet, to avoid getting a zero scale just because we set a * tcp_clamp_window() - Set new window for connection, clamp on socket
* small window now. * @c: Execution context
* @conn: Connection pointer
* @window: Window value, host order, unscaled
*/ */
conn->wnd_from_tap = ntohs(th->window); static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, unsigned wnd)
} else { {
uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap; uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap;
if (th) wnd <<= conn->ws_from_tap;
window = ntohs(th->window) << conn->ws_from_tap; wnd = MIN(MAX_WINDOW, wnd);
else
window <<= conn->ws_from_tap;
window = MIN(MAX_WINDOW, window);
if (conn->flags & WND_CLAMPED) { if (conn->flags & WND_CLAMPED) {
if (prev_scaled == window) if (prev_scaled == wnd)
return; return;
/* Discard +/- 1% updates to spare some syscalls. */ /* Discard +/- 1% updates to spare some syscalls. */
if ((window > prev_scaled && if ((wnd > prev_scaled && wnd * 99 / 100 < prev_scaled) ||
window * 99 / 100 < prev_scaled) || (wnd < prev_scaled && wnd * 101 / 100 > prev_scaled))
(window < prev_scaled &&
window * 101 / 100 > prev_scaled)) {
conn->wnd_from_tap = window >>
conn->ws_from_tap;
return; return;
} }
}
if (window < 256) conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
window = 256; setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, &wnd, sizeof(wnd));
conn->wnd_from_tap = window >> conn->ws_from_tap;
setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP,
&window, sizeof(window));
conn_flag(c, conn, WND_CLAMPED); conn_flag(c, conn, WND_CLAMPED);
} }
}
/** /**
* tcp_seq_init() - Calculate initial sequence number according to RFC 6528 * tcp_seq_init() - Calculate initial sequence number according to RFC 6528
@ -2059,18 +2082,18 @@ static int tcp_conn_new_sock(struct ctx *c, sa_family_t af)
* tcp_conn_tap_mss() - Get and clamp MSS value advertised by tap/guest * tcp_conn_tap_mss() - Get and clamp MSS value advertised by tap/guest
* @c: Execution context * @c: Execution context
* @conn: Connection pointer * @conn: Connection pointer
* @th: TCP header send by tap/guest * @opts: Pointer to start of TCP options
* @len: L4 packet length, host order * @optlen: Bytes in options: caller MUST ensure available length
* *
* Return: clamped MSS value * Return: clamped MSS value
*/ */
static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn, static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn,
struct tcphdr *th, size_t len) char *opts, size_t optlen)
{ {
unsigned int mss; unsigned int mss;
int ret; int ret;
if ((ret = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0) if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
mss = MSS_DEFAULT; mss = MSS_DEFAULT;
else else
mss = ret; mss = ret;
@ -2091,12 +2114,13 @@ static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn,
* @c: Execution context * @c: Execution context
* @af: Address family, AF_INET or AF_INET6 * @af: Address family, AF_INET or AF_INET6
* @addr: Remote address, pointer to sin_addr or sin6_addr * @addr: Remote address, pointer to sin_addr or sin6_addr
* @th: TCP header from tap * @th: TCP header from tap: caller MUST ensure it's there
* @len: Packet length at L4 * @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp * @now: Current timestamp
*/ */
static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
struct tcphdr *th, size_t len, struct tcphdr *th, char *opts, size_t optlen,
struct timespec *now) struct timespec *now)
{ {
struct sockaddr_in addr4 = { struct sockaddr_in addr4 = {
@ -2142,16 +2166,21 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
conn = CONN(c->tcp.conn_count++); conn = CONN(c->tcp.conn_count++);
conn->sock = s; conn->sock = s;
conn->timer = -1; conn->timer = -1;
conn->ws_to_tap = conn->ws_from_tap = 0;
conn_event(c, conn, TAP_SYN_RCVD); conn_event(c, conn, TAP_SYN_RCVD);
conn->wnd_to_tap = WINDOW_DEFAULT; conn->wnd_to_tap = WINDOW_DEFAULT;
mss = tcp_conn_tap_mss(c, conn, th, len); mss = tcp_conn_tap_mss(c, conn, opts, optlen);
setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)); setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss));
MSS_SET(conn, mss); MSS_SET(conn, mss);
tcp_clamp_window(c, conn, th, len, 0, 1); tcp_get_tap_ws(conn, opts, optlen);
/* RFC 7323, 2.2: first value is not scaled. Also, don't clamp yet, to
* avoid getting a zero scale just because we set a small window now.
*/
if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap)))
conn->wnd_from_tap = 1;
if (af == AF_INET) { if (af == AF_INET) {
sa = (struct sockaddr *)&addr4; sa = (struct sockaddr *)&addr4;
@ -2395,53 +2424,52 @@ zero_len:
} }
/** /**
* tcp_data_from_tap() - tap data for established connection * tcp_data_from_tap() - tap/guest data for established connection
* @c: Execution context * @c: Execution context
* @conn: Connection pointer * @conn: Connection pointer
* @msg: Array of messages from tap * @p: Pool of TCP packets, with TCP headers
* @count: Count of messages
* *
* #syscalls sendmsg * #syscalls sendmsg
*/ */
static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn, static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn,
struct tap_l4_msg *msg, int count) struct pool *p)
{ {
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1; int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
uint16_t max_ack_seq_wnd = conn->wnd_from_tap; uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
uint32_t max_ack_seq = conn->seq_ack_from_tap; uint32_t max_ack_seq = conn->seq_ack_from_tap;
uint32_t seq_from_tap = conn->seq_from_tap; uint32_t seq_from_tap = conn->seq_from_tap;
struct msghdr mh = { .msg_iov = tcp_iov }; struct msghdr mh = { .msg_iov = tcp_iov };
int partial_send = 0; size_t len;
uint16_t len;
ssize_t n; ssize_t n;
for (i = 0, iov_i = 0; i < count; i++) { for (i = 0, iov_i = 0; i < (int)p->count; i++) {
uint32_t seq, seq_offset, ack_seq; uint32_t seq, seq_offset, ack_seq;
struct tcphdr *th; struct tcphdr *th;
char *data; char *data;
size_t off; size_t off;
th = (struct tcphdr *)(pkt_buf + msg[i].pkt_buf_offset); packet_get(p, i, 0, 0, &len);
len = msg[i].l4_len; th = packet_get(p, i, 0, sizeof(*th), NULL);
if (!th) {
if (len < sizeof(*th)) {
tcp_rst(c, conn); tcp_rst(c, conn);
return; return;
} }
off = (size_t)th->doff * 4; off = th->doff * 4UL;
if (off < sizeof(*th) || off > len) { if (off < sizeof(*th) || off > len) {
tcp_rst(c, conn); tcp_rst(c, conn);
return; return;
} }
if (th->rst) { if (th->rst) {
tcp_conn_destroy(c, conn); conn_event(c, conn, CLOSED);
return; return;
} }
len -= off; len -= off;
data = (char *)th + off; data = packet_get(p, i, off, len, NULL);
if (!data)
continue;
seq = ntohl(th->seq); seq = ntohl(th->seq);
ack_seq = ntohl(th->ack_seq); ack_seq = ntohl(th->ack_seq);
@ -2511,7 +2539,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn,
i = keep - 1; i = keep - 1;
} }
tcp_clamp_window(c, conn, NULL, 0, max_ack_seq_wnd, 0); tcp_clamp_window(c, conn, max_ack_seq_wnd);
if (ack) { if (ack) {
if (max_ack_seq == conn->seq_to_tap) { if (max_ack_seq == conn->seq_to_tap) {
@ -2595,14 +2623,22 @@ out:
* tcp_conn_from_sock_finish() - Complete connection setup after connect() * tcp_conn_from_sock_finish() - Complete connection setup after connect()
* @c: Execution context * @c: Execution context
* @conn: Connection pointer * @conn: Connection pointer
* @th: TCP header of SYN, ACK segment from tap/guest * @th: TCP header of SYN, ACK segment: caller MUST ensure it's there
* @len: Packet length of SYN, ACK segment at L4, host order * @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length
*/ */
static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
struct tcphdr *th, size_t len) struct tcphdr *th,
char *opts, size_t optlen)
{ {
tcp_clamp_window(c, conn, th, len, 0, 1); tcp_clamp_window(c, conn, ntohs(th->window));
MSS_SET(conn, tcp_conn_tap_mss(c, conn, th, len)); tcp_get_tap_ws(conn, opts, optlen);
/* First value is not scaled */
if (!(conn->wnd_from_tap >>= conn->ws_from_tap))
conn->wnd_from_tap = 1;
MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen));
conn->seq_init_from_tap = ntohl(th->seq) + 1; conn->seq_init_from_tap = ntohl(th->seq) + 1;
conn->seq_from_tap = conn->seq_init_from_tap; conn->seq_from_tap = conn->seq_init_from_tap;
@ -2622,32 +2658,42 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
* @c: Execution context * @c: Execution context
* @af: Address family, AF_INET or AF_INET6 * @af: Address family, AF_INET or AF_INET6
* @addr: Destination address * @addr: Destination address
* @msg: Input messages * @p: Pool of TCP packets, with TCP headers
* @count: Message count
* @now: Current timestamp * @now: Current timestamp
* *
* Return: count of consumed packets * Return: count of consumed packets
*/ */
int tcp_tap_handler(struct ctx *c, int af, void *addr, int tcp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
struct tap_l4_msg *msg, int count, struct timespec *now) struct timespec *now)
{ {
struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset);
uint16_t len = msg[0].l4_len;
struct tcp_conn *conn; struct tcp_conn *conn;
size_t optlen, len;
struct tcphdr *th;
int ack_due = 0; int ack_due = 0;
char *opts;
packet_get(p, 0, 0, 0, &len);
th = packet_get(p, 0, 0, sizeof(*th), NULL);
if (!th)
return 1;
optlen = th->doff * 4UL - sizeof(*th);
opts = packet_get(p, 0, sizeof(*th), optlen, NULL);
conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest)); conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
/* New connection from tap */ /* New connection from tap */
if (!conn) { if (!conn) {
if (th->syn && !th->ack) if (th->syn && !th->ack)
tcp_conn_from_tap(c, af, addr, th, len, now); tcp_conn_from_tap(c, af, addr, th, opts, optlen, now);
return 1; return 1;
} }
trace("TCP: packet length %lu from tap for index %lu", len, conn - tc);
if (th->rst) { if (th->rst) {
tcp_conn_destroy(c, conn); conn_event(c, conn, CLOSED);
return count; return p->count;
} }
if (th->ack) { if (th->ack) {
@ -2660,7 +2706,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
/* Establishing connection from socket */ /* Establishing connection from socket */
if (conn->events & SOCK_ACCEPTED) { if (conn->events & SOCK_ACCEPTED) {
if (th->syn && th->ack && !th->fin) if (th->syn && th->ack && !th->fin)
tcp_conn_from_sock_finish(c, conn, th, len); tcp_conn_from_sock_finish(c, conn, th, opts, optlen);
else else
tcp_rst(c, conn); tcp_rst(c, conn);
@ -2671,7 +2717,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (conn->events & TAP_SYN_RCVD) { if (conn->events & TAP_SYN_RCVD) {
if (!(conn->events & TAP_SYN_ACK_SENT)) { if (!(conn->events & TAP_SYN_ACK_SENT)) {
tcp_rst(c, conn); tcp_rst(c, conn);
return count; return p->count;
} }
conn_event(c, conn, ESTABLISHED); conn_event(c, conn, ESTABLISHED);
@ -2683,17 +2729,19 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
tcp_send_flag(c, conn, ACK); tcp_send_flag(c, conn, ACK);
conn_event(c, conn, SOCK_FIN_SENT); conn_event(c, conn, SOCK_FIN_SENT);
return count; return p->count;
} }
if (!th->ack) { if (!th->ack) {
tcp_rst(c, conn); tcp_rst(c, conn);
return count; return p->count;
} }
tcp_clamp_window(c, conn, th, len, 0, 0); tcp_clamp_window(c, conn, ntohs(th->window));
if (count == 1) tcp_data_from_sock(c, conn);
if (p->count == 1)
return 1; return 1;
} }
@ -2701,13 +2749,13 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (conn->events & TAP_FIN_RCVD) { if (conn->events & TAP_FIN_RCVD) {
if (conn->events & SOCK_FIN_RCVD && if (conn->events & SOCK_FIN_RCVD &&
conn->seq_ack_from_tap == conn->seq_to_tap) conn->seq_ack_from_tap == conn->seq_to_tap)
tcp_conn_destroy(c, conn); conn_event(c, conn, CLOSED);
return 1; return 1;
} }
/* Established connections accepting data from tap */ /* Established connections accepting data from tap */
tcp_data_from_tap(c, conn, msg, count); tcp_data_from_tap(c, conn, p);
if (conn->seq_ack_to_tap != conn->seq_from_tap) if (conn->seq_ack_to_tap != conn->seq_from_tap)
ack_due = 1; ack_due = 1;
@ -2721,7 +2769,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (ack_due) if (ack_due)
conn_flag(c, conn, ACK_TO_TAP_DUE); conn_flag(c, conn, ACK_TO_TAP_DUE);
return count; return p->count;
} }
/** /**
@ -2872,7 +2920,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
if (!(conn->events & ESTABLISHED)) { if (!(conn->events & ESTABLISHED)) {
debug("TCP: index %i, handshake timeout", conn - tc); debug("TCP: index %i, handshake timeout", conn - tc);
tcp_rst(c, conn); tcp_rst(c, conn);
} else if (conn->events & TAP_FIN_SENT) { } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
debug("TCP: index %i, FIN timeout", conn - tc); debug("TCP: index %i, FIN timeout", conn - tc);
tcp_rst(c, conn); tcp_rst(c, conn);
} else if (conn->retrans == TCP_MAX_RETRANS) { } else if (conn->retrans == TCP_MAX_RETRANS) {
@ -2884,6 +2932,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
conn->retrans++; conn->retrans++;
conn->seq_to_tap = conn->seq_ack_from_tap; conn->seq_to_tap = conn->seq_ack_from_tap;
tcp_data_from_sock(c, conn); tcp_data_from_sock(c, conn);
tcp_timer_ctl(c, conn);
} }
} else { } else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
@ -2933,19 +2982,22 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
if (!(conn = CONN_OR_NULL(ref.r.p.tcp.tcp.index))) if (!(conn = CONN_OR_NULL(ref.r.p.tcp.tcp.index)))
return; return;
if (conn->events == CLOSED)
return;
if (events & EPOLLERR) { if (events & EPOLLERR) {
tcp_rst(c, conn); tcp_rst(c, conn);
return; return;
} }
if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) { if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
tcp_conn_destroy(c, conn); conn_event(c, conn, CLOSED);
return; return;
} }
if (conn->events & ESTABLISHED) { if (conn->events & ESTABLISHED) {
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
tcp_conn_destroy(c, conn); conn_event(c, conn, CLOSED);
if (events & (EPOLLRDHUP | EPOLLHUP)) if (events & (EPOLLRDHUP | EPOLLHUP))
conn_event(c, conn, SOCK_FIN_RCVD); conn_event(c, conn, SOCK_FIN_RCVD);
@ -3159,7 +3211,7 @@ static int tcp_sock_refill(void *arg)
* *
* Return: 0 on success, -1 on failure * Return: 0 on success, -1 on failure
*/ */
int tcp_sock_init(struct ctx *c, struct timespec *now) int tcp_sock_init(struct ctx *c)
{ {
struct tcp_sock_refill_arg refill_arg = { c, 0 }; struct tcp_sock_refill_arg refill_arg = { c, 0 };
int i, port; int i, port;
@ -3215,7 +3267,6 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext)); memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext));
memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns)); memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns));
c->tcp.refill_ts = *now;
tcp_sock_refill(&refill_arg); tcp_sock_refill(&refill_arg);
if (c->mode == MODE_PASTA) { if (c->mode == MODE_PASTA) {
@ -3226,7 +3277,7 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
refill_arg.ns = 1; refill_arg.ns = 1;
NS_CALL(tcp_sock_refill, &refill_arg); NS_CALL(tcp_sock_refill, &refill_arg);
c->tcp.port_detect_ts = *now; tcp_splice_timer(c);
} }
return 0; return 0;
@ -3349,17 +3400,18 @@ static int tcp_port_rebind(void *arg)
} }
/** /**
* tcp_timer() - Scan activity bitmap for sockets waiting for timed events * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill
* @c: Execution context * @c: Execution context
* @now: Timestamp from caller * @ts: Unused
*/ */
void tcp_timer(struct ctx *c, struct timespec *now) void tcp_timer(struct ctx *c, struct timespec *ts)
{ {
struct tcp_sock_refill_arg refill_arg = { c, 0 }; struct tcp_sock_refill_arg refill_arg = { c, 0 };
struct tcp_conn *conn;
(void)ts;
if (c->mode == MODE_PASTA) { if (c->mode == MODE_PASTA) {
if (timespec_diff_ms(now, &c->tcp.port_detect_ts) >
PORT_DETECT_INTERVAL) {
struct tcp_port_detect_arg detect_arg = { c, 0 }; struct tcp_port_detect_arg detect_arg = { c, 0 };
struct tcp_port_rebind_arg rebind_arg = { c, 0 }; struct tcp_port_rebind_arg rebind_arg = { c, 0 };
@ -3376,20 +3428,20 @@ void tcp_timer(struct ctx *c, struct timespec *now)
rebind_arg.bind_in_ns = 0; rebind_arg.bind_in_ns = 0;
tcp_port_rebind(&rebind_arg); tcp_port_rebind(&rebind_arg);
} }
c->tcp.port_detect_ts = *now;
} }
tcp_splice_timer(c, now); for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) {
if (conn->events == CLOSED)
tcp_conn_destroy(c, conn);
} }
if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) {
tcp_sock_refill(&refill_arg); tcp_sock_refill(&refill_arg);
if (c->mode == MODE_PASTA) { if (c->mode == MODE_PASTA) {
refill_arg.ns = 1; refill_arg.ns = 1;
if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) || if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
(c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0)) (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
NS_CALL(tcp_sock_refill, &refill_arg); NS_CALL(tcp_sock_refill, &refill_arg);
}
tcp_splice_timer(c);
} }
} }

16
tcp.h
View file

@ -6,9 +6,7 @@
#ifndef TCP_H #ifndef TCP_H
#define TCP_H #define TCP_H
#define REFILL_INTERVAL 1000 /* ms */ #define TCP_TIMER_INTERVAL 1000 /* ms */
#define PORT_DETECT_INTERVAL 1000
#define TCP_TIMER_INTERVAL MIN(REFILL_INTERVAL, PORT_DETECT_INTERVAL)
#define TCP_CONN_INDEX_BITS 17 /* 128k */ #define TCP_CONN_INDEX_BITS 17 /* 128k */
#define TCP_MAX_CONNS (1 << TCP_CONN_INDEX_BITS) #define TCP_MAX_CONNS (1 << TCP_CONN_INDEX_BITS)
@ -20,10 +18,10 @@ struct ctx;
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now); struct timespec *now);
int tcp_tap_handler(struct ctx *c, int af, void *addr, int tcp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
struct tap_l4_msg *msg, int count, struct timespec *now); struct timespec *now);
int tcp_sock_init(struct ctx *c, struct timespec *now); int tcp_sock_init(struct ctx *c);
void tcp_timer(struct ctx *c, struct timespec *now); void tcp_timer(struct ctx *c, struct timespec *ts);
void tcp_defer_handler(struct ctx *c); void tcp_defer_handler(struct ctx *c);
void tcp_sock_set_bufsize(struct ctx *c, int s); void tcp_sock_set_bufsize(struct ctx *c, int s);
@ -64,8 +62,6 @@ union tcp_epoll_ref {
* @timer_run: Timestamp of most recent timer run * @timer_run: Timestamp of most recent timer run
* @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035) * @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035)
* @pipe_size: Size of pipes for spliced connections * @pipe_size: Size of pipes for spliced connections
* @refill_ts: Time of last refill operation for pools of sockets/pipes
* @port_detect_ts: Time of last TCP port detection/rebind, if enabled
*/ */
struct tcp_ctx { struct tcp_ctx {
uint64_t hash_secret[2]; uint64_t hash_secret[2];
@ -80,8 +76,6 @@ struct tcp_ctx {
int kernel_snd_wnd; int kernel_snd_wnd;
#endif #endif
size_t pipe_size; size_t pipe_size;
struct timespec refill_ts;
struct timespec port_detect_ts;
}; };
#endif /* TCP_H */ #endif /* TCP_H */

View file

@ -51,7 +51,7 @@
#define MAX_PIPE_SIZE (2UL * 1024 * 1024) #define MAX_PIPE_SIZE (2UL * 1024 * 1024)
#define TCP_SPLICE_MAX_CONNS (128 * 1024) #define TCP_SPLICE_MAX_CONNS (128 * 1024)
#define TCP_SPLICE_PIPE_POOL_SIZE 16 #define TCP_SPLICE_PIPE_POOL_SIZE 16
#define REFILL_INTERVAL 1000 /* ms, refill pool of pipes */ #define TCP_SPLICE_CONN_PRESSURE 30 /* % of splice_conn_count */
#define TCP_SPLICE_FILE_PRESSURE 30 /* % of c->nofile */ #define TCP_SPLICE_FILE_PRESSURE 30 /* % of c->nofile */
/* From tcp.c */ /* From tcp.c */
@ -83,24 +83,24 @@ struct tcp_splice_conn {
int pipe_b_a[2]; int pipe_b_a[2];
uint8_t events; uint8_t events;
#define SPLICE_CLOSED 0 #define CLOSED 0
#define SPLICE_CONNECT BIT(0) #define CONNECT BIT(0)
#define SPLICE_ESTABLISHED BIT(1) #define ESTABLISHED BIT(1)
#define SPLICE_A_OUT_WAIT BIT(2) #define A_OUT_WAIT BIT(2)
#define SPLICE_B_OUT_WAIT BIT(3) #define B_OUT_WAIT BIT(3)
#define SPLICE_A_FIN_RCVD BIT(4) #define A_FIN_RCVD BIT(4)
#define SPLICE_B_FIN_RCVD BIT(5) #define B_FIN_RCVD BIT(5)
#define SPLICE_A_FIN_SENT BIT(6) #define A_FIN_SENT BIT(6)
#define SPLICE_B_FIN_SENT BIT(7) #define B_FIN_SENT BIT(7)
uint8_t flags; uint8_t flags;
#define SPLICE_V6 BIT(0) #define SOCK_V6 BIT(0)
#define SPLICE_IN_EPOLL BIT(1) #define IN_EPOLL BIT(1)
#define SPLICE_RCVLOWAT_SET_A BIT(2) #define RCVLOWAT_SET_A BIT(2)
#define SPLICE_RCVLOWAT_SET_B BIT(3) #define RCVLOWAT_SET_B BIT(3)
#define SPLICE_RCVLOWAT_ACT_A BIT(4) #define RCVLOWAT_ACT_A BIT(4)
#define SPLICE_RCVLOWAT_ACT_B BIT(5) #define RCVLOWAT_ACT_B BIT(5)
#define SPLICE_CLOSING BIT(6) #define CLOSING BIT(6)
uint64_t a_read; uint64_t a_read;
uint64_t a_written; uint64_t a_written;
@ -108,7 +108,7 @@ struct tcp_splice_conn {
uint64_t b_written; uint64_t b_written;
}; };
#define CONN_V6(x) (x->flags & SPLICE_V6) #define CONN_V6(x) (x->flags & SOCK_V6)
#define CONN_V4(x) (!CONN_V6(x)) #define CONN_V4(x) (!CONN_V6(x))
#define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
#define CONN(index) (tc + (index)) #define CONN(index) (tc + (index))
@ -118,15 +118,13 @@ static struct tcp_splice_conn tc[TCP_SPLICE_MAX_CONNS];
/* Display strings for connection events */ /* Display strings for connection events */
static const char *tcp_splice_event_str[] __attribute((__unused__)) = { static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
"SPLICE_CONNECT", "SPLICE_ESTABLISHED", "CONNECT", "ESTABLISHED", "A_OUT_WAIT", "B_OUT_WAIT",
"SPLICE_A_OUT_WAIT", "SPLICE_B_OUT_WAIT", "A_FIN_RCVD", "B_FIN_RCVD", "A_FIN_SENT", "B_FIN_SENT",
"SPLICE_A_FIN_RCVD", "SPLICE_B_FIN_RCVD",
"SPLICE_A_FIN_SENT", "SPLICE_B_FIN_SENT",
}; };
/* Display strings for connection flags */ /* Display strings for connection flags */
static const char *tcp_splice_flag_str[] __attribute((__unused__)) = { static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
"V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B", "SOCK_V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B",
"RCVLOWAT_ACT_A", "RCVLOWAT_ACT_B", "CLOSING", "RCVLOWAT_ACT_A", "RCVLOWAT_ACT_B", "CLOSING",
}; };
@ -141,23 +139,27 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
{ {
*a = *b = 0; *a = *b = 0;
if (events & SPLICE_CLOSED) if (events & CLOSED)
return; return;
if (events & SPLICE_ESTABLISHED) if (events & ESTABLISHED) {
*a = *b = EPOLLIN | EPOLLRDHUP; if (!(events & B_FIN_SENT))
else if (events & SPLICE_CONNECT) *a = EPOLLIN | EPOLLRDHUP;
if (!(events & A_FIN_SENT))
*b = EPOLLIN | EPOLLRDHUP;
} else if (events & CONNECT) {
*b = EPOLLOUT; *b = EPOLLOUT;
}
*a |= (events & SPLICE_A_OUT_WAIT) ? EPOLLOUT : 0; *a |= (events & A_OUT_WAIT) ? EPOLLOUT : 0;
*b |= (events & SPLICE_B_OUT_WAIT) ? EPOLLOUT : 0; *b |= (events & B_OUT_WAIT) ? EPOLLOUT : 0;
} }
static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn);
static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn); static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn);
/** /**
* conn_flag_do() - Set/unset given flag, log, update epoll on SPLICE_CLOSING * conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag
* @c: Execution context * @c: Execution context
* @conn: Connection pointer * @conn: Connection pointer
* @flag: Flag to set, or ~flag to unset * @flag: Flag to set, or ~flag to unset
@ -181,7 +183,7 @@ static void conn_flag_do(struct ctx *c, struct tcp_splice_conn *conn,
tcp_splice_flag_str[fls(flag)]); tcp_splice_flag_str[fls(flag)]);
} }
if (flag == SPLICE_CLOSING) if (flag == CLOSING)
tcp_splice_epoll_ctl(c, conn); tcp_splice_epoll_ctl(c, conn);
} }
@ -201,7 +203,7 @@ static void conn_flag_do(struct ctx *c, struct tcp_splice_conn *conn,
*/ */
static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn) static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn)
{ {
int m = (conn->flags & SPLICE_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a, union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a,
.r.p.tcp.tcp.splice = 1, .r.p.tcp.tcp.splice = 1,
.r.p.tcp.tcp.index = conn - tc, .r.p.tcp.tcp.index = conn - tc,
@ -214,15 +216,8 @@ static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn)
struct epoll_event ev_b = { .data.u64 = ref_b.u64 }; struct epoll_event ev_b = { .data.u64 = ref_b.u64 };
uint32_t events_a, events_b; uint32_t events_a, events_b;
if (conn->flags & SPLICE_CLOSING) { if (conn->flags & CLOSING)
if (conn->flags & SPLICE_IN_EPOLL) goto delete;
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a);
if (conn->events & SPLICE_CONNECT)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b);
return 0;
}
tcp_splice_conn_epoll_events(conn->events, &events_a, &events_b); tcp_splice_conn_epoll_events(conn->events, &events_a, &events_b);
ev_a.events = events_a; ev_a.events = events_a;
@ -230,13 +225,13 @@ static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn)
if (epoll_ctl(c->epollfd, m, conn->a, &ev_a) || if (epoll_ctl(c->epollfd, m, conn->a, &ev_a) ||
epoll_ctl(c->epollfd, m, conn->b, &ev_b)) epoll_ctl(c->epollfd, m, conn->b, &ev_b))
goto err; goto delete;
conn->flags |= SPLICE_IN_EPOLL; /* No need to log this */ conn->flags |= IN_EPOLL; /* No need to log this */
return 0; return 0;
err: delete:
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a); epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a);
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b); epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b);
return -errno; return -errno;
@ -251,12 +246,6 @@ err:
static void conn_event_do(struct ctx *c, struct tcp_splice_conn *conn, static void conn_event_do(struct ctx *c, struct tcp_splice_conn *conn,
unsigned long event) unsigned long event)
{ {
if (event == SPLICE_CLOSED) {
conn->events = SPLICE_CLOSED;
debug("TCP (spliced): index %i, CLOSED", conn - tc);
return;
}
if (event & (event - 1)) { if (event & (event - 1)) {
if (!(conn->events & ~event)) if (!(conn->events & ~event))
return; return;
@ -274,7 +263,7 @@ static void conn_event_do(struct ctx *c, struct tcp_splice_conn *conn,
} }
if (tcp_splice_epoll_ctl(c, conn)) if (tcp_splice_epoll_ctl(c, conn))
conn_flag(c, conn, SPLICE_CLOSING); conn_flag(c, conn, CLOSING);
} }
#define conn_event(c, conn, event) \ #define conn_event(c, conn, event) \
@ -304,22 +293,25 @@ static void tcp_table_splice_compact(struct ctx *c,
memcpy(hole, move, sizeof(*hole)); memcpy(hole, move, sizeof(*hole));
move->a = move->b = -1; move->a = move->b = -1;
move->flags = move->events = 0;
move->a_read = move->a_written = move->b_read = move->b_written = 0; move->a_read = move->a_written = move->b_read = move->b_written = 0;
move->pipe_a_b[0] = move->pipe_a_b[1] = -1;
move->pipe_b_a[0] = move->pipe_b_a[1] = -1;
move->flags = move->events = 0;
debug("TCP (spliced): index %i moved to %i", move - tc, hole - tc); debug("TCP (spliced): index %i moved to %i", move - tc, hole - tc);
tcp_splice_epoll_ctl(c, hole);
if (tcp_splice_epoll_ctl(c, hole)) if (tcp_splice_epoll_ctl(c, hole))
conn_flag(c, hole, SPLICE_CLOSING); conn_flag(c, hole, CLOSING);
} }
/** /**
* tcp_splice_destroy() - Close spliced connection and pipes, drop from epoll * tcp_splice_destroy() - Close spliced connection and pipes, clear
* @c: Execution context * @c: Execution context
* @conn: Connection pointer * @conn: Connection pointer
*/ */
static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
{ {
if (conn->events & SPLICE_ESTABLISHED) { if (conn->events & ESTABLISHED) {
/* Flushing might need to block: don't recycle them. */ /* Flushing might need to block: don't recycle them. */
if (conn->pipe_a_b[0] != -1) { if (conn->pipe_a_b[0] != -1) {
close(conn->pipe_a_b[0]); close(conn->pipe_a_b[0]);
@ -333,18 +325,19 @@ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
} }
} }
if (conn->events & SPLICE_CONNECT) { if (conn->events & CONNECT) {
close(conn->b); close(conn->b);
conn->b = -1; conn->b = -1;
} }
conn_event(c, conn, SPLICE_CLOSED);
close(conn->a); close(conn->a);
conn->a = -1; conn->a = -1;
conn->flags = 0;
conn->a_read = conn->a_written = conn->b_read = conn->b_written = 0; conn->a_read = conn->a_written = conn->b_read = conn->b_written = 0;
conn->events = CLOSED;
conn->flags = 0;
debug("TCP (spliced): index %i, CLOSED", conn - tc);
tcp_table_splice_compact(c, conn); tcp_table_splice_compact(c, conn);
} }
@ -364,7 +357,7 @@ static int tcp_splice_connect_finish(struct ctx *c,
conn->pipe_a_b[1] = conn->pipe_b_a[1] = -1; conn->pipe_a_b[1] = conn->pipe_b_a[1] = -1;
for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) { for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
if (splice_pipe_pool[i][0][0] > 0) { if (splice_pipe_pool[i][0][0] >= 0) {
SWAP(conn->pipe_a_b[0], splice_pipe_pool[i][0][0]); SWAP(conn->pipe_a_b[0], splice_pipe_pool[i][0][0]);
SWAP(conn->pipe_a_b[1], splice_pipe_pool[i][0][1]); SWAP(conn->pipe_a_b[1], splice_pipe_pool[i][0][1]);
@ -377,7 +370,7 @@ static int tcp_splice_connect_finish(struct ctx *c,
if (conn->pipe_a_b[0] < 0) { if (conn->pipe_a_b[0] < 0) {
if (pipe2(conn->pipe_a_b, O_NONBLOCK) || if (pipe2(conn->pipe_a_b, O_NONBLOCK) ||
pipe2(conn->pipe_b_a, O_NONBLOCK)) { pipe2(conn->pipe_b_a, O_NONBLOCK)) {
conn_flag(c, conn, SPLICE_CLOSING); conn_flag(c, conn, CLOSING);
return -EIO; return -EIO;
} }
@ -385,8 +378,8 @@ static int tcp_splice_connect_finish(struct ctx *c,
fcntl(conn->pipe_b_a[0], F_SETPIPE_SZ, c->tcp.pipe_size); fcntl(conn->pipe_b_a[0], F_SETPIPE_SZ, c->tcp.pipe_size);
} }
if (!(conn->events & SPLICE_ESTABLISHED)) if (!(conn->events & ESTABLISHED))
conn_event(c, conn, SPLICE_ESTABLISHED); conn_event(c, conn, ESTABLISHED);
return 0; return 0;
} }
@ -450,9 +443,9 @@ static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn,
close(sock_conn); close(sock_conn);
return ret; return ret;
} }
conn_event(c, conn, SPLICE_CONNECT); conn_event(c, conn, CONNECT);
} else { } else {
conn_event(c, conn, SPLICE_ESTABLISHED); conn_event(c, conn, ESTABLISHED);
return tcp_splice_connect_finish(c, conn); return tcp_splice_connect_finish(c, conn);
} }
@ -575,20 +568,23 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
conn = CONN(c->tcp.splice_conn_count++); conn = CONN(c->tcp.splice_conn_count++);
conn->a = s; conn->a = s;
conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0; conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0;
if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index)) if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index))
conn_flag(c, conn, SPLICE_CLOSING); conn_flag(c, conn, CLOSING);
return; return;
} }
conn = CONN(ref.r.p.tcp.tcp.index); conn = CONN(ref.r.p.tcp.tcp.index);
if (events & EPOLLERR || events & EPOLLHUP) if (conn->events == CLOSED)
return;
if (events & EPOLLERR)
goto close; goto close;
if (conn->events == SPLICE_CONNECT) { if (conn->events == CONNECT) {
if (!(events & EPOLLOUT)) if (!(events & EPOLLOUT))
goto close; goto close;
if (tcp_splice_connect_finish(c, conn)) if (tcp_splice_connect_finish(c, conn))
@ -597,9 +593,9 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
if (events & EPOLLOUT) { if (events & EPOLLOUT) {
if (ref.r.s == conn->a) if (ref.r.s == conn->a)
conn_event(c, conn, ~SPLICE_A_OUT_WAIT); conn_event(c, conn, ~A_OUT_WAIT);
else else
conn_event(c, conn, ~SPLICE_B_OUT_WAIT); conn_event(c, conn, ~B_OUT_WAIT);
tcp_splice_dir(conn, ref.r.s, 1, &from, &to, &pipes); tcp_splice_dir(conn, ref.r.s, 1, &from, &to, &pipes);
} else { } else {
@ -608,9 +604,16 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
if (events & EPOLLRDHUP) { if (events & EPOLLRDHUP) {
if (ref.r.s == conn->a) if (ref.r.s == conn->a)
conn_event(c, conn, SPLICE_A_FIN_RCVD); conn_event(c, conn, A_FIN_RCVD);
else else
conn_event(c, conn, SPLICE_B_FIN_RCVD); conn_event(c, conn, B_FIN_RCVD);
}
if (events & EPOLLHUP) {
if (ref.r.s == conn->a)
conn_event(c, conn, A_FIN_SENT); /* Fake, but implied */
else
conn_event(c, conn, B_FIN_SENT);
} }
swap: swap:
@ -620,13 +623,13 @@ swap:
if (from == conn->a) { if (from == conn->a) {
seq_read = &conn->a_read; seq_read = &conn->a_read;
seq_write = &conn->a_written; seq_write = &conn->a_written;
lowat_set_flag = SPLICE_RCVLOWAT_SET_A; lowat_set_flag = RCVLOWAT_SET_A;
lowat_act_flag = SPLICE_RCVLOWAT_ACT_A; lowat_act_flag = RCVLOWAT_ACT_A;
} else { } else {
seq_read = &conn->b_read; seq_read = &conn->b_read;
seq_write = &conn->b_written; seq_write = &conn->b_written;
lowat_set_flag = SPLICE_RCVLOWAT_SET_B; lowat_set_flag = RCVLOWAT_SET_B;
lowat_act_flag = SPLICE_RCVLOWAT_ACT_B; lowat_act_flag = RCVLOWAT_ACT_B;
} }
while (1) { while (1) {
@ -636,6 +639,7 @@ swap:
retry: retry:
readlen = splice(from, NULL, pipes[1], NULL, c->tcp.pipe_size, readlen = splice(from, NULL, pipes[1], NULL, c->tcp.pipe_size,
SPLICE_F_MOVE | SPLICE_F_NONBLOCK); SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
trace("TCP (spliced): %li from read-side call", readlen);
if (readlen < 0) { if (readlen < 0) {
if (errno == EINTR) if (errno == EINTR)
goto retry; goto retry;
@ -660,6 +664,8 @@ retry:
eintr: eintr:
written = splice(pipes[0], NULL, to, NULL, to_write, written = splice(pipes[0], NULL, to, NULL, to_write,
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
trace("TCP (spliced): %li from write-side call (passed %lu)",
written, to_write);
/* Most common case: skip updating counters. */ /* Most common case: skip updating counters. */
if (readlen > 0 && readlen == written) { if (readlen > 0 && readlen == written) {
@ -697,9 +703,9 @@ eintr:
goto retry; goto retry;
if (to == conn->a) if (to == conn->a)
conn_event(c, conn, SPLICE_A_OUT_WAIT); conn_event(c, conn, A_OUT_WAIT);
else else
conn_event(c, conn, SPLICE_B_OUT_WAIT); conn_event(c, conn, B_OUT_WAIT);
break; break;
} }
@ -715,23 +721,21 @@ eintr:
break; break;
} }
if ( (conn->events & SPLICE_A_FIN_RCVD) && if ((conn->events & A_FIN_RCVD) && !(conn->events & B_FIN_SENT)) {
!(conn->events & SPLICE_B_FIN_SENT)) { if (*seq_read == *seq_write && eof) {
if (*seq_read == *seq_write) {
shutdown(conn->b, SHUT_WR); shutdown(conn->b, SHUT_WR);
conn_event(c, conn, SPLICE_B_FIN_SENT); conn_event(c, conn, B_FIN_SENT);
} }
} }
if ( (conn->events & SPLICE_B_FIN_RCVD) && if ((conn->events & B_FIN_RCVD) && !(conn->events & A_FIN_SENT)) {
!(conn->events & SPLICE_A_FIN_SENT)) { if (*seq_read == *seq_write && eof) {
if (*seq_read == *seq_write) {
shutdown(conn->a, SHUT_WR); shutdown(conn->a, SHUT_WR);
conn_event(c, conn, SPLICE_A_FIN_SENT); conn_event(c, conn, A_FIN_SENT);
} }
} }
if (CONN_HAS(conn, SPLICE_A_FIN_SENT | SPLICE_B_FIN_SENT)) if (CONN_HAS(conn, A_FIN_SENT | B_FIN_SENT))
goto close; goto close;
if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) { if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
@ -746,10 +750,13 @@ eintr:
goto swap; goto swap;
} }
if (events & EPOLLHUP)
goto close;
return; return;
close: close:
conn_flag(c, conn, SPLICE_CLOSING); conn_flag(c, conn, CLOSING);
} }
/** /**
@ -829,37 +836,35 @@ void tcp_splice_init(struct ctx *c)
/** /**
* tcp_splice_timer() - Timer for spliced connections * tcp_splice_timer() - Timer for spliced connections
* @c: Execution context * @c: Execution context
* @now: Current timestamp
*/ */
void tcp_splice_timer(struct ctx *c, struct timespec *now) void tcp_splice_timer(struct ctx *c)
{ {
struct tcp_splice_conn *conn; struct tcp_splice_conn *conn;
for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) { for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) {
if (conn->flags & SPLICE_CLOSING) { if (conn->flags & CLOSING) {
tcp_splice_destroy(c, conn); tcp_splice_destroy(c, conn);
continue; return;
} }
if ( (conn->flags & SPLICE_RCVLOWAT_SET_A) && if ( (conn->flags & RCVLOWAT_SET_A) &&
!(conn->flags & SPLICE_RCVLOWAT_ACT_A)) { !(conn->flags & RCVLOWAT_ACT_A)) {
setsockopt(conn->a, SOL_SOCKET, SO_RCVLOWAT, setsockopt(conn->a, SOL_SOCKET, SO_RCVLOWAT,
&((int){ 1 }), sizeof(int)); &((int){ 1 }), sizeof(int));
conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_A); conn_flag(c, conn, ~RCVLOWAT_SET_A);
} }
if ( (conn->flags & SPLICE_RCVLOWAT_SET_B) && if ( (conn->flags & RCVLOWAT_SET_B) &&
!(conn->flags & SPLICE_RCVLOWAT_ACT_B)) { !(conn->flags & RCVLOWAT_ACT_B)) {
setsockopt(conn->b, SOL_SOCKET, SO_RCVLOWAT, setsockopt(conn->b, SOL_SOCKET, SO_RCVLOWAT,
&((int){ 1 }), sizeof(int)); &((int){ 1 }), sizeof(int));
conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_B); conn_flag(c, conn, ~RCVLOWAT_SET_B);
} }
conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_A); conn_flag(c, conn, ~RCVLOWAT_ACT_A);
conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_B); conn_flag(c, conn, ~RCVLOWAT_ACT_B);
} }
if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL)
tcp_splice_pipe_refill(c); tcp_splice_pipe_refill(c);
} }
@ -869,14 +874,15 @@ void tcp_splice_timer(struct ctx *c, struct timespec *now)
*/ */
void tcp_splice_defer_handler(struct ctx *c) void tcp_splice_defer_handler(struct ctx *c)
{ {
int max_conns = c->tcp.conn_count / 100 * TCP_SPLICE_CONN_PRESSURE;
int max_files = c->nofile / 100 * TCP_SPLICE_FILE_PRESSURE; int max_files = c->nofile / 100 * TCP_SPLICE_FILE_PRESSURE;
struct tcp_splice_conn *conn; struct tcp_splice_conn *conn;
if (c->tcp.splice_conn_count * 6 < max_files) if (c->tcp.splice_conn_count < MIN(max_files / 6, max_conns))
return; return;
for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) { for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) {
if (conn->flags & SPLICE_CLOSING) if (conn->flags & CLOSING)
tcp_splice_destroy(c, conn); tcp_splice_destroy(c, conn);
} }
} }

View file

@ -11,5 +11,5 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
uint32_t events); uint32_t events);
void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn);
void tcp_splice_init(struct ctx *c); void tcp_splice_init(struct ctx *c);
void tcp_splice_timer(struct ctx *c, struct timespec *now); void tcp_splice_timer(struct ctx *c);
void tcp_splice_defer_handler(struct ctx *c); void tcp_splice_defer_handler(struct ctx *c);

46
udp.c
View file

@ -951,35 +951,35 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
* @c: Execution context * @c: Execution context
* @af: Address family, AF_INET or AF_INET6 * @af: Address family, AF_INET or AF_INET6
* @addr: Destination address * @addr: Destination address
* @msg: Input messages * @p: Pool of UDP packets, with UDP headers
* @count: Message count
* @now: Current timestamp * @now: Current timestamp
* *
* Return: count of consumed packets * Return: count of consumed packets
* *
* #syscalls sendmmsg * #syscalls sendmmsg
*/ */
int udp_tap_handler(struct ctx *c, int af, void *addr, int udp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
struct tap_l4_msg *msg, int count, struct timespec *now) struct timespec *now)
{ {
/* The caller already checks that all the messages have the same source
* and destination, so we can just take those from the first message.
*/
struct udphdr *uh = (struct udphdr *)(pkt_buf + msg[0].pkt_buf_offset);
struct mmsghdr mm[UIO_MAXIOV] = { 0 }; struct mmsghdr mm[UIO_MAXIOV] = { 0 };
struct iovec m[UIO_MAXIOV]; struct iovec m[UIO_MAXIOV];
struct sockaddr_in6 s_in6; struct sockaddr_in6 s_in6;
struct sockaddr_in s_in; struct sockaddr_in s_in;
struct sockaddr *sa; struct sockaddr *sa;
int i, s, count = 0;
in_port_t src, dst; in_port_t src, dst;
struct udphdr *uh;
socklen_t sl; socklen_t sl;
int i, s;
(void)c; (void)c;
if (msg[0].l4_len < sizeof(*uh)) uh = packet_get(p, 0, 0, sizeof(*uh), NULL);
if (!uh)
return 1; return 1;
/* The caller already checks that all the messages have the same source
* and destination, so we can just take those from the first message.
*/
src = ntohs(uh->source); src = ntohs(uh->source);
dst = ntohs(uh->dest); dst = ntohs(uh->dest);
@ -998,8 +998,8 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
.udp.port = src }; .udp.port = src };
s = sock_l4(c, AF_INET, IPPROTO_UDP, src, 0, uref.u32); s = sock_l4(c, AF_INET, IPPROTO_UDP, src, 0, uref.u32);
if (s <= 0) if (s < 0)
return count; return p->count;
udp_tap_map[V4][src].sock = s; udp_tap_map[V4][src].sock = s;
bitmap_set(udp_act[V4][UDP_ACT_TAP], src); bitmap_set(udp_act[V4][UDP_ACT_TAP], src);
@ -1050,8 +1050,8 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
s = sock_l4(c, AF_INET6, IPPROTO_UDP, src, bind_to, s = sock_l4(c, AF_INET6, IPPROTO_UDP, src, bind_to,
uref.u32); uref.u32);
if (s <= 0) if (s < 0)
return count; return p->count;
udp_tap_map[V6][src].sock = s; udp_tap_map[V6][src].sock = s;
bitmap_set(udp_act[V6][UDP_ACT_TAP], src); bitmap_set(udp_act[V6][UDP_ACT_TAP], src);
@ -1060,18 +1060,26 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
udp_tap_map[V6][src].ts = now->tv_sec; udp_tap_map[V6][src].ts = now->tv_sec;
} }
for (i = 0; i < count; i++) { for (i = 0; i < (int)p->count; i++) {
struct udphdr *uh_send; struct udphdr *uh_send;
size_t len;
uh_send = packet_get(p, i, 0, sizeof(*uh), &len);
if (!uh_send)
return p->count;
if (!len)
continue;
uh_send = (struct udphdr *)(msg[i].pkt_buf_offset + pkt_buf);
m[i].iov_base = (char *)(uh_send + 1); m[i].iov_base = (char *)(uh_send + 1);
m[i].iov_len = msg[i].l4_len - sizeof(*uh_send); m[i].iov_len = len;
mm[i].msg_hdr.msg_name = sa; mm[i].msg_hdr.msg_name = sa;
mm[i].msg_hdr.msg_namelen = sl; mm[i].msg_hdr.msg_namelen = sl;
mm[i].msg_hdr.msg_iov = m + i; mm[i].msg_hdr.msg_iov = m + i;
mm[i].msg_hdr.msg_iovlen = 1; mm[i].msg_hdr.msg_iovlen = 1;
count++;
} }
count = sendmmsg(s, mm, count, MSG_NOSIGNAL); count = sendmmsg(s, mm, count, MSG_NOSIGNAL);
@ -1172,13 +1180,11 @@ static void udp_splice_iov_init(void)
* *
* Return: 0 on success, -1 on failure * Return: 0 on success, -1 on failure
*/ */
int udp_sock_init(struct ctx *c, struct timespec *now) int udp_sock_init(struct ctx *c)
{ {
union udp_epoll_ref uref = { .udp.bound = 1 }; union udp_epoll_ref uref = { .udp.bound = 1 };
int dst, s; int dst, s;
(void)now;
for (dst = 0; dst < USHRT_MAX; dst++) { for (dst = 0; dst < USHRT_MAX; dst++) {
if (!bitmap_isset(c->udp.port_to_tap, dst)) if (!bitmap_isset(c->udp.port_to_tap, dst))
continue; continue;

6
udp.h
View file

@ -10,9 +10,9 @@
void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now); struct timespec *now);
int udp_tap_handler(struct ctx *c, int af, void *addr, int udp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
struct tap_l4_msg *msg, int count, struct timespec *now); struct timespec *now);
int udp_sock_init(struct ctx *c, struct timespec *now); int udp_sock_init(struct ctx *c);
void udp_timer(struct ctx *c, struct timespec *ts); void udp_timer(struct ctx *c, struct timespec *ts);
void udp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s, void udp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
const uint32_t *ip_da); const uint32_t *ip_da);

54
util.c
View file

@ -38,6 +38,7 @@
#include "util.h" #include "util.h"
#include "passt.h" #include "passt.h"
#include "packet.h"
/* For __openlog() and __setlogmask() wrappers, and passt_vsyslog() */ /* For __openlog() and __setlogmask() wrappers, and passt_vsyslog() */
static int log_mask; static int log_mask;
@ -156,46 +157,59 @@ void passt_vsyslog(int pri, const char *format, va_list ap)
send(log_sock, buf, n, 0); send(log_sock, buf, n, 0);
} }
#define IPV6_NH_OPT(nh) \
((nh) == 0 || (nh) == 43 || (nh) == 44 || (nh) == 50 || \
(nh) == 51 || (nh) == 60 || (nh) == 135 || (nh) == 139 || \
(nh) == 140 || (nh) == 253 || (nh) == 254)
/** /**
* ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol * ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol
* @ip6h: IPv6 header * @p: Packet pool, packet number @index has IPv6 header at @offset
* @index: Index of packet in pool
* @offset: Pre-calculated IPv6 header offset
* @proto: Filled with L4 protocol number * @proto: Filled with L4 protocol number
* @dlen: Data length (payload excluding header extensions), set on return
* *
* Return: pointer to L4 header, NULL if not found * Return: pointer to L4 header, NULL if not found
*/ */
char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto) char *ipv6_l4hdr(struct pool *p, int index, size_t offset, uint8_t *proto,
size_t *dlen)
{ {
int offset, len, hdrlen;
struct ipv6_opt_hdr *o; struct ipv6_opt_hdr *o;
struct ipv6hdr *ip6h;
char *base;
int hdrlen;
uint8_t nh; uint8_t nh;
len = ntohs(ip6h->payload_len); base = packet_get(p, index, 0, 0, NULL);
offset = 0; ip6h = packet_get(p, index, offset, sizeof(*ip6h), dlen);
if (!ip6h)
return NULL;
offset += sizeof(*ip6h);
while (offset < len) {
if (!offset) {
nh = ip6h->nexthdr; nh = ip6h->nexthdr;
hdrlen = sizeof(struct ipv6hdr); if (!IPV6_NH_OPT(nh))
} else { goto found;
o = (struct ipv6_opt_hdr *)(((char *)ip6h) + offset);
while ((o = packet_get_try(p, index, offset, sizeof(*o), dlen))) {
nh = o->nexthdr; nh = o->nexthdr;
hdrlen = (o->hdrlen + 1) * 8; hdrlen = (o->hdrlen + 1) * 8;
if (IPV6_NH_OPT(nh))
offset += hdrlen;
else
goto found;
} }
return NULL;
found:
if (nh == 59) if (nh == 59)
return NULL; return NULL;
if (nh == 0 || nh == 43 || nh == 44 || nh == 50 ||
nh == 51 || nh == 60 || nh == 135 || nh == 139 ||
nh == 140 || nh == 253 || nh == 254) {
offset += hdrlen;
} else {
*proto = nh; *proto = nh;
return (char *)(ip6h + 1) + offset; return base + offset;
}
}
return NULL;
} }
/** /**

5
util.h
View file

@ -153,6 +153,8 @@ enum {
#include <limits.h> #include <limits.h>
#include <stdarg.h> #include <stdarg.h>
#include "packet.h"
enum bind_type { enum bind_type {
BIND_ANY = 0, BIND_ANY = 0,
BIND_LOOPBACK, BIND_LOOPBACK,
@ -194,7 +196,8 @@ __attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); }
void __openlog(const char *ident, int option, int facility); void __openlog(const char *ident, int option, int facility);
void passt_vsyslog(int pri, const char *format, va_list ap); void passt_vsyslog(int pri, const char *format, va_list ap);
void __setlogmask(int mask); void __setlogmask(int mask);
char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto); char *ipv6_l4hdr(struct pool *p, int index, size_t offset, uint8_t *proto,
size_t *dlen);
int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port,
enum bind_type bind_addr, uint32_t data); enum bind_type bind_addr, uint32_t data);
void sock_probe_mem(struct ctx *c); void sock_probe_mem(struct ctx *c);