Compare commits

...

4 commits

Author SHA1 Message Date
Stefano Brivio
0c6c20dee5 udp, udp_flow: Add instrumentation, handle EPOLLERR without queued errors
Link: https://github.com/containers/podman/issues/23686#issuecomment-2324945010
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2024-09-04 18:36:54 +02:00
David Gibson
d098e0527a additional debug 2024-09-04 18:36:54 +02:00
David Gibson
026fb71d1d tcp: Attempt to mitigate EPOLLRDHUP storms with half-closed connections
Link: https://github.com/containers/podman/issues/23686
2024-09-04 18:36:54 +02:00
Stefano Brivio
232e12529e log: Don't prefix log file messages with time and severity if they're continuations
In fecb1b65b1 ("log: Don't prefix message with timestamp on --debug
if it's a continuation"), I fixed this for --debug on standard error,
but not for log files: if messages are continuations, they shouldn't
be prefixed by timestamp and severity.

Otherwise, we'll print stuff like this:

  0.0028: ERROR:   Receive error on guest connection, reset0.0028:  ERROR:   : Bad file descriptor

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2024-09-04 16:22:27 +02:00
5 changed files with 91 additions and 23 deletions

12
log.c
View file

@ -224,19 +224,23 @@ static int logfile_rotate(int fd, const struct timespec *now)
/** /**
* logfile_write() - Write entry to log file, trigger rotation if full * logfile_write() - Write entry to log file, trigger rotation if full
* @newline: Append newline at the end of the message, if missing * @newline: Append newline at the end of the message, if missing
* @cont: Continuation of a previous message, on the same line
* @pri: Facility and level map, same as priority for vsyslog() * @pri: Facility and level map, same as priority for vsyslog()
* @now: Timestamp * @now: Timestamp
* @format: Same as vsyslog() format * @format: Same as vsyslog() format
* @ap: Same as vsyslog() ap * @ap: Same as vsyslog() ap
*/ */
static void logfile_write(bool newline, int pri, const struct timespec *now, static void logfile_write(bool newline, bool cont, int pri,
const struct timespec *now,
const char *format, va_list ap) const char *format, va_list ap)
{ {
char buf[BUFSIZ]; char buf[BUFSIZ];
int n; int n = 0;
n = logtime_fmt(buf, BUFSIZ, now); if (!cont) {
n += logtime_fmt(buf, BUFSIZ, now);
n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]); n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]);
}
n += vsnprintf(buf + n, BUFSIZ - n, format, ap); n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
@ -278,7 +282,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
va_copy(ap2, ap); /* Don't clobber ap, we need it again */ va_copy(ap2, ap); /* Don't clobber ap, we need it again */
if (log_file != -1) if (log_file != -1)
logfile_write(newline, pri, now, format, ap2); logfile_write(newline, cont, pri, now, format, ap2);
else if (!(log_mask & LOG_MASK(LOG_DEBUG))) else if (!(log_mask & LOG_MASK(LOG_DEBUG)))
passt_vsyslog(newline, pri, format, ap2); passt_vsyslog(newline, pri, format, ap2);

18
tcp.c
View file

@ -424,23 +424,27 @@ int tcp_set_peek_offset(int s, int offset)
*/ */
static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
{ {
uint32_t rdhup;
if (!events) if (!events)
return 0; return 0;
rdhup = (events & SOCK_FIN_RCVD) ? 0 : EPOLLRDHUP;
if (events & ESTABLISHED) { if (events & ESTABLISHED) {
if (events & TAP_FIN_SENT) if (events & TAP_FIN_SENT)
return EPOLLET; return EPOLLET;
if (conn_flags & STALLED) if (conn_flags & STALLED)
return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET; return EPOLLIN | EPOLLOUT | rdhup | EPOLLET;
return EPOLLIN | EPOLLRDHUP; return EPOLLIN | rdhup;
} }
if (events == TAP_SYN_RCVD) if (events == TAP_SYN_RCVD)
return EPOLLOUT | EPOLLET | EPOLLRDHUP; return EPOLLOUT | EPOLLET | rdhup;
return EPOLLRDHUP; return rdhup;
} }
/** /**
@ -2217,6 +2221,12 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
ASSERT(!c->no_tcp); ASSERT(!c->no_tcp);
ASSERT(pif_at_sidx(ref.flowside) != PIF_TAP); ASSERT(pif_at_sidx(ref.flowside) != PIF_TAP);
if (events & EPOLLRDHUP) {
flow_err(conn, "EPOLLRDHUP: events=0x%x conn->events=0x%x "
"conn->flags=0x%x\n", events, conn->events,
conn->flags);
}
if (conn->events == CLOSED) if (conn->events == CLOSED)
return; return;

72
udp.c
View file

@ -387,11 +387,11 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
* udp_sock_recverr() - Receive and clear an error from a socket * udp_sock_recverr() - Receive and clear an error from a socket
* @s: Socket to receive from * @s: Socket to receive from
* *
* Return: true if errors received and processed, false if no more errors * Return: ee_errno, 0 on empty queue
* *
* #syscalls recvmsg * #syscalls recvmsg
*/ */
static bool udp_sock_recverr(int s) static int udp_sock_recverr(int s)
{ {
const struct sock_extended_err *ee; const struct sock_extended_err *ee;
const struct cmsghdr *hdr; const struct cmsghdr *hdr;
@ -410,12 +410,13 @@ static bool udp_sock_recverr(int s)
if (rc < 0) { if (rc < 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK) if (errno != EAGAIN && errno != EWOULDBLOCK)
err_perror("Failed to read error queue"); err_perror("Failed to read error queue");
return false;
return 0;
} }
if (!(mh.msg_flags & MSG_ERRQUEUE)) { if (!(mh.msg_flags & MSG_ERRQUEUE)) {
err("Missing MSG_ERRQUEUE flag reading error queue"); err("Missing MSG_ERRQUEUE flag reading error queue");
return false; return 0;
} }
hdr = CMSG_FIRSTHDR(&mh); hdr = CMSG_FIRSTHDR(&mh);
@ -424,7 +425,7 @@ static bool udp_sock_recverr(int s)
(hdr->cmsg_level == IPPROTO_IPV6 && (hdr->cmsg_level == IPPROTO_IPV6 &&
hdr->cmsg_type == IPV6_RECVERR))) { hdr->cmsg_type == IPV6_RECVERR))) {
err("Unexpected cmsg reading error queue"); err("Unexpected cmsg reading error queue");
return false; return 0;
} }
ee = (const struct sock_extended_err *)CMSG_DATA(hdr); ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
@ -433,7 +434,7 @@ static bool udp_sock_recverr(int s)
debug("%s error on UDP socket %i: %s", debug("%s error on UDP socket %i: %s",
str_ee_origin(ee), s, strerror(ee->ee_errno)); str_ee_origin(ee), s, strerror(ee->ee_errno));
return true; return ee->ee_errno;
} }
/** /**
@ -441,12 +442,15 @@ static bool udp_sock_recverr(int s)
* @c: Execution context * @c: Execution context
* @s: Socket to receive from * @s: Socket to receive from
* @events: epoll events bitmap * @events: epoll events bitmap
* @mmh mmsghdr array to receive into * @mmh: mmsghdr array to receive into
* @recv_err: Set to last error in queue. If none: -1 on EPOLLERR, 0 otherwise
*
* Return: count of datagrams received
* *
* #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64 * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
*/ */
static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
struct mmsghdr *mmh) struct mmsghdr *mmh, int *recv_err)
{ {
/* For not entirely clear reasons (data locality?) pasta gets better /* For not entirely clear reasons (data locality?) pasta gets better
* throughput if we receive tap datagrams one at a atime. For small * throughput if we receive tap datagrams one at a atime. For small
@ -461,8 +465,13 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
/* Clear any errors first */ /* Clear any errors first */
if (events & EPOLLERR) { if (events & EPOLLERR) {
while (udp_sock_recverr(s)) bool found = false;
; int ret;
while ((ret = udp_sock_recverr(s)))
found = true;
*recv_err = found ? ret : -1;
} }
if (!(events & EPOLLIN)) if (!(events & EPOLLIN))
@ -490,9 +499,10 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now) uint32_t events, const struct timespec *now)
{ {
const socklen_t sasize = sizeof(udp_meta[0].s_in); const socklen_t sasize = sizeof(udp_meta[0].s_in);
int recv_err = 0;
int n, i; int n, i;
if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0) if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv, &recv_err)) <= 0)
return; return;
/* We divide datagrams into batches based on how we need to send them, /* We divide datagrams into batches based on how we need to send them,
@ -562,11 +572,49 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
struct udp_flow *uflow = udp_at_sidx(ref.flowside); struct udp_flow *uflow = udp_at_sidx(ref.flowside);
int from_s = uflow->s[ref.flowside.sidei]; int from_s = uflow->s[ref.flowside.sidei];
uint8_t topif = pif_at_sidx(tosidx); uint8_t topif = pif_at_sidx(tosidx);
int recv_err = 0;
int n, i; int n, i;
ASSERT(!c->no_udp && uflow); ASSERT(!c->no_udp && uflow);
if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0) n = udp_sock_recv(c, from_s, events, udp_mh_recv, &recv_err);
if (recv_err == -1) {
struct flow_common *f = &uflow->f;
char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
const struct flowside *ini = &f->side[INISIDE];
const struct flowside *tgt = &f->side[TGTSIDE];
flow_err(uflow, "EPOLLERR without error queue, closing flow");
err("Last recorded errno was: %i (%s)", uflow->last_errno,
strerror(uflow->last_errno));
flow_log_(f, LOG_ERR,
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
ini->eport,
inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
ini->oport,
pif_name(f->pif[TGTSIDE]),
inany_ntop(&tgt->oaddr, fstr1, sizeof(fstr1)),
tgt->oport,
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
tgt->eport);
udp_flow_close(c, uflow);
return;
}
if (recv_err) {
struct udp_flow *uflow = udp_at_sidx(udp_meta[0].tosidx);
uflow->last_errno = recv_err;
flow_err(uflow, "Recorded errno %i (%s)", recv_err,
strerror(recv_err));
}
if (n <= 0)
return; return;
flow_trace(uflow, "Received %d datagrams on reply socket", n); flow_trace(uflow, "Received %d datagrams on reply socket", n);

View file

@ -34,12 +34,12 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
return &flow->udp; return &flow->udp;
} }
/* /**
* udp_flow_close() - Close and clean up UDP flow * udp_flow_close() - Close and clean up UDP flow
* @c: Execution context * @c: Execution context
* @uflow: UDP flow * @uflow: UDP flow
*/ */
static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
{ {
if (uflow->s[INISIDE] >= 0) { if (uflow->s[INISIDE] >= 0) {
/* The listening socket needs to stay in epoll */ /* The listening socket needs to stay in epoll */
@ -53,6 +53,9 @@ static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
close(uflow->s[TGTSIDE]); close(uflow->s[TGTSIDE]);
uflow->s[TGTSIDE] = -1; uflow->s[TGTSIDE] = -1;
} }
uflow->last_errno = 0;
flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
if (!pif_is_socket(uflow->f.pif[TGTSIDE])) if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE)); flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));

View file

@ -19,6 +19,8 @@ struct udp_flow {
time_t ts; time_t ts;
int s[SIDES]; int s[SIDES];
int last_errno;
}; };
struct udp_flow *udp_at_sidx(flow_sidx_t sidx); struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
@ -30,6 +32,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
const void *saddr, const void *daddr, const void *saddr, const void *daddr,
in_port_t srcport, in_port_t dstport, in_port_t srcport, in_port_t dstport,
const struct timespec *now); const struct timespec *now);
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
const struct timespec *now); const struct timespec *now);