From f132246ab9f8a7142fee6dadf595ba0d143d75ff Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Thu, 14 Dec 2023 08:19:36 -0500 Subject: [PATCH] zebra: Prevent possible wedged fpm write An operator is reporting that the dplane_fpm_nl connection has started to accumulate contexts. One such path that could cause this is that the obuf used is full and stays full. This would imply that what ever is on the receiving end has gotten wedged and is not reading from the stream of data being sent it's way. If after 15 seconds of no response, let's declare the connection dead and reset it. Signed-off-by: Donald Sharp --- zebra/dplane_fpm_nl.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c index c98655fdb877..1882e0c9f6f0 100644 --- a/zebra/dplane_fpm_nl.c +++ b/zebra/dplane_fpm_nl.c @@ -47,6 +47,12 @@ #define SOUTHBOUND_DEFAULT_ADDR INADDR_LOOPBACK #define SOUTHBOUND_DEFAULT_PORT 2620 +/* + * Time in seconds that if the other end is not responding + * something terrible has gone wrong. Let's fix that. + */ +#define DPLANE_FPM_NL_WEDGIE_TIME 15 + /** * FPM header: * { @@ -93,6 +99,7 @@ struct fpm_nl_ctx { struct event *t_event; struct event *t_nhg; struct event *t_dequeue; + struct event *t_wedged; /* zebra events. */ struct event *t_lspreset; @@ -1367,6 +1374,18 @@ static void fpm_rmac_reset(struct event *t) &fnc->t_rmacwalk); } +static void fpm_process_wedged(struct event *t) +{ + struct fpm_nl_ctx *fnc = EVENT_ARG(t); + + zlog_warn("%s: Connection unable to write to peer for over %u seconds, reseting", + __func__, DPLANE_FPM_NL_WEDGIE_TIME); + + atomic_fetch_add_explicit(&fnc->counters.connection_errors, 1, + memory_order_relaxed); + FPM_RECONNECT(fnc); +} + static void fpm_process_queue(struct event *t) { struct fpm_nl_ctx *fnc = EVENT_ARG(t); @@ -1411,9 +1430,13 @@ static void fpm_process_queue(struct event *t) processed_contexts, memory_order_relaxed); /* Re-schedule if we ran out of buffer space */ - if (no_bufs) + if (no_bufs) { event_add_timer(fnc->fthread->master, fpm_process_queue, fnc, 0, &fnc->t_dequeue); + event_add_timer(fnc->fthread->master, fpm_process_wedged, fnc, + DPLANE_FPM_NL_WEDGIE_TIME, &fnc->t_wedged); + } else + EVENT_OFF(fnc->t_wedged); /* * Let the dataplane thread know if there are items in the