From 3209ca4b0850f6ea4970304b172a43249e38e914 Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Thu, 14 Dec 2023 08:19:36 -0500 Subject: [PATCH 1/2] zebra: Prevent possible wedged fpm write An operator is reporting that the dplane_fpm_nl connection has started to accumulate contexts. One such path that could cause this is that the obuf used is full and stays full. This would imply that what ever is on the receiving end has gotten wedged and is not reading from the stream of data being sent it's way. If after 15 seconds of no response, let's declare the connection dead and reset it. Signed-off-by: Donald Sharp --- zebra/dplane_fpm_nl.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c index c98655fdb877..31e93d232322 100644 --- a/zebra/dplane_fpm_nl.c +++ b/zebra/dplane_fpm_nl.c @@ -47,6 +47,12 @@ #define SOUTHBOUND_DEFAULT_ADDR INADDR_LOOPBACK #define SOUTHBOUND_DEFAULT_PORT 2620 +/* + * Time in seconds that if the other end is not responding + * something terrible has gone wrong. Let's fix that. + */ +#define DPLANE_FPM_NL_WEDGIE_TIME 15 + /** * FPM header: * { @@ -93,6 +99,7 @@ struct fpm_nl_ctx { struct event *t_event; struct event *t_nhg; struct event *t_dequeue; + struct event *t_wedged; /* zebra events. */ struct event *t_lspreset; @@ -1367,6 +1374,18 @@ static void fpm_rmac_reset(struct event *t) &fnc->t_rmacwalk); } +static void fpm_process_wedged(struct event *t) +{ + struct fpm_nl_ctx *fnc = EVENT_ARG(t); + + zlog_warn("%s: Connection unable to write to peer for over %u seconds, resetting", + __func__, DPLANE_FPM_NL_WEDGIE_TIME); + + atomic_fetch_add_explicit(&fnc->counters.connection_errors, 1, + memory_order_relaxed); + FPM_RECONNECT(fnc); +} + static void fpm_process_queue(struct event *t) { struct fpm_nl_ctx *fnc = EVENT_ARG(t); @@ -1411,9 +1430,13 @@ static void fpm_process_queue(struct event *t) processed_contexts, memory_order_relaxed); /* Re-schedule if we ran out of buffer space */ - if (no_bufs) + if (no_bufs) { event_add_timer(fnc->fthread->master, fpm_process_queue, fnc, 0, &fnc->t_dequeue); + event_add_timer(fnc->fthread->master, fpm_process_wedged, fnc, + DPLANE_FPM_NL_WEDGIE_TIME, &fnc->t_wedged); + } else + EVENT_OFF(fnc->t_wedged); /* * Let the dataplane thread know if there are items in the From 61af06c8135437167fcc67c23986575ecde4a17f Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Thu, 14 Dec 2023 09:14:00 -0500 Subject: [PATCH 2/2] zebra: Use event_add_event instead of _timer The t_dequeue was being enqueued with a timer of 0 this is really an event instead of a timer. Let's use that instead. Signed-off-by: Donald Sharp --- zebra/dplane_fpm_nl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c index 31e93d232322..03e3c3bcef7e 100644 --- a/zebra/dplane_fpm_nl.c +++ b/zebra/dplane_fpm_nl.c @@ -1431,7 +1431,7 @@ static void fpm_process_queue(struct event *t) /* Re-schedule if we ran out of buffer space */ if (no_bufs) { - event_add_timer(fnc->fthread->master, fpm_process_queue, fnc, 0, + event_add_event(fnc->fthread->master, fpm_process_queue, fnc, 0, &fnc->t_dequeue); event_add_timer(fnc->fthread->master, fpm_process_wedged, fnc, DPLANE_FPM_NL_WEDGIE_TIME, &fnc->t_wedged); @@ -1640,7 +1640,7 @@ static int fpm_nl_process(struct zebra_dplane_provider *prov) if (atomic_load_explicit(&fnc->counters.ctxqueue_len, memory_order_relaxed) > 0) - event_add_timer(fnc->fthread->master, fpm_process_queue, fnc, 0, + event_add_event(fnc->fthread->master, fpm_process_queue, fnc, 0, &fnc->t_dequeue); /* Ensure dataplane thread is rescheduled if we hit the work limit */