From b2d6ec398dc4a249eb508cd4488d910cccded710 Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@nvidia.com>
Date: Wed, 12 Jun 2024 14:14:48 -0400
Subject: [PATCH] zebra: Modify dplane loop to allow backpressure to filter up

Currently when the dplane_thread_loop is run, it moves contexts
from the dg_update_list and puts the contexts on the input queue
of the first provider.  This provider is given a chance to run
and then the items on the output queue are pulled off and placed
on the input queue of the next provider.  Rinse/Repeat down through
the entire list of providers.  Now imagine that we have a list
of multiple providers and the last provider is getting backed up.
Contexts will end up sticking in the input Queue of the `slow`
provider.  This can grow without bounds.  This is a real problem
when you have a situation where an interface is flapping and an
upper level protocol is sending a continous stream of route
updates to reflect the change in ecmp.  You can end up with
a very very large backlog of contexts.  This is bad because
zebra can easily grow to a very very large memory size and on
restricted systems you can run out of memory.  Fortunately
for us, the MetaQ already participates with this process
by not doing more route processing until the dg_update_list
goes below the working limit of dg_updates_per_cycle.  Thus
if FRR modifies the behavior of this loop to not move more
contexts onto the input queue if either the input queue
or output queue of the next provider has reached this limit.
FRR will naturaly start auto handling backpressure for the dplane
context system and memory will not go out of control.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
---
 zebra/zebra_dplane.c | 114 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 94 insertions(+), 20 deletions(-)

diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index 80f3ae285a01..0a49537f6c69 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -6706,10 +6706,10 @@ static void dplane_thread_loop(struct thread *event)
 {
 	struct dplane_ctx_list_head work_list;
 	struct dplane_ctx_list_head error_list;
-	struct zebra_dplane_provider *prov;
+	struct zebra_dplane_provider *prov, *next_prov;
 	struct zebra_dplane_ctx *ctx;
 	int limit, counter, error_counter;
-	uint64_t curr, high;
+	uint64_t curr, out_curr, high;
 	bool reschedule = false;
 
 	/* Capture work limit per cycle */
@@ -6733,18 +6733,48 @@ static void dplane_thread_loop(struct thread *event)
 	/* Locate initial registered provider */
 	prov = dplane_prov_list_first(&zdplane_info.dg_providers);
 
-	/* Move new work from incoming list to temp list */
-	for (counter = 0; counter < limit; counter++) {
-		ctx = dplane_ctx_list_pop(&zdplane_info.dg_update_list);
-		if (ctx) {
-			ctx->zd_provider = prov->dp_id;
+	curr = dplane_ctx_queue_count(&prov->dp_ctx_in_list);
+	out_curr = dplane_ctx_queue_count(&prov->dp_ctx_out_list);
 
-			dplane_ctx_list_add_tail(&work_list, ctx);
-		} else {
-			break;
+	if (curr >= (uint64_t)limit) {
+		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+			zlog_debug("%s: Current first provider(%s) Input queue is %" PRIu64
+				   ", holding off work",
+				   __func__, prov->dp_name, curr);
+		counter = 0;
+	} else if (out_curr >= (uint64_t)limit) {
+		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+			zlog_debug("%s: Current first provider(%s) Output queue is %" PRIu64
+				   ", holding off work",
+				   __func__, prov->dp_name, out_curr);
+		counter = 0;
+	} else {
+		int tlimit;
+		/*
+		 * Let's limit the work to how what can be put on the
+		 * in or out queue without going over
+		 */
+		tlimit = limit - MAX(curr, out_curr);
+		/* Move new work from incoming list to temp list */
+		for (counter = 0; counter < tlimit; counter++) {
+			ctx = dplane_ctx_list_pop(&zdplane_info.dg_update_list);
+			if (ctx) {
+				ctx->zd_provider = prov->dp_id;
+
+				dplane_ctx_list_add_tail(&work_list, ctx);
+			} else {
+				break;
+			}
 		}
 	}
 
+	/*
+	 * If there is anything still on the two input queues reschedule
+	 */
+	if (dplane_ctx_queue_count(&prov->dp_ctx_in_list) > 0 ||
+	    dplane_ctx_queue_count(&zdplane_info.dg_update_list) > 0)
+		reschedule = true;
+
 	DPLANE_UNLOCK();
 
 	atomic_fetch_sub_explicit(&zdplane_info.dg_routes_queued, counter,
@@ -6763,8 +6793,9 @@ static void dplane_thread_loop(struct thread *event)
 		 * items.
 		 */
 		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
-			zlog_debug("dplane enqueues %d new work to provider '%s'",
-				   counter, dplane_provider_get_name(prov));
+			zlog_debug("dplane enqueues %d new work to provider '%s' curr is %" PRIu64,
+				   counter, dplane_provider_get_name(prov),
+				   curr);
 
 		/* Capture current provider id in each context; check for
 		 * error status.
@@ -6822,18 +6853,61 @@ static void dplane_thread_loop(struct thread *event)
 		if (!zdplane_info.dg_run)
 			break;
 
+		/* Locate next provider */
+		next_prov = dplane_prov_list_next(&zdplane_info.dg_providers,
+						  prov);
+		if (next_prov) {
+			curr = dplane_ctx_queue_count(
+				&next_prov->dp_ctx_in_list);
+			out_curr = dplane_ctx_queue_count(
+				&next_prov->dp_ctx_out_list);
+		} else
+			out_curr = curr = 0;
+
 		/* Dequeue completed work from the provider */
 		dplane_provider_lock(prov);
 
-		while (counter < limit) {
-			ctx = dplane_provider_dequeue_out_ctx(prov);
-			if (ctx) {
-				dplane_ctx_list_add_tail(&work_list, ctx);
-				counter++;
-			} else
-				break;
+		if (curr >= (uint64_t)limit) {
+			if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+				zlog_debug("%s: Next Provider(%s) Input queue is %" PRIu64
+					   ", holding off work",
+					   __func__, next_prov->dp_name, curr);
+			counter = 0;
+		} else if (out_curr >= (uint64_t)limit) {
+			if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+				zlog_debug("%s: Next Provider(%s) Output queue is %" PRIu64
+					   ", holding off work",
+					   __func__, next_prov->dp_name,
+					   out_curr);
+			counter = 0;
+		} else {
+			int tlimit;
+
+			/*
+			 * Let's limit the work to how what can be put on the
+			 * in or out queue without going over
+			 */
+			tlimit = limit - MAX(curr, out_curr);
+			while (counter < tlimit) {
+				ctx = dplane_provider_dequeue_out_ctx(prov);
+				if (ctx) {
+					dplane_ctx_list_add_tail(&work_list,
+								 ctx);
+					counter++;
+				} else
+					break;
+			}
 		}
 
+		/*
+		 * Let's check if there are still any items on the
+		 * input or output queus of the current provider
+		 * if so then we know we need to reschedule.
+		 */
+		if (dplane_ctx_queue_count(&prov->dp_ctx_in_list) > 0 ||
+		    dplane_ctx_queue_count(&prov->dp_ctx_out_list) > 0)
+			reschedule = true;
+
 		dplane_provider_unlock(prov);
 
 		if (counter >= limit)
@@ -6844,7 +6918,7 @@ static void dplane_thread_loop(struct thread *event)
 				   counter, dplane_provider_get_name(prov));
 
 		/* Locate next provider */
-		prov = dplane_prov_list_next(&zdplane_info.dg_providers, prov);
+		prov = next_prov;
 	}
 
 	/*