From 4954d9d17cd324ebe40aae4842362c0d16e8c35a Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@nvidia.com>
Date: Wed, 23 Oct 2024 13:16:29 -0400
Subject: [PATCH 1/5] bgpd: Do not call evpn_overlay_free no matter what

bgp_update is a very expensive call.  Calling evpn_overlay_free
even when we have no evpn data to free is not trivial.  Let's
limit the call into this function until we actually have data to
free.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
---
 bgpd/bgp_route.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c
index 70da39ee8b43..53cc9a10b569 100644
--- a/bgpd/bgp_route.c
+++ b/bgpd/bgp_route.c
@@ -4676,10 +4676,12 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id,
 		 * will not be interned. In which case, it is ok to update the
 		 * attr->evpn_overlay, so that, this can be stored in adj_in.
 		 */
-		if ((afi == AFI_L2VPN) && evpn)
-			bgp_attr_set_evpn_overlay(attr, evpn);
-		else
-			evpn_overlay_free(evpn);
+		if (evpn) {
+			if (afi == AFI_L2VPN)
+				bgp_attr_set_evpn_overlay(attr, evpn);
+			else
+				evpn_overlay_free(evpn);
+		}
 		bgp_adj_in_set(dest, peer, attr, addpath_id, &bgp_labels);
 	}
 
@@ -4855,10 +4857,12 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id,
 	 * attr->evpn_overlay with evpn directly. Instead memcpy
 	 * evpn to new_atr.evpn_overlay before it is interned.
 	 */
-	if (soft_reconfig && (afi == AFI_L2VPN) && evpn)
-		bgp_attr_set_evpn_overlay(&new_attr, evpn);
-	else
-		evpn_overlay_free(evpn);
+	if (soft_reconfig && evpn) {
+		if (afi == AFI_L2VPN)
+			bgp_attr_set_evpn_overlay(&new_attr, evpn);
+		else
+			evpn_overlay_free(evpn);
+	}
 
 	/* Apply incoming route-map.
 	 * NB: new_attr may now contain newly allocated values from route-map

From ed94fbfe5b1ec2d2d191d30b169e8f40ce48ecdb Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@nvidia.com>
Date: Thu, 24 Oct 2024 11:27:24 -0400
Subject: [PATCH 2/5] bgpd: Store aspath count after aspath has changed

When running bestpath on a very large number of ecmp.
BGP ends up calling aspath_count a very very large number
of times, which results in ~15% cpu runtime in aspath_count_hops.
Modify the aspath to keep track of it's own count.  This results
in the function now taking up ~1.5% of the cpu runtime.  Enough
for the moment to be ignored.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
---
 bgpd/bgp_aspath.c | 37 +++++++++++++++++++++++++++++++++++--
 bgpd/bgp_aspath.h |  1 +
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/bgpd/bgp_aspath.c b/bgpd/bgp_aspath.c
index 4c1615a5c625..a86b42e25015 100644
--- a/bgpd/bgp_aspath.c
+++ b/bgpd/bgp_aspath.c
@@ -297,6 +297,8 @@ static struct aspath *aspath_new(enum asnotation_mode asnotation)
 
 	as = XCALLOC(MTYPE_AS_PATH, sizeof(struct aspath));
 	as->asnotation = asnotation;
+	as->count = 0;
+
 	return as;
 }
 
@@ -399,6 +401,11 @@ unsigned int aspath_count_confeds(struct aspath *aspath)
 }
 
 unsigned int aspath_count_hops(const struct aspath *aspath)
+{
+	return aspath->count;
+}
+
+static unsigned int aspath_count_hops_internal(const struct aspath *aspath)
 {
 	int count = 0;
 	struct assegment *seg = aspath->segments;
@@ -708,6 +715,7 @@ struct aspath *aspath_dup(struct aspath *aspath)
 	else
 		new->str[0] = '\0';
 
+	new->count = aspath->count;
 	return new;
 }
 
@@ -729,6 +737,7 @@ static void *aspath_hash_alloc(void *arg)
 	new->str_len = aspath->str_len;
 	new->json = aspath->json;
 	new->asnotation = aspath->asnotation;
+	new->count = aspath->count;
 
 	return new;
 }
@@ -856,6 +865,8 @@ struct aspath *aspath_parse(struct stream *s, size_t length, int use32bit,
 	if (assegments_parse(s, length, &as.segments, use32bit) < 0)
 		return NULL;
 
+	as.count = aspath_count_hops_internal(&as);
+
 	/* If already same aspath exist then return it. */
 	find = hash_get(ashash, &as, aspath_hash_alloc);
 
@@ -1032,7 +1043,7 @@ static struct assegment *aspath_aggregate_as_set_add(struct aspath *aspath,
 		asset->as[asset->length - 1] = as;
 	}
 
-
+	aspath->count = aspath_count_hops_internal(aspath);
 	return asset;
 }
 
@@ -1113,6 +1124,8 @@ struct aspath *aspath_aggregate(struct aspath *as1, struct aspath *as2)
 
 	assegment_normalise(aspath->segments);
 	aspath_str_update(aspath, false);
+	aspath->count = aspath_count_hops_internal(aspath);
+
 	return aspath;
 }
 
@@ -1268,6 +1281,7 @@ struct aspath *aspath_replace_regex_asn(struct aspath *aspath,
 	}
 
 	aspath_str_update(new, false);
+	new->count = aspath_count_hops_internal(new);
 	return new;
 }
 
@@ -1293,6 +1307,8 @@ struct aspath *aspath_replace_specific_asn(struct aspath *aspath,
 	}
 
 	aspath_str_update(new, false);
+	new->count = aspath_count_hops_internal(new);
+
 	return new;
 }
 
@@ -1315,6 +1331,8 @@ struct aspath *aspath_replace_all_asn(struct aspath *aspath, as_t our_asn)
 	}
 
 	aspath_str_update(new, false);
+	new->count = aspath_count_hops_internal(new);
+
 	return new;
 }
 
@@ -1341,6 +1359,8 @@ struct aspath *aspath_replace_private_asns(struct aspath *aspath, as_t asn,
 	}
 
 	aspath_str_update(new, false);
+	new->count = aspath_count_hops_internal(new);
+
 	return new;
 }
 
@@ -1413,6 +1433,7 @@ struct aspath *aspath_remove_private_asns(struct aspath *aspath, as_t peer_asn)
 	if (!aspath->refcnt)
 		aspath_free(aspath);
 	aspath_str_update(new, false);
+	new->count = aspath_count_hops_internal(new);
 	return new;
 }
 
@@ -1469,6 +1490,7 @@ static struct aspath *aspath_merge(struct aspath *as1, struct aspath *as2)
 		last->next = as2->segments;
 	as2->segments = new;
 	aspath_str_update(as2, false);
+	as2->count = aspath_count_hops_internal(as2);
 	return as2;
 }
 
@@ -1486,6 +1508,7 @@ struct aspath *aspath_prepend(struct aspath *as1, struct aspath *as2)
 	if (as2->segments == NULL) {
 		as2->segments = assegment_dup_all(as1->segments);
 		aspath_str_update(as2, false);
+		as2->count = aspath_count_hops_internal(as2);
 		return as2;
 	}
 
@@ -1506,6 +1529,7 @@ struct aspath *aspath_prepend(struct aspath *as1, struct aspath *as2)
 	if (!as2->segments) {
 		as2->segments = assegment_dup_all(as1->segments);
 		aspath_str_update(as2, false);
+		as2->count = aspath_count_hops_internal(as2);
 		return as2;
 	}
 
@@ -1551,6 +1575,7 @@ struct aspath *aspath_prepend(struct aspath *as1, struct aspath *as2)
 		 * the inbetween AS_SEQUENCE of seg2 in the process
 		 */
 		aspath_str_update(as2, false);
+		as2->count = aspath_count_hops_internal(as2);
 		return as2;
 	} else {
 		/* AS_SET merge code is needed at here. */
@@ -1662,6 +1687,7 @@ struct aspath *aspath_filter_exclude(struct aspath *source,
 		lastseg = newseg;
 	}
 	aspath_str_update(newpath, false);
+	newpath->count = aspath_count_hops_internal(newpath);
 	/* We are happy returning even an empty AS_PATH, because the
 	 * administrator
 	 * might expect this very behaviour. There's a mean to avoid this, if
@@ -1680,6 +1706,7 @@ struct aspath *aspath_filter_exclude_all(struct aspath *source)
 	newpath = aspath_new(source->asnotation);
 
 	aspath_str_update(newpath, false);
+	newpath->count = aspath_count_hops_internal(newpath);
 	/* We are happy returning even an empty AS_PATH, because the
 	 * administrator
 	 * might expect this very behaviour. There's a mean to avoid this, if
@@ -1767,6 +1794,7 @@ struct aspath *aspath_filter_exclude_acl(struct aspath *source,
 
 
 	aspath_str_update(source, false);
+	source->count = aspath_count_hops_internal(source);
 	/* We are happy returning even an empty AS_PATH, because the
 	 * administrator
 	 * might expect this very behaviour. There's a mean to avoid this, if
@@ -1805,6 +1833,7 @@ static struct aspath *aspath_add_asns(struct aspath *aspath, as_t asno,
 	}
 
 	aspath_str_update(aspath, false);
+	aspath->count = aspath_count_hops_internal(aspath);
 	return aspath;
 }
 
@@ -1896,6 +1925,7 @@ struct aspath *aspath_reconcile_as4(struct aspath *aspath,
 	if (!hops) {
 		newpath = aspath_dup(as4path);
 		aspath_str_update(newpath, false);
+		/* dup sets the count properly */
 		return newpath;
 	}
 
@@ -1957,6 +1987,7 @@ struct aspath *aspath_reconcile_as4(struct aspath *aspath,
 	aspath_free(newpath);
 	mergedpath->segments = assegment_normalise(mergedpath->segments);
 	aspath_str_update(mergedpath, false);
+	mergedpath->count = aspath_count_hops_internal(mergedpath);
 
 	if (BGP_DEBUG(as4, AS4))
 		zlog_debug("[AS4] result of synthesizing is %s",
@@ -2027,8 +2058,10 @@ struct aspath *aspath_delete_confed_seq(struct aspath *aspath)
 		seg = next;
 	}
 
-	if (removed_confed_segment)
+	if (removed_confed_segment) {
 		aspath_str_update(aspath, false);
+		aspath->count = aspath_count_hops_internal(aspath);
+	}
 
 	return aspath;
 }
diff --git a/bgpd/bgp_aspath.h b/bgpd/bgp_aspath.h
index f7e57fd66dda..46202fd34afc 100644
--- a/bgpd/bgp_aspath.h
+++ b/bgpd/bgp_aspath.h
@@ -59,6 +59,7 @@ struct aspath {
 	   and AS path regular expression match.  */
 	char *str;
 	unsigned short str_len;
+	uint32_t count;
 
 	/* AS notation used by string expression of AS path */
 	enum asnotation_mode asnotation;

From e68550b8d8371db16d9c7600bfb625354ae4395c Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@nvidia.com>
Date: Thu, 24 Oct 2024 11:40:56 -0400
Subject: [PATCH 3/5] bgpd: Only grab the confed path count if we are comparing
 it

This is just a small optimization but when calling path_info_cmp
hundreds of millions of times this adds up.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
---
 bgpd/bgp_route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c
index 53cc9a10b569..6fa1505998cd 100644
--- a/bgpd/bgp_route.c
+++ b/bgpd/bgp_route.c
@@ -1133,9 +1133,9 @@ int bgp_path_info_cmp(struct bgp *bgp, struct bgp_path_info *new,
 	/* 4. AS path length check. */
 	if (!CHECK_FLAG(bgp->flags, BGP_FLAG_ASPATH_IGNORE)) {
 		int exist_hops = aspath_count_hops(existattr->aspath);
-		int exist_confeds = aspath_count_confeds(existattr->aspath);
 
 		if (CHECK_FLAG(bgp->flags, BGP_FLAG_ASPATH_CONFED)) {
+			int exist_confeds = aspath_count_confeds(existattr->aspath);
 			int aspath_hops;
 
 			aspath_hops = aspath_count_hops(newattr->aspath);

From b097a3188ab89e4ec639eda9e2d25a334df8b710 Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@nvidia.com>
Date: Thu, 24 Oct 2024 14:17:51 -0400
Subject: [PATCH 4/5] bgpd: Fix deadlock in bgp_keepalive and master pthreads

(gdb) bt
0  futex_wait (private=0, expected=2, futex_word=0x5c438e9a98d8) at ../sysdeps/nptl/futex-internal.h:146
1  __GI___lll_lock_wait (futex=futex@entry=0x5c438e9a98d8, private=0) at ./nptl/lowlevellock.c:49
2  0x00007af16d698002 in lll_mutex_lock_optimized (mutex=0x5c438e9a98d8) at ./nptl/pthread_mutex_lock.c:48
3  ___pthread_mutex_lock (mutex=0x5c438e9a98d8) at ./nptl/pthread_mutex_lock.c:93
4  0x00005c4369c17e70 in _frr_mtx_lock (mutex=0x5c438e9a98d8, func=0x5c4369dc2750 <__func__.265> "bgp_notify_send_internal") at ./lib/frr_pthread.h:258
5  0x00005c4369c1a07a in bgp_notify_send_internal (connection=0x5c438e9a98c0, code=8 '\b', sub_code=0 '\000', data=0x0, datalen=0, use_curr=true) at bgpd/bgp_packet.c:928
6  0x00005c4369c1a707 in bgp_notify_send (connection=0x5c438e9a98c0, code=8 '\b', sub_code=0 '\000') at bgpd/bgp_packet.c:1069
7  0x00005c4369bea422 in bgp_stop_with_notify (connection=0x5c438e9a98c0, code=8 '\b', sub_code=0 '\000') at bgpd/bgp_fsm.c:1597
8  0x00005c4369c18480 in bgp_packet_add (connection=0x5c438e9a98c0, peer=0x5c438e9b6010, s=0x7af15c06bf70) at bgpd/bgp_packet.c:151
9  0x00005c4369c19816 in bgp_keepalive_send (peer=0x5c438e9b6010) at bgpd/bgp_packet.c:639
10 0x00005c4369bf01fd in peer_process (hb=0x5c438ed05520, arg=0x7af16bdffaf0) at bgpd/bgp_keepalives.c:111
11 0x00007af16dacd8e6 in hash_iterate (hash=0x7af15c000be0, func=0x5c4369bf005e <peer_process>, arg=0x7af16bdffaf0) at lib/hash.c:252
12 0x00005c4369bf0679 in bgp_keepalives_start (arg=0x5c438e0db110) at bgpd/bgp_keepalives.c:214
13 0x00007af16dac9932 in frr_pthread_inner (arg=0x5c438e0db110) at lib/frr_pthread.c:180
14 0x00007af16d694ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
15 0x00007af16d726850 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
(gdb)

The bgp keepalive pthread gets deadlocked with itself and consequently
the bgp master pthread gets locked when it attempts to lock
the peerhash_mtx, since it is also locked by the keepalive_pthread

The keepalive pthread is locking the peerhash_mtx in
bgp_keepalives_start.  Next the connection->io_mtx mutex in
bgp_keepalives_send is locked and then when it notices a problem it invokes
bgp_stop_with_notify which relocks the same mutex ( and of course
the relock causes it to get stuck on itself ).  This generates a
deadlock condition.

Modify the code to only hold the connection->io_mtx as short as
possible.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
---
 bgpd/bgp_packet.c | 59 ++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c
index 6b116db1075d..7390202e16e7 100644
--- a/bgpd/bgp_packet.c
+++ b/bgpd/bgp_packet.c
@@ -122,42 +122,37 @@ static void bgp_packet_add(struct peer_connection *connection,
 			peer->last_sendq_ok = monotime(NULL);
 
 		stream_fifo_push(connection->obuf, s);
+	}
 
-		delta = monotime(NULL) - peer->last_sendq_ok;
+	delta = monotime(NULL) - peer->last_sendq_ok;
 
-		if (CHECK_FLAG(peer->flags, PEER_FLAG_TIMER))
-			holdtime = atomic_load_explicit(&peer->holdtime,
-							memory_order_relaxed);
-		else
-			holdtime = peer->bgp->default_holdtime;
+	if (CHECK_FLAG(peer->flags, PEER_FLAG_TIMER))
+		holdtime = atomic_load_explicit(&peer->holdtime, memory_order_relaxed);
+	else
+		holdtime = peer->bgp->default_holdtime;
 
-		sendholdtime = holdtime * 2;
+	sendholdtime = holdtime * 2;
 
-		/* Note that when we're here, we're adding some packet to the
-		 * OutQ.  That includes keepalives when there is nothing to
-		 * do, so there's a guarantee we pass by here once in a while.
-		 *
-		 * That implies there is no need to go set up another separate
-		 * timer that ticks down SendHoldTime, as we'll be here sooner
-		 * or later anyway and will see the checks below failing.
-		 */
-		if (!holdtime) {
-			/* no holdtime, do nothing. */
-		} else if (delta > sendholdtime) {
-			flog_err(
-				EC_BGP_SENDQ_STUCK_PROPER,
-				"%pBP has not made any SendQ progress for 2 holdtimes (%jds), terminating session",
-				peer, sendholdtime);
-			bgp_stop_with_notify(connection,
-					     BGP_NOTIFY_SEND_HOLD_ERR, 0);
-		} else if (delta > (intmax_t)holdtime &&
-			   monotime(NULL) - peer->last_sendq_warn > 5) {
-			flog_warn(
-				EC_BGP_SENDQ_STUCK_WARN,
-				"%pBP has not made any SendQ progress for 1 holdtime (%us), peer overloaded?",
-				peer, holdtime);
-			peer->last_sendq_warn = monotime(NULL);
-		}
+	/* Note that when we're here, we're adding some packet to the
+	 * OutQ.  That includes keepalives when there is nothing to
+	 * do, so there's a guarantee we pass by here once in a while.
+	 *
+	 * That implies there is no need to go set up another separate
+	 * timer that ticks down SendHoldTime, as we'll be here sooner
+	 * or later anyway and will see the checks below failing.
+	 */
+	if (!holdtime) {
+		/* no holdtime, do nothing. */
+	} else if (delta > sendholdtime) {
+		flog_err(EC_BGP_SENDQ_STUCK_PROPER,
+			 "%pBP has not made any SendQ progress for 2 holdtimes (%jds), terminating session",
+			 peer, sendholdtime);
+		bgp_stop_with_notify(connection, BGP_NOTIFY_SEND_HOLD_ERR, 0);
+	} else if (delta > (intmax_t)holdtime && monotime(NULL) - peer->last_sendq_warn > 5) {
+		flog_warn(EC_BGP_SENDQ_STUCK_WARN,
+			  "%pBP has not made any SendQ progress for 1 holdtime (%us), peer overloaded?",
+			  peer, holdtime);
+		peer->last_sendq_warn = monotime(NULL);
 	}
 }
 

From 138935a5fdce5a1f29fec84ab569f9c4c6969a8d Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@nvidia.com>
Date: Thu, 24 Oct 2024 17:44:31 -0400
Subject: [PATCH 5/5] bgpd: Fix wrong pthread event cancelling

0  __pthread_kill_implementation (no_tid=0, signo=6, threadid=130719886083648) at ./nptl/pthread_kill.c:44
1  __pthread_kill_internal (signo=6, threadid=130719886083648) at ./nptl/pthread_kill.c:78
2  __GI___pthread_kill (threadid=130719886083648, signo=signo@entry=6) at ./nptl/pthread_kill.c:89
3  0x000076e399e42476 in __GI_raise (sig=6) at ../sysdeps/posix/raise.c:26
4  0x000076e39a34f950 in core_handler (signo=6, siginfo=0x76e3985fca30, context=0x76e3985fc900) at lib/sigevent.c:258
5  <signal handler called>
6  __pthread_kill_implementation (no_tid=0, signo=6, threadid=130719886083648) at ./nptl/pthread_kill.c:44
7  __pthread_kill_internal (signo=6, threadid=130719886083648) at ./nptl/pthread_kill.c:78
8  __GI___pthread_kill (threadid=130719886083648, signo=signo@entry=6) at ./nptl/pthread_kill.c:89
9  0x000076e399e42476 in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26
10 0x000076e399e287f3 in __GI_abort () at ./stdlib/abort.c:79
11 0x000076e39a39874b in _zlog_assert_failed (xref=0x76e39a46cca0 <_xref.27>, extra=0x0) at lib/zlog.c:789
12 0x000076e39a369dde in cancel_event_helper (m=0x5eda32df5e40, arg=0x5eda33afeed0, flags=1) at lib/event.c:1428
13 0x000076e39a369ef6 in event_cancel_event_ready (m=0x5eda32df5e40, arg=0x5eda33afeed0) at lib/event.c:1470
14 0x00005eda0a94a5b3 in bgp_stop (connection=0x5eda33afeed0) at bgpd/bgp_fsm.c:1355
15 0x00005eda0a94b4ae in bgp_stop_with_notify (connection=0x5eda33afeed0, code=8 '\b', sub_code=0 '\000') at bgpd/bgp_fsm.c:1610
16 0x00005eda0a979498 in bgp_packet_add (connection=0x5eda33afeed0, peer=0x5eda33b11800, s=0x76e3880daf90) at bgpd/bgp_packet.c:152
17 0x00005eda0a97a80f in bgp_keepalive_send (peer=0x5eda33b11800) at bgpd/bgp_packet.c:639
18 0x00005eda0a9511fd in peer_process (hb=0x5eda33c9ab80, arg=0x76e3985ffaf0) at bgpd/bgp_keepalives.c:111
19 0x000076e39a2cd8e6 in hash_iterate (hash=0x76e388000be0, func=0x5eda0a95105e <peer_process>, arg=0x76e3985ffaf0) at lib/hash.c:252
20 0x00005eda0a951679 in bgp_keepalives_start (arg=0x5eda3306af80) at bgpd/bgp_keepalives.c:214
21 0x000076e39a2c9932 in frr_pthread_inner (arg=0x5eda3306af80) at lib/frr_pthread.c:180
22 0x000076e399e94ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
23 0x000076e399f26850 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
(gdb) f 12
12 0x000076e39a369dde in cancel_event_helper (m=0x5eda32df5e40, arg=0x5eda33afeed0, flags=1) at lib/event.c:1428
1428		assert(m->owner == pthread_self());

In this decode the attempt to cancel the connection's events from
the wrong thread is causing the crash.  Modify the code to create an
event on the bm->master to cancel the events for the connection.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
---
 bgpd/bgp_fsm.c         | 10 ++++++++++
 bgpd/bgp_fsm.h         |  1 +
 bgpd/bgp_packet.c      |  3 ++-
 bgpd/bgpd.h            |  2 ++
 zebra/kernel_netlink.c |  2 +-
 5 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/bgpd/bgp_fsm.c b/bgpd/bgp_fsm.c
index 567af5bb7577..b3518ac69657 100644
--- a/bgpd/bgp_fsm.c
+++ b/bgpd/bgp_fsm.c
@@ -178,6 +178,7 @@ static struct peer *peer_xfer_conn(struct peer *from_peer)
 	EVENT_OFF(going_away->t_delayopen);
 	EVENT_OFF(going_away->t_connect_check_r);
 	EVENT_OFF(going_away->t_connect_check_w);
+	EVENT_OFF(going_away->t_stop_with_notify);
 	EVENT_OFF(keeper->t_routeadv);
 	EVENT_OFF(keeper->t_connect);
 	EVENT_OFF(keeper->t_delayopen);
@@ -1475,6 +1476,8 @@ enum bgp_fsm_state_progress bgp_stop(struct peer_connection *connection)
 	EVENT_OFF(connection->t_connect_check_r);
 	EVENT_OFF(connection->t_connect_check_w);
 
+	EVENT_OFF(connection->t_stop_with_notify);
+
 	/* Stop all timers. */
 	EVENT_OFF(connection->t_start);
 	EVENT_OFF(connection->t_connect);
@@ -3032,3 +3035,10 @@ void bgp_peer_gr_flags_update(struct peer *peer)
 		}
 	}
 }
+
+void bgp_event_stop_with_notify(struct event *event)
+{
+	struct peer_connection *connection = EVENT_ARG(event);
+
+	bgp_stop_with_notify(connection, BGP_NOTIFY_SEND_HOLD_ERR, 0);
+}
diff --git a/bgpd/bgp_fsm.h b/bgpd/bgp_fsm.h
index 85c488962fc9..013c60ce23ae 100644
--- a/bgpd/bgp_fsm.h
+++ b/bgpd/bgp_fsm.h
@@ -109,6 +109,7 @@ enum bgp_fsm_state_progress {
 extern void bgp_fsm_nht_update(struct peer_connection *connection,
 			       struct peer *peer, bool has_valid_nexthops);
 extern void bgp_event(struct event *event);
+extern void bgp_event_stop_with_notify(struct event *event);
 extern int bgp_event_update(struct peer_connection *connection,
 			    enum bgp_fsm_events event);
 extern enum bgp_fsm_state_progress bgp_stop(struct peer_connection *connection);
diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c
index 7390202e16e7..0523a4b02b4e 100644
--- a/bgpd/bgp_packet.c
+++ b/bgpd/bgp_packet.c
@@ -147,7 +147,8 @@ static void bgp_packet_add(struct peer_connection *connection,
 		flog_err(EC_BGP_SENDQ_STUCK_PROPER,
 			 "%pBP has not made any SendQ progress for 2 holdtimes (%jds), terminating session",
 			 peer, sendholdtime);
-		bgp_stop_with_notify(connection, BGP_NOTIFY_SEND_HOLD_ERR, 0);
+		event_add_event(bm->master, bgp_event_stop_with_notify, connection, 0,
+				&connection->t_stop_with_notify);
 	} else if (delta > (intmax_t)holdtime && monotime(NULL) - peer->last_sendq_warn > 5) {
 		flog_warn(EC_BGP_SENDQ_STUCK_WARN,
 			  "%pBP has not made any SendQ progress for 1 holdtime (%us), peer overloaded?",
diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h
index 852efdf19d31..5ffed544a510 100644
--- a/bgpd/bgpd.h
+++ b/bgpd/bgpd.h
@@ -1223,6 +1223,8 @@ struct peer_connection {
 	struct event *t_process_packet;
 	struct event *t_process_packet_error;
 
+	struct event *t_stop_with_notify;
+
 	union sockunion su;
 #define BGP_CONNECTION_SU_UNSPEC(connection)                                   \
 	(connection->su.sa.sa_family == AF_UNSPEC)
diff --git a/zebra/kernel_netlink.c b/zebra/kernel_netlink.c
index 2148d131ecbe..0c607dfa67cb 100644
--- a/zebra/kernel_netlink.c
+++ b/zebra/kernel_netlink.c
@@ -932,7 +932,7 @@ static int netlink_recv_msg(struct nlsock *nl, struct msghdr *msg)
 	} while (status == -1 && errno == EINTR);
 
 	if (status == -1) {
-		if (errno == EWOULDBLOCK || errno == EAGAIN)
+		if (errno == EWOULDBLOCK || errno == EAGAIN || errno == EMSGSIZE)
 			return 0;
 		flog_err(EC_ZEBRA_RECVMSG_OVERRUN, "%s recvmsg overrun: %s",
 			 nl->name, safe_strerror(errno));