diff --git a/src/sonic-frr/patch/0069-lib-nexthop-code-should-use-uint16_t-for-nexthop-cou.patch b/src/sonic-frr/patch/0069-lib-nexthop-code-should-use-uint16_t-for-nexthop-cou.patch new file mode 100644 index 000000000000..5b6588ce3122 --- /dev/null +++ b/src/sonic-frr/patch/0069-lib-nexthop-code-should-use-uint16_t-for-nexthop-cou.patch @@ -0,0 +1,91 @@ +From 1102e4f6ee58d0d41596880fe3a7c82840ee679e Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Tue, 1 Oct 2024 14:31:08 -0400 +Subject: [PATCH] lib: nexthop code should use uint16_t for nexthop counting + +It's possible to specify via the cli and configure how many +nexthops that are allowed on the system. If you happen to +have > 255 then things are about to get interesting otherwise. + +Let's allow up to 65k nexthops (ha!) + +Signed-off-by: Donald Sharp + +diff --git a/lib/nexthop_group.c b/lib/nexthop_group.c +index 3f408e0a71..cb1ebb5d09 100644 +--- a/lib/nexthop_group.c ++++ b/lib/nexthop_group.c +@@ -70,10 +70,10 @@ static struct nexthop *nexthop_group_tail(const struct nexthop_group *nhg) + return nexthop; + } + +-uint8_t nexthop_group_nexthop_num(const struct nexthop_group *nhg) ++uint16_t nexthop_group_nexthop_num(const struct nexthop_group *nhg) + { + struct nexthop *nhop; +- uint8_t num = 0; ++ uint16_t num = 0; + + for (ALL_NEXTHOPS_PTR(nhg, nhop)) + num++; +@@ -81,11 +81,10 @@ uint8_t nexthop_group_nexthop_num(const struct nexthop_group *nhg) + return num; + } + +-static uint8_t +-nexthop_group_nexthop_num_no_recurse(const struct nexthop_group *nhg) ++static uint16_t nexthop_group_nexthop_num_no_recurse(const struct nexthop_group *nhg) + { + struct nexthop *nhop; +- uint8_t num = 0; ++ uint16_t num = 0; + + for (nhop = nhg->nexthop; nhop; nhop = nhop->next) + num++; +@@ -93,10 +92,10 @@ nexthop_group_nexthop_num_no_recurse(const struct nexthop_group *nhg) + return num; + } + +-uint8_t nexthop_group_active_nexthop_num(const struct nexthop_group *nhg) ++uint16_t nexthop_group_active_nexthop_num(const struct nexthop_group *nhg) + { + struct nexthop *nhop; +- uint8_t num = 0; ++ uint16_t num = 0; + + for (ALL_NEXTHOPS_PTR(nhg, nhop)) { + if (CHECK_FLAG(nhop->flags, NEXTHOP_FLAG_ACTIVE)) +@@ -184,11 +183,9 @@ static struct nexthop *nhg_nh_find(const struct nexthop_group *nhg, + return NULL; + } + +-static bool +-nexthop_group_equal_common(const struct nexthop_group *nhg1, +- const struct nexthop_group *nhg2, +- uint8_t (*nexthop_group_nexthop_num_func)( +- const struct nexthop_group *nhg)) ++static bool nexthop_group_equal_common( ++ const struct nexthop_group *nhg1, const struct nexthop_group *nhg2, ++ uint16_t (*nexthop_group_nexthop_num_func)(const struct nexthop_group *nhg)) + { + if (nhg1 && !nhg2) + return false; +diff --git a/lib/nexthop_group.h b/lib/nexthop_group.h +index 822a35439c..9103299418 100644 +--- a/lib/nexthop_group.h ++++ b/lib/nexthop_group.h +@@ -149,9 +149,8 @@ extern void nexthop_group_json_nexthop(json_object *j, + const struct nexthop *nh); + + /* Return the number of nexthops in this nhg */ +-extern uint8_t nexthop_group_nexthop_num(const struct nexthop_group *nhg); +-extern uint8_t +-nexthop_group_active_nexthop_num(const struct nexthop_group *nhg); ++extern uint16_t nexthop_group_nexthop_num(const struct nexthop_group *nhg); ++extern uint16_t nexthop_group_active_nexthop_num(const struct nexthop_group *nhg); + + extern bool nexthop_group_has_label(const struct nexthop_group *nhg); + +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0070-Allow-16-bit-size-for-nexthops.patch b/src/sonic-frr/patch/0070-Allow-16-bit-size-for-nexthops.patch new file mode 100644 index 000000000000..3ffb095db07d --- /dev/null +++ b/src/sonic-frr/patch/0070-Allow-16-bit-size-for-nexthops.patch @@ -0,0 +1,362 @@ +From ab639c4563abc5754d6a185ea3a8f29778fc3fae Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 7 Oct 2024 12:40:46 -0400 +Subject: [PATCH] *: Allow 16 bit size for nexthops + +Currently FRR is limiting the nexthop count to a uint8_t not a +uint16_t. This leads to issues when the nexthop count is 256 +which results in the count to overflow to 0 causing problems +in the code. + +Signed-off-by: Donald Sharp + +diff --git a/bgpd/bgp_addpath.c b/bgpd/bgp_addpath.c +index f391c13847..aada6e555f 100644 +--- a/bgpd/bgp_addpath.c ++++ b/bgpd/bgp_addpath.c +@@ -361,8 +361,7 @@ void bgp_addpath_type_changed(struct bgp *bgp) + } + } + +-int bgp_addpath_capability_action(enum bgp_addpath_strat addpath_type, +- uint8_t paths) ++int bgp_addpath_capability_action(enum bgp_addpath_strat addpath_type, uint16_t paths) + { + int action = CAPABILITY_ACTION_UNSET; + +@@ -392,8 +391,7 @@ int bgp_addpath_capability_action(enum bgp_addpath_strat addpath_type, + * change take effect. + */ + void bgp_addpath_set_peer_type(struct peer *peer, afi_t afi, safi_t safi, +- enum bgp_addpath_strat addpath_type, +- uint8_t paths) ++ enum bgp_addpath_strat addpath_type, uint16_t paths) + { + struct bgp *bgp = peer->bgp; + enum bgp_addpath_strat old_type; +diff --git a/bgpd/bgp_addpath.h b/bgpd/bgp_addpath.h +index d562000e30..0954a7ced1 100644 +--- a/bgpd/bgp_addpath.h ++++ b/bgpd/bgp_addpath.h +@@ -56,13 +56,11 @@ bool bgp_addpath_tx_path(enum bgp_addpath_strat strat, + * Change the type of addpath used for a peer. + */ + void bgp_addpath_set_peer_type(struct peer *peer, afi_t afi, safi_t safi, +- enum bgp_addpath_strat addpath_type, +- uint8_t paths); ++ enum bgp_addpath_strat addpath_type, uint16_t paths); + + void bgp_addpath_update_ids(struct bgp *bgp, struct bgp_dest *dest, afi_t afi, + safi_t safi); + + void bgp_addpath_type_changed(struct bgp *bgp); +-extern int bgp_addpath_capability_action(enum bgp_addpath_strat addpath_type, +- uint8_t paths); ++extern int bgp_addpath_capability_action(enum bgp_addpath_strat addpath_type, uint16_t paths); + #endif +diff --git a/bgpd/bgp_nexthop.h b/bgpd/bgp_nexthop.h +index 830883872e..410b3f7db9 100644 +--- a/bgpd/bgp_nexthop.h ++++ b/bgpd/bgp_nexthop.h +@@ -38,7 +38,7 @@ struct bgp_nexthop_cache { + uint32_t metric; + + /* Nexthop number and nexthop linked list.*/ +- uint8_t nexthop_num; ++ uint16_t nexthop_num; + + /* This flag is set to TRUE for a bnc that is gateway IP overlay index + * nexthop. +diff --git a/bgpd/bgp_nht.c b/bgpd/bgp_nht.c +index e2c103bb52..1614779ea6 100644 +--- a/bgpd/bgp_nht.c ++++ b/bgpd/bgp_nht.c +@@ -740,7 +740,7 @@ static void bgp_nht_ifp_table_handle(struct bgp *bgp, + { + struct bgp_nexthop_cache *bnc; + struct nexthop *nhop; +- uint8_t other_nh_count; ++ uint16_t other_nh_count; + bool nhop_ll_found = false; + bool nhop_found = false; + +diff --git a/pimd/pim_nht.c b/pimd/pim_nht.c +index 32cdf4bf82..b4f9aa7ca4 100644 +--- a/pimd/pim_nht.c ++++ b/pimd/pim_nht.c +@@ -543,7 +543,7 @@ static int pim_ecmp_nexthop_search(struct pim_instance *pim, + ifindex_t first_ifindex; + struct interface *ifp = NULL; + uint32_t hash_val = 0, mod_val = 0; +- uint8_t nh_iter = 0, found = 0; ++ uint16_t nh_iter = 0, found = 0; + uint32_t i, num_nbrs = 0; + struct pim_interface *pim_ifp; + +@@ -914,7 +914,7 @@ int pim_ecmp_nexthop_lookup(struct pim_instance *pim, + struct interface *ifps[router->multipath], *ifp; + int first_ifindex; + int found = 0; +- uint8_t i = 0; ++ uint16_t i = 0; + uint32_t hash_val = 0, mod_val = 0; + uint32_t num_nbrs = 0; + struct pim_interface *pim_ifp; +diff --git a/pimd/pim_nht.h b/pimd/pim_nht.h +index a1feb76e3b..27014e6898 100644 +--- a/pimd/pim_nht.h ++++ b/pimd/pim_nht.h +@@ -23,7 +23,7 @@ struct pim_nexthop_cache { + uint32_t metric; + uint32_t distance; + /* Nexthop number and nexthop linked list. */ +- uint8_t nexthop_num; ++ uint16_t nexthop_num; + struct nexthop *nexthop; + int64_t last_update; + uint16_t flags; +diff --git a/zebra/rt_netlink.c b/zebra/rt_netlink.c +index 988e67c963..4dd5f80836 100644 +--- a/zebra/rt_netlink.c ++++ b/zebra/rt_netlink.c +@@ -616,12 +616,9 @@ parse_nexthop_unicast(ns_id_t ns_id, struct rtmsg *rtm, struct rtattr **tb, + return nh; + } + +-static uint8_t parse_multipath_nexthops_unicast(ns_id_t ns_id, +- struct nexthop_group *ng, +- struct rtmsg *rtm, +- struct rtnexthop *rtnh, +- struct rtattr **tb, +- void *prefsrc, vrf_id_t vrf_id) ++static uint16_t parse_multipath_nexthops_unicast(ns_id_t ns_id, struct nexthop_group *ng, ++ struct rtmsg *rtm, struct rtnexthop *rtnh, ++ struct rtattr **tb, void *prefsrc, vrf_id_t vrf_id) + { + void *gate = NULL; + struct interface *ifp = NULL; +@@ -746,7 +743,7 @@ static uint8_t parse_multipath_nexthops_unicast(ns_id_t ns_id, + rtnh = RTNH_NEXT(rtnh); + } + +- uint8_t nhop_num = nexthop_group_nexthop_num(ng); ++ uint16_t nhop_num = nexthop_group_nexthop_num(ng); + + return nhop_num; + } +@@ -1039,7 +1036,7 @@ int netlink_route_change_read_unicast_internal(struct nlmsghdr *h, + (struct rtnexthop *)RTA_DATA(tb[RTA_MULTIPATH]); + + if (!nhe_id) { +- uint8_t nhop_num; ++ uint16_t nhop_num; + + /* Use temporary list of nexthops; parse + * message payload's nexthops. +@@ -2672,11 +2669,9 @@ int kernel_get_ipmr_sg_stats(struct zebra_vrf *zvrf, void *in) + /* Char length to debug ID with */ + #define ID_LENGTH 10 + +-static bool _netlink_nexthop_build_group(struct nlmsghdr *n, size_t req_size, +- uint32_t id, +- const struct nh_grp *z_grp, +- const uint8_t count, bool resilient, +- const struct nhg_resilience *nhgr) ++static bool _netlink_nexthop_build_group(struct nlmsghdr *n, size_t req_size, uint32_t id, ++ const struct nh_grp *z_grp, const uint16_t count, ++ bool resilient, const struct nhg_resilience *nhgr) + { + struct nexthop_grp grp[count]; + /* Need space for max group size, "/", and null term */ +@@ -3301,7 +3296,7 @@ static int netlink_nexthop_process_group(struct rtattr **tb, + struct nh_grp *z_grp, int z_grp_size, + struct nhg_resilience *nhgr) + { +- uint8_t count = 0; ++ uint16_t count = 0; + /* linux/nexthop.h group struct */ + struct nexthop_grp *n_grp = NULL; + +@@ -3374,7 +3369,7 @@ int netlink_nexthop_change(struct nlmsghdr *h, ns_id_t ns_id, int startup) + struct nexthop nh = {}; + struct nh_grp grp[MULTIPATH_NUM] = {}; + /* Count of nexthops in group array */ +- uint8_t grp_count = 0; ++ uint16_t grp_count = 0; + struct rtattr *tb[NHA_MAX + 1] = {}; + + frrtrace(3, frr_zebra, netlink_nexthop_change, h, ns_id, startup); +diff --git a/zebra/zapi_msg.c b/zebra/zapi_msg.c +index 9ae8333d6b..4ee3e18e5b 100644 +--- a/zebra/zapi_msg.c ++++ b/zebra/zapi_msg.c +@@ -517,7 +517,7 @@ int zsend_redistribute_route(int cmd, struct zserv *client, + struct zapi_nexthop *api_nh; + struct nexthop *nexthop; + const struct prefix *p, *src_p; +- uint8_t count = 0; ++ uint16_t count = 0; + afi_t afi; + size_t stream_size = + MAX(ZEBRA_MAX_PACKET_SIZ, sizeof(struct zapi_route)); +diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c +index efa795331d..5214b428b3 100644 +--- a/zebra/zebra_dplane.c ++++ b/zebra/zebra_dplane.c +@@ -79,7 +79,7 @@ struct dplane_nexthop_info { + + struct nexthop_group ng; + struct nh_grp nh_grp[MULTIPATH_NUM]; +- uint8_t nh_grp_count; ++ uint16_t nh_grp_count; + }; + + /* +@@ -2303,7 +2303,7 @@ dplane_ctx_get_nhe_nh_grp(const struct zebra_dplane_ctx *ctx) + return ctx->u.rinfo.nhe.nh_grp; + } + +-uint8_t dplane_ctx_get_nhe_nh_grp_count(const struct zebra_dplane_ctx *ctx) ++uint16_t dplane_ctx_get_nhe_nh_grp_count(const struct zebra_dplane_ctx *ctx) + { + DPLANE_CTX_VALID(ctx); + return ctx->u.rinfo.nhe.nh_grp_count; +diff --git a/zebra/zebra_dplane.h b/zebra/zebra_dplane.h +index 51c1bff5d2..63130b751f 100644 +--- a/zebra/zebra_dplane.h ++++ b/zebra/zebra_dplane.h +@@ -590,7 +590,7 @@ const struct nexthop_group * + dplane_ctx_get_nhe_ng(const struct zebra_dplane_ctx *ctx); + const struct nh_grp * + dplane_ctx_get_nhe_nh_grp(const struct zebra_dplane_ctx *ctx); +-uint8_t dplane_ctx_get_nhe_nh_grp_count(const struct zebra_dplane_ctx *ctx); ++uint16_t dplane_ctx_get_nhe_nh_grp_count(const struct zebra_dplane_ctx *ctx); + + /* Accessors for LSP information */ + +diff --git a/zebra/zebra_nhg.c b/zebra/zebra_nhg.c +index ed949da917..0c882ee1d9 100644 +--- a/zebra/zebra_nhg.c ++++ b/zebra/zebra_nhg.c +@@ -601,9 +601,8 @@ bool zebra_nhg_hash_id_equal(const void *arg1, const void *arg2) + return nhe1->id == nhe2->id; + } + +-static int zebra_nhg_process_grp(struct nexthop_group *nhg, +- struct nhg_connected_tree_head *depends, +- struct nh_grp *grp, uint8_t count, ++static int zebra_nhg_process_grp(struct nexthop_group *nhg, struct nhg_connected_tree_head *depends, ++ struct nh_grp *grp, uint16_t count, + struct nhg_resilience *resilience) + { + nhg_connected_tree_init(depends); +@@ -958,7 +957,7 @@ static struct nexthop *nhg_ctx_get_nh(struct nhg_ctx *ctx) + return &ctx->u.nh; + } + +-static uint8_t nhg_ctx_get_count(const struct nhg_ctx *ctx) ++static uint16_t nhg_ctx_get_count(const struct nhg_ctx *ctx) + { + return ctx->count; + } +@@ -1004,9 +1003,8 @@ done: + XFREE(MTYPE_NHG_CTX, *ctx); + } + +-static struct nhg_ctx *nhg_ctx_init(uint32_t id, struct nexthop *nh, +- struct nh_grp *grp, vrf_id_t vrf_id, +- afi_t afi, int type, uint8_t count, ++static struct nhg_ctx *nhg_ctx_init(uint32_t id, struct nexthop *nh, struct nh_grp *grp, ++ vrf_id_t vrf_id, afi_t afi, int type, uint16_t count, + struct nhg_resilience *resilience) + { + struct nhg_ctx *ctx = NULL; +@@ -1155,7 +1153,7 @@ static int nhg_ctx_process_new(struct nhg_ctx *ctx) + struct nhg_hash_entry *nhe = NULL; + + uint32_t id = nhg_ctx_get_id(ctx); +- uint8_t count = nhg_ctx_get_count(ctx); ++ uint16_t count = nhg_ctx_get_count(ctx); + vrf_id_t vrf_id = nhg_ctx_get_vrf_id(ctx); + int type = nhg_ctx_get_type(ctx); + afi_t afi = nhg_ctx_get_afi(ctx); +@@ -1307,9 +1305,9 @@ int nhg_ctx_process(struct nhg_ctx *ctx) + } + + /* Kernel-side, you either get a single new nexthop or a array of ID's */ +-int zebra_nhg_kernel_find(uint32_t id, struct nexthop *nh, struct nh_grp *grp, +- uint8_t count, vrf_id_t vrf_id, afi_t afi, int type, +- int startup, struct nhg_resilience *nhgr) ++int zebra_nhg_kernel_find(uint32_t id, struct nexthop *nh, struct nh_grp *grp, uint16_t count, ++ vrf_id_t vrf_id, afi_t afi, int type, int startup, ++ struct nhg_resilience *nhgr) + { + struct nhg_ctx *ctx = NULL; + +@@ -2973,14 +2971,14 @@ backups_done: + * I'm pretty sure we only allow ONE level of group within group currently. + * But making this recursive just in case that ever changes. + */ +-static uint8_t zebra_nhg_nhe2grp_internal(struct nh_grp *grp, +- uint8_t curr_index, ++static uint16_t zebra_nhg_nhe2grp_internal(struct nh_grp *grp, ++ uint16_t curr_index, + struct nhg_hash_entry *nhe, + int max_num) + { + struct nhg_connected *rb_node_dep = NULL; + struct nhg_hash_entry *depend = NULL; +- uint8_t i = curr_index; ++ uint16_t i = curr_index; + + frr_each(nhg_connected_tree, &nhe->nhg_depends, rb_node_dep) { + bool duplicate = false; +@@ -3069,8 +3067,7 @@ done: + } + + /* Convert a nhe into a group array */ +-uint8_t zebra_nhg_nhe2grp(struct nh_grp *grp, struct nhg_hash_entry *nhe, +- int max_num) ++uint16_t zebra_nhg_nhe2grp(struct nh_grp *grp, struct nhg_hash_entry *nhe, int max_num) + { + /* Call into the recursive function */ + return zebra_nhg_nhe2grp_internal(grp, 0, nhe, max_num); +diff --git a/zebra/zebra_nhg.h b/zebra/zebra_nhg.h +index 4eddecb73d..1f55a4bd96 100644 +--- a/zebra/zebra_nhg.h ++++ b/zebra/zebra_nhg.h +@@ -204,7 +204,7 @@ struct nhg_ctx { + int type; + + /* If its a group array, how many? */ +- uint8_t count; ++ uint16_t count; + + /* Its either a single nexthop or an array of ID's */ + union { +@@ -290,10 +290,8 @@ extern int nhg_ctx_process(struct nhg_ctx *ctx); + void nhg_ctx_free(struct nhg_ctx **ctx); + + /* Find via kernel nh creation */ +-extern int zebra_nhg_kernel_find(uint32_t id, struct nexthop *nh, +- struct nh_grp *grp, uint8_t count, +- vrf_id_t vrf_id, afi_t afi, int type, +- int startup, ++extern int zebra_nhg_kernel_find(uint32_t id, struct nexthop *nh, struct nh_grp *grp, ++ uint16_t count, vrf_id_t vrf_id, afi_t afi, int type, int startup, + struct nhg_resilience *resilience); + /* Del via kernel */ + extern int zebra_nhg_kernel_del(uint32_t id, vrf_id_t vrf_id); +@@ -352,8 +350,7 @@ extern void zebra_nhg_increment_ref(struct nhg_hash_entry *nhe); + extern void zebra_nhg_check_valid(struct nhg_hash_entry *nhe); + + /* Convert nhe depends to a grp context that can be passed around safely */ +-extern uint8_t zebra_nhg_nhe2grp(struct nh_grp *grp, struct nhg_hash_entry *nhe, +- int size); ++extern uint16_t zebra_nhg_nhe2grp(struct nh_grp *grp, struct nhg_hash_entry *nhe, int size); + + /* Dataplane install/uninstall */ + extern void zebra_nhg_install_kernel(struct nhg_hash_entry *nhe); +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0071-zebra-Only-notify-dplane-work-pthread-when-needed.patch b/src/sonic-frr/patch/0071-zebra-Only-notify-dplane-work-pthread-when-needed.patch new file mode 100644 index 000000000000..7373dee5eeea --- /dev/null +++ b/src/sonic-frr/patch/0071-zebra-Only-notify-dplane-work-pthread-when-needed.patch @@ -0,0 +1,112 @@ +From ccfaabcd8a9d519c22d43dbcf8f31be2f378a4bd Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 10 Oct 2024 16:00:08 -0400 +Subject: [PATCH 1/3] zebra: Only notify dplane work pthread when needed + +The fpm_nl_process function was getting the count +of the total number of ctx's processed. This leads +to after having processed 1 context to always signal +the dataplane that there is work to do. Change the +code to only notify the dplane worker when a context +was actually added to the outgoing context queue. + +Signed-off-by: Donald Sharp +--- + zebra/dplane_fpm_nl.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c +index e9942b7dbd..3e5236b077 100644 +--- a/zebra/dplane_fpm_nl.c ++++ b/zebra/dplane_fpm_nl.c +@@ -1459,7 +1459,7 @@ static void fpm_process_queue(struct event *t) + * until the dataplane thread gets scheduled for new, + * unrelated work. + */ +- if (dplane_provider_out_ctx_queue_len(fnc->prov) > 0) ++ if (processed_contexts) + dplane_provider_work_ready(); + } + +-- +2.43.2 + + +From 527923b651aa6eb6d07136fd0988c6054af65d00 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 10 Oct 2024 20:08:32 -0400 +Subject: [PATCH 2/3] zebra: Slow down fpm_process_queue + +When the fpm_process_queue has run out of space +but has written to the fpm output buffer, schedule +it to wake up immediately, as that the write will go out +pretty much immediately, since it was scheduled first. +If the fpm_process_queue has not written to the output +buffer then delay the processing by 10 milliseconds to +allow a possibly backed up write processing to have a +chance to complete it's work. + +Signed-off-by: Donald Sharp +--- + zebra/dplane_fpm_nl.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c +index 3e5236b077..66079d2dfc 100644 +--- a/zebra/dplane_fpm_nl.c ++++ b/zebra/dplane_fpm_nl.c +@@ -1446,8 +1446,12 @@ static void fpm_process_queue(struct event *t) + + /* Re-schedule if we ran out of buffer space */ + if (no_bufs) { +- event_add_event(fnc->fthread->master, fpm_process_queue, fnc, 0, +- &fnc->t_dequeue); ++ if (processed_contexts) ++ event_add_event(fnc->fthread->master, fpm_process_queue, fnc, 0, ++ &fnc->t_dequeue); ++ else ++ event_add_timer_msec(fnc->fthread->master, fpm_process_queue, fnc, 10, ++ &fnc->t_dequeue); + event_add_timer(fnc->fthread->master, fpm_process_wedged, fnc, + DPLANE_FPM_NL_WEDGIE_TIME, &fnc->t_wedged); + } else +-- +2.43.2 + + +From 46cf5903312a1a4c9833f7654779a0e3d903dad3 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Fri, 11 Oct 2024 09:33:35 -0400 +Subject: [PATCH 3/3] fpm: Allow max fpm message size to float based on ecmp + +Currently the max message size is 4k. With a 256 way +ecmp FRR is seeing message sizes that are in the +6k size. There is desire to allow this to increase as +well to 512. Since the multipath size directly effects +how big the message may be when sending the routes ecmp +let's give a bit of headroom for this value when compiling +FRR at greater sizes. Additionally since we know not everyone +is using such large ecmp, allow them to build as appropriate +for their use cases. + +Signed-off-by: Donald Sharp +--- + fpm/fpm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fpm/fpm.h b/fpm/fpm.h +index 70c0df5715..9003c643b0 100644 +--- a/fpm/fpm.h ++++ b/fpm/fpm.h +@@ -65,7 +65,7 @@ + /* + * Largest message that can be sent to or received from the FPM. + */ +-#define FPM_MAX_MSG_LEN 4096 ++#define FPM_MAX_MSG_LEN MAX(MULTIPATH_NUM * 32, 4096) + + #ifdef __SUNPRO_C + #pragma pack(1) +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0072-Fix-up-improper-handling-of-nexthops-for-nexthop-tra.patch b/src/sonic-frr/patch/0072-Fix-up-improper-handling-of-nexthops-for-nexthop-tra.patch new file mode 100644 index 000000000000..1bac1eaf5a19 --- /dev/null +++ b/src/sonic-frr/patch/0072-Fix-up-improper-handling-of-nexthops-for-nexthop-tra.patch @@ -0,0 +1,149 @@ +From 9b0f6cee3bbb8b0a42faf72015ac7e7705f8bfbd Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Fri, 11 Oct 2024 14:01:10 -0400 +Subject: [PATCH] *: Fix up improper handling of nexthops for nexthop tracking + +Currently FRR needs to send a uint16_t value for the number +of nexthops as well it needs the ability to properly decode +all of this. Find and handle all the places that this happens. + +Signed-off-by: Donald Sharp + +diff --git a/lib/zclient.c b/lib/zclient.c +index 64515c7544..a89a02997a 100644 +--- a/lib/zclient.c ++++ b/lib/zclient.c +@@ -2322,7 +2322,7 @@ static bool zapi_nexthop_update_decode(struct stream *s, struct prefix *match, + STREAM_GETW(s, nhr->instance); + STREAM_GETC(s, nhr->distance); + STREAM_GETL(s, nhr->metric); +- STREAM_GETC(s, nhr->nexthop_num); ++ STREAM_GETW(s, nhr->nexthop_num); + + for (i = 0; i < nhr->nexthop_num; i++) { + if (zapi_nexthop_decode(s, &(nhr->nexthops[i]), 0, 0) != 0) +diff --git a/pimd/pim_zlookup.c b/pimd/pim_zlookup.c +index c19119fa47..5d344f1f66 100644 +--- a/pimd/pim_zlookup.c ++++ b/pimd/pim_zlookup.c +@@ -193,7 +193,7 @@ static int zclient_read_nexthop(struct pim_instance *pim, + + distance = stream_getc(s); + metric = stream_getl(s); +- nexthop_num = stream_getc(s); ++ nexthop_num = stream_getw(s); + + if (nexthop_num < 1 || nexthop_num > router->multipath) { + if (PIM_DEBUG_PIM_NHT_DETAIL) +diff --git a/staticd/static_zebra.c b/staticd/static_zebra.c +index c4efc14a9d..f1b793524c 100644 +--- a/staticd/static_zebra.c ++++ b/staticd/static_zebra.c +@@ -43,7 +43,7 @@ struct static_nht_data { + vrf_id_t nh_vrf_id; + + uint32_t refcount; +- uint8_t nh_num; ++ uint16_t nh_num; + bool registered; + }; + +diff --git a/zebra/zapi_msg.c b/zebra/zapi_msg.c +index 4ee3e18e5b..7e3b571844 100644 +--- a/zebra/zapi_msg.c ++++ b/zebra/zapi_msg.c +@@ -647,7 +647,7 @@ static int zsend_nexthop_lookup_mrib(struct zserv *client, struct ipaddr *addr, + { + struct stream *s; + unsigned long nump; +- uint8_t num; ++ uint16_t num; + struct nexthop *nexthop; + + /* Get output stream. */ +@@ -667,7 +667,7 @@ static int zsend_nexthop_lookup_mrib(struct zserv *client, struct ipaddr *addr, + /* remember position for nexthop_num */ + nump = stream_get_endp(s); + /* reserve room for nexthop_num */ +- stream_putc(s, 0); ++ stream_putw(s, 0); + nhg = rib_get_fib_nhg(re); + for (ALL_NEXTHOPS_PTR(nhg, nexthop)) { + if (rnh_nexthop_valid(re, nexthop)) +@@ -675,11 +675,11 @@ static int zsend_nexthop_lookup_mrib(struct zserv *client, struct ipaddr *addr, + } + + /* store nexthop_num */ +- stream_putc_at(s, nump, num); ++ stream_putw_at(s, nump, num); + } else { + stream_putc(s, 0); /* distance */ + stream_putl(s, 0); /* metric */ +- stream_putc(s, 0); /* nexthop_num */ ++ stream_putw(s, 0); /* nexthop_num */ + } + + stream_putw_at(s, 0, stream_get_endp(s)); +diff --git a/zebra/zebra_rnh.c b/zebra/zebra_rnh.c +index b387e9949b..3c64ee9ddf 100644 +--- a/zebra/zebra_rnh.c ++++ b/zebra/zebra_rnh.c +@@ -1141,7 +1141,7 @@ int zebra_send_rnh_update(struct rnh *rnh, struct zserv *client, + struct stream *s = NULL; + struct route_entry *re; + unsigned long nump; +- uint8_t num; ++ uint16_t num; + struct nexthop *nh; + struct route_node *rn; + int ret; +@@ -1212,7 +1212,7 @@ int zebra_send_rnh_update(struct rnh *rnh, struct zserv *client, + stream_putl(s, re->metric); + num = 0; + nump = stream_get_endp(s); +- stream_putc(s, 0); ++ stream_putw(s, 0); + + nhg = rib_get_fib_nhg(re); + for (ALL_NEXTHOPS_PTR(nhg, nh)) +@@ -1240,13 +1240,13 @@ int zebra_send_rnh_update(struct rnh *rnh, struct zserv *client, + } + } + +- stream_putc_at(s, nump, num); ++ stream_putw_at(s, nump, num); + } else { + stream_putc(s, 0); // type + stream_putw(s, 0); // instance + stream_putc(s, 0); // distance + stream_putl(s, 0); // metric +- stream_putc(s, 0); // nexthops ++ stream_putw(s, 0); // nexthops + } + stream_putw_at(s, 0, stream_get_endp(s)); + +diff --git a/zebra/zebra_srte.c b/zebra/zebra_srte.c +index c0b83382c4..bb8d4b3b40 100644 +--- a/zebra/zebra_srte.c ++++ b/zebra/zebra_srte.c +@@ -145,7 +145,7 @@ static int zebra_sr_policy_notify_update_client(struct zebra_sr_policy *policy, + stream_putc(s, nhlfe->distance); + stream_putl(s, 0); /* metric - not available */ + nump = stream_get_endp(s); +- stream_putc(s, 0); ++ stream_putw(s, 0); + } + + zapi_nexthop_from_nexthop(&znh, nhlfe->nexthop); +@@ -155,7 +155,7 @@ static int zebra_sr_policy_notify_update_client(struct zebra_sr_policy *policy, + + num++; + } +- stream_putc_at(s, nump, num); ++ stream_putw_at(s, nump, num); + stream_putw_at(s, 0, stream_get_endp(s)); + + client->nh_last_upd_time = monotime(NULL); +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0073-remove-in6addr-cmp.patch b/src/sonic-frr/patch/0073-remove-in6addr-cmp.patch new file mode 100644 index 000000000000..94c54fdfcceb --- /dev/null +++ b/src/sonic-frr/patch/0073-remove-in6addr-cmp.patch @@ -0,0 +1,150 @@ +From 716d115e843151b30c3ab27e90182efcff44ecac Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Wed, 30 Oct 2024 10:15:42 -0400 +Subject: [PATCH 1/3] lib: Replace usage of in6addr_cmp with memcmp + +memcmp will return and act exactly the same as in6addr_cmp +but it does it significantly faster than how in6addr_cmp +does it. Let this be a lesson for implementing something +that is a duplicate of what is provided by the c library. + +Signed-off-by: Donald Sharp +--- + lib/sockunion.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/lib/sockunion.c b/lib/sockunion.c +index c37ab1d6dd..1b08a06e93 100644 +--- a/lib/sockunion.c ++++ b/lib/sockunion.c +@@ -621,7 +621,8 @@ int sockunion_cmp(const union sockunion *su1, const union sockunion *su2) + return -1; + } + if (su1->sa.sa_family == AF_INET6) +- return in6addr_cmp(&su1->sin6.sin6_addr, &su2->sin6.sin6_addr); ++ return IN6_ADDR_CMP(&su1->sin6.sin6_addr, &su2->sin6.sin6_addr); ++ + return 0; + } + +-- +2.43.2 + + +From bdce78eb3e909e788efa6a28151464b0d65b768a Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Wed, 30 Oct 2024 10:41:08 -0400 +Subject: [PATCH 2/3] lib, tests: Remove in6addr_cmp function from the system + +This function should just be memcmp. + +Signed-off-by: Donald Sharp +--- + lib/sockunion.c | 17 ----------------- + lib/sockunion.h | 1 - + tests/lib/test_frrlua.c | 2 +- + 3 files changed, 1 insertion(+), 19 deletions(-) + +diff --git a/lib/sockunion.c b/lib/sockunion.c +index 1b08a06e93..4a7c190953 100644 +--- a/lib/sockunion.c ++++ b/lib/sockunion.c +@@ -588,23 +588,6 @@ static void __attribute__((unused)) sockunion_print(const union sockunion *su) + } + } + +-int in6addr_cmp(const struct in6_addr *addr1, const struct in6_addr *addr2) +-{ +- unsigned int i; +- const uint8_t *p1, *p2; +- +- p1 = (const uint8_t *)addr1; +- p2 = (const uint8_t *)addr2; +- +- for (i = 0; i < sizeof(struct in6_addr); i++) { +- if (p1[i] > p2[i]) +- return 1; +- else if (p1[i] < p2[i]) +- return -1; +- } +- return 0; +-} +- + int sockunion_cmp(const union sockunion *su1, const union sockunion *su2) + { + if (su1->sa.sa_family > su2->sa.sa_family) +diff --git a/lib/sockunion.h b/lib/sockunion.h +index 146651225c..5152e70a23 100644 +--- a/lib/sockunion.h ++++ b/lib/sockunion.h +@@ -93,7 +93,6 @@ enum connect_result { connect_error, connect_success, connect_in_progress }; + /* Prototypes. */ + extern int str2sockunion(const char *, union sockunion *); + extern const char *sockunion2str(const union sockunion *, char *, size_t); +-int in6addr_cmp(const struct in6_addr *addr1, const struct in6_addr *addr2); + extern int sockunion_cmp(const union sockunion *, const union sockunion *); + extern int sockunion_same(const union sockunion *, const union sockunion *); + extern unsigned int sockunion_hash(const union sockunion *); +diff --git a/tests/lib/test_frrlua.c b/tests/lib/test_frrlua.c +index 2760a273bd..b9cdfd088f 100644 +--- a/tests/lib/test_frrlua.c ++++ b/tests/lib/test_frrlua.c +@@ -88,7 +88,7 @@ static void test_encode_decode(void) + + lua_pushin6addr(L, &in6addr_a); + lua_decode_in6addr(L, -1, &in6addr_a); +- assert(in6addr_cmp(&in6addr_a, &in6addr_b) == 0); ++ assert(memcmp(&in6addr_a, &in6addr_b, sizeof(struct in6_addr)) == 0); + assert(lua_gettop(L) == 0); + + union sockunion su_a, su_b; +-- +2.43.2 + + +From bf0d3bafd4b29b6f0a9bafc349b7d90dc4310a92 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 31 Oct 2024 10:06:26 -0400 +Subject: [PATCH 3/3] lib: In sockunion.c convert v6 memcmp's to IPV6_ADDR_CMP + +Signed-off-by: Donald Sharp +--- + lib/sockunion.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/lib/sockunion.c b/lib/sockunion.c +index 4a7c190953..7acb5004db 100644 +--- a/lib/sockunion.c ++++ b/lib/sockunion.c +@@ -403,8 +403,7 @@ int sockunion_same(const union sockunion *su1, const union sockunion *su2) + sizeof(struct in_addr)); + break; + case AF_INET6: +- ret = memcmp(&su1->sin6.sin6_addr, &su2->sin6.sin6_addr, +- sizeof(struct in6_addr)); ++ ret = IPV6_ADDR_CMP(&su1->sin6.sin6_addr, &su2->sin6.sin6_addr); + if ((ret == 0) && IN6_IS_ADDR_LINKLOCAL(&su1->sin6.sin6_addr)) { + /* compare interface indices */ + if (su1->sin6.sin6_scope_id && su2->sin6.sin6_scope_id) +@@ -604,7 +603,7 @@ int sockunion_cmp(const union sockunion *su1, const union sockunion *su2) + return -1; + } + if (su1->sa.sa_family == AF_INET6) +- return IN6_ADDR_CMP(&su1->sin6.sin6_addr, &su2->sin6.sin6_addr); ++ return IPV6_ADDR_CMP(&su1->sin6.sin6_addr, &su2->sin6.sin6_addr); + + return 0; + } +@@ -711,8 +710,7 @@ int sockunion_is_null(const union sockunion *su) + case AF_INET: + return (su->sin.sin_addr.s_addr == 0); + case AF_INET6: +- return !memcmp(su->sin6.sin6_addr.s6_addr, null_s6_addr, +- sizeof(null_s6_addr)); ++ return !IPV6_ADDR_CMP(su->sin6.sin6_addr.s6_addr, null_s6_addr); + default: + return 0; + } +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0074-bgp-best-port-reordering.patch b/src/sonic-frr/patch/0074-bgp-best-port-reordering.patch new file mode 100644 index 000000000000..9dd3f6c6c919 --- /dev/null +++ b/src/sonic-frr/patch/0074-bgp-best-port-reordering.patch @@ -0,0 +1,2199 @@ +From b6345ad2f34f78470f8839a21eba6e234f011cfa Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 11 Mar 2024 15:26:14 -0400 +Subject: [PATCH 01/11] tests: Explicitly call out bgp timers for bgp_evpn_mh + test + +Signed-off-by: Donald Sharp +--- + tests/topotests/bgp_evpn_mh/leaf1/evpn.conf | 1 + + tests/topotests/bgp_evpn_mh/leaf2/evpn.conf | 1 + + tests/topotests/bgp_evpn_mh/spine1/evpn.conf | 1 + + tests/topotests/bgp_evpn_mh/spine2/evpn.conf | 1 + + tests/topotests/bgp_evpn_mh/torm11/evpn.conf | 1 + + tests/topotests/bgp_evpn_mh/torm12/evpn.conf | 1 + + tests/topotests/bgp_evpn_mh/torm21/evpn.conf | 1 + + tests/topotests/bgp_evpn_mh/torm22/evpn.conf | 1 + + 8 files changed, 8 insertions(+) + +diff --git a/tests/topotests/bgp_evpn_mh/leaf1/evpn.conf b/tests/topotests/bgp_evpn_mh/leaf1/evpn.conf +index 33b6d08aba..d246517898 100644 +--- a/tests/topotests/bgp_evpn_mh/leaf1/evpn.conf ++++ b/tests/topotests/bgp_evpn_mh/leaf1/evpn.conf +@@ -1,6 +1,7 @@ + frr defaults datacenter + ! + router bgp 65101 ++ timers bgp 3 10 + bgp router-id 192.168.100.13 + no bgp ebgp-requires-policy + neighbor 192.168.50.1 remote-as external +diff --git a/tests/topotests/bgp_evpn_mh/leaf2/evpn.conf b/tests/topotests/bgp_evpn_mh/leaf2/evpn.conf +index 428998b0fe..6855a436d4 100644 +--- a/tests/topotests/bgp_evpn_mh/leaf2/evpn.conf ++++ b/tests/topotests/bgp_evpn_mh/leaf2/evpn.conf +@@ -1,6 +1,7 @@ + frr defaults datacenter + ! + router bgp 65101 ++ timers bgp 3 10 + bgp router-id 192.168.100.14 + no bgp ebgp-requires-policy + neighbor 192.168.61.1 remote-as external +diff --git a/tests/topotests/bgp_evpn_mh/spine1/evpn.conf b/tests/topotests/bgp_evpn_mh/spine1/evpn.conf +index b9fce46ea4..7d6fef699d 100644 +--- a/tests/topotests/bgp_evpn_mh/spine1/evpn.conf ++++ b/tests/topotests/bgp_evpn_mh/spine1/evpn.conf +@@ -1,6 +1,7 @@ + frr defaults datacenter + ! + router bgp 65001 ++ timers bgp 3 10 + bgp router-id 192.168.100.13 + no bgp ebgp-requires-policy + neighbor 192.168.50.2 remote-as external +diff --git a/tests/topotests/bgp_evpn_mh/spine2/evpn.conf b/tests/topotests/bgp_evpn_mh/spine2/evpn.conf +index 1430e10b68..c651ada686 100644 +--- a/tests/topotests/bgp_evpn_mh/spine2/evpn.conf ++++ b/tests/topotests/bgp_evpn_mh/spine2/evpn.conf +@@ -1,6 +1,7 @@ + frr defaults datacenter + ! + router bgp 65001 ++ timers bgp 3 10 + bgp router-id 192.168.100.14 + no bgp ebgp-requires-policy + neighbor 192.168.60.2 remote-as external +diff --git a/tests/topotests/bgp_evpn_mh/torm11/evpn.conf b/tests/topotests/bgp_evpn_mh/torm11/evpn.conf +index 2c1c695a18..62b7ec5855 100644 +--- a/tests/topotests/bgp_evpn_mh/torm11/evpn.conf ++++ b/tests/topotests/bgp_evpn_mh/torm11/evpn.conf +@@ -7,6 +7,7 @@ frr defaults datacenter + ! + ! + router bgp 65002 ++ timers bgp 3 10 + bgp router-id 192.168.100.15 + no bgp ebgp-requires-policy + neighbor 192.168.1.1 remote-as external +diff --git a/tests/topotests/bgp_evpn_mh/torm12/evpn.conf b/tests/topotests/bgp_evpn_mh/torm12/evpn.conf +index 8b0ce1d98f..3ceb974c47 100644 +--- a/tests/topotests/bgp_evpn_mh/torm12/evpn.conf ++++ b/tests/topotests/bgp_evpn_mh/torm12/evpn.conf +@@ -7,6 +7,7 @@ frr defaults datacenter + ! + ! + router bgp 65003 ++ timers bgp 3 10 + bgp router-id 192.168.100.16 + no bgp ebgp-requires-policy + neighbor 192.168.2.1 remote-as external +diff --git a/tests/topotests/bgp_evpn_mh/torm21/evpn.conf b/tests/topotests/bgp_evpn_mh/torm21/evpn.conf +index 5247dc1ebd..ecaf85ddb7 100644 +--- a/tests/topotests/bgp_evpn_mh/torm21/evpn.conf ++++ b/tests/topotests/bgp_evpn_mh/torm21/evpn.conf +@@ -7,6 +7,7 @@ frr defaults datacenter + ! + ! + router bgp 65004 ++ timers bgp 3 10 + bgp router-id 192.168.100.17 + no bgp ebgp-requires-policy + neighbor 192.168.3.1 remote-as external +diff --git a/tests/topotests/bgp_evpn_mh/torm22/evpn.conf b/tests/topotests/bgp_evpn_mh/torm22/evpn.conf +index ec56360176..c7e152498c 100644 +--- a/tests/topotests/bgp_evpn_mh/torm22/evpn.conf ++++ b/tests/topotests/bgp_evpn_mh/torm22/evpn.conf +@@ -6,6 +6,7 @@ frr defaults datacenter + ! debug bgp zebra + ! + router bgp 65005 ++ timers bgp 3 10 + bgp router-id 192.168.100.18 + no bgp ebgp-requires-policy + neighbor 192.168.4.1 remote-as external +-- +2.43.2 + + +From da25ab9fa54cad2db494becf0645da8fdad5b414 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 11 Mar 2024 14:40:49 -0400 +Subject: [PATCH 02/11] tests: teste_ospf_rte_calc.py uses bgp add pytest mark + +Signed-off-by: Donald Sharp +--- + tests/topotests/ospf_basic_functionality/test_ospf_rte_calc.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tests/topotests/ospf_basic_functionality/test_ospf_rte_calc.py b/tests/topotests/ospf_basic_functionality/test_ospf_rte_calc.py +index f0950a2db3..45c1325917 100644 +--- a/tests/topotests/ospf_basic_functionality/test_ospf_rte_calc.py ++++ b/tests/topotests/ospf_basic_functionality/test_ospf_rte_calc.py +@@ -49,7 +49,7 @@ from lib.ospf import ( + verify_ospf_interface, + ) + +-pytestmark = [pytest.mark.ospfd, pytest.mark.staticd] ++pytestmark = [pytest.mark.bgpd, pytest.mark.ospfd, pytest.mark.staticd] + + + # Global variables +-- +2.43.2 + + +From fc76785fe670153941b1f320fabc2ca9c1511e33 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 11 Mar 2024 11:22:49 -0400 +Subject: [PATCH 03/11] bgpd: Modify update_evpn_type5_route_entry to include + path_info pointer + +Modify update_evpn_type5_route_entry to return a pointer to the +struct bgp_path_info modified in this function. This code +merely follows the standards used in other bgp_evpn.c code +where the update function returns the pointer to the path +info. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_evpn.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c +index 405e9a84bb..c7f8024f79 100644 +--- a/bgpd/bgp_evpn.c ++++ b/bgpd/bgp_evpn.c +@@ -1584,7 +1584,8 @@ struct bgp_path_info *bgp_evpn_route_get_local_path( + int update_evpn_type5_route_entry(struct bgp *bgp_evpn, + struct bgp *bgp_vrf, afi_t afi, + safi_t safi, struct bgp_dest *dest, +- struct attr *attr, int *route_changed) ++ struct attr *attr, int *route_changed, ++ struct bgp_path_info **entry) + { + struct attr *attr_new = NULL; + struct bgp_path_info *pi = NULL; +@@ -1622,8 +1623,8 @@ int update_evpn_type5_route_entry(struct bgp *bgp_evpn, + + /* add the route entry to route node*/ + bgp_path_info_add(dest, pi); ++ *entry = pi; + } else { +- + tmp_pi = local_pi; + if (!attrhash_cmp(tmp_pi->attr, attr)) { + +@@ -1645,6 +1646,7 @@ int update_evpn_type5_route_entry(struct bgp *bgp_evpn, + tmp_pi->attr = attr_new; + tmp_pi->uptime = monotime(NULL); + } ++ *entry = local_pi; + } + return 0; + } +@@ -1660,6 +1662,7 @@ int update_evpn_type5_route(struct bgp *bgp_vrf, struct prefix_evpn *evp, + struct bgp_dest *dest = NULL; + struct bgp *bgp_evpn = NULL; + int route_changed = 0; ++ struct bgp_path_info *pi = NULL; + + bgp_evpn = bgp_get_evpn(); + if (!bgp_evpn) +@@ -1741,7 +1744,7 @@ int update_evpn_type5_route(struct bgp *bgp_vrf, struct prefix_evpn *evp, + + /* create or update the route entry within the route node */ + update_evpn_type5_route_entry(bgp_evpn, bgp_vrf, afi, safi, dest, &attr, +- &route_changed); ++ &route_changed, &pi); + + /* schedule for processing and unlock node */ + if (route_changed) { +-- +2.43.2 + + +From 6032fb1a37b1d195e67f0e594d762ce048df092e Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Wed, 6 Mar 2024 15:03:31 -0500 +Subject: [PATCH 04/11] bgpd: Fix indentation problem in + bgp_recalculate_afi_safi_bestpaths + +This is seriously indented. Let's make it a bit better. + +Signed-off-by: Donald Sharp +--- + bgpd/bgpd.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c +index cf52053c4e..b1711848e6 100644 +--- a/bgpd/bgpd.c ++++ b/bgpd/bgpd.c +@@ -1878,17 +1878,19 @@ void bgp_recalculate_afi_safi_bestpaths(struct bgp *bgp, afi_t afi, safi_t safi) + for (dest = bgp_table_top(bgp->rib[afi][safi]); dest; + dest = bgp_route_next(dest)) { + table = bgp_dest_get_bgp_table_info(dest); +- if (table != NULL) { +- /* Special handling for 2-level routing +- * tables. */ +- if (safi == SAFI_MPLS_VPN || safi == SAFI_ENCAP +- || safi == SAFI_EVPN) { +- for (ndest = bgp_table_top(table); ndest; +- ndest = bgp_route_next(ndest)) +- bgp_process(bgp, ndest, afi, safi); +- } else +- bgp_process(bgp, dest, afi, safi); +- } ++ ++ if (!table) ++ continue; ++ ++ /* Special handling for 2-level routing ++ * tables. */ ++ if (safi == SAFI_MPLS_VPN || safi == SAFI_ENCAP ++ || safi == SAFI_EVPN) { ++ for (ndest = bgp_table_top(table); ndest; ++ ndest = bgp_route_next(ndest)) ++ bgp_process(bgp, ndest, afi, safi); ++ } else ++ bgp_process(bgp, dest, afi, safi); + } + } + +-- +2.43.2 + + +From a4a790e18469c46fe55e8bf861da4bc37108e2b7 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Fri, 1 Mar 2024 09:49:30 -0500 +Subject: [PATCH 05/11] bgpd: Add a path_info_flags dumper for bgp + +Add a debug function to allow developers to dump flags +associated with a bgp_path_info in a human readable format. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_route.c | 51 +++++++++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 48 insertions(+), 3 deletions(-) + +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index e05507c520..c778002dc8 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -114,6 +114,45 @@ static const struct message bgp_pmsi_tnltype_str[] = { + #define VRFID_NONE_STR "-" + #define SOFT_RECONFIG_TASK_MAX_PREFIX 25000 + ++static inline char *bgp_route_dump_path_info_flags(struct bgp_path_info *pi, ++ char *buf, size_t len) ++{ ++ uint32_t flags = pi->flags; ++ ++ if (flags == 0) { ++ snprintfrr(buf, len, "None "); ++ return buf; ++ } ++ ++ snprintfrr(buf, len, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", ++ CHECK_FLAG(flags, BGP_PATH_IGP_CHANGED) ? "IGP Changed " : "", ++ CHECK_FLAG(flags, BGP_PATH_DAMPED) ? "Damped" : "", ++ CHECK_FLAG(flags, BGP_PATH_HISTORY) ? "History " : "", ++ CHECK_FLAG(flags, BGP_PATH_SELECTED) ? "Selected " : "", ++ CHECK_FLAG(flags, BGP_PATH_VALID) ? "Valid " : "", ++ CHECK_FLAG(flags, BGP_PATH_ATTR_CHANGED) ? "Attr Changed " ++ : "", ++ CHECK_FLAG(flags, BGP_PATH_DMED_CHECK) ? "Dmed Check " : "", ++ CHECK_FLAG(flags, BGP_PATH_DMED_SELECTED) ? "Dmed Selected " ++ : "", ++ CHECK_FLAG(flags, BGP_PATH_STALE) ? "Stale " : "", ++ CHECK_FLAG(flags, BGP_PATH_REMOVED) ? "Removed " : "", ++ CHECK_FLAG(flags, BGP_PATH_COUNTED) ? "Counted " : "", ++ CHECK_FLAG(flags, BGP_PATH_MULTIPATH) ? "Mpath " : "", ++ CHECK_FLAG(flags, BGP_PATH_MULTIPATH_CHG) ? "Mpath Chg " : "", ++ CHECK_FLAG(flags, BGP_PATH_RIB_ATTR_CHG) ? "Rib Chg " : "", ++ CHECK_FLAG(flags, BGP_PATH_ANNC_NH_SELF) ? "NH Self " : "", ++ CHECK_FLAG(flags, BGP_PATH_LINK_BW_CHG) ? "LinkBW Chg " : "", ++ CHECK_FLAG(flags, BGP_PATH_ACCEPT_OWN) ? "Accept Own " : "", ++ CHECK_FLAG(flags, BGP_PATH_MPLSVPN_LABEL_NH) ? "MPLS Label " ++ : "", ++ CHECK_FLAG(flags, BGP_PATH_MPLSVPN_NH_LABEL_BIND) ++ ? "MPLS Label Bind " ++ : ""); ++ ++ return buf; ++} ++ + DEFINE_HOOK(bgp_process, + (struct bgp * bgp, afi_t afi, safi_t safi, struct bgp_dest *bn, + struct peer *peer, bool withdraw), +@@ -683,12 +722,18 @@ int bgp_path_info_cmp(struct bgp *bgp, struct bgp_path_info *new, + } + + if (debug) { ++ char buf1[256], buf2[256]; ++ + bpi_ultimate = bgp_get_imported_bpi_ultimate(exist); + bgp_path_info_path_with_addpath_rx_str(bpi_ultimate, exist_buf, + sizeof(exist_buf)); +- zlog_debug("%s(%s): Comparing %s flags 0x%x with %s flags 0x%x", +- pfx_buf, bgp->name_pretty, new_buf, new->flags, +- exist_buf, exist->flags); ++ zlog_debug("%s(%s): Comparing %s flags %s with %s flags %s", ++ pfx_buf, bgp->name_pretty, new_buf, ++ bgp_route_dump_path_info_flags(new, buf1, ++ sizeof(buf1)), ++ exist_buf, ++ bgp_route_dump_path_info_flags(exist, buf2, ++ sizeof(buf2))); + } + + newattr = new->attr; +-- +2.43.2 + + +From eeea1554360868aa5efec3ad9cd9e5d7a13621aa Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Fri, 1 Mar 2024 10:01:35 -0500 +Subject: [PATCH 06/11] bgpd: Add BGP_PATH_UNSORTED for future commits + +Add a new flag BGP_PATH_UNSORTED to keep track +of sorted -vs- unsorted path_info's. Add some +ability to the system to understand when that +flag is set. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_route.c | 36 ++++++++++++------- + bgpd/bgp_route.h | 3 +- + .../r1/show_bgp_ipv4-post4.1.ref | 2 +- + .../r1/show_bgp_ipv4-post5.0.ref | 2 +- + .../r1/show_bgp_ipv4-post6.1.ref | 2 +- + .../all_protocol_startup/r1/show_bgp_ipv4.ref | 2 +- + .../r1/show_bgp_ipv6-post4.1.ref | 2 +- + .../all_protocol_startup/r1/show_bgp_ipv6.ref | 2 +- + .../r1/show_bgp_ipv6_post6.1.ref | 2 +- + 9 files changed, 32 insertions(+), 21 deletions(-) + +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index c778002dc8..a9bbcaf37e 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -124,7 +124,7 @@ static inline char *bgp_route_dump_path_info_flags(struct bgp_path_info *pi, + return buf; + } + +- snprintfrr(buf, len, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", ++ snprintfrr(buf, len, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + CHECK_FLAG(flags, BGP_PATH_IGP_CHANGED) ? "IGP Changed " : "", + CHECK_FLAG(flags, BGP_PATH_DAMPED) ? "Damped" : "", + CHECK_FLAG(flags, BGP_PATH_HISTORY) ? "History " : "", +@@ -148,7 +148,8 @@ static inline char *bgp_route_dump_path_info_flags(struct bgp_path_info *pi, + : "", + CHECK_FLAG(flags, BGP_PATH_MPLSVPN_NH_LABEL_BIND) + ? "MPLS Label Bind " +- : ""); ++ : "", ++ CHECK_FLAG(flags, BGP_PATH_UNSORTED) ? "Unsorted " : ""); + + return buf; + } +@@ -9021,6 +9022,9 @@ static void route_vty_short_status_out(struct vty *vty, + if (path->extra && bgp_path_suppressed(path)) + json_object_boolean_true_add(json_path, "suppressed"); + ++ if (CHECK_FLAG(path->flags, BGP_PATH_UNSORTED)) ++ json_object_boolean_true_add(json_path, "unsorted"); ++ + if (CHECK_FLAG(path->flags, BGP_PATH_VALID) + && !CHECK_FLAG(path->flags, BGP_PATH_HISTORY)) + json_object_boolean_true_add(json_path, "valid"); +@@ -9083,6 +9087,8 @@ static void route_vty_short_status_out(struct vty *vty, + /* Selected */ + if (CHECK_FLAG(path->flags, BGP_PATH_HISTORY)) + vty_out(vty, "h"); ++ else if (CHECK_FLAG(path->flags, BGP_PATH_UNSORTED)) ++ vty_out(vty, "u"); + else if (CHECK_FLAG(path->flags, BGP_PATH_DAMPED)) + vty_out(vty, "d"); + else if (CHECK_FLAG(path->flags, BGP_PATH_SELECTED)) +@@ -13723,21 +13729,23 @@ enum bgp_pcounts { + PCOUNT_COUNTED, + PCOUNT_BPATH_SELECTED, + PCOUNT_PFCNT, /* the figure we display to users */ ++ PCOUNT_UNSORTED, + PCOUNT_MAX, + }; + + static const char *const pcount_strs[] = { +- [PCOUNT_ADJ_IN] = "Adj-in", +- [PCOUNT_DAMPED] = "Damped", +- [PCOUNT_REMOVED] = "Removed", +- [PCOUNT_HISTORY] = "History", +- [PCOUNT_STALE] = "Stale", +- [PCOUNT_VALID] = "Valid", +- [PCOUNT_ALL] = "All RIB", +- [PCOUNT_COUNTED] = "PfxCt counted", +- [PCOUNT_BPATH_SELECTED] = "PfxCt Best Selected", +- [PCOUNT_PFCNT] = "Useable", +- [PCOUNT_MAX] = NULL, ++ [PCOUNT_ADJ_IN] = "Adj-in", ++ [PCOUNT_DAMPED] = "Damped", ++ [PCOUNT_REMOVED] = "Removed", ++ [PCOUNT_HISTORY] = "History", ++ [PCOUNT_STALE] = "Stale", ++ [PCOUNT_VALID] = "Valid", ++ [PCOUNT_ALL] = "All RIB", ++ [PCOUNT_COUNTED] = "PfxCt counted", ++ [PCOUNT_BPATH_SELECTED] = "PfxCt Best Selected", ++ [PCOUNT_PFCNT] = "Useable", ++ [PCOUNT_UNSORTED] = "Unsorted", ++ [PCOUNT_MAX] = NULL, + }; + + struct peer_pcounts { +@@ -13778,6 +13786,8 @@ static void bgp_peer_count_proc(struct bgp_dest *rn, struct peer_pcounts *pc) + pc->count[PCOUNT_PFCNT]++; + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED)) + pc->count[PCOUNT_BPATH_SELECTED]++; ++ if (CHECK_FLAG(pi->flags, BGP_PATH_UNSORTED)) ++ pc->count[PCOUNT_UNSORTED]++; + + if (CHECK_FLAG(pi->flags, BGP_PATH_COUNTED)) { + pc->count[PCOUNT_COUNTED]++; +diff --git a/bgpd/bgp_route.h b/bgpd/bgp_route.h +index 2929c75df7..5781eb70f2 100644 +--- a/bgpd/bgp_route.h ++++ b/bgpd/bgp_route.h +@@ -59,7 +59,7 @@ enum bgp_show_adj_route_type { + + #define BGP_SHOW_SCODE_HEADER \ + "Status codes: s suppressed, d damped, " \ +- "h history, * valid, > best, = multipath,\n" \ ++ "h history, u unsorted, * valid, > best, = multipath,\n" \ + " i internal, r RIB-failure, S Stale, R Removed\n" + #define BGP_SHOW_OCODE_HEADER \ + "Origin codes: i - IGP, e - EGP, ? - incomplete\n" +@@ -327,6 +327,7 @@ struct bgp_path_info { + #define BGP_PATH_ACCEPT_OWN (1 << 16) + #define BGP_PATH_MPLSVPN_LABEL_NH (1 << 17) + #define BGP_PATH_MPLSVPN_NH_LABEL_BIND (1 << 18) ++#define BGP_PATH_UNSORTED (1 << 19) + + /* BGP route type. This can be static, RIP, OSPF, BGP etc. */ + uint8_t type; +diff --git a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post4.1.ref b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post4.1.ref +index b2e8de5ce1..fca6cbed82 100644 +--- a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post4.1.ref ++++ b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post4.1.ref +@@ -1,5 +1,5 @@ + BGP table version is 1, local router ID is 192.168.0.1, vrf id 0 +-Status codes: s suppressed, d damped, h history, * valid, > best, = multipath, ++Status codes: s suppressed, d damped, h history, u unsorted, * valid, > best, = multipath, + i internal, r RIB-failure, S Stale, R Removed + Nexthop codes: @NNN nexthop's vrf id, < announce-nh-self + Origin codes: i - IGP, e - EGP, ? - incomplete +diff --git a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post5.0.ref b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post5.0.ref +index 7bee704182..bb10828815 100644 +--- a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post5.0.ref ++++ b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post5.0.ref +@@ -1,5 +1,5 @@ + BGP table version is 1, local router ID is 192.168.0.1, vrf id 0 +-Status codes: s suppressed, d damped, h history, * valid, > best, = multipath, ++Status codes: s suppressed, d damped, h history, u unsorted, * valid, > best, = multipath, + i internal, r RIB-failure, S Stale, R Removed + Nexthop codes: @NNN nexthop's vrf id, < announce-nh-self + Origin codes: i - IGP, e - EGP, ? - incomplete +diff --git a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post6.1.ref b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post6.1.ref +index 31071e760d..04ad86fbb0 100644 +--- a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post6.1.ref ++++ b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4-post6.1.ref +@@ -1,6 +1,6 @@ + BGP table version is 1, local router ID is 192.168.0.1, vrf id 0 + Default local pref 100, local AS 100 +-Status codes: s suppressed, d damped, h history, * valid, > best, = multipath, ++Status codes: s suppressed, d damped, h history, u unsorted, * valid, > best, = multipath, + i internal, r RIB-failure, S Stale, R Removed + Nexthop codes: @NNN nexthop's vrf id, < announce-nh-self + Origin codes: i - IGP, e - EGP, ? - incomplete +diff --git a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4.ref b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4.ref +index 53c4793bf4..ca95a1671b 100644 +--- a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4.ref ++++ b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv4.ref +@@ -1,5 +1,5 @@ + BGP table version is 1, local router ID is 192.168.0.1 +-Status codes: s suppressed, d damped, h history, * valid, > best, = multipath, ++Status codes: s suppressed, d damped, h history, u unsorted, * valid, > best, = multipath, + i internal, r RIB-failure, S Stale, R Removed + Origin codes: i - IGP, e - EGP, ? - incomplete + +diff --git a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6-post4.1.ref b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6-post4.1.ref +index fe3f0720d8..f1b09264b2 100644 +--- a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6-post4.1.ref ++++ b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6-post4.1.ref +@@ -1,5 +1,5 @@ + BGP table version is 1, local router ID is 192.168.0.1, vrf id 0 +-Status codes: s suppressed, d damped, h history, * valid, > best, = multipath, ++Status codes: s suppressed, d damped, h history, u unsorted, * valid, > best, = multipath, + i internal, r RIB-failure, S Stale, R Removed + Nexthop codes: @NNN nexthop's vrf id, < announce-nh-self + Origin codes: i - IGP, e - EGP, ? - incomplete +diff --git a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6.ref b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6.ref +index 363b4f5349..278fc2dad2 100644 +--- a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6.ref ++++ b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6.ref +@@ -1,5 +1,5 @@ + BGP table version is 1, local router ID is 192.168.0.1 +-Status codes: s suppressed, d damped, h history, * valid, > best, = multipath, ++Status codes: s suppressed, d damped, h history, u unsorted, * valid, > best, = multipath, + i internal, r RIB-failure, S Stale, R Removed + Origin codes: i - IGP, e - EGP, ? - incomplete + +diff --git a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6_post6.1.ref b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6_post6.1.ref +index 8c3229b45d..88f3eac178 100644 +--- a/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6_post6.1.ref ++++ b/tests/topotests/all_protocol_startup/r1/show_bgp_ipv6_post6.1.ref +@@ -1,6 +1,6 @@ + BGP table version is 1, local router ID is 192.168.0.1, vrf id 0 + Default local pref 100, local AS 100 +-Status codes: s suppressed, d damped, h history, * valid, > best, = multipath, ++Status codes: s suppressed, d damped, h history, u unsorted, * valid, > best, = multipath, + i internal, r RIB-failure, S Stale, R Removed + Nexthop codes: @NNN nexthop's vrf id, < announce-nh-self + Origin codes: i - IGP, e - EGP, ? - incomplete +-- +2.43.2 + + +From 2913f3227edd1b85927f763a4f5ca2bb6dcf9899 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 18 Mar 2024 09:33:21 -0400 +Subject: [PATCH 07/11] bgpd: Call bgp_process when bgp_path_info_delete is + called + +bgp_damp.c has an instance of bgp_path_info_delete is called. +Thus setting up the path info for deletion, but since it never +calls bgp_process, it can never be deleted. This means that in +a dampening situation, after a withdrawal the path_info would +stick around. Schedule the path for deletion. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_damp.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/bgpd/bgp_damp.c b/bgpd/bgp_damp.c +index 80425ebe7a..7893884fa6 100644 +--- a/bgpd/bgp_damp.c ++++ b/bgpd/bgp_damp.c +@@ -306,8 +306,10 @@ void bgp_damp_info_free(struct bgp_damp_info *bdi, int withdraw, afi_t afi, + bgp_path_info_unset_flag(bdi->dest, path, + BGP_PATH_HISTORY | BGP_PATH_DAMPED); + +- if (bdi->lastrecord == BGP_RECORD_WITHDRAW && withdraw) ++ if (bdi->lastrecord == BGP_RECORD_WITHDRAW && withdraw) { + bgp_path_info_delete(bdi->dest, path); ++ bgp_process(path->peer->bgp, bdi->dest, afi, safi); ++ } + + XFREE(MTYPE_BGP_DAMP_INFO, bdi); + } +-- +2.43.2 + + +From 70e1a639ac199ed5fb6df2e0a88e565472c4e65a Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 4 Mar 2024 10:41:13 -0500 +Subject: [PATCH 08/11] bgpd: Add pi to bgp_process + +This will allow a consistency of approach to adding/removing +pi's to from the workqueue for processing as well as properly +handling the dest->info pi list more appropriately. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_damp.c | 4 +-- + bgpd/bgp_evpn.c | 24 +++++++-------- + bgpd/bgp_evpn_mh.c | 8 ++--- + bgpd/bgp_label.c | 2 +- + bgpd/bgp_mplsvpn.c | 14 ++++----- + bgpd/bgp_nht.c | 2 +- + bgpd/bgp_route.c | 77 ++++++++++++++++++++-------------------------- + bgpd/bgp_route.h | 3 +- + bgpd/bgp_vty.c | 9 ++++-- + bgpd/bgp_zebra.c | 2 +- + bgpd/bgpd.c | 16 +++++++--- + bgpd/rfapi/rfapi.c | 6 ++-- + 12 files changed, 85 insertions(+), 82 deletions(-) + +diff --git a/bgpd/bgp_damp.c b/bgpd/bgp_damp.c +index 7893884fa6..6b6387b1b5 100644 +--- a/bgpd/bgp_damp.c ++++ b/bgpd/bgp_damp.c +@@ -150,7 +150,7 @@ static void bgp_reuse_timer(struct event *t) + bgp_aggregate_increment( + bgp, bgp_dest_get_prefix(bdi->dest), + bdi->path, bdi->afi, bdi->safi); +- bgp_process(bgp, bdi->dest, bdi->afi, ++ bgp_process(bgp, bdi->dest, bdi->path, bdi->afi, + bdi->safi); + } + +@@ -308,7 +308,7 @@ void bgp_damp_info_free(struct bgp_damp_info *bdi, int withdraw, afi_t afi, + + if (bdi->lastrecord == BGP_RECORD_WITHDRAW && withdraw) { + bgp_path_info_delete(bdi->dest, path); +- bgp_process(path->peer->bgp, bdi->dest, afi, safi); ++ bgp_process(path->peer->bgp, bdi->dest, path, afi, safi); + } + + XFREE(MTYPE_BGP_DAMP_INFO, bdi); +diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c +index c7f8024f79..72dfd000f6 100644 +--- a/bgpd/bgp_evpn.c ++++ b/bgpd/bgp_evpn.c +@@ -1428,7 +1428,7 @@ void evpn_delete_old_local_route(struct bgp *bgp, struct bgpevpn *vpn, + * this table. + */ + if (pi) +- bgp_process(bgp, global_dest, afi, safi); ++ bgp_process(bgp, global_dest, pi, afi, safi); + bgp_dest_unlock_node(global_dest); + } + +@@ -1748,7 +1748,7 @@ int update_evpn_type5_route(struct bgp *bgp_vrf, struct prefix_evpn *evp, + + /* schedule for processing and unlock node */ + if (route_changed) { +- bgp_process(bgp_evpn, dest, afi, safi); ++ bgp_process(bgp_evpn, dest, pi, afi, safi); + bgp_dest_unlock_node(dest); + } + +@@ -2314,7 +2314,7 @@ int update_evpn_route(struct bgp *bgp, struct bgpevpn *vpn, + false /* setup_sync */, NULL /* old_is_sync */); + + /* Schedule for processing and unlock node. */ +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, global_pi, afi, safi); + bgp_dest_unlock_node(dest); + } + +@@ -2374,7 +2374,7 @@ int delete_evpn_type5_route(struct bgp *bgp_vrf, struct prefix_evpn *evp) + + delete_evpn_route_entry(bgp_evpn, afi, safi, dest, &pi); + if (pi) +- bgp_process(bgp_evpn, dest, afi, safi); ++ bgp_process(bgp_evpn, dest, pi, afi, safi); + bgp_dest_unlock_node(dest); + return 0; + } +@@ -2414,7 +2414,7 @@ int delete_evpn_route(struct bgp *bgp, struct bgpevpn *vpn, + * this table. + */ + if (pi) +- bgp_process(bgp, global_dest, afi, safi); ++ bgp_process(bgp, global_dest, pi, afi, safi); + bgp_dest_unlock_node(global_dest); + } + +@@ -2586,7 +2586,7 @@ void bgp_evpn_update_type2_route_entry(struct bgp *bgp, struct bgpevpn *vpn, + NULL /* old_is_sync */); + + /* Schedule for processing and unlock node. */ +- bgp_process(bgp, global_dest, afi, safi); ++ bgp_process(bgp, global_dest, global_pi, afi, safi); + bgp_dest_unlock_node(global_dest); + } + +@@ -2671,7 +2671,7 @@ void delete_global_type2_routes(struct bgp *bgp, struct bgpevpn *vpn) + + delete_evpn_route_entry(bgp, afi, safi, dest, &pi); + if (pi) +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, pi, afi, safi); + } + + /* Unlock RD node. */ +@@ -3140,7 +3140,7 @@ int install_evpn_route_entry_in_vrf(struct bgp *bgp_vrf, + safi); + + /* Perform route selection and update zebra, if required. */ +- bgp_process(bgp_vrf, dest, afi, safi); ++ bgp_process(bgp_vrf, dest, pi, afi, safi); + + /* Process for route leaking. */ + vpn_leak_from_vrf_update(bgp_get_default(), bgp_vrf, pi); +@@ -3502,7 +3502,7 @@ int uninstall_evpn_route_entry_in_vrf(struct bgp *bgp_vrf, + bgp_evpn_path_nh_del(bgp_vrf, pi); + + /* Perform route selection and update zebra, if required. */ +- bgp_process(bgp_vrf, dest, afi, safi); ++ bgp_process(bgp_vrf, dest, pi, afi, safi); + + /* Unlock route node. */ + bgp_dest_unlock_node(dest); +@@ -4475,7 +4475,7 @@ void update_advertise_vni_route(struct bgp *bgp, struct bgpevpn *vpn, + } + + /* Schedule for processing and unlock node. */ +- bgp_process(bgp, global_dest, afi, safi); ++ bgp_process(bgp, global_dest, global_pi, afi, safi); + bgp_dest_unlock_node(global_dest); + } + +@@ -4525,7 +4525,7 @@ void update_advertise_vni_routes(struct bgp *bgp, struct bgpevpn *vpn) + false /* setup_sync */, NULL /* old_is_sync */); + + /* Schedule for processing and unlock node. */ +- bgp_process(bgp, global_dest, afi, safi); ++ bgp_process(bgp, global_dest, pi, afi, safi); + bgp_dest_unlock_node(global_dest); + } + +@@ -4570,7 +4570,7 @@ int delete_withdraw_vni_routes(struct bgp *bgp, struct bgpevpn *vpn) + * this table. + */ + if (pi) +- bgp_process(bgp, global_dest, afi, safi); ++ bgp_process(bgp, global_dest, pi, afi, safi); + bgp_dest_unlock_node(global_dest); + } + +diff --git a/bgpd/bgp_evpn_mh.c b/bgpd/bgp_evpn_mh.c +index f36d109b65..f5df47fbfb 100644 +--- a/bgpd/bgp_evpn_mh.c ++++ b/bgpd/bgp_evpn_mh.c +@@ -512,7 +512,7 @@ static int bgp_evpn_mh_route_delete(struct bgp *bgp, struct bgp_evpn_es *es, + * this table. + */ + if (pi) +- bgp_process(bgp, global_dest, afi, safi); ++ bgp_process(bgp, global_dest, pi, afi, safi); + bgp_dest_unlock_node(global_dest); + } + +@@ -563,7 +563,7 @@ int delete_global_ead_evi_routes(struct bgp *bgp, struct bgpevpn *vpn) + + delete_evpn_route_entry(bgp, afi, safi, bd, &pi); + if (pi) +- bgp_process(bgp, bd, afi, safi); ++ bgp_process(bgp, bd, pi, afi, safi); + } + } + +@@ -687,7 +687,7 @@ static int bgp_evpn_type4_route_update(struct bgp *bgp, + attr_new, &global_pi, &route_changed); + + /* Schedule for processing and unlock node. */ +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, global_pi, afi, safi); + bgp_dest_unlock_node(dest); + } + +@@ -1026,7 +1026,7 @@ static int bgp_evpn_type1_route_update(struct bgp *bgp, struct bgp_evpn_es *es, + attr_new, &global_pi, &route_changed); + + /* Schedule for processing and unlock node. */ +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, global_pi, afi, safi); + bgp_dest_unlock_node(dest); + } + +diff --git a/bgpd/bgp_label.c b/bgpd/bgp_label.c +index 7327ab5182..68104967b2 100644 +--- a/bgpd/bgp_label.c ++++ b/bgpd/bgp_label.c +@@ -74,7 +74,7 @@ int bgp_parse_fec_update(void) + bgp_set_valid_label(&dest->local_label); + } + SET_FLAG(dest->flags, BGP_NODE_LABEL_CHANGED); +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, NULL, afi, safi); + bgp_dest_unlock_node(dest); + return 1; + } +diff --git a/bgpd/bgp_mplsvpn.c b/bgpd/bgp_mplsvpn.c +index ef8ca39e82..a48ef875c4 100644 +--- a/bgpd/bgp_mplsvpn.c ++++ b/bgpd/bgp_mplsvpn.c +@@ -1254,7 +1254,7 @@ leak_update(struct bgp *to_bgp, struct bgp_dest *bn, + + /* Process change. */ + bgp_aggregate_increment(to_bgp, p, bpi, afi, safi); +- bgp_process(to_bgp, bn, afi, safi); ++ bgp_process(to_bgp, bn, bpi, afi, safi); + + if (debug) + zlog_debug("%s: ->%s: %pBD Found route, changed attr", +@@ -1316,7 +1316,7 @@ leak_update(struct bgp *to_bgp, struct bgp_dest *bn, + bgp_aggregate_increment(to_bgp, p, new, afi, safi); + bgp_path_info_add(bn, new); + +- bgp_process(to_bgp, bn, afi, safi); ++ bgp_process(to_bgp, bn, new, afi, safi); + + if (debug) + zlog_debug("%s: ->%s: %pBD: Added new route", __func__, +@@ -2002,7 +2002,7 @@ void vpn_leak_from_vrf_withdraw(struct bgp *to_bgp, /* to */ + + bgp_aggregate_decrement(to_bgp, p, bpi, afi, safi); + bgp_path_info_delete(bn, bpi); +- bgp_process(to_bgp, bn, afi, safi); ++ bgp_process(to_bgp, bn, bpi, afi, safi); + } + bgp_dest_unlock_node(bn); + } +@@ -2058,7 +2058,7 @@ void vpn_leak_from_vrf_withdraw_all(struct bgp *to_bgp, struct bgp *from_bgp, + to_bgp, bgp_dest_get_prefix(bn), + bpi, afi, safi); + bgp_path_info_delete(bn, bpi); +- bgp_process(to_bgp, bn, afi, safi); ++ bgp_process(to_bgp, bn, bpi, afi, safi); + bgp_mplsvpn_path_nh_label_unlink( + bpi->extra->vrfleak->parent); + } +@@ -2543,7 +2543,7 @@ void vpn_leak_to_vrf_withdraw(struct bgp_path_info *path_vpn) + bpi); + bgp_aggregate_decrement(bgp, p, bpi, afi, safi); + bgp_path_info_delete(bn, bpi); +- bgp_process(bgp, bn, afi, safi); ++ bgp_process(bgp, bn, bpi, afi, safi); + } + bgp_dest_unlock_node(bn); + } +@@ -2575,7 +2575,7 @@ void vpn_leak_to_vrf_withdraw_all(struct bgp *to_bgp, afi_t afi) + bgp_dest_get_prefix(bn), + bpi, afi, safi); + bgp_path_info_delete(bn, bpi); +- bgp_process(to_bgp, bn, afi, safi); ++ bgp_process(to_bgp, bn, bpi, afi, safi); + } + } + } +@@ -4226,7 +4226,7 @@ static int bgp_mplsvpn_nh_label_bind_get_local_label_cb(mpls_label_t label, + if (!table) + continue; + SET_FLAG(pi->net->flags, BGP_NODE_LABEL_CHANGED); +- bgp_process(table->bgp, pi->net, table->afi, table->safi); ++ bgp_process(table->bgp, pi->net, pi, table->afi, table->safi); + } + + return 0; +diff --git a/bgpd/bgp_nht.c b/bgpd/bgp_nht.c +index 1614779ea6..97ec7e33b1 100644 +--- a/bgpd/bgp_nht.c ++++ b/bgpd/bgp_nht.c +@@ -1414,7 +1414,7 @@ void evaluate_paths(struct bgp_nexthop_cache *bnc) + } + } + +- bgp_process(bgp_path, dest, afi, safi); ++ bgp_process(bgp_path, dest, path, afi, safi); + } + + if (peer) { +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index a9bbcaf37e..e3350df7b0 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -2892,6 +2892,7 @@ void bgp_best_selection(struct bgp *bgp, struct bgp_dest *dest, + (pi != NULL) && (nextpi = pi->next, 1); pi = nextpi) { + enum bgp_path_selection_reason reason; + ++ UNSET_FLAG(pi->flags, BGP_PATH_UNSORTED); + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED)) + old_select = pi; + +@@ -3748,13 +3749,22 @@ static struct bgp_process_queue *bgp_processq_alloc(struct bgp *bgp) + return pqnode; + } + +-void bgp_process(struct bgp *bgp, struct bgp_dest *dest, afi_t afi, safi_t safi) ++void bgp_process(struct bgp *bgp, struct bgp_dest *dest, ++ struct bgp_path_info *pi, afi_t afi, safi_t safi) + { + #define ARBITRARY_PROCESS_QLEN 10000 + struct work_queue *wq = bgp->process_queue; + struct bgp_process_queue *pqnode; + int pqnode_reuse = 0; + ++ /* ++ * Indicate that *this* pi is in an unsorted ++ * situation, even if the node is already ++ * scheduled. ++ */ ++ if (pi) ++ SET_FLAG(pi->flags, BGP_PATH_UNSORTED); ++ + /* already scheduled for processing? */ + if (CHECK_FLAG(dest->flags, BGP_NODE_PROCESS_SCHEDULED)) + return; +@@ -4003,7 +4013,7 @@ void bgp_rib_remove(struct bgp_dest *dest, struct bgp_path_info *pi, + } + + hook_call(bgp_process, peer->bgp, afi, safi, dest, peer, true); +- bgp_process(peer->bgp, dest, afi, safi); ++ bgp_process(peer->bgp, dest, pi, afi, safi); + } + + static void bgp_rib_withdraw(struct bgp_dest *dest, struct bgp_path_info *pi, +@@ -4603,7 +4613,7 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id, + != BGP_DAMP_SUPPRESSED) { + bgp_aggregate_increment(bgp, p, pi, afi, + safi); +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, pi, afi, safi); + } + } else /* Duplicate - odd */ + { +@@ -4631,7 +4641,7 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id, + bgp_path_info_unset_flag( + dest, pi, BGP_PATH_STALE); + bgp_dest_set_defer_flag(dest, false); +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, pi, afi, safi); + } + } + +@@ -4921,7 +4931,7 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id, + /* Process change. */ + bgp_aggregate_increment(bgp, p, pi, afi, safi); + +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, pi, afi, safi); + bgp_dest_unlock_node(dest); + + if (SAFI_UNICAST == safi +@@ -5066,7 +5076,7 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id, + hook_call(bgp_process, bgp, afi, safi, dest, peer, false); + + /* Process change. */ +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, new, afi, safi); + + if (SAFI_UNICAST == safi + && (bgp->inst_type == BGP_INSTANCE_TYPE_VRF +@@ -6573,7 +6583,7 @@ void bgp_static_update(struct bgp *bgp, const struct prefix *p, + + /* Process change. */ + bgp_aggregate_increment(bgp, p, pi, afi, safi); +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, pi, afi, safi); + + if (SAFI_MPLS_VPN == safi && + bgp->inst_type == BGP_INSTANCE_TYPE_DEFAULT) { +@@ -6631,7 +6641,7 @@ void bgp_static_update(struct bgp *bgp, const struct prefix *p, + bgp_dest_unlock_node(dest); + + /* Process change. */ +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, new, afi, safi); + + if (SAFI_UNICAST == safi && + (bgp->inst_type == BGP_INSTANCE_TYPE_VRF || +@@ -6688,7 +6698,7 @@ void bgp_static_withdraw(struct bgp *bgp, const struct prefix *p, afi_t afi, + bgp_aggregate_decrement(bgp, p, pi, afi, safi); + bgp_unlink_nexthop(pi); + bgp_path_info_delete(dest, pi); +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, pi, afi, safi); + } + + /* Unlock bgp_node_lookup. */ +@@ -7093,7 +7103,7 @@ static void bgp_purge_af_static_redist_routes(struct bgp *bgp, afi_t afi, + safi); + bgp_unlink_nexthop(pi); + bgp_path_info_delete(dest, pi); +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, pi, afi, safi); + } + } + } +@@ -7482,7 +7492,7 @@ static void bgp_aggregate_install( + SET_FLAG(new->flags, BGP_PATH_VALID); + + bgp_path_info_add(dest, new); +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, new, afi, safi); + } else { + uninstall_aggregate_route: + for (pi = orig; pi; pi = pi->next) +@@ -7494,7 +7504,7 @@ static void bgp_aggregate_install( + /* Withdraw static BGP route from routing table. */ + if (pi) { + bgp_path_info_delete(dest, pi); +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, pi, afi, safi); + } + } + +@@ -7580,7 +7590,6 @@ void bgp_aggregate_toggle_suppressed(struct bgp_aggregate *aggregate, + const struct prefix *dest_p; + struct bgp_dest *dest, *top; + struct bgp_path_info *pi; +- bool toggle_suppression; + + /* We've found a different MED we must revert any suppressed routes. */ + top = bgp_node_get(table, p); +@@ -7590,7 +7599,6 @@ void bgp_aggregate_toggle_suppressed(struct bgp_aggregate *aggregate, + if (dest_p->prefixlen <= p->prefixlen) + continue; + +- toggle_suppression = false; + for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) { + if (BGP_PATH_HOLDDOWN(pi)) + continue; +@@ -7601,17 +7609,14 @@ void bgp_aggregate_toggle_suppressed(struct bgp_aggregate *aggregate, + if (suppress) { + /* Suppress route if not suppressed already. */ + if (aggr_suppress_path(aggregate, pi)) +- toggle_suppression = true; ++ bgp_process(bgp, dest, pi, afi, safi); + continue; + } + + /* Install route if there is no more suppression. */ + if (aggr_unsuppress_path(aggregate, pi)) +- toggle_suppression = true; ++ bgp_process(bgp, dest, pi, afi, safi); + } +- +- if (toggle_suppression) +- bgp_process(bgp, dest, afi, safi); + } + bgp_dest_unlock_node(top); + } +@@ -7670,7 +7675,6 @@ bool bgp_aggregate_route(struct bgp *bgp, const struct prefix *p, afi_t afi, + struct ecommunity *ecommunity = NULL; + struct lcommunity *lcommunity = NULL; + struct bgp_path_info *pi; +- unsigned long match = 0; + uint8_t atomic_aggregate = 0; + + /* If the bgp instance is being deleted or self peer is deleted +@@ -7720,8 +7724,6 @@ bool bgp_aggregate_route(struct bgp *bgp, const struct prefix *p, afi_t afi, + if (!bgp_check_advertise(bgp, dest, safi)) + continue; + +- match = 0; +- + for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) { + if (BGP_PATH_HOLDDOWN(pi)) + continue; +@@ -7745,7 +7747,7 @@ bool bgp_aggregate_route(struct bgp *bgp, const struct prefix *p, afi_t afi, + if (aggregate->summary_only + && AGGREGATE_MED_VALID(aggregate)) { + if (aggr_suppress_path(aggregate, pi)) +- match++; ++ bgp_process(bgp, dest, pi, afi, safi); + } + + /* +@@ -7761,7 +7763,7 @@ bool bgp_aggregate_route(struct bgp *bgp, const struct prefix *p, afi_t afi, + && AGGREGATE_MED_VALID(aggregate) + && aggr_suppress_map_test(bgp, aggregate, pi)) { + if (aggr_suppress_path(aggregate, pi)) +- match++; ++ bgp_process(bgp, dest, pi, afi, safi); + } + + aggregate->count++; +@@ -7822,8 +7824,6 @@ bool bgp_aggregate_route(struct bgp *bgp, const struct prefix *p, afi_t afi, + aggregate, + bgp_attr_get_lcommunity(pi->attr)); + } +- if (match) +- bgp_process(bgp, dest, afi, safi); + } + if (aggregate->as_set) { + bgp_compute_aggregate_aspath_val(aggregate); +@@ -7883,7 +7883,6 @@ void bgp_aggregate_delete(struct bgp *bgp, const struct prefix *p, afi_t afi, + struct bgp_dest *top; + struct bgp_dest *dest; + struct bgp_path_info *pi; +- unsigned long match; + + table = bgp->rib[afi][safi]; + +@@ -7895,7 +7894,6 @@ void bgp_aggregate_delete(struct bgp *bgp, const struct prefix *p, afi_t afi, + + if (dest_p->prefixlen <= p->prefixlen) + continue; +- match = 0; + + for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) { + if (BGP_PATH_HOLDDOWN(pi)) +@@ -7913,7 +7911,7 @@ void bgp_aggregate_delete(struct bgp *bgp, const struct prefix *p, afi_t afi, + if (pi->extra && pi->extra->aggr_suppressors && + listcount(pi->extra->aggr_suppressors)) { + if (aggr_unsuppress_path(aggregate, pi)) +- match++; ++ bgp_process(bgp, dest, pi, afi, safi); + } + + aggregate->count--; +@@ -7955,10 +7953,6 @@ void bgp_aggregate_delete(struct bgp *bgp, const struct prefix *p, afi_t afi, + pi->attr)); + } + } +- +- /* If this node was suppressed, process the change. */ +- if (match) +- bgp_process(bgp, dest, afi, safi); + } + if (aggregate->as_set) { + aspath_free(aggregate->aspath); +@@ -8107,7 +8101,6 @@ static void bgp_remove_route_from_aggregate(struct bgp *bgp, afi_t afi, + struct community *community = NULL; + struct ecommunity *ecommunity = NULL; + struct lcommunity *lcommunity = NULL; +- unsigned long match = 0; + + /* If the bgp instance is being deleted or self peer is deleted + * then do not create aggregate route +@@ -8124,12 +8117,12 @@ static void bgp_remove_route_from_aggregate(struct bgp *bgp, afi_t afi, + + if (aggregate->summary_only && AGGREGATE_MED_VALID(aggregate)) + if (aggr_unsuppress_path(aggregate, pi)) +- match++; ++ bgp_process(bgp, pi->net, pi, afi, safi); + + if (aggregate->suppress_map_name && AGGREGATE_MED_VALID(aggregate) + && aggr_suppress_map_test(bgp, aggregate, pi)) + if (aggr_unsuppress_path(aggregate, pi)) +- match++; ++ bgp_process(bgp, pi->net, pi, afi, safi); + + /* + * This must be called after `summary`, `suppress-map` check to avoid +@@ -8171,10 +8164,6 @@ static void bgp_remove_route_from_aggregate(struct bgp *bgp, afi_t afi, + aggregate, bgp_attr_get_lcommunity(pi->attr)); + } + +- /* If this node was suppressed, process the change. */ +- if (match) +- bgp_process(bgp, pi->net, afi, safi); +- + origin = BGP_ORIGIN_IGP; + if (aggregate->incomplete_origin_count > 0) + origin = BGP_ORIGIN_INCOMPLETE; +@@ -8778,7 +8767,7 @@ void bgp_redistribute_add(struct bgp *bgp, struct prefix *p, + /* Process change. */ + bgp_aggregate_increment(bgp, p, bpi, afi, + SAFI_UNICAST); +- bgp_process(bgp, bn, afi, SAFI_UNICAST); ++ bgp_process(bgp, bn, bpi, afi, SAFI_UNICAST); + bgp_dest_unlock_node(bn); + aspath_unintern(&attr.aspath); + +@@ -8801,7 +8790,7 @@ void bgp_redistribute_add(struct bgp *bgp, struct prefix *p, + bgp_path_info_add(bn, new); + bgp_dest_unlock_node(bn); + SET_FLAG(bn->flags, BGP_NODE_FIB_INSTALLED); +- bgp_process(bgp, bn, afi, SAFI_UNICAST); ++ bgp_process(bgp, bn, new, afi, SAFI_UNICAST); + + if ((bgp->inst_type == BGP_INSTANCE_TYPE_VRF) + || (bgp->inst_type == BGP_INSTANCE_TYPE_DEFAULT)) { +@@ -8842,7 +8831,7 @@ void bgp_redistribute_delete(struct bgp *bgp, struct prefix *p, uint8_t type, + } + bgp_aggregate_decrement(bgp, p, pi, afi, SAFI_UNICAST); + bgp_path_info_delete(dest, pi); +- bgp_process(bgp, dest, afi, SAFI_UNICAST); ++ bgp_process(bgp, dest, pi, afi, SAFI_UNICAST); + } + bgp_dest_unlock_node(dest); + } +@@ -8876,7 +8865,7 @@ void bgp_redistribute_withdraw(struct bgp *bgp, afi_t afi, int type, + bgp_path_info_delete(dest, pi); + if (!CHECK_FLAG(bgp->flags, + BGP_FLAG_DELETE_IN_PROGRESS)) +- bgp_process(bgp, dest, afi, SAFI_UNICAST); ++ bgp_process(bgp, dest, pi, afi, SAFI_UNICAST); + else { + dest = bgp_path_info_reap(dest, pi); + assert(dest); +diff --git a/bgpd/bgp_route.h b/bgpd/bgp_route.h +index 5781eb70f2..1e5c31afab 100644 +--- a/bgpd/bgp_route.h ++++ b/bgpd/bgp_route.h +@@ -802,7 +802,8 @@ extern void bgp_withdraw(struct peer *peer, const struct prefix *p, + struct bgp_route_evpn *evpn); + + /* for bgp_nexthop and bgp_damp */ +-extern void bgp_process(struct bgp *, struct bgp_dest *, afi_t, safi_t); ++extern void bgp_process(struct bgp *bgp, struct bgp_dest *dest, ++ struct bgp_path_info *pi, afi_t afi, safi_t safi); + + /* + * Add an end-of-initial-update marker to the process queue. This is just a +diff --git a/bgpd/bgp_vty.c b/bgpd/bgp_vty.c +index 9530a66fca..8d442ae64a 100644 +--- a/bgpd/bgp_vty.c ++++ b/bgpd/bgp_vty.c +@@ -10672,7 +10672,10 @@ static int bgp_clear_prefix(struct vty *vty, const char *view_name, + if (rm_p->prefixlen == match.prefixlen) { + SET_FLAG(rm->flags, + BGP_NODE_USER_CLEAR); +- bgp_process(bgp, rm, afi, safi); ++ bgp_process(bgp, rm, ++ bgp_dest_get_bgp_path_info( ++ rm), ++ afi, safi); + } + bgp_dest_unlock_node(rm); + } +@@ -10684,7 +10687,9 @@ static int bgp_clear_prefix(struct vty *vty, const char *view_name, + + if (dest_p->prefixlen == match.prefixlen) { + SET_FLAG(dest->flags, BGP_NODE_USER_CLEAR); +- bgp_process(bgp, dest, afi, safi); ++ bgp_process(bgp, dest, ++ bgp_dest_get_bgp_path_info(dest), ++ afi, safi); + } + bgp_dest_unlock_node(dest); + } +diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c +index d553d3d27f..d94ec66c18 100644 +--- a/bgpd/bgp_zebra.c ++++ b/bgpd/bgp_zebra.c +@@ -2142,7 +2142,7 @@ bool bgp_redistribute_metric_set(struct bgp *bgp, struct bgp_redist *red, + + bgp_path_info_set_flag(dest, pi, + BGP_PATH_ATTR_CHANGED); +- bgp_process(bgp, dest, afi, SAFI_UNICAST); ++ bgp_process(bgp, dest, pi, afi, SAFI_UNICAST); + } + } + } +diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c +index b1711848e6..8a3115c8fb 100644 +--- a/bgpd/bgpd.c ++++ b/bgpd/bgpd.c +@@ -1873,6 +1873,7 @@ void bgp_peer_conf_if_to_su_update(struct peer_connection *connection) + void bgp_recalculate_afi_safi_bestpaths(struct bgp *bgp, afi_t afi, safi_t safi) + { + struct bgp_dest *dest, *ndest; ++ struct bgp_path_info *pi, *next; + struct bgp_table *table; + + for (dest = bgp_table_top(bgp->rib[afi][safi]); dest; +@@ -1887,10 +1888,17 @@ void bgp_recalculate_afi_safi_bestpaths(struct bgp *bgp, afi_t afi, safi_t safi) + if (safi == SAFI_MPLS_VPN || safi == SAFI_ENCAP + || safi == SAFI_EVPN) { + for (ndest = bgp_table_top(table); ndest; +- ndest = bgp_route_next(ndest)) +- bgp_process(bgp, ndest, afi, safi); +- } else +- bgp_process(bgp, dest, afi, safi); ++ ndest = bgp_route_next(ndest)) { ++ for (pi = bgp_dest_get_bgp_path_info(ndest); ++ (pi != NULL) && (next = pi->next, 1); ++ pi = next) ++ bgp_process(bgp, ndest, pi, afi, safi); ++ } ++ } else { ++ for (pi = bgp_dest_get_bgp_path_info(dest); ++ (pi != NULL) && (next = pi->next, 1); pi = next) ++ bgp_process(bgp, dest, pi, afi, safi); ++ } + } + } + +diff --git a/bgpd/rfapi/rfapi.c b/bgpd/rfapi/rfapi.c +index 382af05c24..ae899daf82 100644 +--- a/bgpd/rfapi/rfapi.c ++++ b/bgpd/rfapi/rfapi.c +@@ -463,7 +463,7 @@ void del_vnc_route(struct rfapi_descriptor *rfd, + + bgp_aggregate_decrement(bgp, p, bpi, afi, safi); + bgp_path_info_delete(bn, bpi); +- bgp_process(bgp, bn, afi, safi); ++ bgp_process(bgp, bn, bpi, afi, safi); + } else { + vnc_zlog_debug_verbose( + "%s: Couldn't find route (safi=%d) at prefix %pFX", +@@ -1001,7 +1001,7 @@ void add_vnc_route(struct rfapi_descriptor *rfd, /* cookie, VPN UN addr, peer */ + + /* Process change. */ + bgp_aggregate_increment(bgp, p, bpi, afi, safi); +- bgp_process(bgp, bn, afi, safi); ++ bgp_process(bgp, bn, bpi, afi, safi); + bgp_dest_unlock_node(bn); + + vnc_zlog_debug_any( +@@ -1046,7 +1046,7 @@ void add_vnc_route(struct rfapi_descriptor *rfd, /* cookie, VPN UN addr, peer */ + } + + bgp_dest_unlock_node(bn); +- bgp_process(bgp, bn, afi, safi); ++ bgp_process(bgp, bn, new, afi, safi); + + vnc_zlog_debug_any( + "%s: Added route (safi=%s) at prefix %s (bn=%p, prd=%pRDP)", +-- +2.43.2 + + +From 88c7297831d5c49b8f9208075ec2947bb4180765 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 11 Mar 2024 14:05:59 -0400 +Subject: [PATCH 09/11] bgpd: bgp_best_selection is inherently pi based + +Currently evpn code calls bgp_best_selection for local +decisions for local tables to figure out what to do. +This is also pi based so let's note that the pi has +been changed before calling bgp_best_selection. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_evpn.c | 14 ++++++++------ + bgpd/bgp_evpn_mh.c | 13 ++++++++----- + bgpd/bgp_evpn_private.h | 3 ++- + 3 files changed, 18 insertions(+), 12 deletions(-) + +diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c +index 72dfd000f6..76b005fac0 100644 +--- a/bgpd/bgp_evpn.c ++++ b/bgpd/bgp_evpn.c +@@ -1442,7 +1442,7 @@ void evpn_delete_old_local_route(struct bgp *bgp, struct bgpevpn *vpn, + * Note: vpn is NULL for local EAD-ES routes. + */ + int evpn_route_select_install(struct bgp *bgp, struct bgpevpn *vpn, +- struct bgp_dest *dest) ++ struct bgp_dest *dest, struct bgp_path_info *pi) + { + struct bgp_path_info *old_select, *new_select; + struct bgp_path_info_pair old_and_new; +@@ -1450,6 +1450,8 @@ int evpn_route_select_install(struct bgp *bgp, struct bgpevpn *vpn, + safi_t safi = SAFI_EVPN; + int ret = 0; + ++ SET_FLAG(pi->flags, BGP_PATH_UNSORTED); ++ + /* Compute the best path. */ + bgp_best_selection(bgp, dest, &bgp->maxpaths[afi][safi], &old_and_new, + afi, safi); +@@ -2258,7 +2260,7 @@ int update_evpn_route(struct bgp *bgp, struct bgpevpn *vpn, + * route would win, and we should evict the defunct local route + * and (re)install the remote route into zebra. + */ +- evpn_route_select_install(bgp, vpn, dest); ++ evpn_route_select_install(bgp, vpn, dest, pi); + /* + * If the new local route was not selected evict it and tell zebra + * to re-add the best remote dest. BGP doesn't retain non-best local +@@ -2424,7 +2426,7 @@ int delete_evpn_route(struct bgp *bgp, struct bgpevpn *vpn, + if (pi) { + dest = bgp_path_info_reap(dest, pi); + assert(dest); +- evpn_route_select_install(bgp, vpn, dest); ++ evpn_route_select_install(bgp, vpn, dest, pi); + } + + /* dest should still exist due to locking make coverity happy */ +@@ -2539,7 +2541,7 @@ void bgp_evpn_update_type2_route_entry(struct bgp *bgp, struct bgpevpn *vpn, + * advertised to peers; otherwise, ensure it is evicted and + * (re)install the remote route into zebra. + */ +- evpn_route_select_install(bgp, vpn, dest); ++ evpn_route_select_install(bgp, vpn, dest, pi); + + if (CHECK_FLAG(pi->flags, BGP_PATH_REMOVED)) { + route_change = 0; +@@ -3252,7 +3254,7 @@ int install_evpn_route_entry_in_vni_common( + bgp_evpn_remote_ip_hash_add(vpn, pi); + + /* Perform route selection and update zebra, if required. */ +- ret = evpn_route_select_install(bgp, vpn, dest); ++ ret = evpn_route_select_install(bgp, vpn, dest, pi); + + /* if the best path is a local path with a non-zero ES + * sync info against the local path may need to be updated +@@ -3294,7 +3296,7 @@ int uninstall_evpn_route_entry_in_vni_common( + bgp_path_info_delete(dest, pi); + + /* Perform route selection and update zebra, if required. */ +- ret = evpn_route_select_install(bgp, vpn, dest); ++ ret = evpn_route_select_install(bgp, vpn, dest, pi); + + /* if the best path is a local path with a non-zero ES + * sync info against the local path may need to be updated +diff --git a/bgpd/bgp_evpn_mh.c b/bgpd/bgp_evpn_mh.c +index f5df47fbfb..d3def69198 100644 +--- a/bgpd/bgp_evpn_mh.c ++++ b/bgpd/bgp_evpn_mh.c +@@ -91,7 +91,8 @@ static void bgp_evpn_path_nh_unlink(struct bgp_path_evpn_nh_info *nh_info); + */ + static int bgp_evpn_es_route_select_install(struct bgp *bgp, + struct bgp_evpn_es *es, +- struct bgp_dest *dest) ++ struct bgp_dest *dest, ++ struct bgp_path_info *pi) + { + int ret = 0; + int zret = 0; +@@ -101,6 +102,8 @@ static int bgp_evpn_es_route_select_install(struct bgp *bgp, + struct bgp_path_info *new_select; /* new best */ + struct bgp_path_info_pair old_and_new; + ++ SET_FLAG(pi->flags, BGP_PATH_UNSORTED); ++ + /* Compute the best path. */ + bgp_best_selection(bgp, dest, &bgp->maxpaths[afi][safi], &old_and_new, + afi, safi); +@@ -232,7 +235,7 @@ static int bgp_evpn_es_route_install(struct bgp *bgp, + } + + /* Perform route selection and update zebra, if required. */ +- ret = bgp_evpn_es_route_select_install(bgp, es, dest); ++ ret = bgp_evpn_es_route_select_install(bgp, es, dest, pi); + + bgp_dest_unlock_node(dest); + +@@ -273,7 +276,7 @@ static int bgp_evpn_es_route_uninstall(struct bgp *bgp, struct bgp_evpn_es *es, + bgp_path_info_delete(dest, pi); + + /* Perform route selection and update zebra, if required. */ +- ret = bgp_evpn_es_route_select_install(bgp, es, dest); ++ ret = bgp_evpn_es_route_select_install(bgp, es, dest, pi); + + /* Unlock route node. */ + bgp_dest_unlock_node(dest); +@@ -669,7 +672,7 @@ static int bgp_evpn_type4_route_update(struct bgp *bgp, + * this is just to set the flags correctly + * as local route in the ES always wins. + */ +- bgp_evpn_es_route_select_install(bgp, es, dest); ++ bgp_evpn_es_route_select_install(bgp, es, dest, pi); + bgp_dest_unlock_node(dest); + + /* If this is a new route or some attribute has changed, export the +@@ -1009,7 +1012,7 @@ static int bgp_evpn_type1_route_update(struct bgp *bgp, struct bgp_evpn_es *es, + * this is just to set the flags correctly as local route in + * the ES always wins. + */ +- evpn_route_select_install(bgp, vpn, dest); ++ evpn_route_select_install(bgp, vpn, dest, pi); + bgp_dest_unlock_node(dest); + + /* If this is a new route or some attribute has changed, export the +diff --git a/bgpd/bgp_evpn_private.h b/bgpd/bgp_evpn_private.h +index 5af99afa34..07bba9b426 100644 +--- a/bgpd/bgp_evpn_private.h ++++ b/bgpd/bgp_evpn_private.h +@@ -716,7 +716,8 @@ extern void delete_evpn_route_entry(struct bgp *bgp, afi_t afi, safi_t safi, + struct bgp_path_info **pi); + int vni_list_cmp(void *p1, void *p2); + extern int evpn_route_select_install(struct bgp *bgp, struct bgpevpn *vpn, +- struct bgp_dest *dest); ++ struct bgp_dest *dest, ++ struct bgp_path_info *pi); + extern struct bgp_dest * + bgp_evpn_global_node_get(struct bgp_table *table, afi_t afi, safi_t safi, + const struct prefix_evpn *evp, struct prefix_rd *prd, +-- +2.43.2 + + +From a12e265b609288d5a466103b84b26ef53cad8f9f Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Wed, 20 Mar 2024 10:13:00 -0400 +Subject: [PATCH 10/11] bgpd: Do not reap, schedule for deletion + +Do not reap instead let's schedule for deletion +and let best_path_selection take care of the deletion +as it should. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_evpn.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c +index 76b005fac0..3c89495ccc 100644 +--- a/bgpd/bgp_evpn.c ++++ b/bgpd/bgp_evpn.c +@@ -2424,8 +2424,7 @@ int delete_evpn_route(struct bgp *bgp, struct bgpevpn *vpn, + */ + delete_evpn_route_entry(bgp, afi, safi, dest, &pi); + if (pi) { +- dest = bgp_path_info_reap(dest, pi); +- assert(dest); ++ bgp_path_info_delete(dest, pi); + evpn_route_select_install(bgp, vpn, dest, pi); + } + +-- +2.43.2 + + +From c420b9efd6bcbe556cd9d801d02c5fcedd526b85 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Tue, 19 Mar 2024 12:26:14 -0400 +Subject: [PATCH 11/11] bgpd: Sort the bgp_path_info's + +Currently bgp_path_info's are stored in reverse order +received. Sort them by the best path ordering. + +This will allow for optimizations in the future on +how multipath is done. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_evpn.c | 20 ++- + bgpd/bgp_mplsvpn.c | 19 ++- + bgpd/bgp_route.c | 368 +++++++++++++++++++++++++++++++++++++-------- + bgpd/bgp_route.h | 2 + + 4 files changed, 336 insertions(+), 73 deletions(-) + +diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c +index 3c89495ccc..cb51023a45 100644 +--- a/bgpd/bgp_evpn.c ++++ b/bgpd/bgp_evpn.c +@@ -1444,13 +1444,26 @@ void evpn_delete_old_local_route(struct bgp *bgp, struct bgpevpn *vpn, + int evpn_route_select_install(struct bgp *bgp, struct bgpevpn *vpn, + struct bgp_dest *dest, struct bgp_path_info *pi) + { +- struct bgp_path_info *old_select, *new_select; ++ struct bgp_path_info *old_select, *new_select, *first; + struct bgp_path_info_pair old_and_new; + afi_t afi = AFI_L2VPN; + safi_t safi = SAFI_EVPN; + int ret = 0; + ++ first = bgp_dest_get_bgp_path_info(dest); + SET_FLAG(pi->flags, BGP_PATH_UNSORTED); ++ if (pi != first) { ++ if (pi->next) ++ pi->next->prev = pi->prev; ++ if (pi->prev) ++ pi->prev->next = pi->next; ++ ++ if (first) ++ first->prev = pi; ++ pi->next = first; ++ pi->prev = NULL; ++ bgp_dest_set_bgp_path_info(dest, pi); ++ } + + /* Compute the best path. */ + bgp_best_selection(bgp, dest, &bgp->maxpaths[afi][safi], &old_and_new, +@@ -6482,9 +6495,10 @@ void bgp_filter_evpn_routes_upon_martian_change( + + for (dest = bgp_table_top(table); dest; + dest = bgp_route_next(dest)) { ++ struct bgp_path_info *next; + +- for (pi = bgp_dest_get_bgp_path_info(dest); pi; +- pi = pi->next) { ++ for (pi = bgp_dest_get_bgp_path_info(dest); ++ (pi != NULL) && (next = pi->next, 1); pi = next) { + bool affected = false; + const struct prefix *p; + +diff --git a/bgpd/bgp_mplsvpn.c b/bgpd/bgp_mplsvpn.c +index a48ef875c4..b9846d7e63 100644 +--- a/bgpd/bgp_mplsvpn.c ++++ b/bgpd/bgp_mplsvpn.c +@@ -2022,7 +2022,7 @@ void vpn_leak_from_vrf_withdraw_all(struct bgp *to_bgp, struct bgp *from_bgp, + + struct bgp_table *table; + struct bgp_dest *bn; +- struct bgp_path_info *bpi; ++ struct bgp_path_info *bpi, *next; + + /* This is the per-RD table of prefixes */ + table = bgp_dest_get_bgp_table_info(pdest); +@@ -2037,7 +2037,8 @@ void vpn_leak_from_vrf_withdraw_all(struct bgp *to_bgp, struct bgp *from_bgp, + __func__, bn); + } + +- for (; bpi; bpi = bpi->next) { ++ for (; (bpi != NULL) && (next = bpi->next, 1); ++ bpi = next) { + if (debug) + zlog_debug("%s: type %d, sub_type %d", + __func__, bpi->type, +@@ -2552,7 +2553,7 @@ void vpn_leak_to_vrf_withdraw(struct bgp_path_info *path_vpn) + void vpn_leak_to_vrf_withdraw_all(struct bgp *to_bgp, afi_t afi) + { + struct bgp_dest *bn; +- struct bgp_path_info *bpi; ++ struct bgp_path_info *bpi, *next; + safi_t safi = SAFI_UNICAST; + int debug = BGP_DEBUG(vpn, VPN_LEAK_TO_VRF); + +@@ -2563,9 +2564,8 @@ void vpn_leak_to_vrf_withdraw_all(struct bgp *to_bgp, afi_t afi) + */ + for (bn = bgp_table_top(to_bgp->rib[afi][safi]); bn; + bn = bgp_route_next(bn)) { +- +- for (bpi = bgp_dest_get_bgp_path_info(bn); bpi; +- bpi = bpi->next) { ++ for (bpi = bgp_dest_get_bgp_path_info(bn); ++ (bpi != NULL) && (next = bpi->next, 1); bpi = next) { + if (bpi->extra && bpi->extra->vrfleak && + bpi->extra->vrfleak->bgp_orig != to_bgp && + bpi->extra->vrfleak->parent && +@@ -2604,8 +2604,11 @@ void vpn_leak_no_retain(struct bgp *to_bgp, struct bgp *vpn_from, afi_t afi) + continue; + + for (bn = bgp_table_top(table); bn; bn = bgp_route_next(bn)) { +- for (bpi = bgp_dest_get_bgp_path_info(bn); bpi; +- bpi = bpi->next) { ++ struct bgp_path_info *next; ++ ++ for (bpi = bgp_dest_get_bgp_path_info(bn); ++ (bpi != NULL) && (next = bpi->next, 1); ++ bpi = next) { + if (bpi->extra && bpi->extra->vrfleak && + bpi->extra->vrfleak->bgp_orig == to_bgp) + continue; +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index e3350df7b0..c33b14675a 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -482,6 +482,7 @@ void bgp_path_info_add_with_caller(const char *name, struct bgp_dest *dest, + top->prev = pi; + bgp_dest_set_bgp_path_info(dest, pi); + ++ SET_FLAG(pi->flags, BGP_PATH_UNSORTED); + bgp_path_info_lock(pi); + bgp_dest_lock_node(dest); + peer_lock(pi->peer); /* bgp_path_info peer reference */ +@@ -502,8 +503,26 @@ struct bgp_dest *bgp_path_info_reap(struct bgp_dest *dest, + bgp_dest_set_bgp_path_info(dest, pi->next); + + bgp_path_info_mpath_dequeue(pi); ++ ++ pi->next = NULL; ++ pi->prev = NULL; ++ ++ hook_call(bgp_snmp_update_stats, dest, pi, false); ++ + bgp_path_info_unlock(pi); ++ return bgp_dest_unlock_node(dest); ++} ++ ++static struct bgp_dest *bgp_path_info_reap_unsorted(struct bgp_dest *dest, ++ struct bgp_path_info *pi) ++{ ++ bgp_path_info_mpath_dequeue(pi); ++ ++ pi->next = NULL; ++ pi->prev = NULL; ++ + hook_call(bgp_snmp_update_stats, dest, pi, false); ++ bgp_path_info_unlock(pi); + + return bgp_dest_unlock_node(dest); + } +@@ -2786,17 +2805,18 @@ void bgp_best_selection(struct bgp *bgp, struct bgp_dest *dest, + struct bgp_path_info_pair *result, afi_t afi, + safi_t safi) + { +- struct bgp_path_info *new_select; +- struct bgp_path_info *old_select; ++ struct bgp_path_info *new_select, *look_thru; ++ struct bgp_path_info *old_select, *worse, *first; + struct bgp_path_info *pi; + struct bgp_path_info *pi1; + struct bgp_path_info *pi2; +- struct bgp_path_info *nextpi = NULL; + int paths_eq, do_mpath; +- bool debug; ++ bool debug, any_comparisons; + struct list mp_list; + char pfx_buf[PREFIX2STR_BUFFER] = {}; + char path_buf[PATH_ADDPATH_STR_BUFFER]; ++ enum bgp_path_selection_reason reason = bgp_path_selection_none; ++ bool unsorted_items = true; + + bgp_mp_list_init(&mp_list); + do_mpath = +@@ -2807,16 +2827,16 @@ void bgp_best_selection(struct bgp *bgp, struct bgp_dest *dest, + if (debug) + prefix2str(bgp_dest_get_prefix(dest), pfx_buf, sizeof(pfx_buf)); + +- dest->reason = bgp_path_selection_none; + /* bgp deterministic-med */ + new_select = NULL; + if (CHECK_FLAG(bgp->flags, BGP_FLAG_DETERMINISTIC_MED)) { +- + /* Clear BGP_PATH_DMED_SELECTED for all paths */ + for (pi1 = bgp_dest_get_bgp_path_info(dest); pi1; +- pi1 = pi1->next) ++ pi1 = pi1->next) { + bgp_path_info_unset_flag(dest, pi1, + BGP_PATH_DMED_SELECTED); ++ UNSET_FLAG(pi1->flags, BGP_PATH_DMED_CHECK); ++ } + + for (pi1 = bgp_dest_get_bgp_path_info(dest); pi1; + pi1 = pi1->next) { +@@ -2885,70 +2905,273 @@ void bgp_best_selection(struct bgp *bgp, struct bgp_dest *dest, + } + } + +- /* Check old selected route and new selected route. */ ++ /* ++ * Let's grab the unsorted items from the list ++ */ ++ struct bgp_path_info *unsorted_list = NULL; ++ struct bgp_path_info *unsorted_list_spot = NULL; ++ struct bgp_path_info *unsorted_holddown = NULL; ++ + old_select = NULL; +- new_select = NULL; +- for (pi = bgp_dest_get_bgp_path_info(dest); +- (pi != NULL) && (nextpi = pi->next, 1); pi = nextpi) { +- enum bgp_path_selection_reason reason; ++ pi = bgp_dest_get_bgp_path_info(dest); ++ while (pi && CHECK_FLAG(pi->flags, BGP_PATH_UNSORTED)) { ++ struct bgp_path_info *next = pi->next; + +- UNSET_FLAG(pi->flags, BGP_PATH_UNSORTED); + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED)) + old_select = pi; + +- if (BGP_PATH_HOLDDOWN(pi)) { +- /* reap REMOVED routes, if needs be ++ /* ++ * Pull off pi off the list ++ */ ++ if (pi->next) ++ pi->next->prev = NULL; ++ ++ bgp_dest_set_bgp_path_info(dest, pi->next); ++ pi->next = NULL; ++ pi->prev = NULL; ++ ++ /* ++ * Place it on the unsorted list ++ */ ++ if (unsorted_list_spot) { ++ unsorted_list_spot->next = pi; ++ pi->prev = unsorted_list_spot; ++ pi->next = NULL; ++ } else { ++ unsorted_list = pi; ++ ++ pi->next = NULL; ++ pi->prev = NULL; ++ } ++ ++ unsorted_list_spot = pi; ++ pi = next; ++ } ++ ++ if (!old_select) { ++ old_select = bgp_dest_get_bgp_path_info(dest); ++ if (old_select && ++ !CHECK_FLAG(old_select->flags, BGP_PATH_SELECTED)) ++ old_select = NULL; ++ } ++ ++ if (!unsorted_list) ++ unsorted_items = true; ++ else ++ unsorted_items = false; ++ ++ any_comparisons = false; ++ worse = NULL; ++ while (unsorted_list) { ++ first = unsorted_list; ++ unsorted_list = unsorted_list->next; ++ ++ if (unsorted_list) ++ unsorted_list->prev = NULL; ++ first->next = NULL; ++ first->prev = NULL; ++ ++ /* ++ * It's not likely that the just received unsorted entry ++ * is in holddown and scheduled for removal but we should ++ * check ++ */ ++ if (BGP_PATH_HOLDDOWN(first)) { ++ /* ++ * reap REMOVED routes, if needs be + * selected route must stay for a while longer though + */ + if (debug) +- zlog_debug( +- "%s: %pBD(%s) pi from %s in holddown", +- __func__, dest, bgp->name_pretty, +- pi->peer->host); ++ zlog_debug("%s: %pBD(%s) pi %p from %s in holddown", ++ __func__, dest, bgp->name_pretty, ++ first, first->peer->host); + +- if (CHECK_FLAG(pi->flags, BGP_PATH_REMOVED) && +- (pi != old_select)) { +- dest = bgp_path_info_reap(dest, pi); ++ if (old_select != first && ++ CHECK_FLAG(first->flags, BGP_PATH_REMOVED)) { ++ dest = bgp_path_info_reap_unsorted(dest, first); + assert(dest); +- } ++ } else { ++ /* ++ * We are in hold down, so we cannot sort this ++ * item yet. Let's wait, so hold the unsorted ++ * to the side ++ */ ++ if (unsorted_holddown) { ++ first->next = unsorted_holddown; ++ unsorted_holddown->prev = first; ++ unsorted_holddown = first; ++ } else ++ unsorted_holddown = first; + ++ UNSET_FLAG(first->flags, BGP_PATH_UNSORTED); ++ } + continue; + } + +- if (pi->peer && pi->peer != bgp->peer_self +- && !CHECK_FLAG(pi->peer->sflags, PEER_STATUS_NSF_WAIT)) +- if (!peer_established(pi->peer->connection)) { ++ bgp_path_info_unset_flag(dest, first, BGP_PATH_DMED_CHECK); ++ ++ worse = NULL; ++ ++ struct bgp_path_info *look_thru_next; ++ ++ for (look_thru = bgp_dest_get_bgp_path_info(dest); look_thru; ++ look_thru = look_thru_next) { ++ /* look thru can be reaped save the next pointer */ ++ look_thru_next = look_thru->next; ++ ++ /* ++ * Now we have the first unsorted and the best selected ++ * Let's do best path comparison ++ */ ++ if (BGP_PATH_HOLDDOWN(look_thru)) { ++ /* reap REMOVED routes, if needs be ++ * selected route must stay for a while longer though ++ */ + if (debug) +- zlog_debug( +- "%s: %pBD(%s) non self peer %s not estab state", +- __func__, dest, +- bgp->name_pretty, +- pi->peer->host); ++ zlog_debug("%s: %pBD(%s) pi from %s %p in holddown", ++ __func__, dest, ++ bgp->name_pretty, ++ look_thru->peer->host, ++ look_thru); ++ ++ if (CHECK_FLAG(look_thru->flags, ++ BGP_PATH_REMOVED) && ++ (look_thru != old_select)) { ++ dest = bgp_path_info_reap(dest, ++ look_thru); ++ assert(dest); ++ } + + continue; + } + +- if (CHECK_FLAG(bgp->flags, BGP_FLAG_DETERMINISTIC_MED) +- && (!CHECK_FLAG(pi->flags, BGP_PATH_DMED_SELECTED))) { +- bgp_path_info_unset_flag(dest, pi, BGP_PATH_DMED_CHECK); +- if (debug) +- zlog_debug("%s: %pBD(%s) pi %s dmed", __func__, +- dest, bgp->name_pretty, +- pi->peer->host); +- continue; ++ if (look_thru->peer && ++ look_thru->peer != bgp->peer_self && ++ !CHECK_FLAG(look_thru->peer->sflags, ++ PEER_STATUS_NSF_WAIT)) ++ if (!peer_established( ++ look_thru->peer->connection)) { ++ if (debug) ++ zlog_debug("%s: %pBD(%s) non self peer %s not estab state", ++ __func__, dest, ++ bgp->name_pretty, ++ look_thru->peer->host); ++ ++ continue; ++ } ++ ++ bgp_path_info_unset_flag(dest, look_thru, ++ BGP_PATH_DMED_CHECK); ++ if (CHECK_FLAG(bgp->flags, BGP_FLAG_DETERMINISTIC_MED) && ++ (!CHECK_FLAG(look_thru->flags, ++ BGP_PATH_DMED_SELECTED))) { ++ bgp_path_info_unset_flag(dest, look_thru, ++ BGP_PATH_DMED_CHECK); ++ if (debug) ++ zlog_debug("%s: %pBD(%s) pi %s dmed", ++ __func__, dest, ++ bgp->name_pretty, ++ look_thru->peer->host); ++ ++ worse = look_thru; ++ continue; ++ } ++ ++ reason = dest->reason; ++ any_comparisons = true; ++ if (bgp_path_info_cmp(bgp, first, look_thru, &paths_eq, ++ mpath_cfg, debug, pfx_buf, afi, ++ safi, &reason)) { ++ first->reason = reason; ++ worse = look_thru; ++ /* ++ * We can stop looking ++ */ ++ break; ++ } ++ ++ look_thru->reason = reason; + } + +- bgp_path_info_unset_flag(dest, pi, BGP_PATH_DMED_CHECK); ++ if (!any_comparisons) ++ first->reason = bgp_path_selection_first; ++ ++ /* ++ * At this point worse if NON-NULL is where the first ++ * pointer should be before. if worse is NULL then ++ * first is bestpath too. Let's remove first from the ++ * list and place it in the right spot ++ */ ++ ++ if (!worse) { ++ struct bgp_path_info *end = ++ bgp_dest_get_bgp_path_info(dest); ++ ++ for (; end && end->next != NULL; end = end->next) ++ ; ++ ++ if (end) ++ end->next = first; ++ else ++ bgp_dest_set_bgp_path_info(dest, first); ++ first->prev = end; ++ first->next = NULL; ++ ++ dest->reason = first->reason; ++ } else { ++ if (worse->prev) ++ worse->prev->next = first; ++ first->next = worse; ++ if (worse) { ++ first->prev = worse->prev; ++ worse->prev = first; ++ } else ++ first->prev = NULL; + +- reason = dest->reason; +- if (bgp_path_info_cmp(bgp, pi, new_select, &paths_eq, mpath_cfg, +- debug, pfx_buf, afi, safi, +- &dest->reason)) { +- if (new_select == NULL && +- reason != bgp_path_selection_none) +- dest->reason = reason; +- new_select = pi; ++ if (dest->info == worse) { ++ bgp_dest_set_bgp_path_info(dest, first); ++ dest->reason = first->reason; ++ } + } ++ UNSET_FLAG(first->flags, BGP_PATH_UNSORTED); ++ } ++ ++ if (!unsorted_items) { ++ new_select = bgp_dest_get_bgp_path_info(dest); ++ while (new_select && BGP_PATH_HOLDDOWN(new_select)) ++ new_select = new_select->next; ++ ++ if (new_select) { ++ if (new_select->reason == bgp_path_selection_none) ++ new_select->reason = bgp_path_selection_first; ++ else if (new_select == bgp_dest_get_bgp_path_info(dest) && ++ new_select->next == NULL) ++ new_select->reason = bgp_path_selection_first; ++ dest->reason = new_select->reason; ++ } else ++ dest->reason = bgp_path_selection_none; ++ } else ++ new_select = old_select; ++ ++ ++ /* ++ * Reinsert all the unsorted_holddown items for future processing ++ * at the end of the list. ++ */ ++ if (unsorted_holddown) { ++ struct bgp_path_info *top = bgp_dest_get_bgp_path_info(dest); ++ struct bgp_path_info *prev = NULL; ++ ++ while (top != NULL) { ++ prev = top; ++ top = top->next; ++ } ++ ++ if (prev) { ++ prev->next = unsorted_holddown; ++ unsorted_holddown->prev = prev; ++ } else ++ bgp_dest_set_bgp_path_info(dest, unsorted_holddown); + } + + /* Now that we know which path is the bestpath see if any of the other +@@ -2968,9 +3191,7 @@ void bgp_best_selection(struct bgp *bgp, struct bgp_dest *dest, + } + + if (do_mpath && new_select) { +- for (pi = bgp_dest_get_bgp_path_info(dest); +- (pi != NULL) && (nextpi = pi->next, 1); pi = nextpi) { +- ++ for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) { + if (debug) + bgp_path_info_path_with_addpath_rx_str( + pi, path_buf, sizeof(path_buf)); +@@ -3527,7 +3748,8 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + bgp_path_info_unset_flag(dest, old_select, BGP_PATH_SELECTED); + if (new_select) { + if (debug) +- zlog_debug("%s: setting SELECTED flag", __func__); ++ zlog_debug("%s: %pBD setting SELECTED flag", __func__, ++ dest); + bgp_path_info_set_flag(dest, new_select, BGP_PATH_SELECTED); + bgp_path_info_unset_flag(dest, new_select, + BGP_PATH_ATTR_CHANGED); +@@ -3762,9 +3984,25 @@ void bgp_process(struct bgp *bgp, struct bgp_dest *dest, + * situation, even if the node is already + * scheduled. + */ +- if (pi) ++ if (pi) { ++ struct bgp_path_info *first = bgp_dest_get_bgp_path_info(dest); ++ + SET_FLAG(pi->flags, BGP_PATH_UNSORTED); + ++ if (pi != first) { ++ if (pi->next) ++ pi->next->prev = pi->prev; ++ if (pi->prev) ++ pi->prev->next = pi->next; ++ ++ if (first) ++ first->prev = pi; ++ pi->next = first; ++ pi->prev = NULL; ++ bgp_dest_set_bgp_path_info(dest, pi); ++ } ++ } ++ + /* already scheduled for processing? */ + if (CHECK_FLAG(dest->flags, BGP_NODE_PROCESS_SCHEDULED)) + return; +@@ -5647,7 +5885,7 @@ static wq_item_status bgp_clear_route_node(struct work_queue *wq, void *data) + struct bgp_clear_node_queue *cnq = data; + struct bgp_dest *dest = cnq->dest; + struct peer *peer = wq->spec.data; +- struct bgp_path_info *pi; ++ struct bgp_path_info *pi, *next; + struct bgp *bgp; + afi_t afi = bgp_dest_table(dest)->afi; + safi_t safi = bgp_dest_table(dest)->safi; +@@ -5658,7 +5896,8 @@ static wq_item_status bgp_clear_route_node(struct work_queue *wq, void *data) + /* It is possible that we have multiple paths for a prefix from a peer + * if that peer is using AddPath. + */ +- for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) { ++ for (pi = bgp_dest_get_bgp_path_info(dest); ++ (pi != NULL) && (next = pi->next, 1); pi = next) { + if (pi->peer != peer) + continue; + +@@ -5920,7 +6159,7 @@ void bgp_clear_adj_in(struct peer *peer, afi_t afi, safi_t safi) + void bgp_clear_stale_route(struct peer *peer, afi_t afi, safi_t safi) + { + struct bgp_dest *dest; +- struct bgp_path_info *pi; ++ struct bgp_path_info *pi, *next; + struct bgp_table *table; + + if (safi == SAFI_MPLS_VPN || safi == SAFI_ENCAP || safi == SAFI_EVPN) { +@@ -5935,8 +6174,9 @@ void bgp_clear_stale_route(struct peer *peer, afi_t afi, safi_t safi) + + for (rm = bgp_table_top(table); rm; + rm = bgp_route_next(rm)) +- for (pi = bgp_dest_get_bgp_path_info(rm); pi; +- pi = pi->next) { ++ for (pi = bgp_dest_get_bgp_path_info(rm); ++ (pi != NULL) && (next = pi->next, 1); ++ pi = next) { + if (pi->peer != peer) + continue; + if (CHECK_FLAG( +@@ -5969,8 +6209,8 @@ void bgp_clear_stale_route(struct peer *peer, afi_t afi, safi_t safi) + } else { + for (dest = bgp_table_top(peer->bgp->rib[afi][safi]); dest; + dest = bgp_route_next(dest)) +- for (pi = bgp_dest_get_bgp_path_info(dest); pi; +- pi = pi->next) { ++ for (pi = bgp_dest_get_bgp_path_info(dest); ++ (pi != NULL) && (next = pi->next, 1); pi = next) { + if (pi->peer != peer) + continue; + if (CHECK_FLAG(peer->af_sflags[afi][safi], +@@ -6680,6 +6920,7 @@ void bgp_static_withdraw(struct bgp *bgp, const struct prefix *p, afi_t afi, + + /* Withdraw static BGP route from routing table. */ + if (pi) { ++ SET_FLAG(pi->flags, BGP_PATH_UNSORTED); + #ifdef ENABLE_BGP_VNC + if (safi == SAFI_MPLS_VPN || safi == SAFI_ENCAP) + rfapiProcessWithdraw(pi->peer, NULL, p, prd, pi->attr, +@@ -7466,8 +7707,10 @@ static void bgp_aggregate_install( + /* + * Mark the old as unusable + */ +- if (pi) ++ if (pi) { + bgp_path_info_delete(dest, pi); ++ bgp_process(bgp, dest, pi, afi, safi); ++ } + + attr = bgp_attr_aggregate_intern( + bgp, origin, aspath, community, ecommunity, lcommunity, +@@ -7914,7 +8157,8 @@ void bgp_aggregate_delete(struct bgp *bgp, const struct prefix *p, afi_t afi, + bgp_process(bgp, dest, pi, afi, safi); + } + +- aggregate->count--; ++ if (aggregate->count > 0) ++ aggregate->count--; + + if (pi->attr->origin == BGP_ORIGIN_INCOMPLETE) + aggregate->incomplete_origin_count--; +diff --git a/bgpd/bgp_route.h b/bgpd/bgp_route.h +index 1e5c31afab..e6f7b9da36 100644 +--- a/bgpd/bgp_route.h ++++ b/bgpd/bgp_route.h +@@ -346,6 +346,8 @@ struct bgp_path_info { + + unsigned short instance; + ++ enum bgp_path_selection_reason reason; ++ + /* Addpath identifiers */ + uint32_t addpath_rx_id; + struct bgp_addpath_info_data tx_addpath; +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0075-bgp-mp-info-changes.patch b/src/sonic-frr/patch/0075-bgp-mp-info-changes.patch new file mode 100644 index 000000000000..de994256cb23 --- /dev/null +++ b/src/sonic-frr/patch/0075-bgp-mp-info-changes.patch @@ -0,0 +1,1542 @@ +From fb21d89e46751e56822e5bfbd44367641ccec687 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 26 Sep 2024 10:40:30 -0400 +Subject: [PATCH 1/5] bgpd: Use CHECK_FLAG to remain consistent for mp_flags + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_mpath.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/bgpd/bgp_mpath.c b/bgpd/bgp_mpath.c +index c773c21fb7..cf1326e07e 100644 +--- a/bgpd/bgp_mpath.c ++++ b/bgpd/bgp_mpath.c +@@ -468,10 +468,10 @@ bool bgp_path_info_mpath_chkwtd(struct bgp *bgp, struct bgp_path_info *path) + */ + if (bgp->lb_handling != BGP_LINK_BW_SKIP_MISSING && + bgp->lb_handling != BGP_LINK_BW_DEFWT_4_MISSING) +- return (path->mpath->mp_flags & BGP_MP_LB_ALL); ++ return CHECK_FLAG(path->mpath->mp_flags, BGP_MP_LB_ALL); + + /* At least one path should have bandwidth. */ +- return (path->mpath->mp_flags & BGP_MP_LB_PRESENT); ++ return CHECK_FLAG(path->mpath->mp_flags, BGP_MP_LB_PRESENT); + } + + /* +-- +2.43.2 + + +From 179fba970178e9199e95ef364ca50eaac287b958 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 26 Sep 2024 10:46:23 -0400 +Subject: [PATCH 2/5] bgpd: Ensure mpath data is only on bestpath + +The mpath data structure has data that is only relevant +for the first mpath in the list. It is not being used +anywhere else. Let's document that a bit more. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_mpath.c | 4 ++++ + bgpd/bgp_mpath.h | 9 ++++++--- + 2 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/bgpd/bgp_mpath.c b/bgpd/bgp_mpath.c +index cf1326e07e..635bb5236d 100644 +--- a/bgpd/bgp_mpath.c ++++ b/bgpd/bgp_mpath.c +@@ -407,6 +407,10 @@ static void bgp_path_info_mpath_count_set(struct bgp_path_info *path, + * bgp_path_info_mpath_lb_update + * + * Update cumulative info related to link-bandwidth ++ * ++ * This is only set on the first mpath of the list ++ * as such we should UNSET the flags when removing ++ * to ensure nothing accidently happens + */ + static void bgp_path_info_mpath_lb_update(struct bgp_path_info *path, bool set, + bool all_paths_lb, uint64_t cum_bw) +diff --git a/bgpd/bgp_mpath.h b/bgpd/bgp_mpath.h +index 129682d1dc..267d729e06 100644 +--- a/bgpd/bgp_mpath.h ++++ b/bgpd/bgp_mpath.h +@@ -25,15 +25,18 @@ struct bgp_path_info_mpath { + /* When attached to best path, the number of selected multipaths */ + uint16_t mp_count; + +- /* Flags - relevant as noted. */ ++ /* Flags - relevant as noted, attached to bestpath. */ + uint16_t mp_flags; + #define BGP_MP_LB_PRESENT 0x1 /* Link-bandwidth present for >= 1 path */ + #define BGP_MP_LB_ALL 0x2 /* Link-bandwidth present for all multipaths */ + +- /* Aggregated attribute for advertising multipath route */ ++ /* ++ * Aggregated attribute for advertising multipath route, ++ * attached to bestpath ++ */ + struct attr *mp_attr; + +- /* Cumulative bandiwdth of all multipaths - attached to best path. */ ++ /* Cumulative bandiwdth of all multipaths - attached to bestpath. */ + uint64_t cum_bw; + }; + +-- +2.43.2 + + +From 5c40a38e780d00cf5c3dc4752146ae9767192423 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 30 Sep 2024 14:57:45 -0400 +Subject: [PATCH 3/5] tests: Clean up some logging in + test_bgp_default_originate_2links.py + +Test was confusing. Add some useful data and clean up some debugs + +Signed-off-by: Donald Sharp +--- + .../test_bgp_default_originate_2links.py | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/tests/topotests/bgp_default_originate/test_bgp_default_originate_2links.py b/tests/topotests/bgp_default_originate/test_bgp_default_originate_2links.py +index 75e66566b7..58984221f9 100644 +--- a/tests/topotests/bgp_default_originate/test_bgp_default_originate_2links.py ++++ b/tests/topotests/bgp_default_originate/test_bgp_default_originate_2links.py +@@ -269,21 +269,21 @@ def verify_the_uptime(time_stamp_before, time_stamp_after, incremented=None): + if incremented == True: + if uptime_before < uptime_after: + logger.info( +- " The Uptime [{}] is incremented than [{}].......PASSED ".format( ++ " The Uptime before [{}] is less than [{}].......PASSED ".format( + time_stamp_before, time_stamp_after + ) + ) + return True + else: + logger.error( +- " The Uptime [{}] is expected to be incremented than [{}].......FAILED ".format( ++ " The Uptime before [{}] is greater than the uptime after [{}].......FAILED ".format( + time_stamp_before, time_stamp_after + ) + ) + return False + else: + logger.info( +- " The Uptime [{}] is not incremented than [{}] ".format( ++ " The Uptime before [{}] the same as after [{}] ".format( + time_stamp_before, time_stamp_after + ) + ) +@@ -1031,7 +1031,7 @@ def test_verify_bgp_default_originate_with_default_static_route_p1(request): + result = verify_the_uptime(uptime_before_ipv6, uptime_after_ipv6, incremented=False) + assert result is True, "Testcase {} : Failed Error: {}".format(tc_name, result) + +- step("Taking uptime snapshot before removing redisctribute static ") ++ step("Taking uptime snapshot before removing redistribute static") + uptime_before_ipv4 = get_rib_route_uptime(tgen, "ipv4", "r2", ipv4_uptime_dict) + uptime_before_ipv6 = get_rib_route_uptime(tgen, "ipv6", "r2", ipv6_uptime_dict) + sleep(1) +@@ -1078,6 +1078,7 @@ def test_verify_bgp_default_originate_with_default_static_route_p1(request): + ) + assert result is True, "Testcase {} : Failed Error: {}".format(tc_name, result) + ++ step("Now look that the route is not pointed at link2") + result = verify_rib_default_route( + tgen, + topo, +@@ -1097,7 +1098,7 @@ def test_verify_bgp_default_originate_with_default_static_route_p1(request): + ) + assert result is not True, "Testcase {} : Failed Error: {}".format(tc_name, result) + +- step("Taking uptime snapshot before removing redisctribute static ") ++ step("Taking uptime snapshot after removing redistribute static") + uptime_after_ipv4 = get_rib_route_uptime(tgen, "ipv4", "r2", ipv4_uptime_dict) + uptime_after_ipv6 = get_rib_route_uptime(tgen, "ipv6", "r2", ipv6_uptime_dict) + +-- +2.43.2 + + +From 7908175a589b41be062295d9e369a9a4168426ee Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 30 Sep 2024 15:09:42 -0400 +Subject: [PATCH 4/5] bgpd: Cleanup multipath figuring out in bgp + +Currently bgp multipath has these properties: + +a) mp_info may or may not be on a single path, based +upon path perturbations in the past. +b) mp_info->count started counting at 0( meaning 1 ). As that the +bestpath path_info was never included in the count +c) The first mp_info in the list held the multipath data associated +with the multipath. As such if you were at any other node that data +was not filled in. +d) As such the mp_info's that are not first on the list basically +were just pointers to the corresponding bgp_path_info that was in +the multipath. +e) On bestpath calculation, a linklist(struct linklist *) of bgp_path_info's was +created. +f) This linklist was passed in to a comparison function that took the +old mpinfo list and compared it item by item to the linklist and +doing magic to figure out how to create a new mp_info list. +g) the old mp_info and the link list had to be memory managed and +freed up. +h) BGP_PATH_MULTIPATH is only set on non bestpath nodes in the +multipath. + +This is really complicated. Let's change the algorithm to this: + +a) When running bestpath, mark a bgp_path_info node that could be in the ecmp path as +BGP_PATH_MULTIPATH_NEW. +b) When running multipath, just walk the list of bgp_path_info's and if +it has BGP_PATH_MULTIPATH_NEW on it, decide if it is in BGP_MULTIPATH. +If we run out of space to put in the ecmp, clear the flag on the rest. +c) Clean up the counting of sometimes adding 1 to the mpath count. +d) Only allocate a mpath_info node for the bestpath. Clean it up +when done with it. +e) remove the unneeded list management associated with the linklist and +the mp_list. + +This greatly simplifies multipath computation for bgp and reduces memory +load for large scale deployments. + +2 full feeds in work_queue_run prior: + + 0 56367.471 1123 50193 493695 50362 493791 0 0 0 TE work_queue_run + +BGP multipath info : 1941844 48 110780992 1941844 110780992 + +2 full feeds in work_queue_run after change: + + 1 52924.931 1296 40837 465968 41025 487390 0 0 1 TE work_queue_run + +BGP multipath info : 970860 32 38836880 970866 38837120 + +Aproximately 4 seconds of saved cpu time for convergence and ~75 mb +smaller run time. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_mpath.c | 390 +++++++++---------------------- + bgpd/bgp_mpath.h | 18 +- + bgpd/bgp_route.c | 21 +- + bgpd/bgp_route.h | 14 ++ + bgpd/bgp_routemap.c | 2 +- + tests/bgpd/subdir.am | 11 - + tests/bgpd/test_mpath.c | 482 --------------------------------------- + tests/bgpd/test_mpath.py | 10 - + 8 files changed, 135 insertions(+), 813 deletions(-) + delete mode 100644 tests/bgpd/test_mpath.c + delete mode 100644 tests/bgpd/test_mpath.py + +diff --git a/bgpd/bgp_mpath.c b/bgpd/bgp_mpath.c +index 635bb5236d..27c179380d 100644 +--- a/bgpd/bgp_mpath.c ++++ b/bgpd/bgp_mpath.c +@@ -2,8 +2,10 @@ + /* + * BGP Multipath + * Copyright (C) 2010 Google Inc. ++ * 2024 Nvidia Corporation ++ * Donald Sharp + * +- * This file is part of Quagga ++ * This file is part of FRR + */ + + #include +@@ -186,78 +188,6 @@ int bgp_path_info_nexthop_cmp(struct bgp_path_info *bpi1, + return compare; + } + +-/* +- * bgp_path_info_mpath_cmp +- * +- * This function determines our multipath list ordering. By ordering +- * the list we can deterministically select which paths are included +- * in the multipath set. The ordering also helps in detecting changes +- * in the multipath selection so we can detect whether to send an +- * update to zebra. +- * +- * The order of paths is determined first by received nexthop, and then +- * by peer address if the nexthops are the same. +- */ +-static int bgp_path_info_mpath_cmp(void *val1, void *val2) +-{ +- struct bgp_path_info *bpi1, *bpi2; +- int compare; +- +- bpi1 = val1; +- bpi2 = val2; +- +- compare = bgp_path_info_nexthop_cmp(bpi1, bpi2); +- +- if (!compare) { +- if (!bpi1->peer->su_remote && !bpi2->peer->su_remote) +- compare = 0; +- else if (!bpi1->peer->su_remote) +- compare = 1; +- else if (!bpi2->peer->su_remote) +- compare = -1; +- else +- compare = sockunion_cmp(bpi1->peer->su_remote, +- bpi2->peer->su_remote); +- } +- +- return compare; +-} +- +-/* +- * bgp_mp_list_init +- * +- * Initialize the mp_list, which holds the list of multipaths +- * selected by bgp_best_selection +- */ +-void bgp_mp_list_init(struct list *mp_list) +-{ +- assert(mp_list); +- memset(mp_list, 0, sizeof(struct list)); +- mp_list->cmp = bgp_path_info_mpath_cmp; +-} +- +-/* +- * bgp_mp_list_clear +- * +- * Clears all entries out of the mp_list +- */ +-void bgp_mp_list_clear(struct list *mp_list) +-{ +- assert(mp_list); +- list_delete_all_node(mp_list); +-} +- +-/* +- * bgp_mp_list_add +- * +- * Adds a multipath entry to the mp_list +- */ +-void bgp_mp_list_add(struct list *mp_list, struct bgp_path_info *mpinfo) +-{ +- assert(mp_list && mpinfo); +- listnode_add_sort(mp_list, mpinfo); +-} +- + /* + * bgp_path_info_mpath_new + * +@@ -270,6 +200,7 @@ static struct bgp_path_info_mpath *bgp_path_info_mpath_new(void) + new_mpath = XCALLOC(MTYPE_BGP_MPATH_INFO, + sizeof(struct bgp_path_info_mpath)); + ++ new_mpath->mp_count = 1; + return new_mpath; + } + +@@ -283,6 +214,8 @@ void bgp_path_info_mpath_free(struct bgp_path_info_mpath **mpath) + if (mpath && *mpath) { + if ((*mpath)->mp_attr) + bgp_attr_unintern(&(*mpath)->mp_attr); ++ (*mpath)->mp_attr = NULL; ++ + XFREE(MTYPE_BGP_MPATH_INFO, *mpath); + } + } +@@ -309,31 +242,6 @@ bgp_path_info_mpath_get(struct bgp_path_info *path) + return path->mpath; + } + +-/* +- * bgp_path_info_mpath_enqueue +- * +- * Enqueue a path onto the multipath list given the previous multipath +- * list entry +- */ +-static void bgp_path_info_mpath_enqueue(struct bgp_path_info *prev_info, +- struct bgp_path_info *path) +-{ +- struct bgp_path_info_mpath *prev, *mpath; +- +- prev = bgp_path_info_mpath_get(prev_info); +- mpath = bgp_path_info_mpath_get(path); +- if (!prev || !mpath) +- return; +- +- mpath->mp_next = prev->mp_next; +- mpath->mp_prev = prev; +- if (prev->mp_next) +- prev->mp_next->mp_prev = mpath; +- prev->mp_next = mpath; +- +- SET_FLAG(path->flags, BGP_PATH_MULTIPATH); +-} +- + /* + * bgp_path_info_mpath_dequeue + * +@@ -342,14 +250,9 @@ static void bgp_path_info_mpath_enqueue(struct bgp_path_info *prev_info, + void bgp_path_info_mpath_dequeue(struct bgp_path_info *path) + { + struct bgp_path_info_mpath *mpath = path->mpath; ++ + if (!mpath) + return; +- if (mpath->mp_prev) +- mpath->mp_prev->mp_next = mpath->mp_next; +- if (mpath->mp_next) +- mpath->mp_next->mp_prev = mpath->mp_prev; +- mpath->mp_next = mpath->mp_prev = NULL; +- UNSET_FLAG(path->flags, BGP_PATH_MULTIPATH); + } + + /* +@@ -359,9 +262,16 @@ void bgp_path_info_mpath_dequeue(struct bgp_path_info *path) + */ + struct bgp_path_info *bgp_path_info_mpath_next(struct bgp_path_info *path) + { +- if (!path->mpath || !path->mpath->mp_next) +- return NULL; +- return path->mpath->mp_next->mp_info; ++ path = path->next; ++ ++ while (path) { ++ if (CHECK_FLAG(path->flags, BGP_PATH_MULTIPATH)) ++ return path; ++ ++ path = path->next; ++ } ++ ++ return NULL; + } + + /* +@@ -382,7 +292,8 @@ struct bgp_path_info *bgp_path_info_mpath_first(struct bgp_path_info *path) + uint32_t bgp_path_info_mpath_count(struct bgp_path_info *path) + { + if (!path->mpath) +- return 0; ++ return 1; ++ + return path->mpath->mp_count; + } + +@@ -511,58 +422,51 @@ static void bgp_path_info_mpath_attr_set(struct bgp_path_info *path, + /* + * bgp_path_info_mpath_update + * +- * Compare and sync up the multipath list with the mp_list generated by +- * bgp_best_selection ++ * Compare and sync up the multipath flags with what was choosen ++ * in best selection + */ + void bgp_path_info_mpath_update(struct bgp *bgp, struct bgp_dest *dest, +- struct bgp_path_info *new_best, +- struct bgp_path_info *old_best, +- struct list *mp_list, +- struct bgp_maxpaths_cfg *mpath_cfg) ++ struct bgp_path_info *new_best, struct bgp_path_info *old_best, ++ uint32_t num_candidates, struct bgp_maxpaths_cfg *mpath_cfg) + { + uint16_t maxpaths, mpath_count, old_mpath_count; + uint32_t bwval; + uint64_t cum_bw, old_cum_bw; +- struct listnode *mp_node, *mp_next_node; +- struct bgp_path_info *cur_mpath, *new_mpath, *next_mpath, *prev_mpath; +- int mpath_changed, debug; ++ struct bgp_path_info *cur_iterator = NULL; ++ bool mpath_changed, debug; + bool all_paths_lb; + char path_buf[PATH_ADDPATH_STR_BUFFER]; ++ bool old_mpath, new_mpath; + +- mpath_changed = 0; ++ mpath_changed = false; + maxpaths = multipath_num; + mpath_count = 0; +- cur_mpath = NULL; + old_mpath_count = 0; + old_cum_bw = cum_bw = 0; +- prev_mpath = new_best; +- mp_node = listhead(mp_list); + debug = bgp_debug_bestpath(dest); + +- if (new_best) { +- mpath_count++; +- if (new_best != old_best) +- bgp_path_info_mpath_dequeue(new_best); +- maxpaths = (new_best->peer->sort == BGP_PEER_IBGP) +- ? mpath_cfg->maxpaths_ibgp +- : mpath_cfg->maxpaths_ebgp; +- } +- + if (old_best) { +- cur_mpath = bgp_path_info_mpath_first(old_best); + old_mpath_count = bgp_path_info_mpath_count(old_best); ++ if (old_mpath_count == 1) ++ SET_FLAG(old_best->flags, BGP_PATH_MULTIPATH); + old_cum_bw = bgp_path_info_mpath_cumbw(old_best); + bgp_path_info_mpath_count_set(old_best, 0); + bgp_path_info_mpath_lb_update(old_best, false, false, 0); +- bgp_path_info_mpath_dequeue(old_best); ++ bgp_path_info_mpath_free(&old_best->mpath); ++ old_best->mpath = NULL; ++ } ++ ++ if (new_best) { ++ maxpaths = (new_best->peer->sort == BGP_PEER_IBGP) ? mpath_cfg->maxpaths_ibgp ++ : mpath_cfg->maxpaths_ebgp; ++ cur_iterator = new_best; + } + + if (debug) +- zlog_debug("%pBD(%s): starting mpath update, newbest %s num candidates %d old-mpath-count %d old-cum-bw %" PRIu64, +- dest, bgp->name_pretty, +- new_best ? new_best->peer->host : "NONE", +- mp_list ? listcount(mp_list) : 0, old_mpath_count, +- old_cum_bw); ++ zlog_debug("%pBD(%s): starting mpath update, newbest %s num candidates %d old-mpath-count %d old-cum-bw %" PRIu64 ++ " maxpaths set %u", ++ dest, bgp->name_pretty, new_best ? new_best->peer->host : "NONE", ++ num_candidates, old_mpath_count, old_cum_bw, maxpaths); + + /* + * We perform an ordered walk through both lists in parallel. +@@ -576,173 +480,100 @@ void bgp_path_info_mpath_update(struct bgp *bgp, struct bgp_dest *dest, + * to skip over it + */ + all_paths_lb = true; /* We'll reset if any path doesn't have LB. */ +- while (mp_node || cur_mpath) { +- struct bgp_path_info *tmp_info; + ++ while (cur_iterator) { ++ old_mpath = CHECK_FLAG(cur_iterator->flags, BGP_PATH_MULTIPATH); ++ new_mpath = CHECK_FLAG(cur_iterator->flags, BGP_PATH_MULTIPATH_NEW); ++ ++ UNSET_FLAG(cur_iterator->flags, BGP_PATH_MULTIPATH_NEW); + /* +- * We can bail out of this loop if all existing paths on the +- * multipath list have been visited (for cleanup purposes) and +- * the maxpath requirement is fulfulled ++ * If the current mpath count is equal to the number of ++ * maxpaths that can be used then we can bail, after ++ * we clean up the flags associated with the rest of the ++ * bestpaths + */ +- if (!cur_mpath && (mpath_count >= maxpaths)) ++ if (mpath_count >= maxpaths) { ++ while (cur_iterator) { ++ UNSET_FLAG(cur_iterator->flags, BGP_PATH_MULTIPATH); ++ UNSET_FLAG(cur_iterator->flags, BGP_PATH_MULTIPATH_NEW); ++ ++ cur_iterator = cur_iterator->next; ++ } + break; + +- mp_next_node = mp_node ? listnextnode(mp_node) : NULL; +- next_mpath = +- cur_mpath ? bgp_path_info_mpath_next(cur_mpath) : NULL; +- tmp_info = mp_node ? listgetdata(mp_node) : NULL; ++ if (debug) ++ zlog_debug("%pBD(%s): Mpath count %u is equal to maximum paths allowed, finished comparision for MPATHS", ++ dest, bgp->name_pretty, mpath_count); ++ } + + if (debug) +- zlog_debug("%pBD(%s): comparing candidate %s with existing mpath %s", +- dest, bgp->name_pretty, +- tmp_info ? tmp_info->peer->host : "NONE", +- cur_mpath ? cur_mpath->peer->host : "NONE"); +- ++ zlog_debug("%pBD(%s): Candidate %s old_mpath: %u new_mpath: %u, Nexthop %pI4 current mpath count: %u", ++ dest, bgp->name_pretty, cur_iterator->peer->host, old_mpath, ++ new_mpath, &cur_iterator->attr->nexthop, mpath_count); + /* +- * If equal, the path was a multipath and is still a multipath. +- * Insert onto new multipath list if maxpaths allows. ++ * There is nothing to do if the cur_iterator is neither a old path ++ * or a new path + */ +- if (mp_node && (listgetdata(mp_node) == cur_mpath)) { +- list_delete_node(mp_list, mp_node); +- bgp_path_info_mpath_dequeue(cur_mpath); +- if ((mpath_count < maxpaths) +- && prev_mpath +- && bgp_path_info_nexthop_cmp(prev_mpath, +- cur_mpath)) { +- bgp_path_info_mpath_enqueue(prev_mpath, +- cur_mpath); +- prev_mpath = cur_mpath; +- mpath_count++; +- if (ecommunity_linkbw_present( +- bgp_attr_get_ecommunity( +- cur_mpath->attr), +- &bwval)) +- cum_bw += bwval; +- else +- all_paths_lb = false; +- if (debug) { +- bgp_path_info_path_with_addpath_rx_str( +- cur_mpath, path_buf, +- sizeof(path_buf)); +- zlog_debug("%pBD: %s is still multipath, cur count %d", +- dest, path_buf, mpath_count); +- } +- } else { +- mpath_changed = 1; +- if (debug) { +- bgp_path_info_path_with_addpath_rx_str( +- cur_mpath, path_buf, +- sizeof(path_buf)); +- zlog_debug("%pBD: remove mpath %s nexthop %pI4, cur count %d", +- dest, path_buf, +- &cur_mpath->attr->nexthop, +- mpath_count); +- } +- } +- mp_node = mp_next_node; +- cur_mpath = next_mpath; ++ if (!old_mpath && !new_mpath) { ++ UNSET_FLAG(cur_iterator->flags, BGP_PATH_MULTIPATH); ++ cur_iterator = cur_iterator->next; + continue; + } + +- if (cur_mpath +- && (!mp_node +- || (bgp_path_info_mpath_cmp(cur_mpath, +- listgetdata(mp_node)) +- < 0))) { +- /* +- * If here, we have an old multipath and either the +- * mp_list +- * is finished or the next mp_node points to a later +- * multipath, so we need to purge this path from the +- * multipath list +- */ +- bgp_path_info_mpath_dequeue(cur_mpath); +- mpath_changed = 1; ++ if (new_mpath) { ++ mpath_count++; ++ ++ if (cur_iterator != new_best) ++ SET_FLAG(cur_iterator->flags, BGP_PATH_MULTIPATH); ++ ++ if (!old_mpath) ++ mpath_changed = true; ++ ++ if (ecommunity_linkbw_present(bgp_attr_get_ecommunity(cur_iterator->attr), ++ &bwval) || ++ ecommunity_linkbw_present(bgp_attr_get_ipv6_ecommunity( ++ cur_iterator->attr), ++ &bwval)) ++ cum_bw += bwval; ++ else ++ all_paths_lb = false; ++ + if (debug) { +- bgp_path_info_path_with_addpath_rx_str( +- cur_mpath, path_buf, sizeof(path_buf)); +- zlog_debug("%pBD: remove mpath %s nexthop %pI4, cur count %d", +- dest, path_buf, +- &cur_mpath->attr->nexthop, +- mpath_count); ++ bgp_path_info_path_with_addpath_rx_str(cur_iterator, path_buf, ++ sizeof(path_buf)); ++ zlog_debug("%pBD: add mpath %s nexthop %pI4, cur count %d cum_bw: %" PRIu64 ++ " all_paths_lb: %u", ++ dest, path_buf, &cur_iterator->attr->nexthop, ++ mpath_count, cum_bw, all_paths_lb); + } +- cur_mpath = next_mpath; + } else { + /* +- * If here, we have a path on the mp_list that was not +- * previously +- * a multipath (due to non-equivalance or maxpaths +- * exceeded), +- * or the matching multipath is sorted later in the +- * multipath +- * list. Before we enqueue the path on the new multipath +- * list, +- * make sure its not on the old_best multipath list or +- * referenced +- * via next_mpath: +- * - If next_mpath points to this new path, update +- * next_mpath to +- * point to the multipath after this one +- * - Dequeue the path from the multipath list just to +- * make sure ++ * We know that old_mpath is true and new_mpath is false in this path + */ +- new_mpath = listgetdata(mp_node); +- list_delete_node(mp_list, mp_node); +- assert(new_mpath); +- assert(prev_mpath); +- if ((mpath_count < maxpaths) && (new_mpath != new_best) +- && bgp_path_info_nexthop_cmp(prev_mpath, +- new_mpath)) { +- bgp_path_info_mpath_dequeue(new_mpath); +- +- bgp_path_info_mpath_enqueue(prev_mpath, +- new_mpath); +- prev_mpath = new_mpath; +- mpath_changed = 1; +- mpath_count++; +- if (ecommunity_linkbw_present( +- bgp_attr_get_ecommunity( +- new_mpath->attr), +- &bwval)) +- cum_bw += bwval; +- else +- all_paths_lb = false; +- if (debug) { +- bgp_path_info_path_with_addpath_rx_str( +- new_mpath, path_buf, +- sizeof(path_buf)); +- zlog_debug("%pBD: add mpath %s nexthop %pI4, cur count %d", +- dest, path_buf, +- &new_mpath->attr->nexthop, +- mpath_count); +- } +- } +- mp_node = mp_next_node; ++ mpath_changed = true; ++ UNSET_FLAG(cur_iterator->flags, BGP_PATH_MULTIPATH); + } ++ ++ cur_iterator = cur_iterator->next; + } + + if (new_best) { +- bgp_path_info_mpath_count_set(new_best, mpath_count - 1); +- if (mpath_count <= 1 || +- !ecommunity_linkbw_present( +- bgp_attr_get_ecommunity(new_best->attr), &bwval)) +- all_paths_lb = false; +- else +- cum_bw += bwval; +- bgp_path_info_mpath_lb_update(new_best, true, +- all_paths_lb, cum_bw); +- ++ if (mpath_count > 1 || new_best->mpath) { ++ bgp_path_info_mpath_count_set(new_best, mpath_count); ++ bgp_path_info_mpath_lb_update(new_best, true, all_paths_lb, cum_bw); ++ } + if (debug) + zlog_debug("%pBD(%s): New mpath count (incl newbest) %d mpath-change %s all_paths_lb %d cum_bw %" PRIu64, + dest, bgp->name_pretty, mpath_count, + mpath_changed ? "YES" : "NO", all_paths_lb, + cum_bw); + ++ if (mpath_count == 1) ++ UNSET_FLAG(new_best->flags, BGP_PATH_MULTIPATH); + if (mpath_changed + || (bgp_path_info_mpath_count(new_best) != old_mpath_count)) + SET_FLAG(new_best->flags, BGP_PATH_MULTIPATH_CHG); +- if ((mpath_count - 1) != old_mpath_count || +- old_cum_bw != cum_bw) ++ if ((mpath_count) != old_mpath_count || old_cum_bw != cum_bw) + SET_FLAG(new_best->flags, BGP_PATH_LINK_BW_CHG); + } + } +@@ -755,20 +586,13 @@ void bgp_path_info_mpath_update(struct bgp *bgp, struct bgp_dest *dest, + */ + void bgp_mp_dmed_deselect(struct bgp_path_info *dmed_best) + { +- struct bgp_path_info *mpinfo, *mpnext; +- + if (!dmed_best) + return; + +- for (mpinfo = bgp_path_info_mpath_first(dmed_best); mpinfo; +- mpinfo = mpnext) { +- mpnext = bgp_path_info_mpath_next(mpinfo); +- bgp_path_info_mpath_dequeue(mpinfo); +- } +- + bgp_path_info_mpath_count_set(dmed_best, 0); + UNSET_FLAG(dmed_best->flags, BGP_PATH_MULTIPATH_CHG); + UNSET_FLAG(dmed_best->flags, BGP_PATH_LINK_BW_CHG); ++ + assert(bgp_path_info_mpath_first(dmed_best) == NULL); + } + +@@ -806,7 +630,7 @@ void bgp_path_info_mpath_aggregate_update(struct bgp_path_info *new_best, + if (!new_best) + return; + +- if (!bgp_path_info_mpath_count(new_best)) { ++ if (bgp_path_info_mpath_count(new_best) == 1) { + if ((new_attr = bgp_path_info_mpath_attr(new_best))) { + bgp_attr_unintern(&new_attr); + bgp_path_info_mpath_attr_set(new_best, NULL); +diff --git a/bgpd/bgp_mpath.h b/bgpd/bgp_mpath.h +index 267d729e06..a7107deb0e 100644 +--- a/bgpd/bgp_mpath.h ++++ b/bgpd/bgp_mpath.h +@@ -2,8 +2,9 @@ + /* + * BGP Multipath + * Copyright (C) 2010 Google Inc. ++ * 2024 Nvidia Corporation + * +- * This file is part of Quagga ++ * This file is part of FRR + */ + + #ifndef _FRR_BGP_MPATH_H +@@ -13,12 +14,6 @@ + * multipath selections, lazily allocated to save memory + */ + struct bgp_path_info_mpath { +- /* Points to the first multipath (on bestpath) or the next multipath */ +- struct bgp_path_info_mpath *mp_next; +- +- /* Points to the previous multipath or NULL on bestpath */ +- struct bgp_path_info_mpath *mp_prev; +- + /* Points to bgp_path_info associated with this multipath info */ + struct bgp_path_info *mp_info; + +@@ -50,16 +45,11 @@ extern int bgp_maximum_paths_unset(struct bgp *bgp, afi_t afi, safi_t safi, + /* Functions used by bgp_best_selection to record current + * multipath selections + */ +-extern int bgp_path_info_nexthop_cmp(struct bgp_path_info *bpi1, +- struct bgp_path_info *bpi2); +-extern void bgp_mp_list_init(struct list *mp_list); +-extern void bgp_mp_list_clear(struct list *mp_list); +-extern void bgp_mp_list_add(struct list *mp_list, struct bgp_path_info *mpinfo); ++extern int bgp_path_info_nexthop_cmp(struct bgp_path_info *bpi1, struct bgp_path_info *bpi2); + extern void bgp_mp_dmed_deselect(struct bgp_path_info *dmed_best); + extern void bgp_path_info_mpath_update(struct bgp *bgp, struct bgp_dest *dest, + struct bgp_path_info *new_best, +- struct bgp_path_info *old_best, +- struct list *mp_list, ++ struct bgp_path_info *old_best, uint32_t num_candidates, + struct bgp_maxpaths_cfg *mpath_cfg); + extern void + bgp_path_info_mpath_aggregate_update(struct bgp_path_info *new_best, +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index c33b14675a..12cbf66341 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -2143,8 +2143,7 @@ bool subgroup_announce_check(struct bgp_dest *dest, struct bgp_path_info *pi, + from = pi->peer; + filter = &peer->filter[afi][safi]; + bgp = SUBGRP_INST(subgrp); +- piattr = bgp_path_info_mpath_count(pi) ? bgp_path_info_mpath_attr(pi) +- : pi->attr; ++ piattr = bgp_path_info_mpath_count(pi) > 1 ? bgp_path_info_mpath_attr(pi) : pi->attr; + + if (CHECK_FLAG(peer->af_flags[afi][safi], PEER_FLAG_MAX_PREFIX_OUT) && + peer->pmax_out[afi][safi] != 0 && +@@ -2812,13 +2811,12 @@ void bgp_best_selection(struct bgp *bgp, struct bgp_dest *dest, + struct bgp_path_info *pi2; + int paths_eq, do_mpath; + bool debug, any_comparisons; +- struct list mp_list; + char pfx_buf[PREFIX2STR_BUFFER] = {}; + char path_buf[PATH_ADDPATH_STR_BUFFER]; + enum bgp_path_selection_reason reason = bgp_path_selection_none; + bool unsorted_items = true; ++ uint32_t num_candidates = 0; + +- bgp_mp_list_init(&mp_list); + do_mpath = + (mpath_cfg->maxpaths_ebgp > 1 || mpath_cfg->maxpaths_ibgp > 1); + +@@ -3202,7 +3200,8 @@ void bgp_best_selection(struct bgp *bgp, struct bgp_dest *dest, + "%pBD(%s): %s is the bestpath, add to the multipath list", + dest, bgp->name_pretty, + path_buf); +- bgp_mp_list_add(&mp_list, pi); ++ SET_FLAG(pi->flags, BGP_PATH_MULTIPATH_NEW); ++ num_candidates++; + continue; + } + +@@ -3234,15 +3233,14 @@ void bgp_best_selection(struct bgp *bgp, struct bgp_dest *dest, + "%pBD(%s): %s is equivalent to the bestpath, add to the multipath list", + dest, bgp->name_pretty, + path_buf); +- bgp_mp_list_add(&mp_list, pi); ++ SET_FLAG(pi->flags, BGP_PATH_MULTIPATH_NEW); ++ num_candidates++; + } + } + } + +- bgp_path_info_mpath_update(bgp, dest, new_select, old_select, &mp_list, +- mpath_cfg); ++ bgp_path_info_mpath_update(bgp, dest, new_select, old_select, num_candidates, mpath_cfg); + bgp_path_info_mpath_aggregate_update(new_select, old_select); +- bgp_mp_list_clear(&mp_list); + + bgp_addpath_update_ids(bgp, dest, afi, safi); + +@@ -11057,9 +11055,8 @@ void route_vty_out_detail(struct vty *vty, struct bgp *bgp, struct bgp_dest *bn, + vty_out(vty, ", otc %u", attr->otc); + } + +- if (CHECK_FLAG(path->flags, BGP_PATH_MULTIPATH) +- || (CHECK_FLAG(path->flags, BGP_PATH_SELECTED) +- && bgp_path_info_mpath_count(path))) { ++ if (CHECK_FLAG(path->flags, BGP_PATH_MULTIPATH) || ++ (CHECK_FLAG(path->flags, BGP_PATH_SELECTED) && bgp_path_info_mpath_count(path) > 1)) { + if (json_paths) + json_object_boolean_true_add(json_path, "multipath"); + else +diff --git a/bgpd/bgp_route.h b/bgpd/bgp_route.h +index e6f7b9da36..61492980ad 100644 +--- a/bgpd/bgp_route.h ++++ b/bgpd/bgp_route.h +@@ -319,6 +319,11 @@ struct bgp_path_info { + #define BGP_PATH_STALE (1 << 8) + #define BGP_PATH_REMOVED (1 << 9) + #define BGP_PATH_COUNTED (1 << 10) ++/* ++ * A BGP_PATH_MULTIPATH flag is not set on the best path ++ * it is set on every other node that is part of ECMP ++ * for that particular dest ++ */ + #define BGP_PATH_MULTIPATH (1 << 11) + #define BGP_PATH_MULTIPATH_CHG (1 << 12) + #define BGP_PATH_RIB_ATTR_CHG (1 << 13) +@@ -328,6 +333,15 @@ struct bgp_path_info { + #define BGP_PATH_MPLSVPN_LABEL_NH (1 << 17) + #define BGP_PATH_MPLSVPN_NH_LABEL_BIND (1 << 18) + #define BGP_PATH_UNSORTED (1 << 19) ++/* ++ * BGP_PATH_MULTIPATH_NEW is set on those bgp_path_info ++ * nodes that we have decided should possibly be in the ++ * ecmp path for a particular dest. This flag is ++ * removed when the bgp_path_info's are looked at to ++ * decide on whether or not a bgp_path_info is on ++ * the actual ecmp path. ++ */ ++#define BGP_PATH_MULTIPATH_NEW (1 << 20) + + /* BGP route type. This can be static, RIP, OSPF, BGP etc. */ + uint8_t type; +diff --git a/bgpd/bgp_routemap.c b/bgpd/bgp_routemap.c +index dbc3d6445d..7f0abd062b 100644 +--- a/bgpd/bgp_routemap.c ++++ b/bgpd/bgp_routemap.c +@@ -3238,7 +3238,7 @@ route_set_ecommunity_lb(void *rule, const struct prefix *prefix, void *object) + return RMAP_OKAY; + + bw_bytes = ((uint64_t)peer->bgp->lb_ref_bw * 1000 * 1000) / 8; +- mpath_count = bgp_path_info_mpath_count(path) + 1; ++ mpath_count = bgp_path_info_mpath_count(path); + bw_bytes *= mpath_count; + } + +diff --git a/tests/bgpd/subdir.am b/tests/bgpd/subdir.am +index 5148e7e440..97845ec1aa 100644 +--- a/tests/bgpd/subdir.am ++++ b/tests/bgpd/subdir.am +@@ -52,17 +52,6 @@ tests_bgpd_test_mp_attr_LDADD = $(BGP_TEST_LDADD) + tests_bgpd_test_mp_attr_SOURCES = tests/bgpd/test_mp_attr.c + EXTRA_DIST += tests/bgpd/test_mp_attr.py + +- +-if BGPD +-check_PROGRAMS += tests/bgpd/test_mpath +-endif +-tests_bgpd_test_mpath_CFLAGS = $(TESTS_CFLAGS) +-tests_bgpd_test_mpath_CPPFLAGS = $(TESTS_CPPFLAGS) +-tests_bgpd_test_mpath_LDADD = $(BGP_TEST_LDADD) +-tests_bgpd_test_mpath_SOURCES = tests/bgpd/test_mpath.c +-EXTRA_DIST += tests/bgpd/test_mpath.py +- +- + if BGPD + check_PROGRAMS += tests/bgpd/test_packet + endif +diff --git a/tests/bgpd/test_mpath.c b/tests/bgpd/test_mpath.c +deleted file mode 100644 +index ebbe3ac3e2..0000000000 +--- a/tests/bgpd/test_mpath.c ++++ /dev/null +@@ -1,482 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0-or-later +-/* +- * BGP Multipath Unit Test +- * Copyright (C) 2010 Google Inc. +- * +- * This file is part of Quagga +- */ +- +-#include +- +-#include "qobj.h" +-#include "vty.h" +-#include "stream.h" +-#include "privs.h" +-#include "linklist.h" +-#include "memory.h" +-#include "zclient.h" +-#include "queue.h" +-#include "filter.h" +- +-#include "bgpd/bgpd.h" +-#include "bgpd/bgp_table.h" +-#include "bgpd/bgp_route.h" +-#include "bgpd/bgp_attr.h" +-#include "bgpd/bgp_nexthop.h" +-#include "bgpd/bgp_mpath.h" +-#include "bgpd/bgp_evpn.h" +-#include "bgpd/bgp_network.h" +- +-#define VT100_RESET "\x1b[0m" +-#define VT100_RED "\x1b[31m" +-#define VT100_GREEN "\x1b[32m" +-#define VT100_YELLOW "\x1b[33m" +-#define OK VT100_GREEN "OK" VT100_RESET +-#define FAILED VT100_RED "failed" VT100_RESET +- +-#define TEST_PASSED 0 +-#define TEST_FAILED -1 +- +-#define EXPECT_TRUE(expr, res) \ +- if (!(expr)) { \ +- printf("Test failure in %s line %u: %s\n", __func__, __LINE__, \ +- #expr); \ +- (res) = TEST_FAILED; \ +- } +- +-typedef struct testcase_t__ testcase_t; +- +-typedef int (*test_setup_func)(testcase_t *); +-typedef int (*test_run_func)(testcase_t *); +-typedef int (*test_cleanup_func)(testcase_t *); +- +-struct testcase_t__ { +- const char *desc; +- void *test_data; +- void *verify_data; +- void *tmp_data; +- test_setup_func setup; +- test_run_func run; +- test_cleanup_func cleanup; +-}; +- +-/* need these to link in libbgp */ +-struct event_loop *master = NULL; +-extern struct zclient *zclient; +-struct zebra_privs_t bgpd_privs = { +- .user = NULL, +- .group = NULL, +- .vty_group = NULL, +-}; +- +-static int tty = 0; +- +-/* Create fake bgp instance */ +-static struct bgp *bgp_create_fake(as_t *as, const char *name) +-{ +- struct bgp *bgp; +- afi_t afi; +- safi_t safi; +- +- if ((bgp = XCALLOC(MTYPE_BGP, sizeof(struct bgp))) == NULL) +- return NULL; +- +- bgp_lock(bgp); +- // bgp->peer_self = peer_new (bgp); +- // bgp->peer_self->host = XSTRDUP (MTYPE_BGP_PEER_HOST, "Static +- // announcement"); +- +- bgp->peer = list_new(); +- // bgp->peer->cmp = (int (*)(void *, void *)) peer_cmp; +- +- bgp->group = list_new(); +- // bgp->group->cmp = (int (*)(void *, void *)) peer_group_cmp; +- +- bgp_evpn_init(bgp); +- FOREACH_AFI_SAFI (afi, safi) { +- bgp->route[afi][safi] = bgp_table_init(bgp, afi, safi); +- bgp->aggregate[afi][safi] = bgp_table_init(bgp, afi, safi); +- bgp->rib[afi][safi] = bgp_table_init(bgp, afi, safi); +- bgp->maxpaths[afi][safi].maxpaths_ebgp = MULTIPATH_NUM; +- bgp->maxpaths[afi][safi].maxpaths_ibgp = MULTIPATH_NUM; +- } +- +- bgp_scan_init(bgp); +- bgp->default_local_pref = BGP_DEFAULT_LOCAL_PREF; +- bgp->default_holdtime = BGP_DEFAULT_HOLDTIME; +- bgp->default_keepalive = BGP_DEFAULT_KEEPALIVE; +- bgp->restart_time = BGP_DEFAULT_RESTART_TIME; +- bgp->stalepath_time = BGP_DEFAULT_STALEPATH_TIME; +- +- bgp->as = *as; +- +- if (name) +- bgp->name = strdup(name); +- +- return bgp; +-} +- +-/*========================================================= +- * Testcase for maximum-paths configuration +- */ +-static int setup_bgp_cfg_maximum_paths(testcase_t *t) +-{ +- as_t asn = 1; +- t->tmp_data = bgp_create_fake(&asn, NULL); +- if (!t->tmp_data) +- return -1; +- return 0; +-} +- +-static int run_bgp_cfg_maximum_paths(testcase_t *t) +-{ +- afi_t afi; +- safi_t safi; +- struct bgp *bgp; +- int api_result; +- int test_result = TEST_PASSED; +- +- bgp = t->tmp_data; +- FOREACH_AFI_SAFI (afi, safi) { +- /* test bgp_maximum_paths_set */ +- api_result = bgp_maximum_paths_set(bgp, afi, safi, +- BGP_PEER_EBGP, 10, 0); +- EXPECT_TRUE(api_result == 0, test_result); +- api_result = bgp_maximum_paths_set(bgp, afi, safi, +- BGP_PEER_IBGP, 10, 0); +- EXPECT_TRUE(api_result == 0, test_result); +- EXPECT_TRUE(bgp->maxpaths[afi][safi].maxpaths_ebgp == 10, +- test_result); +- EXPECT_TRUE(bgp->maxpaths[afi][safi].maxpaths_ibgp == 10, +- test_result); +- +- /* test bgp_maximum_paths_unset */ +- api_result = +- bgp_maximum_paths_unset(bgp, afi, safi, BGP_PEER_EBGP); +- EXPECT_TRUE(api_result == 0, test_result); +- api_result = +- bgp_maximum_paths_unset(bgp, afi, safi, BGP_PEER_IBGP); +- EXPECT_TRUE(api_result == 0, test_result); +- EXPECT_TRUE((bgp->maxpaths[afi][safi].maxpaths_ebgp +- == MULTIPATH_NUM), +- test_result); +- EXPECT_TRUE((bgp->maxpaths[afi][safi].maxpaths_ibgp +- == MULTIPATH_NUM), +- test_result); +- } +- +- return test_result; +-} +- +-static int cleanup_bgp_cfg_maximum_paths(testcase_t *t) +-{ +- return bgp_delete((struct bgp *)t->tmp_data); +-} +- +-testcase_t test_bgp_cfg_maximum_paths = { +- .desc = "Test bgp maximum-paths config", +- .setup = setup_bgp_cfg_maximum_paths, +- .run = run_bgp_cfg_maximum_paths, +- .cleanup = cleanup_bgp_cfg_maximum_paths, +-}; +- +-/*========================================================= +- * Testcase for bgp_mp_list +- */ +-struct peer test_mp_list_peer[] = { +- {.local_as = 1, .as = 2}, {.local_as = 1, .as = 2}, +- {.local_as = 1, .as = 2}, {.local_as = 1, .as = 2}, +- {.local_as = 1, .as = 2}, +-}; +-int test_mp_list_peer_count = array_size(test_mp_list_peer); +-struct attr test_mp_list_attr[4]; +-struct bgp_path_info test_mp_list_info[] = { +- {.peer = &test_mp_list_peer[0], .attr = &test_mp_list_attr[0]}, +- {.peer = &test_mp_list_peer[1], .attr = &test_mp_list_attr[1]}, +- {.peer = &test_mp_list_peer[2], .attr = &test_mp_list_attr[1]}, +- {.peer = &test_mp_list_peer[3], .attr = &test_mp_list_attr[2]}, +- {.peer = &test_mp_list_peer[4], .attr = &test_mp_list_attr[3]}, +-}; +-int test_mp_list_info_count = array_size(test_mp_list_info); +- +-static int setup_bgp_mp_list(testcase_t *t) +-{ +- test_mp_list_attr[0].nexthop.s_addr = 0x01010101; +- test_mp_list_attr[1].nexthop.s_addr = 0x02020202; +- test_mp_list_attr[2].nexthop.s_addr = 0x03030303; +- test_mp_list_attr[3].nexthop.s_addr = 0x04040404; +- +- if ((test_mp_list_peer[0].su_remote = sockunion_str2su("1.1.1.1")) +- == NULL) +- return -1; +- if ((test_mp_list_peer[1].su_remote = sockunion_str2su("2.2.2.2")) +- == NULL) +- return -1; +- if ((test_mp_list_peer[2].su_remote = sockunion_str2su("3.3.3.3")) +- == NULL) +- return -1; +- if ((test_mp_list_peer[3].su_remote = sockunion_str2su("4.4.4.4")) +- == NULL) +- return -1; +- if ((test_mp_list_peer[4].su_remote = sockunion_str2su("5.5.5.5")) +- == NULL) +- return -1; +- +- return 0; +-} +- +-static int run_bgp_mp_list(testcase_t *t) +-{ +- struct list mp_list; +- struct listnode *mp_node; +- struct bgp_path_info *info; +- int i; +- int test_result = TEST_PASSED; +- bgp_mp_list_init(&mp_list); +- EXPECT_TRUE(listcount(&mp_list) == 0, test_result); +- +- bgp_mp_list_add(&mp_list, &test_mp_list_info[1]); +- bgp_mp_list_add(&mp_list, &test_mp_list_info[4]); +- bgp_mp_list_add(&mp_list, &test_mp_list_info[2]); +- bgp_mp_list_add(&mp_list, &test_mp_list_info[3]); +- bgp_mp_list_add(&mp_list, &test_mp_list_info[0]); +- +- for (i = 0, mp_node = mp_list.head; i < test_mp_list_info_count; +- i++, mp_node = listnextnode(mp_node)) { +- info = listgetdata(mp_node); +- info->lock++; +- EXPECT_TRUE(info == &test_mp_list_info[i], test_result); +- } +- +- bgp_mp_list_clear(&mp_list); +- EXPECT_TRUE(listcount(&mp_list) == 0, test_result); +- +- return test_result; +-} +- +-static int cleanup_bgp_mp_list(testcase_t *t) +-{ +- int i; +- +- for (i = 0; i < test_mp_list_peer_count; i++) +- sockunion_free(test_mp_list_peer[i].su_remote); +- +- return 0; +-} +- +-testcase_t test_bgp_mp_list = { +- .desc = "Test bgp_mp_list", +- .setup = setup_bgp_mp_list, +- .run = run_bgp_mp_list, +- .cleanup = cleanup_bgp_mp_list, +-}; +- +-/*========================================================= +- * Testcase for bgp_path_info_mpath_update +- */ +- +-static struct bgp_dest *dest; +- +-static int setup_bgp_path_info_mpath_update(testcase_t *t) +-{ +- int i; +- struct bgp *bgp; +- struct bgp_table *rt; +- struct prefix p; +- as_t asn = 1; +- +- t->tmp_data = bgp_create_fake(&asn, NULL); +- if (!t->tmp_data) +- return -1; +- +- bgp = t->tmp_data; +- rt = bgp->rib[AFI_IP][SAFI_UNICAST]; +- +- if (!rt) +- return -1; +- +- str2prefix("42.1.1.0/24", &p); +- dest = bgp_node_get(rt, &p); +- +- setup_bgp_mp_list(t); +- for (i = 0; i < test_mp_list_info_count; i++) +- bgp_path_info_add(dest, &test_mp_list_info[i]); +- return 0; +-} +- +-static int run_bgp_path_info_mpath_update(testcase_t *t) +-{ +- struct bgp_path_info *new_best, *old_best, *mpath; +- struct list mp_list; +- struct bgp_maxpaths_cfg mp_cfg = {3, 3}; +- +- int test_result = TEST_PASSED; +- bgp_mp_list_init(&mp_list); +- bgp_mp_list_add(&mp_list, &test_mp_list_info[4]); +- bgp_mp_list_add(&mp_list, &test_mp_list_info[3]); +- bgp_mp_list_add(&mp_list, &test_mp_list_info[0]); +- bgp_mp_list_add(&mp_list, &test_mp_list_info[1]); +- new_best = &test_mp_list_info[3]; +- old_best = NULL; +- bgp_path_info_mpath_update(NULL, dest, new_best, old_best, &mp_list, +- &mp_cfg); +- bgp_mp_list_clear(&mp_list); +- EXPECT_TRUE(bgp_path_info_mpath_count(new_best) == 2, test_result); +- mpath = bgp_path_info_mpath_first(new_best); +- EXPECT_TRUE(mpath == &test_mp_list_info[0], test_result); +- EXPECT_TRUE(CHECK_FLAG(mpath->flags, BGP_PATH_MULTIPATH), test_result); +- mpath = bgp_path_info_mpath_next(mpath); +- EXPECT_TRUE(mpath == &test_mp_list_info[1], test_result); +- EXPECT_TRUE(CHECK_FLAG(mpath->flags, BGP_PATH_MULTIPATH), test_result); +- +- bgp_mp_list_add(&mp_list, &test_mp_list_info[0]); +- bgp_mp_list_add(&mp_list, &test_mp_list_info[1]); +- new_best = &test_mp_list_info[0]; +- old_best = &test_mp_list_info[3]; +- bgp_path_info_mpath_update(NULL, dest, new_best, old_best, &mp_list, +- &mp_cfg); +- bgp_mp_list_clear(&mp_list); +- EXPECT_TRUE(bgp_path_info_mpath_count(new_best) == 1, test_result); +- mpath = bgp_path_info_mpath_first(new_best); +- EXPECT_TRUE(mpath == &test_mp_list_info[1], test_result); +- EXPECT_TRUE(CHECK_FLAG(mpath->flags, BGP_PATH_MULTIPATH), test_result); +- EXPECT_TRUE(!CHECK_FLAG(test_mp_list_info[0].flags, BGP_PATH_MULTIPATH), +- test_result); +- +- return test_result; +-} +- +-static int cleanup_bgp_path_info_mpath_update(testcase_t *t) +-{ +- int i; +- +- for (i = 0; i < test_mp_list_peer_count; i++) +- sockunion_free(test_mp_list_peer[i].su_remote); +- +- return bgp_delete((struct bgp *)t->tmp_data); +-} +- +-testcase_t test_bgp_path_info_mpath_update = { +- .desc = "Test bgp_path_info_mpath_update", +- .setup = setup_bgp_path_info_mpath_update, +- .run = run_bgp_path_info_mpath_update, +- .cleanup = cleanup_bgp_path_info_mpath_update, +-}; +- +-/*========================================================= +- * Set up testcase vector +- */ +-testcase_t *all_tests[] = { +- &test_bgp_cfg_maximum_paths, &test_bgp_mp_list, +- &test_bgp_path_info_mpath_update, +-}; +- +-int all_tests_count = array_size(all_tests); +- +-/*========================================================= +- * Test Driver Functions +- */ +-static int global_test_init(void) +-{ +- qobj_init(); +- master = event_master_create(NULL); +- zclient = zclient_new(master, &zclient_options_default, NULL, 0); +- bgp_master_init(master, BGP_SOCKET_SNDBUF_SIZE, list_new()); +- vrf_init(NULL, NULL, NULL, NULL); +- bgp_option_set(BGP_OPT_NO_LISTEN); +- +- if (fileno(stdout) >= 0) +- tty = isatty(fileno(stdout)); +- return 0; +-} +- +-static int global_test_cleanup(void) +-{ +- if (zclient != NULL) +- zclient_free(zclient); +- event_master_free(master); +- return 0; +-} +- +-static void display_result(testcase_t *test, int result) +-{ +- if (tty) +- printf("%s: %s\n", test->desc, +- result == TEST_PASSED ? OK : FAILED); +- else +- printf("%s: %s\n", test->desc, +- result == TEST_PASSED ? "OK" : "FAILED"); +-} +- +-static int setup_test(testcase_t *t) +-{ +- int res = 0; +- if (t->setup) +- res = t->setup(t); +- return res; +-} +- +-static int cleanup_test(testcase_t *t) +-{ +- int res = 0; +- if (t->cleanup) +- res = t->cleanup(t); +- return res; +-} +- +-static void run_tests(testcase_t *tests[], int num_tests, int *pass_count, +- int *fail_count) +-{ +- int test_index, result; +- testcase_t *cur_test; +- +- *pass_count = *fail_count = 0; +- +- for (test_index = 0; test_index < num_tests; test_index++) { +- cur_test = tests[test_index]; +- if (!cur_test->desc) { +- printf("error: test %d has no description!\n", +- test_index); +- continue; +- } +- if (!cur_test->run) { +- printf("error: test %s has no run function!\n", +- cur_test->desc); +- continue; +- } +- if (setup_test(cur_test) != 0) { +- printf("error: setup failed for test %s\n", +- cur_test->desc); +- continue; +- } +- result = cur_test->run(cur_test); +- if (result == TEST_PASSED) +- *pass_count += 1; +- else +- *fail_count += 1; +- display_result(cur_test, result); +- if (cleanup_test(cur_test) != 0) { +- printf("error: cleanup failed for test %s\n", +- cur_test->desc); +- continue; +- } +- } +-} +- +-int main(void) +-{ +- int pass_count, fail_count; +- time_t cur_time; +- char buf[32]; +- +- time(&cur_time); +- printf("BGP Multipath Tests Run at %s", ctime_r(&cur_time, buf)); +- if (global_test_init() != 0) { +- printf("Global init failed. Terminating.\n"); +- exit(1); +- } +- run_tests(all_tests, all_tests_count, &pass_count, &fail_count); +- global_test_cleanup(); +- printf("Total pass/fail: %d/%d\n", pass_count, fail_count); +- return fail_count; +-} +diff --git a/tests/bgpd/test_mpath.py b/tests/bgpd/test_mpath.py +deleted file mode 100644 +index 582fd25c20..0000000000 +--- a/tests/bgpd/test_mpath.py ++++ /dev/null +@@ -1,10 +0,0 @@ +-import frrtest +- +- +-class TestMpath(frrtest.TestMultiOut): +- program = "./test_mpath" +- +- +-TestMpath.okfail("bgp maximum-paths config") +-TestMpath.okfail("bgp_mp_list") +-TestMpath.okfail("bgp_path_info_mpath_update") +-- +2.43.2 + + +From 6436c4f55ceb2900d8da52411b300338b36125c1 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Tue, 1 Oct 2024 09:18:44 -0400 +Subject: [PATCH 5/5] bgpd: Remove bgp_path_info_mpath_dequeue + +This function is no doing any work. Let's remove. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_mpath.c | 13 ------------- + bgpd/bgp_mpath.h | 1 - + bgpd/bgp_route.c | 4 ---- + 3 files changed, 18 deletions(-) + +diff --git a/bgpd/bgp_mpath.c b/bgpd/bgp_mpath.c +index 27c179380d..b2e0371052 100644 +--- a/bgpd/bgp_mpath.c ++++ b/bgpd/bgp_mpath.c +@@ -242,19 +242,6 @@ bgp_path_info_mpath_get(struct bgp_path_info *path) + return path->mpath; + } + +-/* +- * bgp_path_info_mpath_dequeue +- * +- * Remove a path from the multipath list +- */ +-void bgp_path_info_mpath_dequeue(struct bgp_path_info *path) +-{ +- struct bgp_path_info_mpath *mpath = path->mpath; +- +- if (!mpath) +- return; +-} +- + /* + * bgp_path_info_mpath_next + * +diff --git a/bgpd/bgp_mpath.h b/bgpd/bgp_mpath.h +index a7107deb0e..8201896593 100644 +--- a/bgpd/bgp_mpath.h ++++ b/bgpd/bgp_mpath.h +@@ -56,7 +56,6 @@ bgp_path_info_mpath_aggregate_update(struct bgp_path_info *new_best, + struct bgp_path_info *old_best); + + /* Unlink and free multipath information associated with a bgp_path_info */ +-extern void bgp_path_info_mpath_dequeue(struct bgp_path_info *path); + extern void bgp_path_info_mpath_free(struct bgp_path_info_mpath **mpath); + + /* Walk list of multipaths associated with a best path */ +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index 12cbf66341..774ff068f8 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -502,8 +502,6 @@ struct bgp_dest *bgp_path_info_reap(struct bgp_dest *dest, + else + bgp_dest_set_bgp_path_info(dest, pi->next); + +- bgp_path_info_mpath_dequeue(pi); +- + pi->next = NULL; + pi->prev = NULL; + +@@ -516,8 +514,6 @@ struct bgp_dest *bgp_path_info_reap(struct bgp_dest *dest, + static struct bgp_dest *bgp_path_info_reap_unsorted(struct bgp_dest *dest, + struct bgp_path_info *pi) + { +- bgp_path_info_mpath_dequeue(pi); +- + pi->next = NULL; + pi->prev = NULL; + +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0076-Optimizations-and-problem-fixing-for-large-scale-ecmp-from-bgp.patch b/src/sonic-frr/patch/0076-Optimizations-and-problem-fixing-for-large-scale-ecmp-from-bgp.patch new file mode 100644 index 000000000000..fc885183034f --- /dev/null +++ b/src/sonic-frr/patch/0076-Optimizations-and-problem-fixing-for-large-scale-ecmp-from-bgp.patch @@ -0,0 +1,577 @@ +From a2d0c451c30e80754978854f316e5291c0812e75 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Wed, 23 Oct 2024 13:16:29 -0400 +Subject: [PATCH 1/5] bgpd: Do not call evpn_overlay_free no matter what + +bgp_update is a very expensive call. Calling evpn_overlay_free +even when we have no evpn data to free is not trivial. Let's +limit the call into this function until we actually have data to +free. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_route.c | 15 +++++++++------ + 1 file changed, 9 insertions(+), 6 deletions(-) + +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index 774ff068f8..55e0a7bc2c 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -4542,9 +4542,10 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id, + * will not be interned. In which case, it is ok to update the + * attr->evpn_overlay, so that, this can be stored in adj_in. + */ +- if ((afi == AFI_L2VPN) && evpn) { +- memcpy(&attr->evpn_overlay, evpn, +- sizeof(struct bgp_route_evpn)); ++ if (evpn) { ++ if (afi == AFI_L2VPN) ++ memcpy(&attr->evpn_overlay, evpn, ++ sizeof(struct bgp_route_evpn)); + } + bgp_adj_in_set(dest, peer, attr, addpath_id); + } +@@ -4706,9 +4707,11 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id, + * attr->evpn_overlay with evpn directly. Instead memcpy + * evpn to new_atr.evpn_overlay before it is interned. + */ +- if (soft_reconfig && (afi == AFI_L2VPN) && evpn) +- memcpy(&new_attr.evpn_overlay, evpn, +- sizeof(struct bgp_route_evpn)); ++ if (soft_reconfig && evpn) { ++ if (afi == AFI_L2VPN) ++ memcpy(&new_attr.evpn_overlay, evpn, ++ sizeof(struct bgp_route_evpn)); ++ } + + /* Apply incoming route-map. + * NB: new_attr may now contain newly allocated values from route-map +-- +2.43.2 + + +From 1fb6e42a69b92e7a9f7ee338389f4941a624b98b Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 24 Oct 2024 11:27:24 -0400 +Subject: [PATCH 2/5] bgpd: Store aspath count after aspath has changed + +When running bestpath on a very large number of ecmp. +BGP ends up calling aspath_count a very very large number +of times, which results in ~15% cpu runtime in aspath_count_hops. +Modify the aspath to keep track of it's own count. This results +in the function now taking up ~1.5% of the cpu runtime. Enough +for the moment to be ignored. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_aspath.c | 37 +++++++++++++++++++++++++++++++++++-- + bgpd/bgp_aspath.h | 1 + + 2 files changed, 36 insertions(+), 2 deletions(-) + +diff --git a/bgpd/bgp_aspath.c b/bgpd/bgp_aspath.c +index bc7e8939b4..9241231382 100644 +--- a/bgpd/bgp_aspath.c ++++ b/bgpd/bgp_aspath.c +@@ -294,6 +294,8 @@ static struct aspath *aspath_new(enum asnotation_mode asnotation) + + as = XCALLOC(MTYPE_AS_PATH, sizeof(struct aspath)); + as->asnotation = asnotation; ++ as->count = 0; ++ + return as; + } + +@@ -396,6 +398,11 @@ unsigned int aspath_count_confeds(struct aspath *aspath) + } + + unsigned int aspath_count_hops(const struct aspath *aspath) ++{ ++ return aspath->count; ++} ++ ++static unsigned int aspath_count_hops_internal(const struct aspath *aspath) + { + int count = 0; + struct assegment *seg = aspath->segments; +@@ -705,6 +712,7 @@ struct aspath *aspath_dup(struct aspath *aspath) + else + new->str[0] = '\0'; + ++ new->count = aspath->count; + return new; + } + +@@ -726,6 +734,7 @@ static void *aspath_hash_alloc(void *arg) + new->str_len = aspath->str_len; + new->json = aspath->json; + new->asnotation = aspath->asnotation; ++ new->count = aspath->count; + + return new; + } +@@ -853,6 +862,8 @@ struct aspath *aspath_parse(struct stream *s, size_t length, int use32bit, + if (assegments_parse(s, length, &as.segments, use32bit) < 0) + return NULL; + ++ as.count = aspath_count_hops_internal(&as); ++ + /* If already same aspath exist then return it. */ + find = hash_get(ashash, &as, aspath_hash_alloc); + +@@ -1029,7 +1040,7 @@ static struct assegment *aspath_aggregate_as_set_add(struct aspath *aspath, + asset->as[asset->length - 1] = as; + } + +- ++ aspath->count = aspath_count_hops_internal(aspath); + return asset; + } + +@@ -1110,6 +1121,8 @@ struct aspath *aspath_aggregate(struct aspath *as1, struct aspath *as2) + + assegment_normalise(aspath->segments); + aspath_str_update(aspath, false); ++ aspath->count = aspath_count_hops_internal(aspath); ++ + return aspath; + } + +@@ -1265,6 +1278,7 @@ struct aspath *aspath_replace_regex_asn(struct aspath *aspath, + } + + aspath_str_update(new, false); ++ new->count = aspath_count_hops_internal(new); + return new; + } + +@@ -1290,6 +1304,8 @@ struct aspath *aspath_replace_specific_asn(struct aspath *aspath, + } + + aspath_str_update(new, false); ++ new->count = aspath_count_hops_internal(new); ++ + return new; + } + +@@ -1312,6 +1328,8 @@ struct aspath *aspath_replace_all_asn(struct aspath *aspath, as_t our_asn) + } + + aspath_str_update(new, false); ++ new->count = aspath_count_hops_internal(new); ++ + return new; + } + +@@ -1338,6 +1356,8 @@ struct aspath *aspath_replace_private_asns(struct aspath *aspath, as_t asn, + } + + aspath_str_update(new, false); ++ new->count = aspath_count_hops_internal(new); ++ + return new; + } + +@@ -1410,6 +1430,7 @@ struct aspath *aspath_remove_private_asns(struct aspath *aspath, as_t peer_asn) + if (!aspath->refcnt) + aspath_free(aspath); + aspath_str_update(new, false); ++ new->count = aspath_count_hops_internal(new); + return new; + } + +@@ -1466,6 +1487,7 @@ static struct aspath *aspath_merge(struct aspath *as1, struct aspath *as2) + last->next = as2->segments; + as2->segments = new; + aspath_str_update(as2, false); ++ as2->count = aspath_count_hops_internal(as2); + return as2; + } + +@@ -1483,6 +1505,7 @@ struct aspath *aspath_prepend(struct aspath *as1, struct aspath *as2) + if (as2->segments == NULL) { + as2->segments = assegment_dup_all(as1->segments); + aspath_str_update(as2, false); ++ as2->count = aspath_count_hops_internal(as2); + return as2; + } + +@@ -1503,6 +1526,7 @@ struct aspath *aspath_prepend(struct aspath *as1, struct aspath *as2) + if (!as2->segments) { + as2->segments = assegment_dup_all(as1->segments); + aspath_str_update(as2, false); ++ as2->count = aspath_count_hops_internal(as2); + return as2; + } + +@@ -1548,6 +1572,7 @@ struct aspath *aspath_prepend(struct aspath *as1, struct aspath *as2) + * the inbetween AS_SEQUENCE of seg2 in the process + */ + aspath_str_update(as2, false); ++ as2->count = aspath_count_hops_internal(as2); + return as2; + } else { + /* AS_SET merge code is needed at here. */ +@@ -1627,6 +1652,7 @@ struct aspath *aspath_filter_exclude(struct aspath *source, + lastseg = newseg; + } + aspath_str_update(newpath, false); ++ newpath->count = aspath_count_hops_internal(newpath); + /* We are happy returning even an empty AS_PATH, because the + * administrator + * might expect this very behaviour. There's a mean to avoid this, if +@@ -1645,6 +1671,7 @@ struct aspath *aspath_filter_exclude_all(struct aspath *source) + newpath = aspath_new(source->asnotation); + + aspath_str_update(newpath, false); ++ newpath->count = aspath_count_hops_internal(newpath); + /* We are happy returning even an empty AS_PATH, because the + * administrator + * might expect this very behaviour. There's a mean to avoid this, if +@@ -1732,6 +1759,7 @@ struct aspath *aspath_filter_exclude_acl(struct aspath *source, + + + aspath_str_update(source, false); ++ source->count = aspath_count_hops_internal(source); + /* We are happy returning even an empty AS_PATH, because the + * administrator + * might expect this very behaviour. There's a mean to avoid this, if +@@ -1770,6 +1798,7 @@ static struct aspath *aspath_add_asns(struct aspath *aspath, as_t asno, + } + + aspath_str_update(aspath, false); ++ aspath->count = aspath_count_hops_internal(aspath); + return aspath; + } + +@@ -1861,6 +1890,7 @@ struct aspath *aspath_reconcile_as4(struct aspath *aspath, + if (!hops) { + newpath = aspath_dup(as4path); + aspath_str_update(newpath, false); ++ /* dup sets the count properly */ + return newpath; + } + +@@ -1922,6 +1952,7 @@ struct aspath *aspath_reconcile_as4(struct aspath *aspath, + aspath_free(newpath); + mergedpath->segments = assegment_normalise(mergedpath->segments); + aspath_str_update(mergedpath, false); ++ mergedpath->count = aspath_count_hops_internal(mergedpath); + + if (BGP_DEBUG(as4, AS4)) + zlog_debug("[AS4] result of synthesizing is %s", +@@ -1992,8 +2023,10 @@ struct aspath *aspath_delete_confed_seq(struct aspath *aspath) + seg = next; + } + +- if (removed_confed_segment) ++ if (removed_confed_segment) { + aspath_str_update(aspath, false); ++ aspath->count = aspath_count_hops_internal(aspath); ++ } + + return aspath; + } +diff --git a/bgpd/bgp_aspath.h b/bgpd/bgp_aspath.h +index 2a831c3a55..8a7a734e6a 100644 +--- a/bgpd/bgp_aspath.h ++++ b/bgpd/bgp_aspath.h +@@ -58,6 +58,7 @@ struct aspath { + and AS path regular expression match. */ + char *str; + unsigned short str_len; ++ uint32_t count; + + /* AS notation used by string expression of AS path */ + enum asnotation_mode asnotation; +-- +2.43.2 + + +From f41a92c74a0e10defdb5bd739b6596799e43f83c Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 24 Oct 2024 11:40:56 -0400 +Subject: [PATCH 3/5] bgpd: Only grab the confed path count if we are comparing + it + +This is just a small optimization but when calling path_info_cmp +hundreds of millions of times this adds up. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_route.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index 55e0a7bc2c..7c596f02a7 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -1096,9 +1096,9 @@ int bgp_path_info_cmp(struct bgp *bgp, struct bgp_path_info *new, + /* 4. AS path length check. */ + if (!CHECK_FLAG(bgp->flags, BGP_FLAG_ASPATH_IGNORE)) { + int exist_hops = aspath_count_hops(existattr->aspath); +- int exist_confeds = aspath_count_confeds(existattr->aspath); + + if (CHECK_FLAG(bgp->flags, BGP_FLAG_ASPATH_CONFED)) { ++ int exist_confeds = aspath_count_confeds(existattr->aspath); + int aspath_hops; + + aspath_hops = aspath_count_hops(newattr->aspath); +-- +2.43.2 + + +From 3c74e1d0dec60abfc8032c7edbebe4dc84e23a24 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 24 Oct 2024 14:17:51 -0400 +Subject: [PATCH 4/5] bgpd: Fix deadlock in bgp_keepalive and master pthreads + +(gdb) bt +0 futex_wait (private=0, expected=2, futex_word=0x5c438e9a98d8) at ../sysdeps/nptl/futex-internal.h:146 +1 __GI___lll_lock_wait (futex=futex@entry=0x5c438e9a98d8, private=0) at ./nptl/lowlevellock.c:49 +2 0x00007af16d698002 in lll_mutex_lock_optimized (mutex=0x5c438e9a98d8) at ./nptl/pthread_mutex_lock.c:48 +3 ___pthread_mutex_lock (mutex=0x5c438e9a98d8) at ./nptl/pthread_mutex_lock.c:93 +4 0x00005c4369c17e70 in _frr_mtx_lock (mutex=0x5c438e9a98d8, func=0x5c4369dc2750 <__func__.265> "bgp_notify_send_internal") at ./lib/frr_pthread.h:258 +5 0x00005c4369c1a07a in bgp_notify_send_internal (connection=0x5c438e9a98c0, code=8 '\b', sub_code=0 '\000', data=0x0, datalen=0, use_curr=true) at bgpd/bgp_packet.c:928 +6 0x00005c4369c1a707 in bgp_notify_send (connection=0x5c438e9a98c0, code=8 '\b', sub_code=0 '\000') at bgpd/bgp_packet.c:1069 +7 0x00005c4369bea422 in bgp_stop_with_notify (connection=0x5c438e9a98c0, code=8 '\b', sub_code=0 '\000') at bgpd/bgp_fsm.c:1597 +8 0x00005c4369c18480 in bgp_packet_add (connection=0x5c438e9a98c0, peer=0x5c438e9b6010, s=0x7af15c06bf70) at bgpd/bgp_packet.c:151 +9 0x00005c4369c19816 in bgp_keepalive_send (peer=0x5c438e9b6010) at bgpd/bgp_packet.c:639 +10 0x00005c4369bf01fd in peer_process (hb=0x5c438ed05520, arg=0x7af16bdffaf0) at bgpd/bgp_keepalives.c:111 +11 0x00007af16dacd8e6 in hash_iterate (hash=0x7af15c000be0, func=0x5c4369bf005e , arg=0x7af16bdffaf0) at lib/hash.c:252 +12 0x00005c4369bf0679 in bgp_keepalives_start (arg=0x5c438e0db110) at bgpd/bgp_keepalives.c:214 +13 0x00007af16dac9932 in frr_pthread_inner (arg=0x5c438e0db110) at lib/frr_pthread.c:180 +14 0x00007af16d694ac3 in start_thread (arg=) at ./nptl/pthread_create.c:442 +15 0x00007af16d726850 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 +(gdb) + +The bgp keepalive pthread gets deadlocked with itself and consequently +the bgp master pthread gets locked when it attempts to lock +the peerhash_mtx, since it is also locked by the keepalive_pthread + +The keepalive pthread is locking the peerhash_mtx in +bgp_keepalives_start. Next the connection->io_mtx mutex in +bgp_keepalives_send is locked and then when it notices a problem it invokes +bgp_stop_with_notify which relocks the same mutex ( and of course +the relock causes it to get stuck on itself ). This generates a +deadlock condition. + +Modify the code to only hold the connection->io_mtx as short as +possible. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_packet.c | 60 +++++++++++++++++++++++------------------------ + 1 file changed, 29 insertions(+), 31 deletions(-) + +diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c +index 1f808eea72..effe20ab92 100644 +--- a/bgpd/bgp_packet.c ++++ b/bgpd/bgp_packet.c +@@ -122,41 +122,39 @@ static void bgp_packet_add(struct peer_connection *connection, + peer->last_sendq_ok = monotime(NULL); + + stream_fifo_push(connection->obuf, s); ++ } + +- delta = monotime(NULL) - peer->last_sendq_ok; ++ delta = monotime(NULL) - peer->last_sendq_ok; + +- if (CHECK_FLAG(peer->flags, PEER_FLAG_TIMER)) +- holdtime = atomic_load_explicit(&peer->holdtime, +- memory_order_relaxed); +- else +- holdtime = peer->bgp->default_holdtime; ++ if (CHECK_FLAG(peer->flags, PEER_FLAG_TIMER)) ++ holdtime = atomic_load_explicit(&peer->holdtime, ++ memory_order_relaxed); ++ else ++ holdtime = peer->bgp->default_holdtime; + +- sendholdtime = holdtime * 2; ++ sendholdtime = holdtime * 2; + +- /* Note that when we're here, we're adding some packet to the +- * OutQ. That includes keepalives when there is nothing to +- * do, so there's a guarantee we pass by here once in a while. +- * +- * That implies there is no need to go set up another separate +- * timer that ticks down SendHoldTime, as we'll be here sooner +- * or later anyway and will see the checks below failing. +- */ +- if (!holdtime) { +- /* no holdtime, do nothing. */ +- } else if (delta > sendholdtime) { +- flog_err( +- EC_BGP_SENDQ_STUCK_PROPER, +- "%pBP has not made any SendQ progress for 2 holdtimes (%jds), terminating session", +- peer, sendholdtime); +- BGP_EVENT_ADD(connection, TCP_fatal_error); +- } else if (delta > (intmax_t)holdtime && +- monotime(NULL) - peer->last_sendq_warn > 5) { +- flog_warn( +- EC_BGP_SENDQ_STUCK_WARN, +- "%pBP has not made any SendQ progress for 1 holdtime (%us), peer overloaded?", +- peer, holdtime); +- peer->last_sendq_warn = monotime(NULL); +- } ++ /* Note that when we're here, we're adding some packet to the ++ * OutQ. That includes keepalives when there is nothing to ++ * do, so there's a guarantee we pass by here once in a while. ++ * ++ * That implies there is no need to go set up another separate ++ * timer that ticks down SendHoldTime, as we'll be here sooner ++ * or later anyway and will see the checks below failing. ++ */ ++ if (!holdtime) { ++ /* no holdtime, do nothing. */ ++ } else if (delta > sendholdtime) { ++ flog_err(EC_BGP_SENDQ_STUCK_PROPER, ++ "%pBP has not made any SendQ progress for 2 holdtimes (%jds), terminating session", ++ peer, sendholdtime); ++ BGP_EVENT_ADD(connection, TCP_fatal_error); ++ } else if (delta > (intmax_t)holdtime && ++ monotime(NULL) - peer->last_sendq_warn > 5) { ++ flog_warn(EC_BGP_SENDQ_STUCK_WARN, ++ "%pBP has not made any SendQ progress for 1 holdtime (%us), peer overloaded?", ++ peer, holdtime); ++ peer->last_sendq_warn = monotime(NULL); + } + } + +-- +2.43.2 + + +From 2cf93b11d29475d0a7f0a0ee0759129669aa03ba Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 24 Oct 2024 17:44:31 -0400 +Subject: [PATCH 5/5] bgpd: Fix wrong pthread event cancelling + +0 __pthread_kill_implementation (no_tid=0, signo=6, threadid=130719886083648) at ./nptl/pthread_kill.c:44 +1 __pthread_kill_internal (signo=6, threadid=130719886083648) at ./nptl/pthread_kill.c:78 +2 __GI___pthread_kill (threadid=130719886083648, signo=signo@entry=6) at ./nptl/pthread_kill.c:89 +3 0x000076e399e42476 in __GI_raise (sig=6) at ../sysdeps/posix/raise.c:26 +4 0x000076e39a34f950 in core_handler (signo=6, siginfo=0x76e3985fca30, context=0x76e3985fc900) at lib/sigevent.c:258 +5 +6 __pthread_kill_implementation (no_tid=0, signo=6, threadid=130719886083648) at ./nptl/pthread_kill.c:44 +7 __pthread_kill_internal (signo=6, threadid=130719886083648) at ./nptl/pthread_kill.c:78 +8 __GI___pthread_kill (threadid=130719886083648, signo=signo@entry=6) at ./nptl/pthread_kill.c:89 +9 0x000076e399e42476 in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26 +10 0x000076e399e287f3 in __GI_abort () at ./stdlib/abort.c:79 +11 0x000076e39a39874b in _zlog_assert_failed (xref=0x76e39a46cca0 <_xref.27>, extra=0x0) at lib/zlog.c:789 +12 0x000076e39a369dde in cancel_event_helper (m=0x5eda32df5e40, arg=0x5eda33afeed0, flags=1) at lib/event.c:1428 +13 0x000076e39a369ef6 in event_cancel_event_ready (m=0x5eda32df5e40, arg=0x5eda33afeed0) at lib/event.c:1470 +14 0x00005eda0a94a5b3 in bgp_stop (connection=0x5eda33afeed0) at bgpd/bgp_fsm.c:1355 +15 0x00005eda0a94b4ae in bgp_stop_with_notify (connection=0x5eda33afeed0, code=8 '\b', sub_code=0 '\000') at bgpd/bgp_fsm.c:1610 +16 0x00005eda0a979498 in bgp_packet_add (connection=0x5eda33afeed0, peer=0x5eda33b11800, s=0x76e3880daf90) at bgpd/bgp_packet.c:152 +17 0x00005eda0a97a80f in bgp_keepalive_send (peer=0x5eda33b11800) at bgpd/bgp_packet.c:639 +18 0x00005eda0a9511fd in peer_process (hb=0x5eda33c9ab80, arg=0x76e3985ffaf0) at bgpd/bgp_keepalives.c:111 +19 0x000076e39a2cd8e6 in hash_iterate (hash=0x76e388000be0, func=0x5eda0a95105e , arg=0x76e3985ffaf0) at lib/hash.c:252 +20 0x00005eda0a951679 in bgp_keepalives_start (arg=0x5eda3306af80) at bgpd/bgp_keepalives.c:214 +21 0x000076e39a2c9932 in frr_pthread_inner (arg=0x5eda3306af80) at lib/frr_pthread.c:180 +22 0x000076e399e94ac3 in start_thread (arg=) at ./nptl/pthread_create.c:442 +23 0x000076e399f26850 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 +(gdb) f 12 +12 0x000076e39a369dde in cancel_event_helper (m=0x5eda32df5e40, arg=0x5eda33afeed0, flags=1) at lib/event.c:1428 +1428 assert(m->owner == pthread_self()); + +In this decode the attempt to cancel the connection's events from +the wrong thread is causing the crash. Modify the code to create an +event on the bm->master to cancel the events for the connection. + +Signed-off-by: Donald Sharp +--- + bgpd/bgp_fsm.c | 10 ++++++++++ + bgpd/bgp_fsm.h | 1 + + bgpd/bgp_packet.c | 3 ++- + bgpd/bgpd.h | 2 ++ + zebra/kernel_netlink.c | 2 +- + 5 files changed, 16 insertions(+), 2 deletions(-) + +diff --git a/bgpd/bgp_fsm.c b/bgpd/bgp_fsm.c +index f58ab7c027..650301163d 100644 +--- a/bgpd/bgp_fsm.c ++++ b/bgpd/bgp_fsm.c +@@ -178,6 +178,7 @@ static struct peer *peer_xfer_conn(struct peer *from_peer) + EVENT_OFF(going_away->t_delayopen); + EVENT_OFF(going_away->t_connect_check_r); + EVENT_OFF(going_away->t_connect_check_w); ++ EVENT_OFF(going_away->t_stop_with_notify); + EVENT_OFF(keeper->t_routeadv); + EVENT_OFF(keeper->t_connect); + EVENT_OFF(keeper->t_delayopen); +@@ -1472,6 +1473,8 @@ enum bgp_fsm_state_progress bgp_stop(struct peer_connection *connection) + EVENT_OFF(connection->t_connect_check_r); + EVENT_OFF(connection->t_connect_check_w); + ++ EVENT_OFF(connection->t_stop_with_notify); ++ + /* Stop all timers. */ + EVENT_OFF(connection->t_start); + EVENT_OFF(connection->t_connect); +@@ -3143,3 +3146,10 @@ void bgp_peer_gr_flags_update(struct peer *peer) + } + } + } ++ ++void bgp_event_stop_with_notify(struct event *event) ++{ ++ struct peer_connection *connection = EVENT_ARG(event); ++ ++ bgp_stop_with_notify(connection, BGP_NOTIFY_HOLD_ERR, 0); ++} +diff --git a/bgpd/bgp_fsm.h b/bgpd/bgp_fsm.h +index 2e96ac4c10..4e76262909 100644 +--- a/bgpd/bgp_fsm.h ++++ b/bgpd/bgp_fsm.h +@@ -109,6 +109,7 @@ enum bgp_fsm_state_progress { + extern void bgp_fsm_nht_update(struct peer_connection *connection, + struct peer *peer, bool has_valid_nexthops); + extern void bgp_event(struct event *event); ++extern void bgp_event_stop_with_notify(struct event *event); + extern int bgp_event_update(struct peer_connection *connection, + enum bgp_fsm_events event); + extern enum bgp_fsm_state_progress bgp_stop(struct peer_connection *connection); +diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c +index effe20ab92..2e682c7733 100644 +--- a/bgpd/bgp_packet.c ++++ b/bgpd/bgp_packet.c +@@ -148,7 +148,8 @@ static void bgp_packet_add(struct peer_connection *connection, + flog_err(EC_BGP_SENDQ_STUCK_PROPER, + "%pBP has not made any SendQ progress for 2 holdtimes (%jds), terminating session", + peer, sendholdtime); +- BGP_EVENT_ADD(connection, TCP_fatal_error); ++ event_add_event(bm->master, bgp_event_stop_with_notify, ++ connection, 0, &connection->t_stop_with_notify); + } else if (delta > (intmax_t)holdtime && + monotime(NULL) - peer->last_sendq_warn > 5) { + flog_warn(EC_BGP_SENDQ_STUCK_WARN, +diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h +index 057e26a83d..1da1a17e8b 100644 +--- a/bgpd/bgpd.h ++++ b/bgpd/bgpd.h +@@ -1189,6 +1189,8 @@ struct peer_connection { + struct event *t_process_packet; + struct event *t_process_packet_error; + ++ struct event *t_stop_with_notify; ++ + union sockunion su; + #define BGP_CONNECTION_SU_UNSPEC(connection) \ + (connection->su.sa.sa_family == AF_UNSPEC) +diff --git a/zebra/kernel_netlink.c b/zebra/kernel_netlink.c +index 8a64a1ea48..cdfb90e0f8 100644 +--- a/zebra/kernel_netlink.c ++++ b/zebra/kernel_netlink.c +@@ -931,7 +931,7 @@ static int netlink_recv_msg(struct nlsock *nl, struct msghdr *msg) + } while (status == -1 && errno == EINTR); + + if (status == -1) { +- if (errno == EWOULDBLOCK || errno == EAGAIN) ++ if (errno == EWOULDBLOCK || errno == EAGAIN || errno == EMSGSIZE) + return 0; + flog_err(EC_ZEBRA_RECVMSG_OVERRUN, "%s recvmsg overrun: %s", + nl->name, safe_strerror(errno)); +-- +2.43.2 + diff --git a/src/sonic-frr/patch/series b/src/sonic-frr/patch/series index 8381f11a0d7b..bd88dbc2bb41 100644 --- a/src/sonic-frr/patch/series +++ b/src/sonic-frr/patch/series @@ -48,3 +48,11 @@ 0066-lib-fix-srv6-locator-flags-propagated-to-isis.patch 0067-Add-support-for-SRv6-SID-Manager.patch 0068-bgpd-Extend-BGP-to-communicate-with-the-SRv6-SID-Manager-to-allocate-release-SRv6-SIDs.patch +0069-lib-nexthop-code-should-use-uint16_t-for-nexthop-cou.patch +0070-Allow-16-bit-size-for-nexthops.patch +0071-zebra-Only-notify-dplane-work-pthread-when-needed.patch +0072-Fix-up-improper-handling-of-nexthops-for-nexthop-tra.patch +0073-remove-in6addr-cmp.patch +0074-bgp-best-port-reordering.patch +0075-bgp-mp-info-changes.patch +0076-Optimizations-and-problem-fixing-for-large-scale-ecmp-from-bgp.patch