From be4cd99a3688cef19f66e1c8b8e0506ffc1e13fc Mon Sep 17 00:00:00 2001 From: Ondrej Zajicek Date: Thu, 22 Dec 2011 13:20:29 +0100 Subject: Implements deterministic MED handling. Thanks to Alexander V. Chernikov for many suggestions. --- proto/bgp/attrs.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++--- proto/bgp/bgp.c | 4 ++ proto/bgp/bgp.h | 2 + proto/bgp/config.Y | 3 +- 4 files changed, 172 insertions(+), 9 deletions(-) (limited to 'proto') diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 2832f42..93c1f6d 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -1125,6 +1125,14 @@ bgp_rte_better(rte *new, rte *old) eattr *x, *y; u32 n, o; + /* Skip suppressed routes (see bgp_rte_recalculate()) */ + n = new->u.bgp.suppressed; + o = old->u.bgp.suppressed; + if (n > o) + return 0; + if (n < o) + return 1; + /* RFC 4271 9.1.2.1. Route resolvability test */ n = rte_resolvable(new); o = rte_resolvable(old); @@ -1167,14 +1175,15 @@ bgp_rte_better(rte *new, rte *old) return 0; /* RFC 4271 9.1.2.2. c) Compare MED's */ - /* This is noncompliant. Proper RFC 4271 path selection cannot be - * interpreted as finding the best path in some ordering. - * Therefore, it cannot be implemented in BIRD without some ugly - * hacks. This is just an approximation, which in specific - * situations may lead to persistent routing loops, because it is - * nondeterministic - it depends on the order in which routes - * appeared. But it is also the same behavior as used by default in - * Cisco routers, so it is probably not a big issue. + /* Proper RFC 4271 path selection cannot be interpreted as finding + * the best path in some ordering. It is implemented partially in + * bgp_rte_recalculate() when deterministic_med option is + * active. Without that option, the behavior is just an + * approximation, which in specific situations may lead to + * persistent routing loops, because it is nondeterministic - it + * depends on the order in which routes appeared. But it is also the + * same behavior as used by default in Cisco routers, so it is + * probably not a big issue. */ if (new_bgp->cf->med_metric || old_bgp->cf->med_metric || (bgp_get_neighbor(new) == bgp_get_neighbor(old))) @@ -1236,6 +1245,148 @@ bgp_rte_better(rte *new, rte *old) return (ipa_compare(new_bgp->cf->remote_ip, old_bgp->cf->remote_ip) < 0); } + +static inline int +same_group(rte *r, u32 lpref, u32 lasn) +{ + return (r->pref == lpref) && (bgp_get_neighbor(r) == lasn); +} + +static inline int +use_deterministic_med(rte *r) +{ + return ((struct bgp_proto *) r->attrs->proto)->cf->deterministic_med; +} + +int +bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) +{ + rte *r, *s; + rte *key = new ? new : old; + u32 lpref = key->pref; + u32 lasn = bgp_get_neighbor(key); + int old_is_group_best = 0; + + /* + * Proper RFC 4271 path selection is a bit complicated, it cannot be + * implemented just by rte_better(), because it is not a linear + * ordering. But it can be splitted to two levels, where the lower + * level chooses the best routes in each group of routes from the + * same neighboring AS and higher level chooses the best route (with + * a slightly different ordering) between the best-in-group routes. + * + * When deterministic_med is disabled, we just ignore this issue and + * choose the best route by bgp_rte_better() alone. If enabled, the + * lower level of the route selection is done here (for the group + * to which the changed route belongs), all routes in group are + * marked as suppressed, just chosen best-in-group is not. + * + * Global best route selection then implements higher level by + * choosing between non-suppressed routes (as they are always + * preferred over suppressed routes). Routes from BGP protocols + * that do not set deterministic_med are just never suppressed. As + * they do not participate in the lower level selection, it is OK + * that this fn is not called for them. + * + * The idea is simple, the implementation is more problematic, + * mostly because of optimizations in rte_recalculate() that + * avoids full recalculation in most cases. + * + * We can assume that at least one of new, old is non-NULL and both + * are from the same protocol with enabled deterministic_med. We + * group routes by both neighbor AS (lasn) and preference (lpref), + * because bgp_rte_better() does not handle preference itself. + */ + + /* If new and old are from different groups, we just process that + as two independent events */ + if (new && old && !same_group(old, lpref, lasn)) + { + int i1, i2; + i1 = bgp_rte_recalculate(table, net, NULL, old, old_best); + i2 = bgp_rte_recalculate(table, net, new, NULL, old_best); + return i1 || i2; + } + + /* + * We could find the best-in-group and then make some shortcuts like + * in rte_recalculate, but as we would have to walk through all + * net->routes just to find it, it is probably not worth. So we + * just have two simpler fast cases that use just the old route. + * We also set suppressed flag to avoid using it in bgp_rte_better(). + */ + + if (new) + new->u.bgp.suppressed = 1; + + if (old) + { + old_is_group_best = !old->u.bgp.suppressed; + old->u.bgp.suppressed = 1; + int new_is_better = new && bgp_rte_better(new, old); + + /* The first case - replace not best with worse (or remove not best) */ + if (!old_is_group_best && !new_is_better) + return 0; + + /* The second case - replace the best with better */ + if (old_is_group_best && new_is_better) + { + /* new is best-in-group, the see discussion below - this is + a special variant of NBG && OBG. From OBG we can deduce + that same_group(old_best) iff (old == old_best) */ + new->u.bgp.suppressed = 0; + return (old == old_best); + } + } + + /* The default case - find a new best-in-group route */ + r = new; /* new may not be in the list */ + for (s=net->routes; s; s=s->next) + if (use_deterministic_med(s) && same_group(s, lpref, lasn)) + { + s->u.bgp.suppressed = 1; + if (!r || bgp_rte_better(s, r)) + r = s; + } + + /* Simple case - the last route in group disappears */ + if (!r) + return 0; + + /* Found best-in-group */ + r->u.bgp.suppressed = 0; + + /* + * There are generally two reasons why we have to force + * recalculation (return 1): First, the new route may be wrongfully + * chosen to be the best in the first case check in + * rte_recalculate(), this may happen only if old_best is from the + * same group. Second, another (different than new route) + * best-in-group is chosen and that may be the proper best (although + * rte_recalculate() without ignore that possibility). + * + * There are three possible cases according to whether the old route + * was the best in group (OBG, stored in old_is_group_best) and + * whether the new route is the best in group (NBG, tested by r == new). + * These cases work even if old or new is NULL. + * + * NBG -> new is a possible candidate for the best route, so we just + * check for the first reason using same_group(). + * + * !NBG && OBG -> Second reason applies, return 1 + * + * !NBG && !OBG -> Best in group does not change, old != old_best, + * rte_better(new, old_best) is false and therefore + * the first reason does not apply, return 0 + */ + + if (r == new) + return old_best && same_group(old_best, lpref, lasn); + else + return old_is_group_best; +} + static struct adata * bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool) { @@ -1614,6 +1765,11 @@ bgp_get_route_info(rte *e, byte *buf, ea_list *attrs) eattr *o = ea_find(attrs, EA_CODE(EAP_BGP, BA_ORIGIN)); u32 origas; + /* + if (e->u.bgp.suppressed) + buf += bsprintf(buf, " -"); + */ + buf += bsprintf(buf, " (%d", e->pref); if (e->attrs->hostentry) { diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 675342d..28396a5 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -908,6 +908,10 @@ bgp_init(struct proto_config *C) P->import_control = bgp_import_control; P->neigh_notify = bgp_neigh_notify; P->reload_routes = bgp_reload_routes; + + if (c->deterministic_med) + P->rte_recalculate = bgp_rte_recalculate; + p->cf = c; p->local_as = c->local_as; p->remote_as = c->remote_as; diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 437ba33..0c50583 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -29,6 +29,7 @@ struct bgp_config { int med_metric; /* Compare MULTI_EXIT_DISC even between routes from differen ASes */ int igp_metric; /* Use IGP metrics when selecting best route */ int prefer_older; /* Prefer older routes according to RFC 5004 */ + int deterministic_med; /* Use more complicated algo to have strict RFC 4271 MED comparison */ u32 default_local_pref; /* Default value for LOCAL_PREF attribute */ u32 default_med; /* Default value for MULTI_EXIT_DISC attribute */ int capabilities; /* Enable capability handshake [RFC3392] */ @@ -185,6 +186,7 @@ byte *bgp_attach_attr_wa(struct ea_list **to, struct linpool *pool, unsigned att struct rta *bgp_decode_attrs(struct bgp_conn *conn, byte *a, unsigned int len, struct linpool *pool, int mandatory); int bgp_get_attr(struct eattr *e, byte *buf, int buflen); int bgp_rte_better(struct rte *, struct rte *); +int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs); int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *); void bgp_attr_init(struct bgp_proto *); diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 03c233d..3ef9b29 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -25,7 +25,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, CLUSTER, ID, AS4, ADVERTISE, IPV4, CAPABILITIES, LIMIT, PASSIVE, PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP, - TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY) + TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC) CF_GRAMMAR @@ -82,6 +82,7 @@ bgp_proto: | bgp_proto MED METRIC bool ';' { BGP_CFG->med_metric = $4; } | bgp_proto IGP METRIC bool ';' { BGP_CFG->igp_metric = $4; } | bgp_proto PREFER OLDER bool ';' { BGP_CFG->prefer_older = $4; } + | bgp_proto DETERMINISTIC MED bool ';' { BGP_CFG->deterministic_med = $4; } | bgp_proto DEFAULT BGP_MED expr ';' { BGP_CFG->default_med = $4; } | bgp_proto DEFAULT BGP_LOCAL_PREF expr ';' { BGP_CFG->default_local_pref = $4; } | bgp_proto SOURCE ADDRESS ipa ';' { BGP_CFG->source_addr = $4; } -- cgit v1.2.3