summaryrefslogtreecommitdiffstats
path: root/proto/bgp
diff options
context:
space:
mode:
authorOndrej Zajicek <santiago@crfreenet.org>2011-12-22 13:20:29 +0100
committerOndrej Zajicek <santiago@crfreenet.org>2011-12-22 13:20:29 +0100
commitbe4cd99a3688cef19f66e1c8b8e0506ffc1e13fc (patch)
tree2ac6aed8d703b4150976493ddcc179b395a795c8 /proto/bgp
parentcf7f0645316f5df0984467cf7001f5466254eaf3 (diff)
downloadbird-be4cd99a3688cef19f66e1c8b8e0506ffc1e13fc.tar
bird-be4cd99a3688cef19f66e1c8b8e0506ffc1e13fc.zip
Implements deterministic MED handling.
Thanks to Alexander V. Chernikov for many suggestions.
Diffstat (limited to 'proto/bgp')
-rw-r--r--proto/bgp/attrs.c172
-rw-r--r--proto/bgp/bgp.c4
-rw-r--r--proto/bgp/bgp.h2
-rw-r--r--proto/bgp/config.Y3
4 files changed, 172 insertions, 9 deletions
diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c
index 2832f42..93c1f6d 100644
--- a/proto/bgp/attrs.c
+++ b/proto/bgp/attrs.c
@@ -1125,6 +1125,14 @@ bgp_rte_better(rte *new, rte *old)
eattr *x, *y;
u32 n, o;
+ /* Skip suppressed routes (see bgp_rte_recalculate()) */
+ n = new->u.bgp.suppressed;
+ o = old->u.bgp.suppressed;
+ if (n > o)
+ return 0;
+ if (n < o)
+ return 1;
+
/* RFC 4271 9.1.2.1. Route resolvability test */
n = rte_resolvable(new);
o = rte_resolvable(old);
@@ -1167,14 +1175,15 @@ bgp_rte_better(rte *new, rte *old)
return 0;
/* RFC 4271 9.1.2.2. c) Compare MED's */
- /* This is noncompliant. Proper RFC 4271 path selection cannot be
- * interpreted as finding the best path in some ordering.
- * Therefore, it cannot be implemented in BIRD without some ugly
- * hacks. This is just an approximation, which in specific
- * situations may lead to persistent routing loops, because it is
- * nondeterministic - it depends on the order in which routes
- * appeared. But it is also the same behavior as used by default in
- * Cisco routers, so it is probably not a big issue.
+ /* Proper RFC 4271 path selection cannot be interpreted as finding
+ * the best path in some ordering. It is implemented partially in
+ * bgp_rte_recalculate() when deterministic_med option is
+ * active. Without that option, the behavior is just an
+ * approximation, which in specific situations may lead to
+ * persistent routing loops, because it is nondeterministic - it
+ * depends on the order in which routes appeared. But it is also the
+ * same behavior as used by default in Cisco routers, so it is
+ * probably not a big issue.
*/
if (new_bgp->cf->med_metric || old_bgp->cf->med_metric ||
(bgp_get_neighbor(new) == bgp_get_neighbor(old)))
@@ -1236,6 +1245,148 @@ bgp_rte_better(rte *new, rte *old)
return (ipa_compare(new_bgp->cf->remote_ip, old_bgp->cf->remote_ip) < 0);
}
+
+static inline int
+same_group(rte *r, u32 lpref, u32 lasn)
+{
+ return (r->pref == lpref) && (bgp_get_neighbor(r) == lasn);
+}
+
+static inline int
+use_deterministic_med(rte *r)
+{
+ return ((struct bgp_proto *) r->attrs->proto)->cf->deterministic_med;
+}
+
+int
+bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
+{
+ rte *r, *s;
+ rte *key = new ? new : old;
+ u32 lpref = key->pref;
+ u32 lasn = bgp_get_neighbor(key);
+ int old_is_group_best = 0;
+
+ /*
+ * Proper RFC 4271 path selection is a bit complicated, it cannot be
+ * implemented just by rte_better(), because it is not a linear
+ * ordering. But it can be splitted to two levels, where the lower
+ * level chooses the best routes in each group of routes from the
+ * same neighboring AS and higher level chooses the best route (with
+ * a slightly different ordering) between the best-in-group routes.
+ *
+ * When deterministic_med is disabled, we just ignore this issue and
+ * choose the best route by bgp_rte_better() alone. If enabled, the
+ * lower level of the route selection is done here (for the group
+ * to which the changed route belongs), all routes in group are
+ * marked as suppressed, just chosen best-in-group is not.
+ *
+ * Global best route selection then implements higher level by
+ * choosing between non-suppressed routes (as they are always
+ * preferred over suppressed routes). Routes from BGP protocols
+ * that do not set deterministic_med are just never suppressed. As
+ * they do not participate in the lower level selection, it is OK
+ * that this fn is not called for them.
+ *
+ * The idea is simple, the implementation is more problematic,
+ * mostly because of optimizations in rte_recalculate() that
+ * avoids full recalculation in most cases.
+ *
+ * We can assume that at least one of new, old is non-NULL and both
+ * are from the same protocol with enabled deterministic_med. We
+ * group routes by both neighbor AS (lasn) and preference (lpref),
+ * because bgp_rte_better() does not handle preference itself.
+ */
+
+ /* If new and old are from different groups, we just process that
+ as two independent events */
+ if (new && old && !same_group(old, lpref, lasn))
+ {
+ int i1, i2;
+ i1 = bgp_rte_recalculate(table, net, NULL, old, old_best);
+ i2 = bgp_rte_recalculate(table, net, new, NULL, old_best);
+ return i1 || i2;
+ }
+
+ /*
+ * We could find the best-in-group and then make some shortcuts like
+ * in rte_recalculate, but as we would have to walk through all
+ * net->routes just to find it, it is probably not worth. So we
+ * just have two simpler fast cases that use just the old route.
+ * We also set suppressed flag to avoid using it in bgp_rte_better().
+ */
+
+ if (new)
+ new->u.bgp.suppressed = 1;
+
+ if (old)
+ {
+ old_is_group_best = !old->u.bgp.suppressed;
+ old->u.bgp.suppressed = 1;
+ int new_is_better = new && bgp_rte_better(new, old);
+
+ /* The first case - replace not best with worse (or remove not best) */
+ if (!old_is_group_best && !new_is_better)
+ return 0;
+
+ /* The second case - replace the best with better */
+ if (old_is_group_best && new_is_better)
+ {
+ /* new is best-in-group, the see discussion below - this is
+ a special variant of NBG && OBG. From OBG we can deduce
+ that same_group(old_best) iff (old == old_best) */
+ new->u.bgp.suppressed = 0;
+ return (old == old_best);
+ }
+ }
+
+ /* The default case - find a new best-in-group route */
+ r = new; /* new may not be in the list */
+ for (s=net->routes; s; s=s->next)
+ if (use_deterministic_med(s) && same_group(s, lpref, lasn))
+ {
+ s->u.bgp.suppressed = 1;
+ if (!r || bgp_rte_better(s, r))
+ r = s;
+ }
+
+ /* Simple case - the last route in group disappears */
+ if (!r)
+ return 0;
+
+ /* Found best-in-group */
+ r->u.bgp.suppressed = 0;
+
+ /*
+ * There are generally two reasons why we have to force
+ * recalculation (return 1): First, the new route may be wrongfully
+ * chosen to be the best in the first case check in
+ * rte_recalculate(), this may happen only if old_best is from the
+ * same group. Second, another (different than new route)
+ * best-in-group is chosen and that may be the proper best (although
+ * rte_recalculate() without ignore that possibility).
+ *
+ * There are three possible cases according to whether the old route
+ * was the best in group (OBG, stored in old_is_group_best) and
+ * whether the new route is the best in group (NBG, tested by r == new).
+ * These cases work even if old or new is NULL.
+ *
+ * NBG -> new is a possible candidate for the best route, so we just
+ * check for the first reason using same_group().
+ *
+ * !NBG && OBG -> Second reason applies, return 1
+ *
+ * !NBG && !OBG -> Best in group does not change, old != old_best,
+ * rte_better(new, old_best) is false and therefore
+ * the first reason does not apply, return 0
+ */
+
+ if (r == new)
+ return old_best && same_group(old_best, lpref, lasn);
+ else
+ return old_is_group_best;
+}
+
static struct adata *
bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool)
{
@@ -1614,6 +1765,11 @@ bgp_get_route_info(rte *e, byte *buf, ea_list *attrs)
eattr *o = ea_find(attrs, EA_CODE(EAP_BGP, BA_ORIGIN));
u32 origas;
+ /*
+ if (e->u.bgp.suppressed)
+ buf += bsprintf(buf, " -");
+ */
+
buf += bsprintf(buf, " (%d", e->pref);
if (e->attrs->hostentry)
{
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index 675342d..28396a5 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -908,6 +908,10 @@ bgp_init(struct proto_config *C)
P->import_control = bgp_import_control;
P->neigh_notify = bgp_neigh_notify;
P->reload_routes = bgp_reload_routes;
+
+ if (c->deterministic_med)
+ P->rte_recalculate = bgp_rte_recalculate;
+
p->cf = c;
p->local_as = c->local_as;
p->remote_as = c->remote_as;
diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h
index 437ba33..0c50583 100644
--- a/proto/bgp/bgp.h
+++ b/proto/bgp/bgp.h
@@ -29,6 +29,7 @@ struct bgp_config {
int med_metric; /* Compare MULTI_EXIT_DISC even between routes from differen ASes */
int igp_metric; /* Use IGP metrics when selecting best route */
int prefer_older; /* Prefer older routes according to RFC 5004 */
+ int deterministic_med; /* Use more complicated algo to have strict RFC 4271 MED comparison */
u32 default_local_pref; /* Default value for LOCAL_PREF attribute */
u32 default_med; /* Default value for MULTI_EXIT_DISC attribute */
int capabilities; /* Enable capability handshake [RFC3392] */
@@ -185,6 +186,7 @@ byte *bgp_attach_attr_wa(struct ea_list **to, struct linpool *pool, unsigned att
struct rta *bgp_decode_attrs(struct bgp_conn *conn, byte *a, unsigned int len, struct linpool *pool, int mandatory);
int bgp_get_attr(struct eattr *e, byte *buf, int buflen);
int bgp_rte_better(struct rte *, struct rte *);
+int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best);
void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs);
int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *);
void bgp_attr_init(struct bgp_proto *);
diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y
index 03c233d..3ef9b29 100644
--- a/proto/bgp/config.Y
+++ b/proto/bgp/config.Y
@@ -25,7 +25,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY,
CLUSTER, ID, AS4, ADVERTISE, IPV4, CAPABILITIES, LIMIT, PASSIVE,
PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH,
INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP,
- TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY)
+ TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC)
CF_GRAMMAR
@@ -82,6 +82,7 @@ bgp_proto:
| bgp_proto MED METRIC bool ';' { BGP_CFG->med_metric = $4; }
| bgp_proto IGP METRIC bool ';' { BGP_CFG->igp_metric = $4; }
| bgp_proto PREFER OLDER bool ';' { BGP_CFG->prefer_older = $4; }
+ | bgp_proto DETERMINISTIC MED bool ';' { BGP_CFG->deterministic_med = $4; }
| bgp_proto DEFAULT BGP_MED expr ';' { BGP_CFG->default_med = $4; }
| bgp_proto DEFAULT BGP_LOCAL_PREF expr ';' { BGP_CFG->default_local_pref = $4; }
| bgp_proto SOURCE ADDRESS ipa ';' { BGP_CFG->source_addr = $4; }