Implements deterministic MED handling.

Thanks to Alexander V. Chernikov for many suggestions.
This commit is contained in:
Ondrej Zajicek 2011-12-22 13:20:29 +01:00
parent cf7f064531
commit be4cd99a36
9 changed files with 245 additions and 27 deletions

View file

@ -1316,6 +1316,21 @@ for each neighbor using the following configuration parameters:
received from the same AS (which is the standard behavior).
Default: off.
<tag>deterministic med <m/switch/</tag> BGP route selection
algorithm is often viewed as a comparison between individual
routes (e.g. if a new route appears and is better than the
current best one, it is chosen as the new best one). But the
proper route selection, as specified by RFC 4271, cannot be
fully implemented in that way. The problem is mainly in
handling the MED attribute. BIRD, by default, uses an
simplification based on individual route comparison, which in
some cases may lead to temporally dependent behavior (i.e. the
selection is dependent on the order in which routes appeared).
This option enables a different (and slower) algorithm
implementing proper RFC 4271 route selection, which is
deterministic. Alternative way how to get deterministic
behavior is to use <cf/med metric/ option. Default: off.
<tag>igp metric <m/switch/</tag> Enable comparison of internal
distances to boundary routers during best route selection. Default: on.

View file

@ -267,6 +267,26 @@ void store_tmp_attrs(rte *e, ea_list *attrs)
int import_control(struct proto *p, rte **e, ea_list **attrs, struct linpool *pool)
{ DUMMY; }
/**
* rte_recalculate - prepare routes for comparison
* @table: a routing table
* @net: a network entry
* @new: new route for the network
* @old: old route for the network
* @old_best: old best route for the network (may be NULL)
*
* This hook is called when a route change (from @old to @new for a
* @net entry) is propagated to a @table. It may be used to prepare
* routes for comparison by rte_better() in the best route
* selection. @new may or may not be in @net->routes list,
* @old is not there.
*
* Result: 1 if the ordering implied by rte_better() changes enough
* that full best route calculation have to be done, 0 otherwise.
*/
int rte_recalculate(struct rtable *table, struct network *net, struct rte *new, struct rte *old, struct rte *old_best)
{ DUMMY; }
/**
* rte_better - compare metrics of two routes
* @new: the new route

View file

@ -178,12 +178,14 @@ struct proto {
/*
* Routing entry hooks (called only for rte's belonging to this protocol):
*
* rte_recalculate Called at the beginning of the best route selection
* rte_better Compare two rte's and decide which one is better (1=first, 0=second).
* rte_same Compare two rte's and decide whether they are identical (1=yes, 0=no).
* rte_insert Called whenever a rte is inserted to a routing table.
* rte_remove Called whenever a rte is removed from the routing table.
*/
int (*rte_recalculate)(struct rtable *, struct network *, struct rte *, struct rte *, struct rte *);
int (*rte_better)(struct rte *, struct rte *);
int (*rte_same)(struct rte *, struct rte *);
void (*rte_insert)(struct network *, struct rte *);

View file

@ -200,6 +200,11 @@ typedef struct rte {
u32 tag; /* External route tag */
u32 router_id; /* Router that originated this route */
} ospf;
#endif
#ifdef CONFIG_BGP
struct {
u8 suppressed; /* Used for deterministic MED comparison */
} bgp;
#endif
struct { /* Routes generated by krt sync (both temporary and inherited ones) */
s8 src; /* Alleged route source (see krt.h) */

View file

@ -498,6 +498,9 @@ rte_recalculate(rtable *table, net *net, struct proto *p, struct proto *src, rte
rte_announce(table, RA_ANY, net, new, old, tmpa);
if (src->rte_recalculate && src->rte_recalculate(table, net, new, old, old_best))
goto do_recalculate;
if (new && rte_better(new, old_best))
{
/* The first case - the new route is cleary optimal, we link it
@ -516,6 +519,7 @@ rte_recalculate(rtable *table, net *net, struct proto *p, struct proto *src, rte
that route at the first position and announce it. New optimal
route might be NULL if there is no more routes */
do_recalculate:
/* Add the new route to the list */
if (new)
{
@ -1015,27 +1019,36 @@ rt_next_hop_update_net(rtable *tab, net *n)
if (!old_best)
return 0;
new_best = NULL;
for (k = &n->routes; e = *k; k = &e->next)
if (rta_next_hop_outdated(e->attrs))
{
new = rt_next_hop_update_rte(tab, e);
*k = new;
rte_announce_i(tab, RA_ANY, n, new, e);
rte_trace_in(D_ROUTES, new->sender, new, "updated");
/* Call a pre-comparison hook */
/* Not really an efficient way to compute this */
if (e->attrs->proto->rte_recalculate)
e->attrs->proto->rte_recalculate(tab, n, new, e, NULL);
if (e != old_best)
rte_free_quick(e);
else /* Freeing of the old best rte is postponed */
free_old_best = 1;
e = new;
count++;
}
if (!count)
return 0;
/* Find the new best route */
new_best = NULL;
for (k = &n->routes; e = *k; k = &e->next)
{
if (rta_next_hop_outdated(e->attrs))
{
new = rt_next_hop_update_rte(tab, e);
*k = new;
rte_announce_i(tab, RA_ANY, n, new, e);
rte_trace_in(D_ROUTES, new->sender, new, "updated");
if (e != old_best)
rte_free_quick(e);
else /* Freeing of the old best rte is postponed */
free_old_best = 1;
e = new;
count++;
}
if (!new_best || rte_better(e, *new_best))
new_best = k;
}

View file

@ -1125,6 +1125,14 @@ bgp_rte_better(rte *new, rte *old)
eattr *x, *y;
u32 n, o;
/* Skip suppressed routes (see bgp_rte_recalculate()) */
n = new->u.bgp.suppressed;
o = old->u.bgp.suppressed;
if (n > o)
return 0;
if (n < o)
return 1;
/* RFC 4271 9.1.2.1. Route resolvability test */
n = rte_resolvable(new);
o = rte_resolvable(old);
@ -1167,14 +1175,15 @@ bgp_rte_better(rte *new, rte *old)
return 0;
/* RFC 4271 9.1.2.2. c) Compare MED's */
/* This is noncompliant. Proper RFC 4271 path selection cannot be
* interpreted as finding the best path in some ordering.
* Therefore, it cannot be implemented in BIRD without some ugly
* hacks. This is just an approximation, which in specific
* situations may lead to persistent routing loops, because it is
* nondeterministic - it depends on the order in which routes
* appeared. But it is also the same behavior as used by default in
* Cisco routers, so it is probably not a big issue.
/* Proper RFC 4271 path selection cannot be interpreted as finding
* the best path in some ordering. It is implemented partially in
* bgp_rte_recalculate() when deterministic_med option is
* active. Without that option, the behavior is just an
* approximation, which in specific situations may lead to
* persistent routing loops, because it is nondeterministic - it
* depends on the order in which routes appeared. But it is also the
* same behavior as used by default in Cisco routers, so it is
* probably not a big issue.
*/
if (new_bgp->cf->med_metric || old_bgp->cf->med_metric ||
(bgp_get_neighbor(new) == bgp_get_neighbor(old)))
@ -1236,6 +1245,148 @@ bgp_rte_better(rte *new, rte *old)
return (ipa_compare(new_bgp->cf->remote_ip, old_bgp->cf->remote_ip) < 0);
}
static inline int
same_group(rte *r, u32 lpref, u32 lasn)
{
return (r->pref == lpref) && (bgp_get_neighbor(r) == lasn);
}
static inline int
use_deterministic_med(rte *r)
{
return ((struct bgp_proto *) r->attrs->proto)->cf->deterministic_med;
}
int
bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
{
rte *r, *s;
rte *key = new ? new : old;
u32 lpref = key->pref;
u32 lasn = bgp_get_neighbor(key);
int old_is_group_best = 0;
/*
* Proper RFC 4271 path selection is a bit complicated, it cannot be
* implemented just by rte_better(), because it is not a linear
* ordering. But it can be splitted to two levels, where the lower
* level chooses the best routes in each group of routes from the
* same neighboring AS and higher level chooses the best route (with
* a slightly different ordering) between the best-in-group routes.
*
* When deterministic_med is disabled, we just ignore this issue and
* choose the best route by bgp_rte_better() alone. If enabled, the
* lower level of the route selection is done here (for the group
* to which the changed route belongs), all routes in group are
* marked as suppressed, just chosen best-in-group is not.
*
* Global best route selection then implements higher level by
* choosing between non-suppressed routes (as they are always
* preferred over suppressed routes). Routes from BGP protocols
* that do not set deterministic_med are just never suppressed. As
* they do not participate in the lower level selection, it is OK
* that this fn is not called for them.
*
* The idea is simple, the implementation is more problematic,
* mostly because of optimizations in rte_recalculate() that
* avoids full recalculation in most cases.
*
* We can assume that at least one of new, old is non-NULL and both
* are from the same protocol with enabled deterministic_med. We
* group routes by both neighbor AS (lasn) and preference (lpref),
* because bgp_rte_better() does not handle preference itself.
*/
/* If new and old are from different groups, we just process that
as two independent events */
if (new && old && !same_group(old, lpref, lasn))
{
int i1, i2;
i1 = bgp_rte_recalculate(table, net, NULL, old, old_best);
i2 = bgp_rte_recalculate(table, net, new, NULL, old_best);
return i1 || i2;
}
/*
* We could find the best-in-group and then make some shortcuts like
* in rte_recalculate, but as we would have to walk through all
* net->routes just to find it, it is probably not worth. So we
* just have two simpler fast cases that use just the old route.
* We also set suppressed flag to avoid using it in bgp_rte_better().
*/
if (new)
new->u.bgp.suppressed = 1;
if (old)
{
old_is_group_best = !old->u.bgp.suppressed;
old->u.bgp.suppressed = 1;
int new_is_better = new && bgp_rte_better(new, old);
/* The first case - replace not best with worse (or remove not best) */
if (!old_is_group_best && !new_is_better)
return 0;
/* The second case - replace the best with better */
if (old_is_group_best && new_is_better)
{
/* new is best-in-group, the see discussion below - this is
a special variant of NBG && OBG. From OBG we can deduce
that same_group(old_best) iff (old == old_best) */
new->u.bgp.suppressed = 0;
return (old == old_best);
}
}
/* The default case - find a new best-in-group route */
r = new; /* new may not be in the list */
for (s=net->routes; s; s=s->next)
if (use_deterministic_med(s) && same_group(s, lpref, lasn))
{
s->u.bgp.suppressed = 1;
if (!r || bgp_rte_better(s, r))
r = s;
}
/* Simple case - the last route in group disappears */
if (!r)
return 0;
/* Found best-in-group */
r->u.bgp.suppressed = 0;
/*
* There are generally two reasons why we have to force
* recalculation (return 1): First, the new route may be wrongfully
* chosen to be the best in the first case check in
* rte_recalculate(), this may happen only if old_best is from the
* same group. Second, another (different than new route)
* best-in-group is chosen and that may be the proper best (although
* rte_recalculate() without ignore that possibility).
*
* There are three possible cases according to whether the old route
* was the best in group (OBG, stored in old_is_group_best) and
* whether the new route is the best in group (NBG, tested by r == new).
* These cases work even if old or new is NULL.
*
* NBG -> new is a possible candidate for the best route, so we just
* check for the first reason using same_group().
*
* !NBG && OBG -> Second reason applies, return 1
*
* !NBG && !OBG -> Best in group does not change, old != old_best,
* rte_better(new, old_best) is false and therefore
* the first reason does not apply, return 0
*/
if (r == new)
return old_best && same_group(old_best, lpref, lasn);
else
return old_is_group_best;
}
static struct adata *
bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool)
{
@ -1614,6 +1765,11 @@ bgp_get_route_info(rte *e, byte *buf, ea_list *attrs)
eattr *o = ea_find(attrs, EA_CODE(EAP_BGP, BA_ORIGIN));
u32 origas;
/*
if (e->u.bgp.suppressed)
buf += bsprintf(buf, " -");
*/
buf += bsprintf(buf, " (%d", e->pref);
if (e->attrs->hostentry)
{

View file

@ -908,6 +908,10 @@ bgp_init(struct proto_config *C)
P->import_control = bgp_import_control;
P->neigh_notify = bgp_neigh_notify;
P->reload_routes = bgp_reload_routes;
if (c->deterministic_med)
P->rte_recalculate = bgp_rte_recalculate;
p->cf = c;
p->local_as = c->local_as;
p->remote_as = c->remote_as;

View file

@ -29,6 +29,7 @@ struct bgp_config {
int med_metric; /* Compare MULTI_EXIT_DISC even between routes from differen ASes */
int igp_metric; /* Use IGP metrics when selecting best route */
int prefer_older; /* Prefer older routes according to RFC 5004 */
int deterministic_med; /* Use more complicated algo to have strict RFC 4271 MED comparison */
u32 default_local_pref; /* Default value for LOCAL_PREF attribute */
u32 default_med; /* Default value for MULTI_EXIT_DISC attribute */
int capabilities; /* Enable capability handshake [RFC3392] */
@ -185,6 +186,7 @@ byte *bgp_attach_attr_wa(struct ea_list **to, struct linpool *pool, unsigned att
struct rta *bgp_decode_attrs(struct bgp_conn *conn, byte *a, unsigned int len, struct linpool *pool, int mandatory);
int bgp_get_attr(struct eattr *e, byte *buf, int buflen);
int bgp_rte_better(struct rte *, struct rte *);
int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best);
void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs);
int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *);
void bgp_attr_init(struct bgp_proto *);

View file

@ -25,7 +25,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY,
CLUSTER, ID, AS4, ADVERTISE, IPV4, CAPABILITIES, LIMIT, PASSIVE,
PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH,
INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP,
TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY)
TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC)
CF_GRAMMAR
@ -82,6 +82,7 @@ bgp_proto:
| bgp_proto MED METRIC bool ';' { BGP_CFG->med_metric = $4; }
| bgp_proto IGP METRIC bool ';' { BGP_CFG->igp_metric = $4; }
| bgp_proto PREFER OLDER bool ';' { BGP_CFG->prefer_older = $4; }
| bgp_proto DETERMINISTIC MED bool ';' { BGP_CFG->deterministic_med = $4; }
| bgp_proto DEFAULT BGP_MED expr ';' { BGP_CFG->default_med = $4; }
| bgp_proto DEFAULT BGP_LOCAL_PREF expr ';' { BGP_CFG->default_local_pref = $4; }
| bgp_proto SOURCE ADDRESS ipa ';' { BGP_CFG->source_addr = $4; }