/* * BIRD -- BGP Packet Processing * * (c) 2000 Martin Mares <mj@ucw.cz> * * Can be freely distributed and used under the terms of the GNU GPL. */ #undef LOCAL_DEBUG #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" #include "nest/route.h" #include "conf/conf.h" #include "lib/unaligned.h" #include "lib/socket.h" #include "bgp.h" static byte * bgp_create_notification(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode); buf[0] = conn->notify_code; buf[1] = conn->notify_subcode; memcpy(buf+2, conn->notify_data, conn->notify_size); return buf + 2 + conn->notify_size; } static byte * bgp_create_open(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)", BGP_VERSION, p->local_as, p->cf->hold_time, p->local_id); buf[0] = BGP_VERSION; put_u16(buf+1, p->local_as); put_u16(buf+3, p->cf->hold_time); put_u32(buf+5, p->local_id); buf[9] = 0; /* No optional parameters */ return buf+10; } static unsigned int bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, unsigned int remains) { byte *start = w; ip_addr a; int bytes; while (!EMPTY_LIST(buck->prefixes) && remains >= 5) { struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes)); DBG("\tDequeued route %I/%d\n", px->n.prefix, px->n.pxlen); *w++ = px->n.pxlen; bytes = (px->n.pxlen + 7) / 8; a = px->n.prefix; ipa_hton(a); memcpy(w, &a, bytes); w += bytes; remains -= bytes + 1; rem_node(&px->bucket_node); fib_delete(&p->prefix_fib, px); } return w - start; } #ifndef IPV6 /* IPv4 version */ static byte * bgp_create_update(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; struct bgp_bucket *buck; int remains = BGP_MAX_PACKET_LENGTH - BGP_HEADER_LENGTH - 4; byte *w; int wd_size = 0; int r_size = 0; int a_size = 0; w = buf+2; if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) { DBG("Withdrawn routes:\n"); wd_size = bgp_encode_prefixes(p, w, buck, remains); w += wd_size; remains -= wd_size; } put_u16(buf, wd_size); if (remains >= 2048) { while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next) { if (EMPTY_LIST(buck->prefixes)) { DBG("Deleting empty bucket %p\n", buck); rem_node(&buck->send_node); bgp_free_bucket(p, buck); continue; } DBG("Processing bucket %p\n", buck); a_size = bgp_encode_attrs(w+2, buck->eattrs, 1024); put_u16(w, a_size); w += a_size + 2; r_size = bgp_encode_prefixes(p, w, buck, remains - a_size); w += r_size; break; } } if (!a_size) /* Attributes not already encoded */ { put_u16(w, 0); w += 2; } if (wd_size || r_size) { BGP_TRACE(D_PACKETS, "Sending UPDATE"); return w; } else return NULL; } #else /* IPv6 version */ static byte * bgp_create_update(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; struct bgp_bucket *buck; int size, is_ll; int remains = BGP_MAX_PACKET_LENGTH - BGP_HEADER_LENGTH - 4; byte *w, *tmp, *tstart; ip_addr ip, ip_ll; ea_list *ea; eattr *nh; neighbor *n; put_u16(buf, 0); w = buf+4; if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) { DBG("Withdrawn routes:\n"); tmp = bgp_attach_attr(&ea, bgp_linpool, BA_MP_UNREACH_NLRI, remains-8); *tmp++ = 0; *tmp++ = BGP_AF_IPV6; *tmp++ = 1; ea->attrs[0].u.ptr->length = bgp_encode_prefixes(p, tmp, buck, remains-11); size = bgp_encode_attrs(w, ea, remains); w += size; remains -= size; } if (remains >= 2048) { while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next) { if (EMPTY_LIST(buck->prefixes)) { DBG("Deleting empty bucket %p\n", buck); rem_node(&buck->send_node); bgp_free_bucket(p, buck); continue; } DBG("Processing bucket %p\n", buck); size = bgp_encode_attrs(w, buck->eattrs, 1024); w += size; remains -= size; tstart = tmp = bgp_attach_attr(&ea, bgp_linpool, BA_MP_REACH_NLRI, remains-8); *tmp++ = 0; *tmp++ = BGP_AF_IPV6; *tmp++ = 1; nh = ea_find(buck->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP)); ASSERT(nh); ip = *(ip_addr *) nh->u.ptr->data; if (ipa_equal(ip, p->local_addr)) is_ll = 1; else { n = neigh_find(&p->p, &ip, 0); if (n && n->iface == p->neigh->iface) is_ll = 1; else is_ll = 0; } if (is_ll) { *tmp++ = 32; ip_ll = ipa_or(ipa_build(0xfe80,0,0,0), ipa_and(ip, ipa_build(0,0,~0,~0))); ipa_hton(ip); memcpy(tmp, &ip, 16); ipa_hton(ip_ll); memcpy(tmp+16, &ip_ll, 16); tmp += 32; } else { *tmp++ = 16; ipa_hton(ip); memcpy(tmp, &ip, 16); tmp += 16; } *tmp++ = 0; /* No SNPA information */ tmp += bgp_encode_prefixes(p, tmp, buck, remains - (8+3+32+1)); ea->attrs[0].u.ptr->length = tmp - tstart; w += bgp_encode_attrs(w, ea, remains); break; } } size = w - (buf+4); put_u16(buf+2, size); lp_flush(bgp_linpool); if (size) { BGP_TRACE(D_PACKETS, "Sending UPDATE"); return w; } else return NULL; } #endif static void bgp_create_header(byte *buf, unsigned int len, unsigned int type) { memset(buf, 0xff, 16); /* Marker */ put_u16(buf+16, len); buf[18] = type; } static int bgp_fire_tx(struct bgp_conn *conn) { struct bgp_proto *p = conn->bgp; unsigned int s = conn->packets_to_send; sock *sk = conn->sk; byte *buf, *pkt, *end; int type; if (!sk) { conn->packets_to_send = 0; return 0; } buf = sk->tbuf; pkt = buf + BGP_HEADER_LENGTH; if (s & (1 << PKT_SCHEDULE_CLOSE)) { bgp_close_conn(conn); return 0; } if (s & (1 << PKT_NOTIFICATION)) { s = 1 << PKT_SCHEDULE_CLOSE; type = PKT_NOTIFICATION; end = bgp_create_notification(conn, pkt); } else if (s & (1 << PKT_KEEPALIVE)) { s &= ~(1 << PKT_KEEPALIVE); type = PKT_KEEPALIVE; end = pkt; /* Keepalives carry no data */ BGP_TRACE(D_PACKETS, "Sending KEEPALIVE"); bgp_start_timer(conn->keepalive_timer, conn->keepalive_time); } else if (s & (1 << PKT_OPEN)) { s &= ~(1 << PKT_OPEN); type = PKT_OPEN; end = bgp_create_open(conn, pkt); } else if (s & (1 << PKT_UPDATE)) { end = bgp_create_update(conn, pkt); type = PKT_UPDATE; if (!end) { conn->packets_to_send = 0; return 0; } } else return 0; conn->packets_to_send = s; bgp_create_header(buf, end - buf, type); return sk_send(sk, end - buf); } void bgp_schedule_packet(struct bgp_conn *conn, int type) { DBG("BGP: Scheduling packet type %d\n", type); conn->packets_to_send |= 1 << type; if (conn->sk && conn->sk->tpos == conn->sk->tbuf) while (bgp_fire_tx(conn)) ; } void bgp_tx(sock *sk) { struct bgp_conn *conn = sk->data; DBG("BGP: TX hook\n"); while (bgp_fire_tx(conn)) ; } static int bgp_parse_options(struct bgp_conn *conn, byte *opt, int len) { while (len > 0) { if (len < 2 || len < 2 + opt[1]) { bgp_error(conn, 2, 0, NULL, 0); return 0; } #ifdef LOCAL_DEBUG { int i; DBG("\tOption %02x:", opt[0]); for(i=0; i<opt[1]; i++) DBG(" %02x", opt[2+i]); DBG("\n"); } #endif switch (opt[0]) { case 2: /* Defined in draft-ietf-idr-bgp4-cap-neg-06 */ /* We can safely ignore all capabilities */ break; default: /* * BGP specs don't tell us to send which option * we didn't recognize, but it's common practice * to do so. Also, capability negotiation with * Cisco routers doesn't work without that. */ bgp_error(conn, 2, 4, opt, opt[1]); return 0; } len -= 2 + opt[1]; opt += 2 + opt[1]; } return 0; } static void bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) { struct bgp_conn *other; struct bgp_proto *p = conn->bgp; struct bgp_config *cf = p->cf; unsigned as, hold; u32 id; /* Check state */ if (conn->state != BS_OPENSENT) { bgp_error(conn, 5, 0, NULL, 0); } /* Check message contents */ if (len < 29 || len != 29 + pkt[28]) { bgp_error(conn, 1, 2, pkt+16, 2); return; } if (pkt[19] != BGP_VERSION) { bgp_error(conn, 2, 1, pkt+19, 1); return; } /* RFC 1771 says 16 bits, draft-09 tells to use 8 */ as = get_u16(pkt+20); hold = get_u16(pkt+22); id = get_u32(pkt+24); BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%08x)", as, hold, id); if (cf->remote_as && as != p->remote_as) { bgp_error(conn, 2, 2, pkt+20, -2); return; } if (hold > 0 && hold < 3) { bgp_error(conn, 2, 6, pkt+22, 2); return; } p->remote_id = id; if (bgp_parse_options(conn, pkt+29, pkt[28])) return; if (!id || id == 0xffffffff || id == p->local_id) { bgp_error(conn, 2, 3, pkt+24, -4); return; } /* Check the other connection */ other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; switch (other->state) { case BS_IDLE: break; case BS_CONNECT: case BS_ACTIVE: case BS_OPENSENT: BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection"); bgp_close_conn(other); break; case BS_OPENCONFIRM: if ((p->local_id < id) == (conn == &p->incoming_conn)) { /* Should close the other connection */ BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection"); bgp_error(other, 6, 0, NULL, 0); break; } /* Fall thru */ case BS_ESTABLISHED: /* Should close this connection */ BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection"); bgp_error(conn, 6, 0, NULL, 0); return; default: bug("bgp_rx_open: Unknown state"); } /* Make this connection primary */ conn->primary = 1; p->conn = conn; /* Update our local variables */ if (hold < p->cf->hold_time) conn->hold_time = hold; else conn->hold_time = p->cf->hold_time; conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3; p->remote_as = as; p->remote_id = id; DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x\n", conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id); bgp_schedule_packet(conn, PKT_KEEPALIVE); bgp_start_timer(conn->hold_timer, conn->hold_time); conn->state = BS_OPENCONFIRM; } #define DECODE_PREFIX(pp, ll) do { \ int b = *pp++; \ int q; \ ll--; \ if (b > BITS_PER_IP_ADDRESS) { err=10; goto bad; } \ q = (b+7) / 8; \ if (ll < q) { err=1; goto bad; } \ memcpy(&prefix, pp, q); \ pp += q; \ ll -= q; \ ipa_ntoh(prefix); \ prefix = ipa_and(prefix, ipa_mkmask(b)); \ pxlen = b; \ } while (0) static inline int bgp_get_nexthop(struct bgp_proto *bgp, rta *a) { neighbor *neigh; ip_addr nexthop; struct eattr *nh = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP)); ASSERT(nh); nexthop = *(ip_addr *) nh->u.ptr->data; if (ipa_equal(nexthop, bgp->local_addr)) { DBG("BGP: Loop!\n"); return 0; } neigh = neigh_find(&bgp->p, &nexthop, 0) ? : bgp->neigh; a->gw = neigh->addr; a->iface = neigh->iface; return 1; } #ifndef IPV6 /* IPv4 version */ static void bgp_do_rx_update(struct bgp_conn *conn, byte *withdrawn, int withdrawn_len, byte *nlri, int nlri_len, byte *attrs, int attr_len) { struct bgp_proto *p = conn->bgp; rta *a0; rta *a = NULL; ip_addr prefix; net *n; rte e; int err = 0, pxlen; /* Withdraw routes */ while (withdrawn_len) { DECODE_PREFIX(withdrawn, withdrawn_len); DBG("Withdraw %I/%d\n", prefix, pxlen); if (n = net_find(p->p.table, prefix, pxlen)) rte_update(p->p.table, n, &p->p, NULL); } if (!attr_len && !nlri_len) /* shortcut */ return; a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, nlri_len); if (a0 && nlri_len && bgp_get_nexthop(p, a0)) { a = rta_lookup(a0); while (nlri_len) { rte *e; DECODE_PREFIX(nlri, nlri_len); DBG("Add %I/%d\n", prefix, pxlen); e = rte_get_temp(rta_clone(a)); n = net_get(p->p.table, prefix, pxlen); e->net = n; e->pflags = 0; rte_update(p->p.table, n, &p->p, e); } } bad: if (a) rta_free(a); if (err) bgp_error(conn, 3, err, NULL, 0); return; } #else /* IPv6 version */ #define DO_NLRI(name) \ start = x = p->name##_start; \ len = len0 = p->name##_len; \ if (len) \ { \ if (len < 3) goto bad; \ af = get_u16(x); \ sub = x[2]; \ x += 3; \ len -= 3; \ DBG("\tNLRI AF=%d sub=%d len=%d\n", af, sub, len);\ } \ else \ af = 0; \ if (af == BGP_AF_IPV6) static void bgp_do_rx_update(struct bgp_conn *conn, byte *withdrawn, int withdrawn_len, byte *nlri, int nlri_len, byte *attrs, int attr_len) { struct bgp_proto *p = conn->bgp; byte *start, *x; int len, len0; unsigned af, sub; rta *a0; rta *a = NULL; ip_addr prefix; net *n; rte e; int err = 0, pxlen; p->mp_reach_len = 0; p->mp_unreach_len = 0; a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, 0); if (!a0) return; DO_NLRI(mp_unreach) { while (len) { DECODE_PREFIX(x, len); DBG("Withdraw %I/%d\n", prefix, pxlen); if (n = net_find(p->p.table, prefix, pxlen)) rte_update(p->p.table, n, &p->p, NULL); } } DO_NLRI(mp_reach) { int i; /* Create fake NEXT_HOP attribute */ if (len < 1 || (*x != 16 && *x != 32) || len < *x + 2) goto bad; memcpy(bgp_attach_attr(&a0->eattrs, bgp_linpool, BA_NEXT_HOP, 16), x+1, 16); len -= *x + 1; x += *x + 1; /* Ignore SNPA info */ i = *x++; while (i--) { if (len < 1 || len < 1 + *x) goto bad; len -= *x + 1; x += *x + 1; } if (bgp_get_nexthop(p, a0)) { a = rta_lookup(a0); while (len) { rte *e; DECODE_PREFIX(x, len); DBG("Add %I/%d\n", prefix, pxlen); e = rte_get_temp(rta_clone(a)); n = net_get(p->p.table, prefix, pxlen); e->net = n; e->pflags = 0; rte_update(p->p.table, n, &p->p, e); } rta_free(a); } } return; bad: bgp_error(conn, 3, 9, start, len0); if (a) rta_free(a); return; } #endif static void bgp_rx_update(struct bgp_conn *conn, byte *pkt, int len) { struct bgp_proto *p = conn->bgp; byte *withdrawn, *attrs, *nlri; int withdrawn_len, attr_len, nlri_len; BGP_TRACE(D_PACKETS, "Got UPDATE"); if (conn->state != BS_ESTABLISHED) { bgp_error(conn, 5, 0, NULL, 0); return; } bgp_start_timer(conn->hold_timer, conn->hold_time); /* Find parts of the packet and check sizes */ if (len < 23) { bgp_error(conn, 1, 2, pkt+16, 2); return; } withdrawn = pkt + 21; withdrawn_len = get_u16(pkt + 19); if (withdrawn_len + 23 > len) goto malformed; attrs = withdrawn + withdrawn_len + 2; attr_len = get_u16(attrs - 2); if (withdrawn_len + attr_len + 23 > len) goto malformed; nlri = attrs + attr_len; nlri_len = len - withdrawn_len - attr_len - 23; if (!attr_len && nlri_len) goto malformed; DBG("Sizes: withdrawn=%d, attrs=%d, NLRI=%d\n", withdrawn_len, attr_len, nlri_len); lp_flush(bgp_linpool); bgp_do_rx_update(conn, withdrawn, withdrawn_len, nlri, nlri_len, attrs, attr_len); return; malformed: bgp_error(conn, 3, 1, NULL, 0); } static struct { byte major, minor; byte *msg; } bgp_msg_table[] = { { 1, 0, "Invalid message header" }, { 1, 1, "Connection not synchronized" }, { 1, 2, "Bad message length" }, { 1, 3, "Bad message type" }, { 2, 0, "Invalid OPEN message" }, { 2, 1, "Unsupported version number" }, { 2, 2, "Bad peer AS" }, { 2, 3, "Bad BGP identifier" }, { 2, 4, "Unsupported optional parameter" }, { 2, 5, "Authentication failure" }, { 2, 6, "Unacceptable hold time" }, { 2, 7, "Required capability missing" }, /* capability negotiation draft */ { 3, 0, "Invalid UPDATE message" }, { 3, 1, "Malformed attribute list" }, { 3, 2, "Unrecognized well-known attribute" }, { 3, 3, "Missing mandatory attribute" }, { 3, 4, "Invalid attribute flags" }, { 3, 5, "Invalid attribute length" }, { 3, 6, "Invalid ORIGIN attribute" }, { 3, 7, "AS routing loop" }, /* Deprecated */ { 3, 8, "Invalid NEXT_HOP attribute" }, { 3, 9, "Optional attribute error" }, { 3, 10, "Invalid network field" }, { 3, 11, "Malformed AS_PATH" }, { 4, 0, "Hold timer expired" }, { 5, 0, "Finite state machine error" }, { 6, 0, "Cease" } }; void bgp_log_error(struct bgp_proto *p, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len) { byte *name, namebuf[16]; byte *t, argbuf[36]; unsigned i; if (code == 6 && !subcode) /* Don't report Cease messages */ return; bsprintf(namebuf, "%d.%d", code, subcode); name = namebuf; for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++) if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode) { name = bgp_msg_table[i].msg; break; } t = argbuf; if (len) { *t++ = ':'; *t++ = ' '; if (len > 16) len = 16; for (i=0; i<len; i++) t += bsprintf(t, "%02x", data[i]); } *t = 0; log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, name, argbuf); } static void bgp_rx_notification(struct bgp_conn *conn, byte *pkt, int len) { if (len < 21) { bgp_error(conn, 1, 2, pkt+16, 2); return; } bgp_log_error(conn->bgp, "Received error notification", pkt[19], pkt[20], pkt+21, len-21); conn->error_flag = 1; if (conn->primary) proto_notify_state(&conn->bgp->p, PS_STOP); bgp_schedule_packet(conn, PKT_SCHEDULE_CLOSE); } static void bgp_rx_keepalive(struct bgp_conn *conn, byte *pkt, unsigned len) { struct bgp_proto *p = conn->bgp; BGP_TRACE(D_PACKETS, "Got KEEPALIVE"); bgp_start_timer(conn->hold_timer, conn->hold_time); switch (conn->state) { case BS_OPENCONFIRM: DBG("BGP: UP!!!\n"); conn->state = BS_ESTABLISHED; bgp_attr_init(conn->bgp); proto_notify_state(&conn->bgp->p, PS_UP); break; case BS_ESTABLISHED: break; default: bgp_error(conn, 5, 0, NULL, 0); } } static void bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len) { DBG("BGP: Got packet %02x (%d bytes)\n", pkt[18], len); switch (pkt[18]) { case PKT_OPEN: return bgp_rx_open(conn, pkt, len); case PKT_UPDATE: return bgp_rx_update(conn, pkt, len); case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len); case PKT_KEEPALIVE: return bgp_rx_keepalive(conn, pkt, len); default: bgp_error(conn, 1, 3, pkt+18, 1); } } int bgp_rx(sock *sk, int size) { struct bgp_conn *conn = sk->data; byte *pkt_start = sk->rbuf; byte *end = pkt_start + size; unsigned i, len; DBG("BGP: RX hook: Got %d bytes\n", size); while (end >= pkt_start + BGP_HEADER_LENGTH) { if (conn->error_flag) { /* * We still need to remember the erroneous packet, so that * we can generate error notifications properly. To avoid * subsequent reads rewriting the buffer, we just reset the * rx_hook. */ DBG("BGP: Error, dropping input\n"); sk->rx_hook = NULL; return 0; } for(i=0; i<16; i++) if (pkt_start[i] != 0xff) { bgp_error(conn, 1, 1, NULL, 0); break; } len = get_u16(pkt_start+16); if (len < BGP_HEADER_LENGTH || len > BGP_MAX_PACKET_LENGTH) { bgp_error(conn, 1, 2, pkt_start+16, 2); break; } if (end < pkt_start + len) break; bgp_rx_packet(conn, pkt_start, len); pkt_start += len; } if (pkt_start != sk->rbuf) { memmove(sk->rbuf, pkt_start, end - pkt_start); sk->rpos = sk->rbuf + (end - pkt_start); } return 0; }