bird/sysdep/unix/io.c

2406 lines
49 KiB
C
Raw Permalink Normal View History

/*
* BIRD Internet Routing Daemon -- Unix I/O
*
* (c) 1998--2004 Martin Mares <mj@ucw.cz>
* (c) 2004 Ondrej Filip <feela@network.cz>
*
* Can be freely distributed and used under the terms of the GNU GPL.
*/
2010-04-14 21:35:08 +08:00
/* Unfortunately, some glibc versions hide parts of RFC 3542 API
if _GNU_SOURCE is not defined. */
2016-11-09 00:46:29 +08:00
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
2010-04-14 21:35:08 +08:00
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
2010-04-03 16:45:21 +08:00
#include <sys/uio.h>
1999-10-29 20:09:29 +08:00
#include <sys/un.h>
#include <poll.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
2014-05-18 17:42:26 +08:00
#include <net/if.h>
2011-03-24 00:15:11 +08:00
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <netinet/icmp6.h>
#include "nest/bird.h"
#include "lib/lists.h"
#include "lib/resource.h"
#include "lib/socket.h"
#include "lib/event.h"
#include "lib/timer.h"
#include "lib/string.h"
#include "nest/iface.h"
#include "conf/conf.h"
#include "sysdep/unix/unix.h"
#include CONFIG_INCLUDE_SYSIO_H
/* Maximum number of calls of tx handler for one socket in one
* poll iteration. Should be small enough to not monopolize CPU by
* one protocol instance.
*/
#define MAX_STEPS 4
/* Maximum number of calls of rx handler for all sockets in one poll
iteration. RX callbacks are often much more costly so we limit
this to gen small latencies */
#define MAX_RX_STEPS 4
/*
* Tracked Files
*/
struct rfile {
resource r;
FILE *f;
};
static void
rf_free(resource *r)
{
struct rfile *a = (struct rfile *) r;
fclose(a->f);
}
static void
rf_dump(resource *r)
{
struct rfile *a = (struct rfile *) r;
debug("(FILE *%p)\n", a->f);
}
static struct resclass rf_class = {
"FILE",
sizeof(struct rfile),
rf_free,
rf_dump,
NULL,
NULL
};
struct rfile *
rf_open(pool *p, const char *name, const char *mode)
{
FILE *f = fopen(name, mode);
if (!f)
return NULL;
struct rfile *r = ralloc(p, &rf_class);
r->f = f;
return r;
}
void *
rf_file(struct rfile *f)
{
return f->f;
}
int
rf_fileno(struct rfile *f)
{
return fileno(f->f);
}
2014-05-18 17:42:26 +08:00
/*
* Time clock
*/
btime boot_time;
void
times_init(struct timeloop *loop)
{
struct timespec ts;
int rv;
rv = clock_gettime(CLOCK_MONOTONIC, &ts);
if (rv < 0)
die("Monotonic clock is missing");
if ((ts.tv_sec < 0) || (((u64) ts.tv_sec) > ((u64) 1 << 40)))
log(L_WARN "Monotonic clock is crazy");
loop->last_time = ts.tv_sec S + ts.tv_nsec NS;
loop->real_time = 0;
}
void
times_update(struct timeloop *loop)
{
struct timespec ts;
int rv;
rv = clock_gettime(CLOCK_MONOTONIC, &ts);
if (rv < 0)
die("clock_gettime: %m");
btime new_time = ts.tv_sec S + ts.tv_nsec NS;
if (new_time < loop->last_time)
log(L_ERR "Monotonic clock is broken");
loop->last_time = new_time;
loop->real_time = 0;
}
void
times_update_real_time(struct timeloop *loop)
{
struct timespec ts;
int rv;
rv = clock_gettime(CLOCK_REALTIME, &ts);
if (rv < 0)
die("clock_gettime: %m");
loop->real_time = ts.tv_sec S + ts.tv_nsec NS;
}
2000-06-05 20:19:12 +08:00
/**
* DOC: Sockets
*
* Socket resources represent network connections. Their data structure (&socket)
* contains a lot of fields defining the exact type of the socket, the local and
* remote addresses and ports, pointers to socket buffers and finally pointers to
* hook functions to be called when new data have arrived to the receive buffer
* (@rx_hook), when the contents of the transmit buffer have been transmitted
* (@tx_hook) and when an error or connection close occurs (@err_hook).
*
* Freeing of sockets from inside socket hooks is perfectly safe.
*/
1999-04-01 23:33:52 +08:00
#ifndef SOL_IP
#define SOL_IP IPPROTO_IP
#endif
#ifndef SOL_IPV6
#define SOL_IPV6 IPPROTO_IPV6
#endif
#ifndef SOL_ICMPV6
#define SOL_ICMPV6 IPPROTO_ICMPV6
#endif
2014-05-18 17:42:26 +08:00
/*
* Sockaddr helper functions
*/
static inline int UNUSED sockaddr_length(int af)
2014-05-18 17:42:26 +08:00
{ return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
static inline void
sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
{
2014-05-18 17:42:26 +08:00
memset(sa, 0, sizeof(struct sockaddr_in));
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
2014-05-18 17:42:26 +08:00
sa->sin_len = sizeof(struct sockaddr_in);
#endif
sa->sin_family = AF_INET;
sa->sin_port = htons(port);
sa->sin_addr = ipa_to_in4(a);
}
2014-05-18 17:42:26 +08:00
static inline void
sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
{
2014-05-18 17:42:26 +08:00
memset(sa, 0, sizeof(struct sockaddr_in6));
#ifdef SIN6_LEN
sa->sin6_len = sizeof(struct sockaddr_in6);
#endif
sa->sin6_family = AF_INET6;
sa->sin6_port = htons(port);
sa->sin6_flowinfo = 0;
sa->sin6_addr = ipa_to_in6(a);
if (ifa && ipa_is_link_local(a))
sa->sin6_scope_id = ifa->index;
}
2014-05-18 17:42:26 +08:00
void
sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
{
2014-05-18 17:42:26 +08:00
if (af == AF_INET)
sockaddr_fill4((struct sockaddr_in *) sa, a, port);
2014-05-18 17:42:26 +08:00
else if (af == AF_INET6)
sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
else
bug("Unknown AF");
}
2014-05-18 17:42:26 +08:00
static inline void
sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
{
2014-05-18 17:42:26 +08:00
*port = ntohs(sa->sin_port);
*a = ipa_from_in4(sa->sin_addr);
}
2014-05-18 17:42:26 +08:00
static inline void
sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
{
2014-05-18 17:42:26 +08:00
*port = ntohs(sa->sin6_port);
*a = ipa_from_in6(sa->sin6_addr);
2014-05-18 17:42:26 +08:00
if (ifa && ipa_is_link_local(*a))
*ifa = if_find_by_index(sa->sin6_scope_id);
}
2014-05-18 17:42:26 +08:00
int
sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
{
2014-05-18 17:42:26 +08:00
if (sa->sa.sa_family != af)
goto fail;
2014-05-18 17:42:26 +08:00
if (af == AF_INET)
sockaddr_read4((struct sockaddr_in *) sa, a, port);
2014-05-18 17:42:26 +08:00
else if (af == AF_INET6)
sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
else
goto fail;
2014-05-18 17:42:26 +08:00
return 0;
2014-05-18 17:42:26 +08:00
fail:
*a = IPA_NONE;
*port = 0;
return -1;
}
2014-05-18 17:42:26 +08:00
/*
* IPv6 multicast syscalls
*/
2014-05-18 17:42:26 +08:00
/* Fortunately standardized in RFC 3493 */
2014-05-18 17:42:26 +08:00
#define INIT_MREQ6(maddr,ifa) \
{ .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
2014-05-18 17:42:26 +08:00
static inline int
sk_setup_multicast6(sock *s)
{
2014-05-18 17:42:26 +08:00
int index = s->iface->index;
int ttl = s->ttl;
int n = 0;
2014-05-18 17:42:26 +08:00
if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
ERR("IPV6_MULTICAST_IF");
2014-05-18 17:42:26 +08:00
if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
ERR("IPV6_MULTICAST_HOPS");
2014-05-18 17:42:26 +08:00
if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
ERR("IPV6_MULTICAST_LOOP");
2014-05-18 17:42:26 +08:00
return 0;
2009-08-28 00:25:46 +08:00
}
2014-05-18 17:42:26 +08:00
static inline int
sk_join_group6(sock *s, ip_addr maddr)
{
2014-05-18 17:42:26 +08:00
struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
2014-05-18 17:42:26 +08:00
if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
ERR("IPV6_JOIN_GROUP");
2014-05-18 17:42:26 +08:00
return 0;
}
2014-05-18 17:42:26 +08:00
static inline int
sk_leave_group6(sock *s, ip_addr maddr)
{
2014-05-18 17:42:26 +08:00
struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
2014-05-18 17:42:26 +08:00
if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
ERR("IPV6_LEAVE_GROUP");
return 0;
}
2010-04-02 17:31:20 +08:00
2014-05-18 17:42:26 +08:00
/*
* IPv6 packet control messages
*/
2010-04-02 17:31:20 +08:00
2014-05-18 17:42:26 +08:00
/* Also standardized, in RFC 3542 */
2010-04-02 17:31:20 +08:00
2010-04-14 20:46:21 +08:00
/*
* RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
* type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
* don't have IPV6_RECVPKTINFO we suppose the OS implements the older
* RFC and we use IPV6_PKTINFO.
*/
#ifndef IPV6_RECVPKTINFO
#define IPV6_RECVPKTINFO IPV6_PKTINFO
#endif
/*
* Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
*/
#ifndef IPV6_RECVHOPLIMIT
#define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
#endif
2010-04-14 20:46:21 +08:00
2014-05-18 17:42:26 +08:00
#define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
#define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
2010-04-02 17:31:20 +08:00
2014-05-18 17:42:26 +08:00
static inline int
sk_request_cmsg6_pktinfo(sock *s)
{
int y = 1;
2014-05-18 17:42:26 +08:00
if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
ERR("IPV6_RECVPKTINFO");
return 0;
2010-04-02 17:31:20 +08:00
}
2014-05-18 17:42:26 +08:00
static inline int
sk_request_cmsg6_ttl(sock *s)
2010-04-02 17:31:20 +08:00
{
2014-05-18 17:42:26 +08:00
int y = 1;
2010-04-02 17:31:20 +08:00
2014-05-18 17:42:26 +08:00
if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
ERR("IPV6_RECVHOPLIMIT");
2014-05-18 17:42:26 +08:00
return 0;
}
2014-05-18 17:42:26 +08:00
static inline void
sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
{
if (cm->cmsg_type == IPV6_PKTINFO)
{
2014-05-18 17:42:26 +08:00
struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
s->laddr = ipa_from_in6(pi->ipi6_addr);
s->lifindex = pi->ipi6_ifindex;
}
2014-05-18 17:42:26 +08:00
}
2014-05-18 17:42:26 +08:00
static inline void
sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
{
if (cm->cmsg_type == IPV6_HOPLIMIT)
s->rcv_ttl = * (int *) CMSG_DATA(cm);
2010-04-02 17:31:20 +08:00
}
2014-05-18 17:42:26 +08:00
static inline void
sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
2010-04-02 17:31:20 +08:00
{
struct cmsghdr *cm;
struct in6_pktinfo *pi;
int controllen = 0;
2010-04-02 17:31:20 +08:00
msg->msg_control = cbuf;
msg->msg_controllen = cbuflen;
cm = CMSG_FIRSTHDR(msg);
cm->cmsg_level = SOL_IPV6;
2010-04-02 17:31:20 +08:00
cm->cmsg_type = IPV6_PKTINFO;
cm->cmsg_len = CMSG_LEN(sizeof(*pi));
controllen += CMSG_SPACE(sizeof(*pi));
2010-04-02 17:31:20 +08:00
pi = (struct in6_pktinfo *) CMSG_DATA(cm);
pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
2014-05-18 17:42:26 +08:00
pi->ipi6_addr = ipa_to_in6(s->saddr);
2010-04-02 17:31:20 +08:00
msg->msg_controllen = controllen;
2010-04-02 17:31:20 +08:00
}
2010-04-02 17:31:20 +08:00
2014-05-18 17:42:26 +08:00
/*
* Miscellaneous socket syscalls
*/
static inline int
sk_set_ttl4(sock *s, int ttl)
{
2014-05-18 17:42:26 +08:00
if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
ERR("IP_TTL");
return 0;
}
2014-05-18 17:42:26 +08:00
static inline int
sk_set_ttl6(sock *s, int ttl)
{
if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
ERR("IPV6_UNICAST_HOPS");
2014-05-18 17:42:26 +08:00
return 0;
}
static inline int
sk_set_tos4(sock *s, int tos)
{
2014-05-18 17:42:26 +08:00
if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
ERR("IP_TOS");
2014-05-18 17:42:26 +08:00
return 0;
}
2014-05-18 17:42:26 +08:00
static inline int
sk_set_tos6(sock *s, int tos)
{
if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
ERR("IPV6_TCLASS");
2014-05-18 17:42:26 +08:00
return 0;
}
2015-04-19 06:19:56 +08:00
static inline int
sk_set_high_port(sock *s UNUSED)
2015-04-19 06:19:56 +08:00
{
/* Port range setting is optional, ignore it if not supported */
#ifdef IP_PORTRANGE
if (sk_is_ipv4(s))
{
int range = IP_PORTRANGE_HIGH;
if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
ERR("IP_PORTRANGE");
}
#endif
#ifdef IPV6_PORTRANGE
if (sk_is_ipv6(s))
{
int range = IPV6_PORTRANGE_HIGH;
if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
ERR("IPV6_PORTRANGE");
}
#endif
return 0;
}
2014-10-24 17:11:43 +08:00
static inline byte *
sk_skip_ip_header(byte *pkt, int *len)
{
if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
return NULL;
int hlen = (*pkt & 0x0f) * 4;
if ((hlen < 20) || (hlen > *len))
return NULL;
*len -= hlen;
return pkt + hlen;
}
byte *
sk_rx_buffer(sock *s, int *len)
{
if (sk_is_ipv4(s) && (s->type == SK_IP))
return sk_skip_ip_header(s->rbuf, len);
else
return s->rbuf;
}
2014-05-18 17:42:26 +08:00
/*
* Public socket functions
*/
2014-05-18 17:42:26 +08:00
/**
* sk_setup_multicast - enable multicast for given socket
* @s: socket
*
* Prepare transmission of multicast packets for given datagram socket.
* The socket must have defined @iface.
*
* Result: 0 for success, -1 for an error.
*/
2014-05-18 17:42:26 +08:00
int
sk_setup_multicast(sock *s)
{
ASSERT(s->iface);
2014-05-18 17:42:26 +08:00
if (sk_is_ipv4(s))
return sk_setup_multicast4(s);
else
return sk_setup_multicast6(s);
}
2014-05-18 17:42:26 +08:00
/**
* sk_join_group - join multicast group for given socket
* @s: socket
* @maddr: multicast address
*
* Join multicast group for given datagram socket and associated interface.
* The socket must have defined @iface.
*
* Result: 0 for success, -1 for an error.
*/
2014-05-18 17:42:26 +08:00
int
sk_join_group(sock *s, ip_addr maddr)
{
if (sk_is_ipv4(s))
return sk_join_group4(s, maddr);
else
return sk_join_group6(s, maddr);
}
2014-05-18 17:42:26 +08:00
/**
* sk_leave_group - leave multicast group for given socket
* @s: socket
* @maddr: multicast address
*
* Leave multicast group for given datagram socket and associated interface.
* The socket must have defined @iface.
*
* Result: 0 for success, -1 for an error.
*/
2014-05-18 17:42:26 +08:00
int
sk_leave_group(sock *s, ip_addr maddr)
{
if (sk_is_ipv4(s))
return sk_leave_group4(s, maddr);
else
return sk_leave_group6(s, maddr);
}
/**
2014-05-18 17:42:26 +08:00
* sk_setup_broadcast - enable broadcast for given socket
* @s: socket
*
* Allow reception and transmission of broadcast packets for given datagram
* socket. The socket must have defined @iface. For transmission, packets should
* be send to @brd address of @iface.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_setup_broadcast(sock *s)
{
int y = 1;
if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
ERR("SO_BROADCAST");
return 0;
}
/**
* sk_set_ttl - set transmit TTL for given socket
* @s: socket
* @ttl: TTL value
*
2014-05-18 17:42:26 +08:00
* Set TTL for already opened connections when TTL was not set before. Useful
* for accepted connections when different ones should have different TTL.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_set_ttl(sock *s, int ttl)
{
s->ttl = ttl;
2014-05-18 17:42:26 +08:00
if (sk_is_ipv4(s))
return sk_set_ttl4(s, ttl);
else
return sk_set_ttl6(s, ttl);
}
/**
2014-05-18 17:42:26 +08:00
* sk_set_min_ttl - set minimal accepted TTL for given socket
* @s: socket
* @ttl: TTL value
*
2014-05-18 17:42:26 +08:00
* Set minimal accepted TTL for given socket. Can be used for TTL security.
* implementations.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_set_min_ttl(sock *s, int ttl)
{
2014-05-18 17:42:26 +08:00
if (sk_is_ipv4(s))
return sk_set_min_ttl4(s, ttl);
else
return sk_set_min_ttl6(s, ttl);
}
2014-05-18 17:42:26 +08:00
#if 0
/**
2014-05-18 17:42:26 +08:00
* sk_set_md5_auth - add / remove MD5 security association for given socket
* @s: socket
* @local: IP address of local side
* @remote: IP address of remote side
* @ifa: Interface for link-local IP address
* @passwd: Password used for MD5 authentication
* @setkey: Update also system SA/SP database
*
* In TCP MD5 handling code in kernel, there is a set of security associations
* used for choosing password and other authentication parameters according to
* the local and remote address. This function is useful for listening socket,
* for active sockets it may be enough to set s->password field.
*
* When called with passwd != NULL, the new pair is added,
* When called with passwd == NULL, the existing pair is removed.
*
* Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
* stored in global SA/SP database (but the behavior also must be enabled on
* per-socket basis). In case of multiple sockets to the same neighbor, the
* socket-specific state must be configured for each socket while global state
* just once per src-dst pair. The @setkey argument controls whether the global
* state (SA/SP database) is also updated.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
2014-05-18 17:42:26 +08:00
{ DUMMY; }
#endif
2014-05-18 17:42:26 +08:00
/**
* sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
* @s: socket
* @offset: offset
*
* Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
* kernel will automatically fill it for outgoing packets and check it for
* incoming packets. Should not be used on ICMPv6 sockets, where the position is
* known to the kernel.
*
* Result: 0 for success, -1 for an error.
*/
2009-11-10 06:22:53 +08:00
int
sk_set_ipv6_checksum(sock *s, int offset)
{
if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
2014-05-18 17:42:26 +08:00
ERR("IPV6_CHECKSUM");
2009-11-10 06:22:53 +08:00
return 0;
}
int
2014-05-18 17:42:26 +08:00
sk_set_icmp6_filter(sock *s, int p1, int p2)
{
/* a bit of lame interface, but it is here only for Radv */
struct icmp6_filter f;
ICMP6_FILTER_SETBLOCKALL(&f);
ICMP6_FILTER_SETPASS(p1, &f);
ICMP6_FILTER_SETPASS(p2, &f);
if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
2014-05-18 17:42:26 +08:00
ERR("ICMP6_FILTER");
return 0;
}
2014-05-18 17:42:26 +08:00
void
sk_log_error(sock *s, const char *p)
{
log(L_ERR "%s: Socket error: %s%#m", p, s->err);
}
/*
* Actual struct birdsock code
*/
static list sock_list;
static struct birdsock *current_sock;
static struct birdsock *stored_sock;
static inline sock *
sk_next(sock *s)
{
if (!s->n.next->next)
return NULL;
else
return SKIP_BACK(sock, n, s->n.next);
}
static void
sk_alloc_bufs(sock *s)
{
if (!s->rbuf && s->rbsize)
s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
s->rpos = s->rbuf;
if (!s->tbuf && s->tbsize)
s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
s->tpos = s->ttx = s->tbuf;
}
static void
sk_free_bufs(sock *s)
{
if (s->rbuf_alloc)
{
xfree(s->rbuf_alloc);
s->rbuf = s->rbuf_alloc = NULL;
}
if (s->tbuf_alloc)
{
xfree(s->tbuf_alloc);
s->tbuf = s->tbuf_alloc = NULL;
}
}
#ifdef HAVE_LIBSSH
static void
sk_ssh_free(sock *s)
{
struct ssh_sock *ssh = s->ssh;
if (s->ssh == NULL)
return;
s->ssh = NULL;
if (ssh->channel)
{
if (ssh_channel_is_open(ssh->channel))
ssh_channel_close(ssh->channel);
ssh_channel_free(ssh->channel);
ssh->channel = NULL;
}
if (ssh->session)
{
ssh_disconnect(ssh->session);
ssh_free(ssh->session);
ssh->session = NULL;
}
}
#endif
2014-05-18 17:42:26 +08:00
static void
sk_free(resource *r)
{
sock *s = (sock *) r;
sk_free_bufs(s);
#ifdef HAVE_LIBSSH
if (s->type == SK_SSH || s->type == SK_SSH_ACTIVE)
sk_ssh_free(s);
#endif
2014-05-18 17:42:26 +08:00
if (s->fd < 0)
return;
2014-05-18 17:42:26 +08:00
/* FIXME: we should call sk_stop() for SKF_THREAD sockets */
if (!(s->flags & SKF_THREAD))
{
2014-05-18 17:42:26 +08:00
if (s == current_sock)
current_sock = sk_next(s);
if (s == stored_sock)
stored_sock = sk_next(s);
rem_node(&s->n);
}
if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE)
close(s->fd);
s->fd = -1;
2014-05-18 17:42:26 +08:00
}
void
sk_set_rbsize(sock *s, uint val)
{
ASSERT(s->rbuf_alloc == s->rbuf);
if (s->rbsize == val)
return;
s->rbsize = val;
xfree(s->rbuf_alloc);
s->rbuf_alloc = xmalloc(val);
s->rpos = s->rbuf = s->rbuf_alloc;
}
void
sk_set_tbsize(sock *s, uint val)
{
ASSERT(s->tbuf_alloc == s->tbuf);
if (s->tbsize == val)
return;
byte *old_tbuf = s->tbuf;
s->tbsize = val;
s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
s->tpos = s->tbuf + (s->tpos - old_tbuf);
s->ttx = s->tbuf + (s->ttx - old_tbuf);
}
void
sk_set_tbuf(sock *s, void *tbuf)
{
s->tbuf = tbuf ?: s->tbuf_alloc;
s->ttx = s->tpos = s->tbuf;
}
void
sk_reallocate(sock *s)
{
sk_free_bufs(s);
sk_alloc_bufs(s);
}
static void
sk_dump(resource *r)
{
sock *s = (sock *) r;
static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "SSH>", "SSH", "DEL!" };
2014-05-18 17:42:26 +08:00
debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
2014-05-18 17:42:26 +08:00
sk_type_names[s->type],
s->data,
s->saddr,
s->sport,
s->daddr,
s->dport,
s->tos,
s->ttl,
s->iface ? s->iface->name : "none");
}
static struct resclass sk_class = {
"Socket",
sizeof(sock),
sk_free,
sk_dump,
NULL,
NULL
};
/**
* sk_new - create a socket
* @p: pool
*
* This function creates a new socket resource. If you want to use it,
* you need to fill in all the required fields of the structure and
* call sk_open() to do the actual opening of the socket.
*
* The real function name is sock_new(), sk_new() is a macro wrapper
* to avoid collision with OpenSSL.
*/
sock *
sock_new(pool *p)
{
sock *s = ralloc(p, &sk_class);
s->pool = p;
// s->saddr = s->daddr = IPA_NONE;
s->tos = s->priority = s->ttl = -1;
s->fd = -1;
return s;
}
static int
sk_setup(sock *s)
{
2014-05-18 17:42:26 +08:00
int y = 1;
int fd = s->fd;
if (s->type == SK_SSH_ACTIVE)
return 0;
2014-05-18 17:42:26 +08:00
if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
ERR("O_NONBLOCK");
if (!s->af)
2014-05-18 17:42:26 +08:00
return 0;
2014-05-18 17:42:26 +08:00
if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
s->flags |= SKF_PKTINFO;
2014-05-18 17:42:26 +08:00
#ifdef CONFIG_USE_HDRINCL
if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
{
s->flags &= ~SKF_PKTINFO;
s->flags |= SKF_HDRINCL;
if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
ERR("IP_HDRINCL");
}
#endif
Basic VRF support Add basic VRF (virtual routing and forwarding) support. Protocols can be associated with VRFs, such protocols will be restricted to interfaces assigned to the VRF (as reported by Linux kernel) and will use sockets bound to the VRF. E.g., different multihop BGP instances can use diffent kernel routing tables to handle BGP TCP connections. The VRF support is preliminary, currently there are several limitations: - Recent Linux kernels (4.11) do not handle correctly sockets bound to interaces that are part of VRF, so most protocols other than multihop BGP do not work. This will be fixed by future kernel versions. - Neighbor cache ignores VRFs. Breaks config with the same prefix on local interfaces in different VRFs. Not much problem as single hop protocols do not work anyways. - Olock code ignores VRFs. Breaks config with multiple BGP peers with the same IP address in different VRFs. - Incoming BGP connections are not dispatched according to VRFs. Breaks config with multiple BGP peers with the same IP address in different VRFs. Perhaps we would need some kernel API to read VRF of incoming connection? Or probably use multiple listening sockets in int-new branch. - We should handle master VRF interface up/down events and perhaps disable associated protocols when VRF goes down. Or at least disable associated interfaces. - Also we should check if the master iface is really VRF iface and not some other kind of master iface. - BFD session request dispatch should be aware of VRFs. - Perhaps kernel protocol should read default kernel table ID from VRF iface so it is not necessary to configure it. - Perhaps we should have per-VRF default table.
2017-09-06 23:38:48 +08:00
if (s->vrf && !s->iface)
{
/* Bind socket to associated VRF interface.
This is Linux-specific, but so is SO_BINDTODEVICE. */
#ifdef SO_BINDTODEVICE
struct ifreq ifr = {};
strcpy(ifr.ifr_name, s->vrf->name);
if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
ERR("SO_BINDTODEVICE");
#endif
}
2014-05-18 17:42:26 +08:00
if (s->iface)
{
#ifdef SO_BINDTODEVICE
struct ifreq ifr = {};
2014-05-18 17:42:26 +08:00
strcpy(ifr.ifr_name, s->iface->name);
if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
ERR("SO_BINDTODEVICE");
#endif
2014-05-18 17:42:26 +08:00
#ifdef CONFIG_UNIX_DONTROUTE
if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
ERR("SO_DONTROUTE");
#endif
}
2014-05-18 17:42:26 +08:00
if (sk_is_ipv4(s))
{
if (s->flags & SKF_LADDR_RX)
if (sk_request_cmsg4_pktinfo(s) < 0)
return -1;
2014-05-18 17:42:26 +08:00
if (s->flags & SKF_TTL_RX)
if (sk_request_cmsg4_ttl(s) < 0)
return -1;
2014-05-18 17:42:26 +08:00
if ((s->type == SK_UDP) || (s->type == SK_IP))
if (sk_disable_mtu_disc4(s) < 0)
return -1;
2014-05-18 17:42:26 +08:00
if (s->ttl >= 0)
if (sk_set_ttl4(s, s->ttl) < 0)
return -1;
2014-05-18 17:42:26 +08:00
if (s->tos >= 0)
if (sk_set_tos4(s, s->tos) < 0)
return -1;
}
2014-05-18 17:42:26 +08:00
if (sk_is_ipv6(s))
{
if ((s->type == SK_TCP_PASSIVE) || (s->type == SK_TCP_ACTIVE) || (s->type == SK_UDP))
if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
ERR("IPV6_V6ONLY");
2014-05-18 17:42:26 +08:00
if (s->flags & SKF_LADDR_RX)
if (sk_request_cmsg6_pktinfo(s) < 0)
return -1;
2014-05-18 17:42:26 +08:00
if (s->flags & SKF_TTL_RX)
if (sk_request_cmsg6_ttl(s) < 0)
return -1;
2014-05-18 17:42:26 +08:00
if ((s->type == SK_UDP) || (s->type == SK_IP))
if (sk_disable_mtu_disc6(s) < 0)
return -1;
2014-05-18 17:42:26 +08:00
if (s->ttl >= 0)
if (sk_set_ttl6(s, s->ttl) < 0)
return -1;
2014-05-18 17:42:26 +08:00
if (s->tos >= 0)
if (sk_set_tos6(s, s->tos) < 0)
return -1;
}
/* Must be after sk_set_tos4() as setting ToS on Linux also mangles priority */
if (s->priority >= 0)
if (sk_set_priority(s, s->priority) < 0)
return -1;
return 0;
}
2014-05-18 17:42:26 +08:00
static void
sk_insert(sock *s)
{
2014-05-18 17:42:26 +08:00
add_tail(&sock_list, &s->n);
}
1999-10-29 20:09:29 +08:00
static void
sk_tcp_connected(sock *s)
{
2014-05-18 17:42:26 +08:00
sockaddr sa;
int sa_len = sizeof(sa);
if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
(sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
2014-05-18 17:42:26 +08:00
log(L_WARN "SOCK: Cannot get local IP address for TCP>");
s->type = SK_TCP;
sk_alloc_bufs(s);
s->tx_hook(s);
}
2017-05-23 19:12:25 +08:00
#ifdef HAVE_LIBSSH
static void
sk_ssh_connected(sock *s)
{
sk_alloc_bufs(s);
s->type = SK_SSH;
s->tx_hook(s);
}
2017-05-23 19:12:25 +08:00
#endif
1999-10-29 20:09:29 +08:00
static int
2014-05-18 17:42:26 +08:00
sk_passive_connected(sock *s, int type)
1999-10-29 20:09:29 +08:00
{
2014-05-18 17:42:26 +08:00
sockaddr loc_sa, rem_sa;
int loc_sa_len = sizeof(loc_sa);
int rem_sa_len = sizeof(rem_sa);
2010-01-03 19:17:52 +08:00
2014-05-18 17:42:26 +08:00
int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
if (fd < 0)
{
if ((errno != EINTR) && (errno != EAGAIN))
s->err_hook(s, errno);
2014-05-18 17:42:26 +08:00
return 0;
}
sock *t = sk_new(s->pool);
t->type = type;
t->data = s->data;
t->af = s->af;
t->fd = fd;
2014-05-18 17:42:26 +08:00
t->ttl = s->ttl;
t->tos = s->tos;
t->vrf = s->vrf;
2014-05-18 17:42:26 +08:00
t->rbsize = s->rbsize;
t->tbsize = s->tbsize;
if (type == SK_TCP)
{
if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
(sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
2014-05-18 17:42:26 +08:00
log(L_WARN "SOCK: Cannot get local IP address for TCP<");
if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
2014-05-18 17:42:26 +08:00
log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
}
if (sk_setup(t) < 0)
{
/* FIXME: Call err_hook instead ? */
log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
/* FIXME: handle it better in rfree() */
close(t->fd);
2014-05-18 17:42:26 +08:00
t->fd = -1;
rfree(t);
return 1;
}
sk_insert(t);
sk_alloc_bufs(t);
s->rx_hook(t, 0);
return 1;
1999-10-29 20:09:29 +08:00
}
#ifdef HAVE_LIBSSH
/*
* Return SSH_OK or SSH_AGAIN or SSH_ERROR
*/
static int
sk_ssh_connect(sock *s)
{
s->fd = ssh_get_fd(s->ssh->session);
/* Big fall thru automata */
switch (s->ssh->state)
{
case SK_SSH_CONNECT:
{
switch (ssh_connect(s->ssh->session))
{
case SSH_AGAIN:
/* A quick look into libSSH shows that ssh_get_fd() should return non-(-1)
* after SSH_AGAIN is returned by ssh_connect(). This is however nowhere
* documented but our code relies on that.
*/
return SSH_AGAIN;
case SSH_OK:
break;
default:
return SSH_ERROR;
}
} /* fallthrough */
case SK_SSH_SERVER_KNOWN:
{
s->ssh->state = SK_SSH_SERVER_KNOWN;
if (s->ssh->server_hostkey_path)
{
int server_identity_is_ok = 1;
/* Check server identity */
switch (ssh_is_server_known(s->ssh->session))
{
#define LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s,msg,args...) log(L_WARN "SSH Identity %s@%s:%u: " msg, (s)->ssh->username, (s)->host, (s)->dport, ## args);
case SSH_SERVER_KNOWN_OK:
/* The server is known and has not changed. */
break;
case SSH_SERVER_NOT_KNOWN:
LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server is unknown, its public key was not found in the known host file %s", s->ssh->server_hostkey_path);
break;
case SSH_SERVER_KNOWN_CHANGED:
LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server key has changed. Either you are under attack or the administrator changed the key.");
server_identity_is_ok = 0;
break;
case SSH_SERVER_FILE_NOT_FOUND:
LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The known host file %s does not exist", s->ssh->server_hostkey_path);
server_identity_is_ok = 0;
break;
case SSH_SERVER_ERROR:
LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "Some error happened");
server_identity_is_ok = 0;
break;
case SSH_SERVER_FOUND_OTHER:
LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server gave use a key of a type while we had an other type recorded. " \
"It is a possible attack.");
server_identity_is_ok = 0;
break;
}
if (!server_identity_is_ok)
return SSH_ERROR;
}
} /* fallthrough */
case SK_SSH_USERAUTH:
{
s->ssh->state = SK_SSH_USERAUTH;
switch (ssh_userauth_publickey_auto(s->ssh->session, NULL, NULL))
{
case SSH_AUTH_AGAIN:
return SSH_AGAIN;
case SSH_AUTH_SUCCESS:
break;
default:
return SSH_ERROR;
}
} /* fallthrough */
case SK_SSH_CHANNEL:
{
s->ssh->state = SK_SSH_CHANNEL;
s->ssh->channel = ssh_channel_new(s->ssh->session);
if (s->ssh->channel == NULL)
return SSH_ERROR;
} /* fallthrough */
case SK_SSH_SESSION:
{
s->ssh->state = SK_SSH_SESSION;
switch (ssh_channel_open_session(s->ssh->channel))
{
case SSH_AGAIN:
return SSH_AGAIN;
case SSH_OK:
break;
default:
return SSH_ERROR;
}
} /* fallthrough */
case SK_SSH_SUBSYSTEM:
{
s->ssh->state = SK_SSH_SUBSYSTEM;
if (s->ssh->subsystem)
{
switch (ssh_channel_request_subsystem(s->ssh->channel, s->ssh->subsystem))
{
case SSH_AGAIN:
return SSH_AGAIN;
case SSH_OK:
break;
default:
return SSH_ERROR;
}
}
} /* fallthrough */
case SK_SSH_ESTABLISHED:
s->ssh->state = SK_SSH_ESTABLISHED;
}
return SSH_OK;
}
/*
* Return file descriptor number if success
* Return -1 if failed
*/
static int
sk_open_ssh(sock *s)
{
if (!s->ssh)
bug("sk_open() sock->ssh is not allocated");
ssh_session sess = ssh_new();
if (sess == NULL)
ERR2("Cannot create a ssh session");
s->ssh->session = sess;
const int verbosity = SSH_LOG_NOLOG;
ssh_options_set(sess, SSH_OPTIONS_LOG_VERBOSITY, &verbosity);
ssh_options_set(sess, SSH_OPTIONS_HOST, s->host);
ssh_options_set(sess, SSH_OPTIONS_PORT, &(s->dport));
/* TODO: Add SSH_OPTIONS_BINDADDR */
ssh_options_set(sess, SSH_OPTIONS_USER, s->ssh->username);
if (s->ssh->server_hostkey_path)
ssh_options_set(sess, SSH_OPTIONS_KNOWNHOSTS, s->ssh->server_hostkey_path);
if (s->ssh->client_privkey_path)
ssh_options_set(sess, SSH_OPTIONS_IDENTITY, s->ssh->client_privkey_path);
ssh_set_blocking(sess, 0);
switch (sk_ssh_connect(s))
{
case SSH_AGAIN:
break;
case SSH_OK:
sk_ssh_connected(s);
break;
case SSH_ERROR:
ERR2(ssh_get_error(sess));
break;
}
return ssh_get_fd(sess);
err:
return -1;
}
#endif
2000-06-05 20:19:12 +08:00
/**
* sk_open - open a socket
* @s: socket
*
* This function takes a socket resource created by sk_new() and
* initialized by the user and binds a corresponding network connection
* to it.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_open(sock *s)
{
int af = AF_UNSPEC;
2014-05-18 17:42:26 +08:00
int fd = -1;
int do_bind = 0;
int bind_port = 0;
ip_addr bind_addr = IPA_NONE;
sockaddr sa;
if (s->type <= SK_IP)
{
/*
* For TCP/IP sockets, Address family (IPv4 or IPv6) can be specified either
* explicitly (SK_IPV4 or SK_IPV6) or implicitly (based on saddr, daddr).
* But the specifications have to be consistent.
*/
switch (s->subtype)
{
case 0:
ASSERT(ipa_zero(s->saddr) || ipa_zero(s->daddr) ||
(ipa_is_ip4(s->saddr) == ipa_is_ip4(s->daddr)));
af = (ipa_is_ip4(s->saddr) || ipa_is_ip4(s->daddr)) ? AF_INET : AF_INET6;
break;
case SK_IPV4:
ASSERT(ipa_zero(s->saddr) || ipa_is_ip4(s->saddr));
ASSERT(ipa_zero(s->daddr) || ipa_is_ip4(s->daddr));
af = AF_INET;
break;
case SK_IPV6:
ASSERT(ipa_zero(s->saddr) || !ipa_is_ip4(s->saddr));
ASSERT(ipa_zero(s->daddr) || !ipa_is_ip4(s->daddr));
af = AF_INET6;
break;
default:
bug("Invalid subtype %d", s->subtype);
}
}
switch (s->type)
2014-05-18 17:42:26 +08:00
{
case SK_TCP_ACTIVE:
s->ttx = ""; /* Force s->ttx != s->tpos */
/* Fall thru */
case SK_TCP_PASSIVE:
fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
2014-05-18 17:42:26 +08:00
bind_port = s->sport;
bind_addr = s->saddr;
do_bind = bind_port || ipa_nonzero(bind_addr);
break;
#ifdef HAVE_LIBSSH
case SK_SSH_ACTIVE:
s->ttx = ""; /* Force s->ttx != s->tpos */
fd = sk_open_ssh(s);
break;
#endif
2014-05-18 17:42:26 +08:00
case SK_UDP:
fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
2014-05-18 17:42:26 +08:00
bind_port = s->sport;
bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
do_bind = 1;
break;
case SK_IP:
fd = socket(af, SOCK_RAW, s->dport);
2014-05-18 17:42:26 +08:00
bind_port = 0;
bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
do_bind = ipa_nonzero(bind_addr);
break;
case SK_MAGIC:
af = 0;
2014-05-18 17:42:26 +08:00
fd = s->fd;
break;
default:
bug("sk_open() called for invalid sock type %d", s->type);
}
if (fd < 0)
2014-05-18 17:42:26 +08:00
ERR("socket");
s->af = af;
s->fd = fd;
2014-05-18 17:42:26 +08:00
if (sk_setup(s) < 0)
goto err;
if (do_bind)
2014-05-18 17:42:26 +08:00
{
if (bind_port)
{
2014-05-18 17:42:26 +08:00
int y = 1;
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
ERR2("SO_REUSEADDR");
#ifdef CONFIG_NO_IFACE_BIND
2014-05-18 17:42:26 +08:00
/* Workaround missing ability to bind to an iface */
if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
{
if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
ERR2("SO_REUSEPORT");
}
#endif
}
2015-04-19 06:19:56 +08:00
else
if (s->flags & SKF_HIGH_PORT)
if (sk_set_high_port(s) < 0)
log(L_WARN "Socket error: %s%#m", s->err);
if (s->flags & SKF_FREEBIND)
if (sk_set_freebind(s) < 0)
log(L_WARN "Socket error: %s%#m", s->err);
sockaddr_fill(&sa, s->af, bind_addr, s->iface, bind_port);
2014-05-18 17:42:26 +08:00
if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
ERR2("bind");
}
if (s->password)
if (sk_set_md5_auth(s, s->saddr, s->daddr, -1, s->iface, s->password, 0) < 0)
2014-05-18 17:42:26 +08:00
goto err;
switch (s->type)
2014-05-18 17:42:26 +08:00
{
case SK_TCP_ACTIVE:
sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
2014-05-18 17:42:26 +08:00
if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
sk_tcp_connected(s);
else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
ERR2("connect");
break;
case SK_TCP_PASSIVE:
if (listen(fd, 8) < 0)
ERR2("listen");
break;
case SK_SSH_ACTIVE:
2014-05-18 17:42:26 +08:00
case SK_MAGIC:
break;
default:
sk_alloc_bufs(s);
}
2013-09-10 18:09:36 +08:00
if (!(s->flags & SKF_THREAD))
sk_insert(s);
return 0;
2014-05-18 17:42:26 +08:00
err:
close(fd);
s->fd = -1;
return -1;
}
2014-05-18 17:42:26 +08:00
int
1999-10-29 20:09:29 +08:00
sk_open_unix(sock *s, char *name)
{
struct sockaddr_un sa;
2014-05-18 17:42:26 +08:00
int fd;
/* We are sloppy during error (leak fd and not set s->err), but we die anyway */
1999-10-29 20:09:29 +08:00
fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (fd < 0)
2014-05-18 17:42:26 +08:00
return -1;
if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
return -1;
2008-10-27 06:55:38 +08:00
/* Path length checked in test_old_bird() but we may need unix sockets for other reasons in future */
ASSERT_DIE(strlen(name) < sizeof(sa.sun_path));
1999-10-29 20:09:29 +08:00
sa.sun_family = AF_UNIX;
2008-08-25 20:06:20 +08:00
strcpy(sa.sun_path, name);
2014-05-18 17:42:26 +08:00
if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
2014-05-18 17:42:26 +08:00
return -1;
if (listen(fd, 8) < 0)
return -1;
s->fd = fd;
sk_insert(s);
2014-05-18 17:42:26 +08:00
return 0;
}
#define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
#define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
static void
sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
{
if (sk_is_ipv4(s))
sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
else
sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
}
static void
sk_process_cmsgs(sock *s, struct msghdr *msg)
{
struct cmsghdr *cm;
s->laddr = IPA_NONE;
s->lifindex = 0;
s->rcv_ttl = -1;
for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
{
if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
{
sk_process_cmsg4_pktinfo(s, cm);
sk_process_cmsg4_ttl(s, cm);
}
1999-10-29 20:09:29 +08:00
2014-05-18 17:42:26 +08:00
if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
{
sk_process_cmsg6_pktinfo(s, cm);
sk_process_cmsg6_ttl(s, cm);
}
}
1999-10-29 20:09:29 +08:00
}
static inline int
sk_sendmsg(sock *s)
{
struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
byte cmsg_buf[CMSG_TX_SPACE];
2016-05-12 22:04:47 +08:00
sockaddr dst;
int flags = 0;
sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
struct msghdr msg = {
2014-05-18 17:42:26 +08:00
.msg_name = &dst.sa,
.msg_namelen = SA_LEN(dst),
.msg_iov = &iov,
.msg_iovlen = 1
};
#ifdef CONFIG_DONTROUTE_UNICAST
/* FreeBSD silently changes TTL to 1 when MSG_DONTROUTE is used, therefore we
cannot use it for other cases (e.g. when TTL security is used). */
if (ipa_is_ip4(s->daddr) && ip4_is_unicast(ipa_to_ip4(s->daddr)) && (s->ttl == 1))
flags = MSG_DONTROUTE;
#endif
#ifdef CONFIG_USE_HDRINCL
byte hdr[20];
struct iovec iov2[2] = { {hdr, 20}, iov };
if (s->flags & SKF_HDRINCL)
{
2014-05-18 17:42:26 +08:00
sk_prepare_ip_header(s, hdr, iov.iov_len);
msg.msg_iov = iov2;
msg.msg_iovlen = 2;
}
#endif
if (s->flags & SKF_PKTINFO)
2014-05-18 17:42:26 +08:00
sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
return sendmsg(s->fd, &msg, flags);
}
static inline int
sk_recvmsg(sock *s)
{
struct iovec iov = {s->rbuf, s->rbsize};
byte cmsg_buf[CMSG_RX_SPACE];
sockaddr src;
struct msghdr msg = {
2014-05-18 17:42:26 +08:00
.msg_name = &src.sa,
.msg_namelen = sizeof(src), // XXXX ??
.msg_iov = &iov,
.msg_iovlen = 1,
.msg_control = cmsg_buf,
.msg_controllen = sizeof(cmsg_buf),
.msg_flags = 0
};
int rv = recvmsg(s->fd, &msg, 0);
if (rv < 0)
return rv;
//ifdef IPV4
// if (cf_type == SK_IP)
// rv = ipv4_skip_header(pbuf, rv);
//endif
sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
2014-05-18 17:42:26 +08:00
sk_process_cmsgs(s, &msg);
if (msg.msg_flags & MSG_TRUNC)
s->flags |= SKF_TRUNCATED;
else
s->flags &= ~SKF_TRUNCATED;
return rv;
}
static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
static int
sk_maybe_write(sock *s)
{
int e;
switch (s->type)
2014-05-18 17:42:26 +08:00
{
case SK_TCP:
case SK_MAGIC:
case SK_UNIX:
while (s->ttx != s->tpos)
{
2014-05-18 17:42:26 +08:00
e = write(s->fd, s->ttx, s->tpos - s->ttx);
if (e < 0)
{
if (errno != EINTR && errno != EAGAIN)
{
2014-05-18 17:42:26 +08:00
reset_tx_buffer(s);
/* EPIPE is just a connection close notification during TX */
s->err_hook(s, (errno != EPIPE) ? errno : 0);
return -1;
}
2014-05-18 17:42:26 +08:00
return 0;
}
s->ttx += e;
}
reset_tx_buffer(s);
return 1;
#ifdef HAVE_LIBSSH
case SK_SSH:
while (s->ttx != s->tpos)
{
e = ssh_channel_write(s->ssh->channel, s->ttx, s->tpos - s->ttx);
if (e < 0)
{
s->err = ssh_get_error(s->ssh->session);
s->err_hook(s, ssh_get_error_code(s->ssh->session));
reset_tx_buffer(s);
/* EPIPE is just a connection close notification during TX */
s->err_hook(s, (errno != EPIPE) ? errno : 0);
return -1;
}
s->ttx += e;
}
reset_tx_buffer(s);
return 1;
#endif
2014-05-18 17:42:26 +08:00
case SK_UDP:
case SK_IP:
{
if (s->tbuf == s->tpos)
return 1;
2014-05-18 17:42:26 +08:00
e = sk_sendmsg(s);
if (e < 0)
{
if (errno != EINTR && errno != EAGAIN)
{
reset_tx_buffer(s);
s->err_hook(s, errno);
return -1;
}
if (!s->tx_hook)
reset_tx_buffer(s);
return 0;
}
2014-05-18 17:42:26 +08:00
reset_tx_buffer(s);
return 1;
}
2014-05-18 17:42:26 +08:00
default:
bug("sk_maybe_write: unknown socket type %d", s->type);
}
}
int
sk_rx_ready(sock *s)
{
int rv;
struct pollfd pfd = { .fd = s->fd };
pfd.events |= POLLIN;
redo:
rv = poll(&pfd, 1, 0);
if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
goto redo;
return rv;
}
2000-06-05 20:19:12 +08:00
/**
* sk_send - send data to a socket
* @s: socket
* @len: number of bytes to send
*
* This function sends @len bytes of data prepared in the
* transmit buffer of the socket @s to the network connection.
* If the packet can be sent immediately, it does so and returns
* 1, else it queues the packet for later processing, returns 0
* and calls the @tx_hook of the socket when the tranmission
* takes place.
*/
int
sk_send(sock *s, unsigned len)
{
s->ttx = s->tbuf;
s->tpos = s->tbuf + len;
return sk_maybe_write(s);
}
2000-06-05 20:19:12 +08:00
/**
* sk_send_to - send data to a specific destination
* @s: socket
* @len: number of bytes to send
* @addr: IP address to send the packet to
* @port: port to send the packet to
*
2000-06-07 20:29:08 +08:00
* This is a sk_send() replacement for connection-less packet sockets
2000-06-05 20:19:12 +08:00
* which allows destination of the packet to be chosen dynamically.
* Raw IP sockets should use 0 for @port.
2000-06-05 20:19:12 +08:00
*/
int
sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
{
s->daddr = addr;
if (port)
s->dport = port;
s->ttx = s->tbuf;
s->tpos = s->tbuf + len;
return sk_maybe_write(s);
}
/*
int
sk_send_full(sock *s, unsigned len, struct iface *ifa,
ip_addr saddr, ip_addr daddr, unsigned dport)
{
s->iface = ifa;
s->saddr = saddr;
s->daddr = daddr;
s->dport = dport;
s->ttx = s->tbuf;
s->tpos = s->tbuf + len;
return sk_maybe_write(s);
}
*/
static void
call_rx_hook(sock *s, int size)
{
if (s->rx_hook(s, size))
{
/* We need to be careful since the socket could have been deleted by the hook */
if (current_sock == s)
s->rpos = s->rbuf;
}
}
#ifdef HAVE_LIBSSH
static int
sk_read_ssh(sock *s)
{
ssh_channel rchans[2] = { s->ssh->channel, NULL };
struct timeval timev = { 1, 0 };
if (ssh_channel_select(rchans, NULL, NULL, &timev) == SSH_EINTR)
return 1; /* Try again */
if (ssh_channel_is_eof(s->ssh->channel) != 0)
{
/* The remote side is closing the connection */
s->err_hook(s, 0);
return 0;
}
if (rchans[0] == NULL)
return 0; /* No data is available on the socket */
const uint used_bytes = s->rpos - s->rbuf;
const int read_bytes = ssh_channel_read_nonblocking(s->ssh->channel, s->rpos, s->rbsize - used_bytes, 0);
if (read_bytes > 0)
{
/* Received data */
s->rpos += read_bytes;
call_rx_hook(s, used_bytes + read_bytes);
return 1;
}
else if (read_bytes == 0)
{
if (ssh_channel_is_eof(s->ssh->channel) != 0)
{
/* The remote side is closing the connection */
s->err_hook(s, 0);
}
}
else
{
s->err = ssh_get_error(s->ssh->session);
s->err_hook(s, ssh_get_error_code(s->ssh->session));
}
return 0; /* No data is available on the socket */
}
#endif
/* sk_read() and sk_write() are called from BFD's event loop */
static inline int
sk_read_noflush(sock *s, int revents)
{
switch (s->type)
2014-05-18 17:42:26 +08:00
{
case SK_TCP_PASSIVE:
return sk_passive_connected(s, SK_TCP);
case SK_UNIX_PASSIVE:
return sk_passive_connected(s, SK_UNIX);
case SK_TCP:
case SK_UNIX:
{
2014-05-18 17:42:26 +08:00
int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
if (c < 0)
1999-10-29 20:09:29 +08:00
{
2014-05-18 17:42:26 +08:00
if (errno != EINTR && errno != EAGAIN)
s->err_hook(s, errno);
else if (errno == EAGAIN && !(revents & POLLIN))
{
log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents);
s->err_hook(s, 0);
}
}
2014-05-18 17:42:26 +08:00
else if (!c)
s->err_hook(s, 0);
else
{
2014-05-18 17:42:26 +08:00
s->rpos += c;
call_rx_hook(s, s->rpos - s->rbuf);
2014-05-18 17:42:26 +08:00
return 1;
}
2014-05-18 17:42:26 +08:00
return 0;
}
#ifdef HAVE_LIBSSH
case SK_SSH:
return sk_read_ssh(s);
#endif
2014-05-18 17:42:26 +08:00
case SK_MAGIC:
return s->rx_hook(s, 0);
2014-05-18 17:42:26 +08:00
default:
{
int e = sk_recvmsg(s);
2014-05-18 17:42:26 +08:00
if (e < 0)
{
if (errno != EINTR && errno != EAGAIN)
s->err_hook(s, errno);
return 0;
}
2014-05-18 17:42:26 +08:00
s->rpos = s->rbuf + e;
s->rx_hook(s, e);
return 1;
}
2014-05-18 17:42:26 +08:00
}
}
int
sk_read(sock *s, int revents)
{
int e = sk_read_noflush(s, revents);
tmp_flush();
return e;
}
static inline int
sk_write_noflush(sock *s)
{
switch (s->type)
2014-05-18 17:42:26 +08:00
{
case SK_TCP_ACTIVE:
{
2014-05-18 17:42:26 +08:00
sockaddr sa;
sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
2014-05-18 17:42:26 +08:00
if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
sk_tcp_connected(s);
else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
s->err_hook(s, errno);
return 0;
}
2014-05-18 17:42:26 +08:00
#ifdef HAVE_LIBSSH
case SK_SSH_ACTIVE:
{
switch (sk_ssh_connect(s))
{
case SSH_OK:
sk_ssh_connected(s);
break;
case SSH_AGAIN:
return 1;
case SSH_ERROR:
s->err = ssh_get_error(s->ssh->session);
s->err_hook(s, ssh_get_error_code(s->ssh->session));
break;
}
return 0;
}
#endif
2014-05-18 17:42:26 +08:00
default:
if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
{
if (s->tx_hook)
s->tx_hook(s);
return 1;
}
return 0;
}
}
int
sk_write(sock *s)
{
int e = sk_write_noflush(s);
tmp_flush();
return e;
}
int sk_is_ipv4(sock *s)
{ return s->af == AF_INET; }
int sk_is_ipv6(sock *s)
{ return s->af == AF_INET6; }
void
sk_err(sock *s, int revents)
{
int se = 0, sse = sizeof(se);
if ((s->type != SK_MAGIC) && (revents & POLLERR))
if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0)
{
log(L_ERR "IO: Socket error: SO_ERROR: %m");
se = 0;
}
s->err_hook(s, se);
tmp_flush();
}
void
sk_dump_all(void)
{
node *n;
sock *s;
debug("Open sockets:\n");
WALK_LIST(n, sock_list)
2014-05-18 17:42:26 +08:00
{
s = SKIP_BACK(sock, n, n);
debug("%p ", s);
sk_dump(&s->r);
}
debug("\n");
}
/*
* Internal event log and watchdog
*/
#define EVENT_LOG_LENGTH 32
struct event_log_entry
{
void *hook;
void *data;
btime timestamp;
btime duration;
};
static struct event_log_entry event_log[EVENT_LOG_LENGTH];
static struct event_log_entry *event_open;
static int event_log_pos, event_log_num, watchdog_active;
static btime last_time;
static btime loop_time;
static void
io_update_time(void)
{
struct timespec ts;
int rv;
/*
* This is third time-tracking procedure (after update_times() above and
* times_update() in BFD), dedicated to internal event log and latency
* tracking. Hopefully, we consolidate these sometimes.
*/
rv = clock_gettime(CLOCK_MONOTONIC, &ts);
if (rv < 0)
die("clock_gettime: %m");
last_time = ts.tv_sec S + ts.tv_nsec NS;
if (event_open)
{
event_open->duration = last_time - event_open->timestamp;
if (event_open->duration > config->latency_limit)
log(L_WARN "Event 0x%p 0x%p took %u.%03u ms",
event_open->hook, event_open->data, (uint) (event_open->duration TO_MS), (uint) (event_open->duration % 1000));
event_open = NULL;
}
}
/**
* io_log_event - mark approaching event into event log
* @hook: event hook address
* @data: event data address
*
* Store info (hook, data, timestamp) about the following internal event into
* a circular event log (@event_log). When latency tracking is enabled, the log
* entry is kept open (in @event_open) so the duration can be filled later.
*/
void
io_log_event(void *hook, void *data)
{
if (config->latency_debug)
io_update_time();
struct event_log_entry *en = event_log + event_log_pos;
en->hook = hook;
en->data = data;
en->timestamp = last_time;
en->duration = 0;
event_log_num++;
event_log_pos++;
event_log_pos %= EVENT_LOG_LENGTH;
event_open = config->latency_debug ? en : NULL;
}
static inline void
io_close_event(void)
{
if (event_open)
io_update_time();
}
void
io_log_dump(void)
{
int i;
log(L_DEBUG "Event log:");
for (i = 0; i < EVENT_LOG_LENGTH; i++)
{
struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
if (en->hook)
log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
(int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
}
}
void
watchdog_sigalrm(int sig UNUSED)
{
/* Update last_time and duration, but skip latency check */
config->latency_limit = 0xffffffff;
io_update_time();
/* We want core dump */
abort();
}
static inline void
watchdog_start1(void)
{
io_update_time();
loop_time = last_time;
}
static inline void
watchdog_start(void)
{
io_update_time();
loop_time = last_time;
event_log_num = 0;
if (config->watchdog_timeout)
{
alarm(config->watchdog_timeout);
watchdog_active = 1;
}
}
static inline void
watchdog_stop(void)
{
io_update_time();
if (watchdog_active)
{
alarm(0);
watchdog_active = 0;
}
btime duration = last_time - loop_time;
if (duration > config->watchdog_warning)
log(L_WARN "I/O loop cycle took %u.%03u ms for %d events",
(uint) (duration TO_MS), (uint) (duration % 1000), event_log_num);
}
/*
* Main I/O Loop
*/
void
io_init(void)
{
init_list(&sock_list);
init_list(&global_event_list);
init_list(&global_work_list);
krt_io_init();
// XXX init_times();
// XXX update_times();
boot_time = current_time();
u64 now = (u64) current_real_time();
srandom((uint) (now ^ (now >> 32)));
}
static int short_loops = 0;
#define SHORT_LOOP_MAX 10
#define WORK_EVENTS_MAX 10
void
io_loop(void)
{
int poll_tout, timeout;
int nfds, events, pout;
timer *t;
sock *s;
node *n;
int fdmax = 256;
struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd));
watchdog_start1();
for(;;)
{
times_update(&main_timeloop);
events = ev_run_list(&global_event_list);
events = ev_run_list_limited(&global_work_list, WORK_EVENTS_MAX) || events;
timers_fire(&main_timeloop);
io_close_event();
// FIXME
poll_tout = (events ? 0 : 3000); /* Time in milliseconds */
if (t = timers_first(&main_timeloop))
{
times_update(&main_timeloop);
timeout = (tm_remains(t) TO_MS) + 1;
poll_tout = MIN(poll_tout, timeout);
}
nfds = 0;
WALK_LIST(n, sock_list)
{
pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
s = SKIP_BACK(sock, n, n);
if (s->rx_hook)
{
pfd[nfds].fd = s->fd;
pfd[nfds].events |= POLLIN;
}
if (s->tx_hook && s->ttx != s->tpos)
{
pfd[nfds].fd = s->fd;
pfd[nfds].events |= POLLOUT;
}
if (pfd[nfds].fd != -1)
{
s->index = nfds;
nfds++;
}
else
s->index = -1;
if (nfds >= fdmax)
{
fdmax *= 2;
pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd));
}
}
/*
* Yes, this is racy. But even if the signal comes before this test
* and entering poll(), it gets caught on the next timer tick.
*/
if (async_config_flag)
{
io_log_event(async_config, NULL);
async_config();
async_config_flag = 0;
continue;
}
if (async_dump_flag)
{
io_log_event(async_dump, NULL);
async_dump();
async_dump_flag = 0;
continue;
}
if (async_shutdown_flag)
{
io_log_event(async_shutdown, NULL);
async_shutdown();
async_shutdown_flag = 0;
continue;
}
/* And finally enter poll() to find active sockets */
watchdog_stop();
pout = poll(pfd, nfds, poll_tout);
watchdog_start();
if (pout < 0)
{
if (errno == EINTR || errno == EAGAIN)
continue;
die("poll: %m");
}
if (pout)
{
times_update(&main_timeloop);
/* guaranteed to be non-empty */
current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
while (current_sock)
{
sock *s = current_sock;
if (s->index == -1)
{
current_sock = sk_next(s);
goto next;
}
int e;
int steps;
steps = MAX_STEPS;
if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
do
{
steps--;
io_log_event(s->rx_hook, s->data);
e = sk_read(s, pfd[s->index].revents);
if (s != current_sock)
goto next;
}
while (e && s->rx_hook && steps);
steps = MAX_STEPS;
if (pfd[s->index].revents & POLLOUT)
do
{
steps--;
io_log_event(s->tx_hook, s->data);
e = sk_write(s);
if (s != current_sock)
goto next;
}
while (e && steps);
current_sock = sk_next(s);
next: ;
}
short_loops++;
if (events && (short_loops < SHORT_LOOP_MAX))
continue;
short_loops = 0;
int count = 0;
current_sock = stored_sock;
if (current_sock == NULL)
current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
while (current_sock && count < MAX_RX_STEPS)
{
sock *s = current_sock;
if (s->index == -1)
{
current_sock = sk_next(s);
goto next2;
}
if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
{
count++;
io_log_event(s->rx_hook, s->data);
sk_read(s, pfd[s->index].revents);
if (s != current_sock)
goto next2;
}
if (pfd[s->index].revents & (POLLHUP | POLLERR))
{
sk_err(s, pfd[s->index].revents);
if (s != current_sock)
goto next2;
}
current_sock = sk_next(s);
next2: ;
}
stored_sock = current_sock;
}
}
}
2004-06-07 01:05:25 +08:00
void
test_old_bird(char *path)
{
int fd;
struct sockaddr_un sa;
fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (fd < 0)
die("Cannot create socket: %m");
if (strlen(path) >= sizeof(sa.sun_path))
die("Socket path too long");
2004-06-07 01:05:25 +08:00
bzero(&sa, sizeof(sa));
sa.sun_family = AF_UNIX;
strcpy(sa.sun_path, path);
if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
die("I found another BIRD running.");
close(fd);
}