@@ -56,12 +56,26 @@ struct clsmap_node {
uint32_t table;
};
+struct router_rule {
+ struct rculist node;
+ uint32_t prio;
+ bool invert;
+ uint8_t src_len;
+ struct in6_addr from_addr;
+ uint32_t lookup_table;
+};
+
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER;
static struct hmap clsmap = HMAP_INITIALIZER(&clsmap);
static struct classifier default_cls;
+static struct router_rule rules = {
+ .node = RCUOVS_LIST_INITIALIZER(&rules.node)
+};
+static bool rules_have_from_all;
+
/* By default, use the system routing table. For system-independent testing,
* the unit tests disable using the system routing table. */
static bool use_system_routing_table = true;
@@ -131,6 +145,14 @@ cls_destroy(struct classifier *cls)
classifier_publish(cls);
}
+bool
+ovs_router_is_empty(uint32_t table)
+{
+ struct classifier *cls = cls_find(table);
+
+ return !cls || !cls->n_rules;
+}
+
static struct ovs_router_entry *
ovs_router_entry_cast(const struct cls_rule *cr)
{
@@ -170,10 +192,49 @@ ovs_router_lookup(uint32_t mark, const struct in6_addr *ip6_dst,
char output_netdev[],
struct in6_addr *src, struct in6_addr *gw)
{
- const struct cls_rule *cr;
struct flow flow = {.ipv6_dst = *ip6_dst, .pkt_mark = mark};
+ const struct cls_rule *cr;
+ struct router_rule *rule;
+ bool match_found = false;
+
+ if (rules_have_from_all || (src && ipv6_addr_is_set(src))) {
+ const struct in6_addr *from_src = src;
- if (src && ipv6_addr_is_set(src)) {
+ if (rules_have_from_all && !from_src) {
+ from_src = &in6addr_any;
+ }
+
+ /* Rules list is always sorted by router_rule::prio so here it is
+ * traversed starting from the higher priority rules first. */
+ RCULIST_FOR_EACH (rule, node, &rules.node) {
+ bool matched = !!(!rule->src_len ||
+ ipv6_addr_equals_masked(&rule->from_addr,
+ from_src, rule->src_len)
+ );
+
+ if (rule->invert) {
+ matched = !matched;
+ }
+
+ if (matched) {
+ struct classifier *cls = cls_find(rule->lookup_table);
+
+ if (!cls) {
+ VLOG_WARN_RL(&rl, "rule %u: route table %u not found",
+ rule->prio, rule->lookup_table);
+ continue;
+ }
+ cr = classifier_lookup(cls, OVS_VERSION_MAX, &flow, NULL,
+ NULL);
+ if (cr) {
+ match_found = true;
+ break;
+ }
+ }
+ }
+ }
+
+ if (!match_found && src && ipv6_addr_is_set(src)) {
const struct cls_rule *cr_src;
struct flow flow_src = {.ipv6_dst = *src, .pkt_mark = mark};
@@ -189,7 +250,11 @@ ovs_router_lookup(uint32_t mark, const struct in6_addr *ip6_dst,
}
}
- cr = classifier_lookup(&default_cls, OVS_VERSION_MAX, &flow, NULL, NULL);
+ if (!match_found) {
+ cr = classifier_lookup(&default_cls, OVS_VERSION_MAX, &flow, NULL,
+ NULL);
+ }
+
if (cr) {
struct ovs_router_entry *p = ovs_router_entry_cast(cr);
@@ -750,9 +815,24 @@ ovs_router_flush(void)
seq_change(tnl_conf_seq);
}
+void
+ovs_router_rules_flush(void)
+{
+ struct router_rule *rule;
+
+ rules_have_from_all = false;
+ ovsrcu_quiesce();
+
+ RCULIST_FOR_EACH_SAFE_PROTECTED (rule, node, &rules.node) {
+ rculist_remove(&rule->node);
+ ovsrcu_postpone(free, rule);
+ }
+}
+
static void
ovs_router_flush_handler(void *aux OVS_UNUSED)
{
+ ovs_router_rules_flush();
ovs_router_flush();
ovs_mutex_lock(&mutex);
@@ -761,6 +841,44 @@ ovs_router_flush_handler(void *aux OVS_UNUSED)
ovs_mutex_unlock(&mutex);
}
+bool
+ovs_router_is_referenced(uint32_t table)
+{
+ struct router_rule *rule;
+
+ if (ovs_router_is_standard_table_id(table)) {
+ return true;
+ }
+
+ RCULIST_FOR_EACH (rule, node, &rules.node) {
+ if (rule->lookup_table == table) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void
+ovs_router_add_rule(uint32_t prio, bool invert, uint8_t src_len,
+ const struct in6_addr *from, uint32_t lookup_table)
+{
+ struct router_rule *rule = xzalloc(sizeof *rule);
+
+ rculist_init(&rule->node);
+
+ rule->prio = prio;
+ rule->invert = invert;
+ rule->src_len = src_len;
+ rule->from_addr = *from;
+ rule->lookup_table = lookup_table;
+
+ rculist_push_back(&rules.node, &rule->node);
+
+ if (!src_len && !rules_have_from_all) {
+ rules_have_from_all = true;
+ }
+}
+
void
ovs_router_init(void)
{
@@ -770,6 +888,7 @@ ovs_router_init(void)
ovs_mutex_lock(&mutex);
hmap_init(&clsmap);
classifier_init(&default_cls, NULL);
+ rculist_init(&rules.node);
ovs_mutex_unlock(&mutex);
fatal_signal_add_hook(ovs_router_flush_handler, NULL, NULL, true);
unixctl_command_register("ovs/route/add",
@@ -57,6 +57,8 @@ bool ovs_router_lookup(uint32_t mark, const struct in6_addr *ip_dst,
char output_netdev[],
struct in6_addr *src, struct in6_addr *gw);
void ovs_router_init(void);
+bool ovs_router_is_empty(uint32_t table);
+bool ovs_router_is_referenced(uint32_t table);
void ovs_router_insert(uint32_t table, uint32_t mark,
const struct in6_addr *ip_dst,
uint8_t plen, bool local,
@@ -68,7 +70,10 @@ void ovs_router_force_insert(uint32_t table, uint32_t mark,
const char output_netdev[],
const struct in6_addr *gw,
const struct in6_addr *prefsrc);
+void ovs_router_add_rule(uint32_t prio, bool invert, uint8_t src_len,
+ const struct in6_addr *from, uint32_t lookup_table);
void ovs_router_flush(void);
+void ovs_router_rules_flush(void);
void ovs_router_disable_system_routing_table(void);
@@ -1081,6 +1081,26 @@ ipv6_is_cidr(const struct in6_addr *netmask)
return true;
}
+bool
+ipv6_addr_equals_masked(const struct in6_addr *a, const struct in6_addr *b,
+ int m)
+{
+ struct in6_addr mask;
+ struct in6_addr ma;
+ struct in6_addr mb;
+
+ if (((IN6_IS_ADDR_V4MAPPED(a) || IN6_IS_ADDR_V4MAPPED(b)) && m == 32) ||
+ m == 128) {
+ return ipv6_addr_equals(a, b);
+ }
+
+ mask = ipv6_create_mask(m);
+ ma = ipv6_addr_bitand(a, &mask);
+ mb = ipv6_addr_bitand(b, &mask);
+
+ return ipv6_addr_equals(&ma, &mb);
+}
+
/* Populates 'b' with an Ethernet II packet headed with the given 'eth_dst',
* 'eth_src' and 'eth_type' parameters. A payload of 'size' bytes is allocated
* in 'b' and returned. This payload may be populated with appropriate
@@ -1606,6 +1606,8 @@ bool ipv6_is_zero(const struct in6_addr *a);
struct in6_addr ipv6_create_mask(int mask);
int ipv6_count_cidr_bits(const struct in6_addr *netmask);
bool ipv6_is_cidr(const struct in6_addr *netmask);
+bool ipv6_addr_equals_masked(const struct in6_addr *a,
+ const struct in6_addr *b, int m);
bool ipv6_parse(const char *s, struct in6_addr *ip);
char *ipv6_parse_masked(const char *s, struct in6_addr *ipv6,
@@ -23,6 +23,7 @@
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/socket.h>
+#include <linux/fib_rules.h>
#include <linux/rtnetlink.h>
#include <net/if.h>
@@ -59,13 +60,23 @@ static struct nln *nln = NULL;
static struct route_table_msg nln_rtmsg_change;
static struct nln_notifier *route_notifier = NULL;
static struct nln_notifier *route6_notifier = NULL;
+static struct nln_notifier *rule_notifier = NULL;
+static struct nln_notifier *rule6_notifier = NULL;
static struct nln_notifier *name_notifier = NULL;
static bool route_table_valid = false;
+static bool rules_valid = false;
+
+static int route_nln_parse(struct ofpbuf *, void *change);
+
+static void rule_handle_msg(const struct route_table_msg *);
+static int rule_parse(struct ofpbuf *, void *change);
static void route_table_reset(void);
-static void route_table_handle_msg(const struct route_table_msg *, void *aux);
+static void route_table_handle_msg(const struct route_table_msg *, void *aux,
+ uint32_t table);
static void route_table_change(struct route_table_msg *, void *aux);
+static void rules_change(const struct route_table_msg *, void *aux);
static void route_map_clear(void);
static void name_table_init(void);
@@ -105,9 +116,11 @@ route_table_init(void)
ovs_assert(!nln);
ovs_assert(!route_notifier);
ovs_assert(!route6_notifier);
+ ovs_assert(!rule_notifier);
+ ovs_assert(!rule6_notifier);
ovs_router_init();
- nln = nln_create(NETLINK_ROUTE, route_table_parse, &nln_rtmsg_change);
+ nln = nln_create(NETLINK_ROUTE, route_nln_parse, &nln_rtmsg_change);
route_notifier =
nln_notifier_create(nln, RTNLGRP_IPV4_ROUTE,
@@ -116,6 +129,13 @@ route_table_init(void)
nln_notifier_create(nln, RTNLGRP_IPV6_ROUTE,
(nln_notify_func *) route_table_change, NULL);
+ rule_notifier =
+ nln_notifier_create(nln, RTNLGRP_IPV4_RULE,
+ (nln_notify_func *) rules_change, NULL);
+ rule6_notifier =
+ nln_notifier_create(nln, RTNLGRP_IPV6_RULE,
+ (nln_notify_func *) rules_change, NULL);
+
route_table_reset();
name_table_init();
@@ -132,7 +152,7 @@ route_table_run(void)
rtnetlink_run();
nln_run(nln);
- if (!route_table_valid) {
+ if (!route_table_valid || !rules_valid) {
route_table_reset();
}
}
@@ -191,7 +211,7 @@ route_table_dump_one_table(uint32_t id,
if (!(nlmsghdr->nlmsg_flags & NLM_F_DUMP_FILTERED)) {
filtered = false;
}
- handle_msg_cb(&msg, aux);
+ handle_msg_cb(&msg, aux, id);
route_data_destroy(&msg.rd);
}
}
@@ -201,6 +221,36 @@ route_table_dump_one_table(uint32_t id,
return filtered;
}
+static void
+rules_dump(void)
+{
+ uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
+ struct ofpbuf request, reply, buf;
+ struct fib_rule_hdr *rq_msg;
+ struct nl_dump dump;
+
+ ofpbuf_init(&request, 0);
+
+ nl_msg_put_nlmsghdr(&request, sizeof *rq_msg, RTM_GETRULE, NLM_F_REQUEST);
+
+ rq_msg = ofpbuf_put_zeros(&request, sizeof *rq_msg);
+ rq_msg->family = AF_UNSPEC;
+
+ nl_dump_start(&dump, NETLINK_ROUTE, &request);
+ ofpbuf_uninit(&request);
+
+ ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
+ while (nl_dump_next(&dump, &reply, &buf)) {
+ struct route_table_msg msg;
+
+ if (rule_parse(&reply, &msg)) {
+ rule_handle_msg(&msg);
+ }
+ }
+ ofpbuf_uninit(&buf);
+ nl_dump_done(&dump);
+}
+
static void
route_table_reset(void)
{
@@ -213,6 +263,7 @@ route_table_reset(void)
route_map_clear();
netdev_get_addrs_list_flush();
route_table_valid = true;
+ rules_valid = true;
rt_change_seq++;
COVERAGE_INC(route_table_dump);
@@ -224,6 +275,166 @@ route_table_reset(void)
break;
}
}
+ rules_dump();
+}
+
+static void
+rule_handle_msg(const struct route_table_msg *change)
+{
+ if (change->relevant) {
+ const struct rule_data *rd = &change->rud;
+
+ if (!ovs_router_is_standard_table_id(rd->lookup_table)) {
+ route_table_dump_one_table(rd->lookup_table,
+ route_table_handle_msg, NULL);
+ }
+
+ /* OVS is more restrictive than kernel in what routes it allows, thus
+ * table dump operation may be unsuccessful, resulting in zero routes
+ * imported. The rule referencing such a table needs to be ignored
+ * too. */
+ if (!ovs_router_is_empty(rd->lookup_table)) {
+ /* rules_dump() always receives rules list from kernel sorted by
+ * priority from highest to lowest, thus rules get inserted into
+ * OVS list maintaining this property as well. */
+ ovs_router_add_rule(rd->prio, rd->invert, rd->src_len,
+ &rd->from_addr, rd->lookup_table);
+ }
+ }
+}
+
+static int route_nln_parse(struct ofpbuf *buf, void *change_)
+{
+ const struct nlmsghdr *nlmsg = buf->data;
+
+ if (nlmsg->nlmsg_type == RTM_NEWROUTE ||
+ nlmsg->nlmsg_type == RTM_DELROUTE) {
+ return route_table_parse(buf, change_);
+ } else if (nlmsg->nlmsg_type == RTM_NEWRULE ||
+ nlmsg->nlmsg_type == RTM_DELRULE) {
+ return rule_parse(buf, change_);
+ }
+
+ VLOG_DBG_RL(&rl, "received unsupported rtnetlink route message");
+ return 0;
+}
+
+/* Return RTNLGRP_IPV4_RULE or RTNLGRP_IPV6_RULE on success, 0 on parse
+ * error. */
+static int
+rule_parse(struct ofpbuf *buf, void *change_)
+{
+ struct route_table_msg *change = change_;
+ bool parsed, ipv4 = false;
+
+ static const struct nl_policy policy[] = {
+ [FRA_PRIORITY] = { .type = NL_A_U32, .optional = false },
+ [FRA_SRC] = { .type = NL_A_U32, .optional = true },
+ [FRA_TABLE] = { .type = NL_A_U32, .optional = true },
+ };
+
+ static const struct nl_policy policy6[] = {
+ [FRA_PRIORITY] = { .type = NL_A_U32, .optional = false },
+ [FRA_SRC] = { .type = NL_A_IPV6, .optional = true },
+ [FRA_TABLE] = { .type = NL_A_U32, .optional = true },
+ };
+
+ struct nlattr *attrs[ARRAY_SIZE(policy)];
+ const struct fib_rule_hdr *frh;
+
+ frh = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *frh);
+ if (frh->action != FR_ACT_TO_TBL || frh->tos || frh->dst_len) {
+ /* unsupported rule */
+ return 0;
+ }
+
+ if (frh->family == AF_INET) {
+ parsed =
+ nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct fib_rule_hdr),
+ policy, attrs, ARRAY_SIZE(policy));
+ ipv4 = true;
+ } else if (frh->family == AF_INET6) {
+ parsed =
+ nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct fib_rule_hdr),
+ policy6, attrs, ARRAY_SIZE(policy6));
+ } else {
+ VLOG_DBG_RL(&rl, "received non AF_INET rtnetlink route message");
+ return 0;
+ }
+
+ if (parsed) {
+ const struct nlmsghdr *nlmsg;
+
+ nlmsg = buf->data;
+
+ memset(change, 0, sizeof *change);
+ change->relevant = true;
+ change->nlmsg_type = nlmsg->nlmsg_type;
+ change->rud.invert = false;
+ change->rud.src_len = frh->src_len;
+ change->rud.lookup_table = frh->table;
+
+ if (frh->flags & FIB_RULE_INVERT) {
+ /* Invert the matching of rule selector */
+ change->rud.invert = true;
+ }
+
+ if (attrs[FRA_PRIORITY]) {
+ change->rud.prio = nl_attr_get_u32(attrs[FRA_PRIORITY]);
+ }
+
+ if (attrs[FRA_SRC]) {
+ if (ipv4) {
+ ovs_be32 src;
+ src = nl_attr_get_be32(attrs[FRA_SRC]);
+ in6_addr_set_mapped_ipv4(&change->rud.from_addr, src);
+ } else {
+ change->rud.from_addr = nl_attr_get_in6_addr(attrs[FRA_SRC]);
+ }
+ } else if (ipv4) {
+ in6_addr_set_mapped_ipv4(&change->rud.from_addr, 0);
+ }
+
+ if (attrs[FRA_TABLE]) {
+ change->rud.lookup_table = nl_attr_get_u32(attrs[FRA_TABLE]);
+ if (ovs_router_is_standard_table_id(change->rud.lookup_table)) {
+ change->relevant = false;
+ }
+ } else {
+ change->relevant = false;
+ }
+
+ if (change->rud.invert && !change->rud.src_len) {
+ change->relevant = false;
+ }
+ } else {
+ VLOG_DBG_RL(&rl, "received unparseable rtnetlink rule message");
+ return 0;
+ }
+
+ /* Check if there are any additional attributes that aren't supported
+ * currently by OVS rule-based route lookup. */
+ if (change->relevant) {
+ size_t offset = NLMSG_HDRLEN + sizeof(struct fib_rule_hdr);
+ struct nlattr *nla;
+ size_t left;
+
+ NL_ATTR_FOR_EACH (nla, left, ofpbuf_at(buf, offset, 0),
+ buf->size - offset) {
+ uint16_t type = nl_attr_type(nla);
+
+ if ((type > FRA_SRC && type < FRA_PRIORITY) ||
+ (type > FRA_PRIORITY && type < FRA_SUPPRESS_PREFIXLEN) ||
+ (type > FRA_TABLE && type < FRA_PROTOCOL) ||
+ type > FRA_PROTOCOL || type > FRA_MAX) {
+ change->relevant = false;
+ break;
+ }
+ }
+ }
+
+ /* Success. */
+ return ipv4 ? RTNLGRP_IPV4_RULE : RTNLGRP_IPV6_RULE;
}
/* Returns true if the given route requires nexthop information (output
@@ -524,7 +735,7 @@ route_table_change(struct route_table_msg *change, void *aux OVS_UNUSED)
{
if (!change
|| (change->relevant
- && ovs_router_is_standard_table_id(change->rd.rta_table_id))) {
+ && ovs_router_is_referenced(change->rd.rta_table_id))) {
route_table_valid = false;
}
if (change) {
@@ -534,7 +745,7 @@ route_table_change(struct route_table_msg *change, void *aux OVS_UNUSED)
static void
route_table_handle_msg(const struct route_table_msg *change,
- void *aux OVS_UNUSED)
+ void *aux OVS_UNUSED, uint32_t table)
{
if (change->relevant && change->nlmsg_type == RTM_NEWROUTE
&& !ovs_list_is_empty(&change->rd.nexthops)) {
@@ -547,7 +758,7 @@ route_table_handle_msg(const struct route_table_msg *change,
rdnh = CONTAINER_OF(ovs_list_front(&change->rd.nexthops),
const struct route_data_nexthop, nexthop_node);
- ovs_router_insert(CLS_DEFAULT, rd->rta_mark, &rd->rta_dst,
+ ovs_router_insert(table, rd->rta_mark, &rd->rta_dst,
IN6_IS_ADDR_V4MAPPED(&rd->rta_dst)
? rd->rtm_dst_len + 96 : rd->rtm_dst_len,
rd->rtn_local, rdnh->ifname, &rdnh->addr,
@@ -555,9 +766,19 @@ route_table_handle_msg(const struct route_table_msg *change,
}
}
+static void
+rules_change(const struct route_table_msg *change OVS_UNUSED,
+ void *aux OVS_UNUSED)
+{
+ if (!change || change->relevant) {
+ rules_valid = false;
+ }
+}
+
static void
route_map_clear(void)
{
+ ovs_router_rules_flush();
ovs_router_flush();
}
@@ -143,12 +143,24 @@ struct route_data {
uint32_t rta_priority; /* 0 if missing. */
};
+struct rule_data {
+ bool invert;
+ uint32_t prio;
+ uint8_t src_len;
+ struct in6_addr from_addr;
+ uint32_t lookup_table;
+};
+
/* A digested version of a route message sent down by the kernel to indicate
- * that a route has changed. */
+ * that a route or a rule has changed. */
struct route_table_msg {
bool relevant; /* Should this message be processed? */
uint16_t nlmsg_type; /* e.g. RTM_NEWROUTE, RTM_DELROUTE. */
- struct route_data rd; /* Data parsed from this message. */
+ union { /* Data parsed from this message, depending on
+ * nlmsg_type. */
+ struct route_data rd;
+ struct rule_data rud;
+ };
};
uint64_t route_table_get_change_seq(void);
@@ -160,7 +172,7 @@ bool route_table_fallback_lookup(const struct in6_addr *ip6_dst,
struct in6_addr *gw6);
typedef void route_table_handle_msg_callback(const struct route_table_msg *,
- void *aux);
+ void *aux, uint32_t table);
bool route_table_dump_one_table(uint32_t id,
route_table_handle_msg_callback *,
@@ -67,7 +67,8 @@ rt_table_name(uint32_t id)
static void
test_lib_route_table_handle_msg(const struct route_table_msg *change,
- void *data OVS_UNUSED)
+ void *data OVS_UNUSED,
+ uint32_t table OVS_UNUSED)
{
struct ds nexthop_addr = DS_EMPTY_INITIALIZER;
struct ds rta_prefsrc = DS_EMPTY_INITIALIZER;
@@ -115,7 +116,7 @@ static void
test_lib_route_table_change(struct route_table_msg *change,
void *aux OVS_UNUSED)
{
- test_lib_route_table_handle_msg(change, NULL);
+ test_lib_route_table_handle_msg(change, NULL, 0);
route_data_destroy(&change->rd);
}
Introduce support for route lookup across several routing tables following a priority from higher to lower, with the main routing table being the lowest. Additional routing tables are created by reading the Routing Policy Database (RPDB) from the kernel and importing only those tables which are referenced in the RPDB rules with a table lookup action. The table IDs and rule priority are copied from the kernel RPDB as is. Current implementation only supports RPDB rules with a source address selector, in form of '[not] from IP' match. Signed-off-by: Dima Chumak <dchumak@nvidia.com> --- lib/ovs-router.c | 125 ++++++++++++++++++- lib/ovs-router.h | 5 + lib/packets.c | 20 +++ lib/packets.h | 2 + lib/route-table.c | 235 +++++++++++++++++++++++++++++++++-- lib/route-table.h | 18 ++- tests/test-lib-route-table.c | 5 +- 7 files changed, 395 insertions(+), 15 deletions(-)