@@ -2248,6 +2248,15 @@ output;
</p>
<p>
+ For ECMP routes, i.e. multiple static routes with same policy and
+ prefix but different nexthops, the above actions are deferred to next
+ table. This table, instead, is responsible for determine the ECMP
+ group id and select a member id within the group based on 5-tuple
+ hashing. It stores group id in <code>reg8[0..15]</code> and member id in
+ <code>reg8[16..31]</code>.
+ </p>
+
+ <p>
This table contains the following logical flows:
</p>
@@ -2333,6 +2342,7 @@ outport = <code>redirect-chassis-port</code>;
<pre>
ip.ttl--;
+reg8[0..15] = 0;
reg0 = <var>G</var>;
reg1 = <var>A</var>;
eth.src = <var>E</var>;
@@ -2366,6 +2376,7 @@ next;
<pre>
ip.ttl--;
+reg8[0..15] = 0;
xxreg0 = <var>G</var>;
xxreg1 = <var>A</var>;
eth.src = <var>E</var>;
@@ -2390,9 +2401,79 @@ next;
route will be limited to sending on the ingress port.
</p>
</li>
+
+ <li>
+ <p>
+ For ECMP routes, they are grouped by policy and prefix. An unique id
+ (non-zero) is assigned to each group, and each member is also
+ assigned an unique id (non-zero) within each group.
+ </p>
+
+ <p>
+ For each IPv4/IPv6 ECMP group with group id <var>GID</var> and member
+ ids <var>MID1</var>, <var>MID2</var>, ..., a logical flow with match
+ in CIDR notation <code>ip4.dst == <var>N</var>/<var>M</var></code>,
+ or <code>ip6.dst == <var>N</var>/<var>M</var></code>, whose priority
+ is the integer value of <var>M</var>, has the following actions:
+ </p>
+
+ <pre>
+ip.ttl--;
+flags.loopback = 1;
+reg8[0..15] = <var>GID</var>;
+select(reg8[16..31], <var>MID1</var>, <var>MID2</var>, ...);
+ </pre>
+ </li>
+ </ul>
+
+ <h3>Ingress Table 10: IP_ROUTING_ECMP</h3>
+
+ <p>
+ This table implements the second part of IP routing for ECMP routes
+ following the previous table. If a packet matched a ECMP group in the
+ previous table, this table matches the group id and member id stored
+ from the previous table, setting <code>reg0</code>
+ (or <code>xxreg0</code> for IPv6) to the next-hop IP address
+ (leaving <code>ip4.dst</code> or <code>ip6.dst</code>, the
+ packet's final destination, unchanged) and advances to the next
+ table for ARP resolution. It also sets <code>reg1</code> (or
+ <code>xxreg1</code>) to the IP address owned by the selected router
+ port (ingress table <code>ARP Request</code> will generate an ARP
+ request, if needed, with <code>reg0</code> as the target protocol
+ address and <code>reg1</code> as the source protocol address).
+ </p>
+
+ <p>
+ This table contains the following logical flows:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ A priority-150 flow that matches <code>reg8[0..15] == 0</code>
+ with action <code>next;</code> directly bypasses packets of non-ECMP
+ routes.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ For each member with ID <var>MID</var> in each ECMP group with ID
+ <var>GID</var>, a priority-100 flow with match
+ <code>reg8[0..15] == <var>GID</var> && reg8[16..31] == <var>MID</var></code>
+ has following actions:
+ </p>
+
+ <pre>
+[xx]reg0 = <var>G</var>;
+[xx]reg1 = <var>A</var>;
+eth.src = <var>E</var>;
+outport = <var>P</var>;
+ </pre>
+ </li>
</ul>
- <h3>Ingress Table 10: ARP/ND Resolution</h3>
+ <h3>Ingress Table 12: ARP/ND Resolution</h3>
<p>
Any packet that reaches this table is an IP packet whose next-hop
@@ -2539,7 +2620,7 @@ next;
</ul>
- <h3>Ingress Table 11: Check packet length</h3>
+ <h3>Ingress Table 13: Check packet length</h3>
<p>
For distributed logical routers with distributed gateway port configured
@@ -2569,7 +2650,7 @@ REGBIT_PKT_LARGER = check_pkt_larger(<var>L</var>); next;
and advances to the next table.
</p>
- <h3>Ingress Table 12: Handle larger packets</h3>
+ <h3>Ingress Table 14: Handle larger packets</h3>
<p>
For distributed logical routers with distributed gateway port configured
@@ -2618,7 +2699,7 @@ icmp4 {
and advances to the next table.
</p>
- <h3>Ingress Table 13: Gateway Redirect</h3>
+ <h3>Ingress Table 15: Gateway Redirect</h3>
<p>
For distributed logical routers where one of the logical router
@@ -2680,7 +2761,7 @@ icmp4 {
</li>
</ul>
- <h3>Ingress Table 14: ARP Request</h3>
+ <h3>Ingress Table 16: ARP Request</h3>
<p>
In the common case where the Ethernet destination has been resolved, this
@@ -176,12 +176,13 @@ enum ovn_stage {
PIPELINE_STAGE(ROUTER, IN, ND_RA_OPTIONS, 7, "lr_in_nd_ra_options") \
PIPELINE_STAGE(ROUTER, IN, ND_RA_RESPONSE, 8, "lr_in_nd_ra_response") \
PIPELINE_STAGE(ROUTER, IN, IP_ROUTING, 9, "lr_in_ip_routing") \
- PIPELINE_STAGE(ROUTER, IN, POLICY, 10, "lr_in_policy") \
- PIPELINE_STAGE(ROUTER, IN, ARP_RESOLVE, 11, "lr_in_arp_resolve") \
- PIPELINE_STAGE(ROUTER, IN, CHK_PKT_LEN , 12, "lr_in_chk_pkt_len") \
- PIPELINE_STAGE(ROUTER, IN, LARGER_PKTS, 13,"lr_in_larger_pkts") \
- PIPELINE_STAGE(ROUTER, IN, GW_REDIRECT, 14, "lr_in_gw_redirect") \
- PIPELINE_STAGE(ROUTER, IN, ARP_REQUEST, 15, "lr_in_arp_request") \
+ PIPELINE_STAGE(ROUTER, IN, IP_ROUTING_ECMP, 10, "lr_in_ip_routing_ecmp") \
+ PIPELINE_STAGE(ROUTER, IN, POLICY, 11, "lr_in_policy") \
+ PIPELINE_STAGE(ROUTER, IN, ARP_RESOLVE, 12, "lr_in_arp_resolve") \
+ PIPELINE_STAGE(ROUTER, IN, CHK_PKT_LEN , 13, "lr_in_chk_pkt_len") \
+ PIPELINE_STAGE(ROUTER, IN, LARGER_PKTS, 14,"lr_in_larger_pkts") \
+ PIPELINE_STAGE(ROUTER, IN, GW_REDIRECT, 15, "lr_in_gw_redirect") \
+ PIPELINE_STAGE(ROUTER, IN, ARP_REQUEST, 16, "lr_in_arp_request") \
\
/* Logical router egress stages. */ \
PIPELINE_STAGE(ROUTER, OUT, UNDNAT, 0, "lr_out_undnat") \
@@ -222,6 +223,9 @@ enum ovn_stage {
#define REGBIT_PKT_LARGER "reg9[3]"
#define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[4]"
#define REGBIT_SKIP_LOOKUP_NEIGHBOR "reg9[5]"
+/* Register for ECMP bucket selection. */
+#define REG_ECMP_GROUP_ID "reg8[0..15]"
+#define REG_ECMP_MEMBER_ID "reg8[16..31]"
#define FLAGBIT_NOT_VXLAN "flags[1] == 0"
@@ -6712,132 +6716,290 @@ add_distributed_nat_routes(struct hmap *lflows, const struct ovn_port *op)
ds_destroy(&actions);
}
-static void
-add_route(struct hmap *lflows, const struct ovn_port *op,
- const char *lrp_addr_s, const char *network_s, int plen,
- const char *gateway, const char *policy)
+static bool
+ip46_parse_cidr(const char *str, struct v46_ip *prefix, unsigned int *plen)
{
- bool is_ipv4 = strchr(network_s, '.') ? true : false;
- struct ds match = DS_EMPTY_INITIALIZER;
- const char *dir;
- uint16_t priority;
-
- if (policy && !strcmp(policy, "src-ip")) {
- dir = "src";
- priority = plen * 2;
- } else {
- dir = "dst";
- priority = (plen * 2) + 1;
+ char *error = ip_parse_cidr(str, &prefix->ipv4, plen);
+ if (!error) {
+ prefix->family = AF_INET;
+ return true;
}
+ free(error);
+ error = ipv6_parse_cidr(str, &prefix->ipv6, plen);
+ if (!error) {
+ prefix->family = AF_INET6;
+ return true;
+ }
+ free(error);
+ return false;
+}
- /* IPv6 link-local addresses must be scoped to the local router port. */
- if (!is_ipv4) {
- struct in6_addr network;
- ovs_assert(ipv6_parse(network_s, &network));
- if (in6_is_lla(&network)) {
- ds_put_format(&match, "inport == %s && ", op->json_key);
- }
+static bool
+ip46_equals(const struct v46_ip *addr1, const struct v46_ip *addr2)
+{
+ return (addr1->family == addr2->family &&
+ (addr1->family == AF_INET ? addr1->ipv4 == addr2->ipv4 :
+ IN6_ARE_ADDR_EQUAL(&addr1->ipv6, &addr2->ipv6)));
+}
+
+struct parsed_route {
+ struct ovs_list list_node;
+ struct v46_ip prefix;
+ unsigned int plen;
+ bool is_src_route;
+ uint32_t hash;
+ const struct nbrec_logical_router_static_route *route;
+};
+
+static uint32_t
+route_hash(struct parsed_route *route)
+{
+ return hash_bytes(&route->prefix, sizeof route->prefix,
+ (uint32_t)route->plen);
+}
+
+/* Parse and validate the route. Return the parsed route if successful.
+ * Otherwise return NULL. */
+static struct parsed_route *
+parsed_routes_add(struct ovs_list *routes,
+ const struct nbrec_logical_router_static_route *route)
+{
+ /* Verify that the next hop is an IP address with an all-ones mask. */
+ struct v46_ip nexthop;
+ unsigned int plen;
+ if (!ip46_parse_cidr(route->nexthop, &nexthop, &plen)) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+ VLOG_WARN_RL(&rl, "bad 'nexthop' %s in static route"
+ UUID_FMT, route->nexthop,
+ UUID_ARGS(&route->header_.uuid));
+ return NULL;
+ }
+ if ((nexthop.family == AF_INET && plen != 32) ||
+ (nexthop.family == AF_INET6 && plen != 128)) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+ VLOG_WARN_RL(&rl, "bad next hop mask %s in static route"
+ UUID_FMT, route->nexthop,
+ UUID_ARGS(&route->header_.uuid));
+ return NULL;
}
- ds_put_format(&match, "ip%s.%s == %s/%d", is_ipv4 ? "4" : "6", dir,
- network_s, plen);
- struct ds actions = DS_EMPTY_INITIALIZER;
- ds_put_format(&actions, "ip.ttl--; %sreg0 = ", is_ipv4 ? "" : "xx");
+ /* Parse ip_prefix */
+ struct v46_ip prefix;
+ if (!ip46_parse_cidr(route->ip_prefix, &prefix, &plen)) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+ VLOG_WARN_RL(&rl, "bad 'ip_prefix' %s in static route"
+ UUID_FMT, route->ip_prefix,
+ UUID_ARGS(&route->header_.uuid));
+ return NULL;
+ }
- if (gateway) {
- ds_put_cstr(&actions, gateway);
- } else {
- ds_put_format(&actions, "ip%s.dst", is_ipv4 ? "4" : "6");
+ /* Verify that ip_prefix and nexthop have same address familiy. */
+ if (prefix.family != nexthop.family) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+ VLOG_WARN_RL(&rl, "Address family doesn't match between 'ip_prefix' %s"
+ " and 'nexthop' %s in static route"UUID_FMT,
+ route->ip_prefix, route->nexthop,
+ UUID_ARGS(&route->header_.uuid));
+ return NULL;
}
- ds_put_format(&actions, "; "
- "%sreg1 = %s; "
- "eth.src = %s; "
- "outport = %s; "
- "flags.loopback = 1; "
- "next;",
- is_ipv4 ? "" : "xx",
- lrp_addr_s,
- op->lrp_networks.ea_s,
- op->json_key);
- /* The priority here is calculated to implement longest-prefix-match
- * routing. */
- ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_ROUTING, priority,
- ds_cstr(&match), ds_cstr(&actions));
- ds_destroy(&match);
- ds_destroy(&actions);
+ struct parsed_route *pr = xzalloc(sizeof *pr);
+ pr->prefix = prefix;
+ pr->plen = plen;
+ pr->is_src_route = (route->policy && !strcmp(route->policy,
+ "src-ip"));
+ pr->hash = route_hash(pr);
+ pr->route = route;
+ ovs_list_insert(routes, &pr->list_node);
+ return pr;
}
static void
-build_static_route_flow(struct hmap *lflows, struct ovn_datapath *od,
- struct hmap *ports,
- const struct nbrec_logical_router_static_route *route)
+parsed_routes_destroy(struct ovs_list *routes)
{
- ovs_be32 nexthop;
- const char *lrp_addr_s = NULL;
+ struct parsed_route *pr, *next;
+ LIST_FOR_EACH_SAFE (pr, next, list_node, routes) {
+ ovs_list_remove(&pr->list_node);
+ free(pr);
+ }
+}
+
+struct ecmp_route_list_node {
+ struct ovs_list list_node;
+ uint16_t id; /* starts from 1 */
+ const struct parsed_route *route;
+};
+
+struct ecmp_groups_node {
+ struct hmap_node hmap_node; /* In ecmp_groups */
+ uint16_t id; /* starts from 1 */
+ struct v46_ip prefix;
unsigned int plen;
- bool is_ipv4;
+ bool is_src_route;
+ uint16_t route_count;
+ struct ovs_list route_list; /* Contains ecmp_route_list_node */
+};
- /* Verify that the next hop is an IP address with an all-ones mask. */
- char *error = ip_parse_cidr(route->nexthop, &nexthop, &plen);
- if (!error) {
- if (plen != 32) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
- VLOG_WARN_RL(&rl, "bad next hop mask %s", route->nexthop);
- return;
+static void
+ecmp_groups_add_route(struct ecmp_groups_node *group,
+ const struct parsed_route *route)
+{
+ struct ecmp_route_list_node *er = xmalloc(sizeof *er);
+ er->route = route;
+ er->id = ++group->route_count;
+ if (er->id == 0) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+ VLOG_WARN_RL(&rl, "too many routes in a single ecmp group.");
+ return;
+ }
+
+ ovs_list_insert(&group->route_list, &er->list_node);
+}
+
+static struct ecmp_groups_node *
+ecmp_groups_add(struct hmap *ecmp_groups,
+ const struct parsed_route *route)
+{
+ if (hmap_count(ecmp_groups) == UINT16_MAX) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+ VLOG_WARN_RL(&rl, "too many ecmp groups.");
+ return NULL;
+ }
+
+ struct ecmp_groups_node *eg = xzalloc(sizeof *eg);
+ hmap_insert(ecmp_groups, &eg->hmap_node, route->hash);
+
+ eg->id = hmap_count(ecmp_groups);
+ eg->prefix = route->prefix;
+ eg->plen = route->plen;
+ eg->is_src_route = route->is_src_route;
+ ovs_list_init(&eg->route_list);
+ ecmp_groups_add_route(eg, route);
+
+ return eg;
+}
+
+static struct ecmp_groups_node *
+ecmp_groups_find(struct hmap *ecmp_groups, struct parsed_route *route)
+{
+ struct ecmp_groups_node *eg;
+ HMAP_FOR_EACH_WITH_HASH (eg, hmap_node, route->hash, ecmp_groups) {
+ if (ip46_equals(&eg->prefix, &route->prefix) &&
+ eg->plen == route->plen &&
+ eg->is_src_route == route->is_src_route) {
+ return eg;
}
- is_ipv4 = true;
- } else {
- free(error);
+ }
+ return NULL;
+}
- struct in6_addr ip6;
- error = ipv6_parse_cidr(route->nexthop, &ip6, &plen);
- if (!error) {
- if (plen != 128) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
- VLOG_WARN_RL(&rl, "bad next hop mask %s", route->nexthop);
- return;
- }
- is_ipv4 = false;
- } else {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
- VLOG_WARN_RL(&rl, "bad next hop ip address %s", route->nexthop);
- free(error);
- return;
+static void
+ecmp_groups_destroy(struct hmap *ecmp_groups)
+{
+ struct ecmp_groups_node *eg, *next;
+ HMAP_FOR_EACH_SAFE (eg, next, hmap_node, ecmp_groups) {
+ struct ecmp_route_list_node *er, *er_next;
+ LIST_FOR_EACH_SAFE (er, er_next, list_node, &eg->route_list) {
+ ovs_list_remove(&er->list_node);
+ free(er);
}
+ hmap_remove(ecmp_groups, &eg->hmap_node);
+ free(eg);
}
+ hmap_destroy(ecmp_groups);
+}
- char *prefix_s;
- if (is_ipv4) {
- ovs_be32 prefix;
- /* Verify that ip prefix is a valid IPv4 address. */
- error = ip_parse_cidr(route->ip_prefix, &prefix, &plen);
- if (error) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
- VLOG_WARN_RL(&rl, "bad 'ip_prefix' in static routes %s",
- route->ip_prefix);
- free(error);
- return;
+struct unique_routes_node {
+ struct hmap_node hmap_node;
+ const struct parsed_route *route;
+};
+
+static void
+unique_routes_add(struct hmap *unique_routes,
+ const struct parsed_route *route)
+{
+ struct unique_routes_node *ur = xmalloc(sizeof *ur);
+ ur->route = route;
+ hmap_insert(unique_routes, &ur->hmap_node, route->hash);
+}
+
+/* Remove the unique_routes_node from the hmap, and return the parsed_route
+ * pointed by the removed node. */
+static const struct parsed_route *
+unique_routes_remove(struct hmap *unique_routes,
+ const struct parsed_route *route)
+{
+ struct unique_routes_node *ur;
+ HMAP_FOR_EACH_WITH_HASH (ur, hmap_node, route->hash, unique_routes) {
+ if (ip46_equals(&route->prefix, &ur->route->prefix) &&
+ route->plen == ur->route->plen &&
+ route->is_src_route == ur->route->is_src_route) {
+ hmap_remove(unique_routes, &ur->hmap_node);
+ const struct parsed_route *existed_route = ur->route;
+ free(ur);
+ return existed_route;
}
- prefix_s = xasprintf(IP_FMT, IP_ARGS(prefix & be32_prefix_mask(plen)));
+ }
+ return NULL;
+}
+
+static void
+unique_routes_destroy(struct hmap *unique_routes)
+{
+ struct unique_routes_node *ur, *next;
+ HMAP_FOR_EACH_SAFE (ur, next, hmap_node, unique_routes) {
+ hmap_remove(unique_routes, &ur->hmap_node);
+ free(ur);
+ }
+ hmap_destroy(unique_routes);
+}
+
+static char *
+build_route_prefix_s(const struct v46_ip *prefix, unsigned int plen)
+{
+ char *prefix_s;
+ if (prefix->family == AF_INET) {
+ prefix_s = xasprintf(IP_FMT, IP_ARGS(prefix->ipv4 &
+ be32_prefix_mask(plen)));
} else {
- /* Verify that ip prefix is a valid IPv6 address. */
- struct in6_addr prefix;
- error = ipv6_parse_cidr(route->ip_prefix, &prefix, &plen);
- if (error) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
- VLOG_WARN_RL(&rl, "bad 'ip_prefix' in static routes %s",
- route->ip_prefix);
- free(error);
- return;
- }
struct in6_addr mask = ipv6_create_mask(plen);
- struct in6_addr network = ipv6_addr_bitand(&prefix, &mask);
+ struct in6_addr network = ipv6_addr_bitand(&prefix->ipv6, &mask);
prefix_s = xmalloc(INET6_ADDRSTRLEN);
inet_ntop(AF_INET6, &network, prefix_s, INET6_ADDRSTRLEN);
}
+ return prefix_s;
+}
- /* Find the outgoing port. */
+static void
+build_route_match(const struct ovn_port *op_inport, const char *network_s,
+ int plen, bool is_src_route, bool is_ipv4, struct ds *match,
+ uint16_t *priority)
+{
+ const char *dir;
+ /* The priority here is calculated to implement longest-prefix-match
+ * routing. */
+ if (is_src_route) {
+ dir = "src";
+ *priority = plen * 2;
+ } else {
+ dir = "dst";
+ *priority = (plen * 2) + 1;
+ }
+
+ if (op_inport) {
+ ds_put_format(match, "inport == %s && ", op_inport->json_key);
+ }
+ ds_put_format(match, "ip%s.%s == %s/%d", is_ipv4 ? "4" : "6", dir,
+ network_s, plen);
+}
+
+/* Output: p_lrp_addr_s and p_out_port. */
+static bool
+find_static_route_outport(struct ovn_datapath *od, struct hmap *ports,
+ const struct nbrec_logical_router_static_route *route, bool is_ipv4,
+ const char **p_lrp_addr_s, struct ovn_port **p_out_port)
+{
+ const char *lrp_addr_s = NULL;
struct ovn_port *out_port = NULL;
if (route->output_port) {
out_port = ovn_port_find(ports, route->output_port);
@@ -6845,7 +7007,7 @@ build_static_route_flow(struct hmap *lflows, struct ovn_datapath *od,
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "Bad out port %s for static route %s",
route->output_port, route->ip_prefix);
- goto free_prefix_s;
+ return false;
}
lrp_addr_s = find_lrp_member_ip(out_port, route->nexthop);
if (!lrp_addr_s) {
@@ -6883,20 +7045,151 @@ build_static_route_flow(struct hmap *lflows, struct ovn_datapath *od,
}
}
}
-
if (!out_port || !lrp_addr_s) {
/* There is no matched out port. */
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "No path for static route %s; next hop %s",
route->ip_prefix, route->nexthop);
- goto free_prefix_s;
+ return false;
}
+ *p_out_port = out_port;
+ *p_lrp_addr_s = lrp_addr_s;
+
+ return true;
+}
- char *policy = route->policy ? route->policy : "dst-ip";
- add_route(lflows, out_port, lrp_addr_s, prefix_s, plen, route->nexthop,
- policy);
+static void
+build_ecmp_route_flow(struct hmap *lflows, struct ovn_datapath *od,
+ struct hmap *ports, struct ecmp_groups_node *eg)
+
+{
+ bool is_ipv4 = (eg->prefix.family == AF_INET);
+ struct ds match = DS_EMPTY_INITIALIZER;
+ uint16_t priority;
+
+ char *prefix_s = build_route_prefix_s(&eg->prefix, eg->plen);
+ build_route_match(NULL, prefix_s, eg->plen, eg->is_src_route, is_ipv4,
+ &match, &priority);
+ free(prefix_s);
+
+ struct ds actions = DS_EMPTY_INITIALIZER;
+ ds_put_format(&actions, "ip.ttl--; flags.loopback = 1; %s = %"PRIu16
+ "; select(%s", REG_ECMP_GROUP_ID, eg->id,
+ REG_ECMP_MEMBER_ID);
+
+ struct ecmp_route_list_node *er;
+ LIST_FOR_EACH (er, list_node, &eg->route_list) {
+ ds_put_format(&actions, ", %"PRIu16, er->id);
+ }
+
+ ds_put_cstr(&actions, ");");
+
+ ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, priority,
+ ds_cstr(&match), ds_cstr(&actions));
+
+ /* Add per member flow */
+ LIST_FOR_EACH (er, list_node, &eg->route_list) {
+
+ const struct parsed_route *route_ = er->route;
+ const struct nbrec_logical_router_static_route *route = route_->route;
+ /* Find the outgoing port. */
+ const char *lrp_addr_s = NULL;
+ struct ovn_port *out_port = NULL;
+ if (!find_static_route_outport(od, ports, route, is_ipv4, &lrp_addr_s,
+ &out_port)) {
+ continue;
+ }
+ ds_clear(&match);
+ ds_put_format(&match, REG_ECMP_GROUP_ID" == %"PRIu16" && "
+ REG_ECMP_MEMBER_ID" == %"PRIu16,
+ eg->id, er->id);
+ ds_clear(&actions);
+ ds_put_format(&actions, "%sreg0 = %s; "
+ "%sreg1 = %s; "
+ "eth.src = %s; "
+ "outport = %s; "
+ "next;",
+ is_ipv4 ? "" : "xx",
+ route->nexthop,
+ is_ipv4 ? "" : "xx",
+ lrp_addr_s,
+ out_port->lrp_networks.ea_s,
+ out_port->json_key);
+ ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING_ECMP, 100,
+ ds_cstr(&match), ds_cstr(&actions));
+ }
+ ds_destroy(&match);
+ ds_destroy(&actions);
+}
+
+static void
+add_route(struct hmap *lflows, const struct ovn_port *op,
+ const char *lrp_addr_s, const char *network_s, int plen,
+ const char *gateway, bool is_src_route)
+{
+ bool is_ipv4 = strchr(network_s, '.') ? true : false;
+ struct ds match = DS_EMPTY_INITIALIZER;
+ uint16_t priority;
+ const struct ovn_port *op_inport = NULL;
+
+ /* IPv6 link-local addresses must be scoped to the local router port. */
+ if (!is_ipv4) {
+ struct in6_addr network;
+ ovs_assert(ipv6_parse(network_s, &network));
+ if (in6_is_lla(&network)) {
+ op_inport = op;
+ }
+ }
+ build_route_match(op_inport, network_s, plen, is_src_route, is_ipv4,
+ &match, &priority);
+
+ struct ds actions = DS_EMPTY_INITIALIZER;
+ ds_put_format(&actions, "ip.ttl--; "REG_ECMP_GROUP_ID" = 0; %sreg0 = ",
+ is_ipv4 ? "" : "xx");
+
+ if (gateway) {
+ ds_put_cstr(&actions, gateway);
+ } else {
+ ds_put_format(&actions, "ip%s.dst", is_ipv4 ? "4" : "6");
+ }
+ ds_put_format(&actions, "; "
+ "%sreg1 = %s; "
+ "eth.src = %s; "
+ "outport = %s; "
+ "flags.loopback = 1; "
+ "next;",
+ is_ipv4 ? "" : "xx",
+ lrp_addr_s,
+ op->lrp_networks.ea_s,
+ op->json_key);
+
+ ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_ROUTING, priority,
+ ds_cstr(&match), ds_cstr(&actions));
+ ds_destroy(&match);
+ ds_destroy(&actions);
+}
+
+static void
+build_static_route_flow(struct hmap *lflows, struct ovn_datapath *od,
+ struct hmap *ports,
+ const struct parsed_route *route_)
+{
+ const char *lrp_addr_s = NULL;
+ struct ovn_port *out_port = NULL;
+ bool is_ipv4 = (route_->prefix.family == AF_INET);
+
+ const struct nbrec_logical_router_static_route *route = route_->route;
+
+ /* Find the outgoing port. */
+ if (!find_static_route_outport(od, ports, route, is_ipv4, &lrp_addr_s,
+ &out_port)) {
+ return;
+ }
+
+ char *prefix_s = build_route_prefix_s(&route_->prefix, route_->plen);
+ add_route(lflows, out_port, lrp_addr_s, prefix_s, route_->plen,
+ route->nexthop, route_->is_src_route);
-free_prefix_s:
free(prefix_s);
}
@@ -8731,14 +9024,21 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
ovn_lflow_add(lflows, od, S_ROUTER_IN_ND_RA_RESPONSE, 0, "1", "next;");
}
- /* Logical router ingress table IP_ROUTING: IP Routing.
+ /* Logical router ingress table IP_ROUTING & IP_ROUTING_ECMP: IP Routing.
*
* A packet that arrives at this table is an IP packet that should be
- * routed to the address in 'ip[46].dst'. This table sets outport to
- * the correct output port, eth.src to the output port's MAC
- * address, and '[xx]reg0' to the next-hop IP address (leaving
- * 'ip[46].dst', the packet’s final destination, unchanged), and
- * advances to the next table for ARP/ND resolution. */
+ * routed to the address in 'ip[46].dst'.
+ *
+ * For regular routes without ECMP, table IP_ROUTING sets outport to the
+ * correct output port, eth.src to the output port's MAC address, and
+ * '[xx]reg0' to the next-hop IP address (leaving 'ip[46].dst', the
+ * packet’s final destination, unchanged), and advances to the next table.
+ *
+ * For ECMP routes, i.e. multiple routes with same policy and prefix, table
+ * IP_ROUTING remembers ECMP group id and selects a member id, and advances
+ * to table IP_ROUTING_ECMP, which sets outport, eth.src and '[xx]reg0' for
+ * the selected ECMP member.
+ * */
HMAP_FOR_EACH (op, key_node, ports) {
if (!op->nbrp) {
continue;
@@ -8765,13 +9065,47 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
if (!od->nbr) {
continue;
}
+ ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING_ECMP, 150,
+ REG_ECMP_GROUP_ID" == 0", "next;");
+ struct hmap ecmp_groups = HMAP_INITIALIZER(&ecmp_groups);
+ struct hmap unique_routes = HMAP_INITIALIZER(&unique_routes);
+ struct ovs_list parsed_routes = OVS_LIST_INITIALIZER(&parsed_routes);
+ struct ecmp_groups_node *group;
for (int i = 0; i < od->nbr->n_static_routes; i++) {
- const struct nbrec_logical_router_static_route *route;
-
- route = od->nbr->static_routes[i];
- build_static_route_flow(lflows, od, ports, route);
+ struct parsed_route *route =
+ parsed_routes_add(&parsed_routes, od->nbr->static_routes[i]);
+ if (!route) {
+ continue;
+ }
+ group = ecmp_groups_find(&ecmp_groups, route);
+ if (group) {
+ ecmp_groups_add_route(group, route);
+ } else {
+ const struct parsed_route *existed_route =
+ unique_routes_remove(&unique_routes, route);
+ if (existed_route) {
+ group = ecmp_groups_add(&ecmp_groups, existed_route);
+ if (group) {
+ ecmp_groups_add_route(group, route);
+ }
+ } else {
+ unique_routes_add(&unique_routes, route);
+ }
+ }
+ }
+ HMAP_FOR_EACH (group, hmap_node, &ecmp_groups) {
+ /* add a flow in IP_ROUTING, and one flow for each member in
+ * IP_ROUTING_ECMP. */
+ build_ecmp_route_flow(lflows, od, ports, group);
+ }
+ const struct unique_routes_node *ur;
+ HMAP_FOR_EACH (ur, hmap_node, &unique_routes) {
+ build_static_route_flow(lflows, od, ports, ur->route);
}
+ ecmp_groups_destroy(&ecmp_groups);
+ unique_routes_destroy(&unique_routes);
+ parsed_routes_destroy(&parsed_routes);
}
/* IP Multicast lookup. Here we set the output port, adjust TTL and
@@ -2199,6 +2199,12 @@
a <code>src-ip</code> route.
</p>
+ <p>
+ When there are ECMP routes, i.e. multiple routes with same prefix and
+ policy, one of them will be selected based on the 5-tuple hashing of the
+ packet header.
+ </p>
+
<column name="ip_prefix">
<p>
IP prefix of this route (e.g. 192.168.100.0/24).
@@ -1379,7 +1379,13 @@ IPv4 Routes
dnl Delete non-existent prefix
AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.2.1/24], [1], [],
- [ovn-nbctl: no matching prefix: 10.0.2.0/24 (policy: any)
+ [ovn-nbctl: no matching route: policy 'any', prefix '10.0.2.0/24', nexthop 'any', output_port 'any'.
+])
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.1.0/24 11.0.1.2], [1], [],
+ [ovn-nbctl: no matching route: policy 'any', prefix '10.0.1.0/24', nexthop '11.0.1.2', output_port 'any'.
+])
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.1.0/24 11.0.1.1 lp1], [1], [],
+ [ovn-nbctl: no matching route: policy 'any', prefix '10.0.1.0/24', nexthop '11.0.1.1', output_port 'lp1'.
])
AT_CHECK([ovn-nbctl --if-exists lr-route-del lr0 10.0.2.1/24])
@@ -1415,6 +1421,45 @@ AT_CHECK([ovn-nbctl lr-route-del lr0])
AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
])
+dnl Add ecmp routes
+AT_CHECK([ovn-nbctl lr-route-add lr0 10.0.0.0/24 11.0.0.1])
+AT_CHECK([ovn-nbctl --ecmp lr-route-add lr0 10.0.0.0/24 11.0.0.2])
+AT_CHECK([ovn-nbctl --ecmp lr-route-add lr0 10.0.0.0/24 11.0.0.2])
+AT_CHECK([ovn-nbctl --ecmp lr-route-add lr0 10.0.0.0/24 11.0.0.3])
+AT_CHECK([ovn-nbctl --ecmp lr-route-add lr0 10.0.0.0/24 11.0.0.3 lp0])
+AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
+IPv4 Routes
+ 10.0.0.0/24 11.0.0.1 dst-ip
+ 10.0.0.0/24 11.0.0.2 dst-ip
+ 10.0.0.0/24 11.0.0.2 dst-ip
+ 10.0.0.0/24 11.0.0.3 dst-ip
+ 10.0.0.0/24 11.0.0.3 dst-ip lp0
+])
+
+dnl Delete ecmp routes
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.0.0/24 11.0.0.1])
+AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
+IPv4 Routes
+ 10.0.0.0/24 11.0.0.2 dst-ip
+ 10.0.0.0/24 11.0.0.2 dst-ip
+ 10.0.0.0/24 11.0.0.3 dst-ip
+ 10.0.0.0/24 11.0.0.3 dst-ip lp0
+])
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.0.0/24 11.0.0.2])
+AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
+IPv4 Routes
+ 10.0.0.0/24 11.0.0.3 dst-ip
+ 10.0.0.0/24 11.0.0.3 dst-ip lp0
+])
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.0.0/24 11.0.0.3 lp0])
+AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
+IPv4 Routes
+ 10.0.0.0/24 11.0.0.3 dst-ip
+])
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.0.0/24 11.0.0.3])
+AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
+])
+
dnl Check IPv6 routes
AT_CHECK([ovn-nbctl lr-route-add lr0 0:0:0:0:0:0:0:0/0 2001:0db8:0:f101::1])
AT_CHECK([ovn-nbctl lr-route-add lr0 2001:0db8:0::/64 2001:0db8:0:f102::1 lp0])
@@ -14876,7 +14876,7 @@ ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
# Since the sw0-vir is not claimed by any chassis, eth.dst should be set to
# zero if the ip4.dst is the virtual ip in the router pipeline.
AT_CHECK([cat lflows.txt], [0], [dnl
- table=11(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 00:00:00:00:00:00; next;)
+ table=12(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 00:00:00:00:00:00; next;)
])
ip_to_hex() {
@@ -14912,7 +14912,7 @@ ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
# There should be an arp resolve flow to resolve the virtual_ip with the
# sw0-p1's MAC.
AT_CHECK([cat lflows.txt], [0], [dnl
- table=11(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:03; next;)
+ table=12(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:03; next;)
])
# From sw0-p3 send GARP for 10.0.0.10. hv1 should claim sw0-vir
@@ -14935,7 +14935,7 @@ ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
# There should be an arp resolve flow to resolve the virtual_ip with the
# sw0-p2's MAC.
AT_CHECK([cat lflows.txt], [0], [dnl
- table=11(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:05; next;)
+ table=12(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:05; next;)
])
# send the garp from sw0-p2 (in hv2). hv2 should claim sw0-vir
@@ -14958,7 +14958,7 @@ ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
# There should be an arp resolve flow to resolve the virtual_ip with the
# sw0-p3's MAC.
AT_CHECK([cat lflows.txt], [0], [dnl
- table=11(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:04; next;)
+ table=12(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:04; next;)
])
# Now send arp reply from sw0-p1. hv1 should claim sw0-vir
@@ -14979,7 +14979,7 @@ ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
> lflows.txt
AT_CHECK([cat lflows.txt], [0], [dnl
- table=11(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:03; next;)
+ table=12(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:03; next;)
])
# Delete hv1-vif1 port. hv1 should release sw0-vir
@@ -14997,7 +14997,7 @@ ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
> lflows.txt
AT_CHECK([cat lflows.txt], [0], [dnl
- table=11(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 00:00:00:00:00:00; next;)
+ table=12(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 00:00:00:00:00:00; next;)
])
# Now send arp reply from sw0-p2. hv2 should claim sw0-vir
@@ -15018,7 +15018,7 @@ ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
> lflows.txt
AT_CHECK([cat lflows.txt], [0], [dnl
- table=11(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:04; next;)
+ table=12(lr_in_arp_resolve ), priority=100 , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:04; next;)
])
# Delete sw0-p2 logical port
@@ -17364,3 +17364,103 @@ OVS_WAIT_UNTIL([
OVN_CLEANUP([hv1])
AT_CLEANUP
+
+AT_SETUP([ovn -- ECMP static routes])
+ovn_start
+
+# Logical network:
+# ls1 (192.168.1.0/24) - lr1 - ls2 (192.168.2.0/24)
+# lsl has lsp11 (192.168.1.11) and ls2 has lsp21 (192.168.2.21) and lsp22
+# (192.168.2.22)
+#
+# Static routes on lr1:
+# 10.0.0.0/24 nexthop 192.168.2.21
+# 10.0.0.0/24 nexthop 192.168.2.22
+#
+# Test:
+# lsp11 send packets to 10.0.0.*
+#
+# Expected result:
+# Both lsp21 and lsp22 should received some of the packets.
+
+ovn-nbctl lr-add lr1
+
+ovn-nbctl ls-add ls1
+ovn-nbctl ls-add ls2
+
+for i in 1 2; do
+ ovn-nbctl lrp-add lr1 lrp-lr1-ls${i} 00:00:00:01:0${i}:01 192.168.${i}.1/24
+ ovn-nbctl lsp-add ls${i} lsp-ls${i}-lr1 -- lsp-set-type lsp-ls${i}-lr1 router \
+ -- lsp-set-options lsp-ls${i}-lr1 router-port=lrp-lr1-ls${i} \
+ -- lsp-set-addresses lsp-ls${i}-lr1 router
+done
+
+#install static routes
+ovn-nbctl lr-route-add lr1 10.0.0.0/24 192.168.2.21
+ovn-nbctl --ecmp lr-route-add lr1 10.0.0.0/24 192.168.2.22
+
+# Create logical ports
+ovn-nbctl lsp-add ls1 lsp11 -- \
+ lsp-set-addresses lsp11 "f0:00:00:00:01:11 192.168.1.11"
+ovn-nbctl lsp-add ls2 lsp21 -- \
+ lsp-set-addresses lsp21 "f0:00:00:00:02:21 192.168.2.21"
+ovn-nbctl lsp-add ls2 lsp22 -- \
+ lsp-set-addresses lsp22 "f0:00:00:00:02:22 192.168.2.22"
+
+net_add n1
+sim_add hv1
+as hv1
+ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+ovs-vsctl -- add-port br-int hv1-vif1 -- \
+ set interface hv1-vif1 external-ids:iface-id=lsp11 \
+ options:tx_pcap=hv1/vif1-tx.pcap \
+ options:rxq_pcap=hv1/vif1-rx.pcap \
+ ofport-request=1
+
+ovs-vsctl -- add-port br-int hv1-vif2 -- \
+ set interface hv1-vif2 external-ids:iface-id=lsp21 \
+ options:tx_pcap=hv1/vif2-tx.pcap \
+ options:rxq_pcap=hv1/vif2-rx.pcap \
+ ofport-request=2
+
+ovs-vsctl -- add-port br-int hv1-vif3 -- \
+ set interface hv1-vif3 external-ids:iface-id=lsp22 \
+ options:tx_pcap=hv1/vif3-tx.pcap \
+ options:rxq_pcap=hv1/vif3-rx.pcap \
+ ofport-request=3
+
+# wait for earlier changes to take effect
+AT_CHECK([ovn-nbctl --timeout=3 --wait=hv sync], [0], [ignore])
+
+for i in $(seq 5001 5010); do
+ packet="inport==\"lsp11\" && eth.src==f0:00:00:00:01:11 && eth.dst==00:00:00:01:01:01 &&
+ ip4 && ip.ttl==64 && ip4.src==192.168.1.11 && ip4.dst==10.0.0.123 &&
+ tcp && tcp.src==$i && tcp.dst==80"
+ AT_CHECK([as hv1 ovs-appctl -t ovn-controller inject-pkt "$packet"])
+
+ for j in 1 2; do
+ # Assume all packets go to lsp2${j}.
+ exp_packet="eth.src==00:00:00:01:02:01 && eth.dst==f0:00:00:00:02:2${j} &&
+ ip4 && ip.ttl==63 && ip4.src==192.168.1.11 && ip4.dst==10.0.0.123 &&
+ tcp && tcp.src==$i && tcp.dst==80"
+ echo $exp_packet | ovstest test-ovn expr-to-packets >> expected_lsp2${j}
+ done
+done
+
+# Each port should receive some packets and the total number should be 10
+OVS_WAIT_UNTIL([
+ rcv_n1=`$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/vif2-tx.pcap > lsp21.packets && cat lsp21.packets | wc -l`
+ rcv_n2=`$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/vif3-tx.pcap > lsp22.packets && cat lsp22.packets | wc -l`
+ echo $rcv_n1 $rcv_n2
+ test $rcv_n1 -ge 1 -a $rcv_n2 -ge 1 -a $(($rcv_n1 + $rcv_n2)) -ge 10])
+
+# Each port should receive a subset of expected packets
+for i in 1 2; do
+ sort expected_lsp2$i > expout
+ AT_CHECK([cat lsp2${i}.packets expected_lsp2$i | sort | uniq], [0], [expout])
+done
+
+OVN_CLEANUP([hv1])
+
+AT_CLEANUP
@@ -612,7 +612,9 @@
<h1>Logical Router Static Route Commands</h1>
<dl>
- <dt>[<code>--may-exist</code>] [<code>--policy</code>=<var>POLICY</var>] <code>lr-route-add</code> <var>router</var> <var>prefix</var> <var>nexthop</var> [<var>port</var>]</dt>
+ <dt>[<code>--may-exist</code>] [<code>--policy</code>=<var>POLICY</var>]
+ [<code>--ecmp</code>] <code>lr-route-add</code> <var>router</var>
+ <var>prefix</var> <var>nexthop</var> [<var>port</var>]</dt>
<dd>
<p>
Adds the specified route to <var>router</var>.
@@ -635,19 +637,27 @@
<p>
It is an error if a route with <var>prefix</var> and
- <var>POLICY</var> already exists, unless <code>--may-exist</code> is
- specified.
+ <var>POLICY</var> already exists, unless <code>--may-exist</code> or
+ <code>--ecmp</code> is specified. If <code>--may-exist</code> is
+ specified but not <code>--ecmp</code>, the existed route will be
+ updated with the new nexthop and port. If <code>--ecmp</code> is
+ specified, a new route will be added, regardless of the existed
+ route, which is useful when adding ECMP routes, i.e. routes with same
+ <var>POLICY</var> and <var>prefix</var> but different
+ <var>nexthop</var> and <var>port</var>.
</p>
</dd>
- <dt>[<code>--if-exists</code>] [<code>--policy</code>=<var>POLICY</var>] <code>lr-route-del</code> <var>router</var> [<var>prefix</var>]</dt>
+ <dt>[<code>--if-exists</code>] [<code>--policy</code>=<var>POLICY</var>]
+ <code>lr-route-del</code> <var>router</var> [<var>prefix</var>
+ [<var>nexthop</var> [<var>port</var>]]]</dt>
<dd>
<p>
Deletes routes from <var>router</var>. If only <var>router</var>
- is supplied, all the routes from the logical router are
- deleted. If <var>POLICY</var> and/or <var>prefix</var> are also
- specified, then all the routes that match the conditions will be
- deleted from the logical router.
+ is supplied, all the routes from the logical router are deleted. If
+ <var>POLICY</var>, <var>prefix</var>, <var>nexthop</var> and/or
+ <var>port</var> are also specified, then all the routes that match
+ the conditions will be deleted from the logical router.
</p>
<p>
@@ -680,9 +680,9 @@ Logical router port commands:\n\
('overlay' or 'vlan')\n\
\n\
Route commands:\n\
- [--policy=POLICY] lr-route-add ROUTER PREFIX NEXTHOP [PORT]\n\
+ [--policy=POLICY] [--ecmp] lr-route-add ROUTER PREFIX NEXTHOP [PORT]\n\
add a route to ROUTER\n\
- [--policy=POLICY] lr-route-del ROUTER [PREFIX]\n\
+ [--policy=POLICY] lr-route-del ROUTER [PREFIX [NEXTHOP [PORT]]]\n\
remove routes from ROUTER\n\
lr-route-list ROUTER print routes for ROUTER\n\
\n\
@@ -3739,59 +3739,62 @@ nbctl_lr_route_add(struct ctl_context *ctx)
}
bool may_exist = shash_find(&ctx->options, "--may-exist") != NULL;
- for (int i = 0; i < lr->n_static_routes; i++) {
- const struct nbrec_logical_router_static_route *route
- = lr->static_routes[i];
- char *rt_prefix;
+ bool ecmp = shash_find(&ctx->options, "--ecmp") != NULL;
+ if (!ecmp) {
+ for (int i = 0; i < lr->n_static_routes; i++) {
+ const struct nbrec_logical_router_static_route *route
+ = lr->static_routes[i];
+ char *rt_prefix;
+
+ /* Compare route policy. */
+ char *nb_policy = lr->static_routes[i]->policy;
+ bool nb_is_src_route = false;
+ if (nb_policy && !strcmp(nb_policy, "src-ip")) {
+ nb_is_src_route = true;
+ }
+ if (is_src_route != nb_is_src_route) {
+ continue;
+ }
- /* Compare route policy. */
- char *nb_policy = lr->static_routes[i]->policy;
- bool nb_is_src_route = false;
- if (nb_policy && !strcmp(nb_policy, "src-ip")) {
- nb_is_src_route = true;
- }
- if (is_src_route != nb_is_src_route) {
- continue;
- }
+ /* Compare route prefix. */
+ rt_prefix = normalize_prefix_str(lr->static_routes[i]->ip_prefix);
+ if (!rt_prefix) {
+ /* Ignore existing prefix we couldn't parse. */
+ continue;
+ }
- /* Compare route prefix. */
- rt_prefix = normalize_prefix_str(lr->static_routes[i]->ip_prefix);
- if (!rt_prefix) {
- /* Ignore existing prefix we couldn't parse. */
- continue;
- }
+ if (strcmp(rt_prefix, prefix)) {
+ free(rt_prefix);
+ continue;
+ }
- if (strcmp(rt_prefix, prefix)) {
- free(rt_prefix);
- continue;
- }
+ if (!may_exist) {
+ ctl_error(ctx, "duplicate prefix: %s (policy: %s)",
+ prefix, is_src_route ? "src-ip" : "dst-ip");
+ free(next_hop);
+ free(rt_prefix);
+ free(prefix);
+ return;
+ }
- if (!may_exist) {
- ctl_error(ctx, "duplicate prefix: %s (policy: %s)",
- prefix, is_src_route ? "src-ip" : "dst-ip");
- free(next_hop);
+ /* Update the next hop for an existing route. */
+ nbrec_logical_router_verify_static_routes(lr);
+ nbrec_logical_router_static_route_verify_ip_prefix(route);
+ nbrec_logical_router_static_route_verify_nexthop(route);
+ nbrec_logical_router_static_route_set_ip_prefix(route, prefix);
+ nbrec_logical_router_static_route_set_nexthop(route, next_hop);
+ if (ctx->argc == 5) {
+ nbrec_logical_router_static_route_set_output_port(
+ route, ctx->argv[4]);
+ }
+ if (policy) {
+ nbrec_logical_router_static_route_set_policy(route, policy);
+ }
free(rt_prefix);
+ free(next_hop);
free(prefix);
return;
}
-
- /* Update the next hop for an existing route. */
- nbrec_logical_router_verify_static_routes(lr);
- nbrec_logical_router_static_route_verify_ip_prefix(route);
- nbrec_logical_router_static_route_verify_nexthop(route);
- nbrec_logical_router_static_route_set_ip_prefix(route, prefix);
- nbrec_logical_router_static_route_set_nexthop(route, next_hop);
- if (ctx->argc == 5) {
- nbrec_logical_router_static_route_set_output_port(route,
- ctx->argv[4]);
- }
- if (policy) {
- nbrec_logical_router_static_route_set_policy(route, policy);
- }
- free(rt_prefix);
- free(next_hop);
- free(prefix);
- return;
}
struct nbrec_logical_router_static_route *route;
@@ -3854,6 +3857,20 @@ nbctl_lr_route_del(struct ctl_context *ctx)
}
}
+ char *nexthop = NULL;
+ if (ctx->argc >= 4) {
+ nexthop = normalize_prefix_str(ctx->argv[3]);
+ if (!nexthop) {
+ ctl_error(ctx, "bad nexthop argument: %s", ctx->argv[3]);
+ return;
+ }
+ }
+
+ char *output_port = NULL;
+ if (ctx->argc == 5) {
+ output_port = ctx->argv[4];
+ }
+
struct nbrec_logical_router_static_route **new_routes
= xmemdup(lr->static_routes,
sizeof *new_routes * lr->n_static_routes);
@@ -3882,27 +3899,59 @@ nbctl_lr_route_del(struct ctl_context *ctx)
continue;
}
- if (strcmp(prefix, rt_prefix)) {
+ int ret = strcmp(prefix, rt_prefix);
+ free(rt_prefix);
+ if (ret) {
+ new_routes[n_new++] = lr->static_routes[i];
+ continue;
+ }
+ }
+
+ /* Compare nexthop, if specified. */
+ if (nexthop) {
+ char *rt_nexthop =
+ normalize_prefix_str(lr->static_routes[i]->nexthop);
+ if (!rt_nexthop) {
+ /* Ignore existing nexthop we couldn't parse. */
+ new_routes[n_new++] = lr->static_routes[i];
+ continue;
+ }
+ int ret = strcmp(nexthop, rt_nexthop);
+ free(rt_nexthop);
+ if (ret) {
+ new_routes[n_new++] = lr->static_routes[i];
+ continue;
+ }
+ }
+
+ /* Compare output_port, if specified. */
+ if (output_port) {
+ char *rt_output_port = lr->static_routes[i]->output_port;
+ if (!rt_output_port || strcmp(output_port, rt_output_port)) {
new_routes[n_new++] = lr->static_routes[i];
}
- free(rt_prefix);
}
}
if (n_new < lr->n_static_routes) {
nbrec_logical_router_verify_static_routes(lr);
nbrec_logical_router_set_static_routes(lr, new_routes, n_new);
- free(new_routes);
- free(prefix);
- return;
+ goto out;
}
if (!shash_find(&ctx->options, "--if-exists")) {
- ctl_error(ctx, "no matching prefix: %s (policy: %s)",
- prefix, policy ? policy : "any");
+ ctl_error(ctx, "no matching route: policy '%s', prefix '%s', nexthop "
+ "'%s', output_port '%s'.",
+ policy ? policy : "any",
+ prefix,
+ nexthop ? nexthop : "any",
+ output_port ? output_port : "any");
}
+
+out:
free(new_routes);
free(prefix);
+ free(nexthop);
}
static void
@@ -4773,6 +4822,26 @@ nbctl_lrp_get_redirect_type(struct ctl_context *ctx)
}
+static int
+route_cmp_details(const struct nbrec_logical_router_static_route *r1,
+ const struct nbrec_logical_router_static_route *r2)
+{
+ int ret = strcmp(r1->nexthop, r2->nexthop);
+ if (ret) {
+ return ret;
+ }
+ if (r1->output_port && r2->output_port) {
+ ret = strcmp(r1->output_port, r2->output_port);
+ if (ret) {
+ return ret;
+ }
+ return 0;
+ }
+ if (!r1->output_port && !r2->output_port) {
+ return 0;
+ }
+ return r1->output_port ? 1 : -1;
+}
struct ipv4_route {
int priority;
ovs_be32 addr;
@@ -4787,11 +4856,11 @@ ipv4_route_cmp(const void *route1_, const void *route2_)
if (route1p->priority != route2p->priority) {
return route1p->priority > route2p->priority ? -1 : 1;
- } else if (route1p->addr != route2p->addr) {
+ }
+ if (route1p->addr != route2p->addr) {
return ntohl(route1p->addr) < ntohl(route2p->addr) ? -1 : 1;
- } else {
- return 0;
}
+ return route_cmp_details(route1p->route, route2p->route);
}
struct ipv6_route {
@@ -4809,7 +4878,11 @@ ipv6_route_cmp(const void *route1_, const void *route2_)
if (route1p->priority != route2p->priority) {
return route1p->priority > route2p->priority ? -1 : 1;
}
- return memcmp(&route1p->addr, &route2p->addr, sizeof(route1p->addr));
+ int ret = memcmp(&route1p->addr, &route2p->addr, sizeof(route1p->addr));
+ if (ret) {
+ return ret;
+ }
+ return route_cmp_details(route1p->route, route2p->route);
}
static void
@@ -5786,9 +5859,9 @@ static const struct ctl_command_syntax nbctl_commands[] = {
/* logical router route commands. */
{ "lr-route-add", 3, 4, "ROUTER PREFIX NEXTHOP [PORT]", NULL,
- nbctl_lr_route_add, NULL, "--may-exist,--policy=", RW },
- { "lr-route-del", 1, 2, "ROUTER [PREFIX]", NULL, nbctl_lr_route_del,
- NULL, "--if-exists,--policy=", RW },
+ nbctl_lr_route_add, NULL, "--may-exist,--ecmp,--policy=", RW },
+ { "lr-route-del", 1, 4, "ROUTER [PREFIX [NEXTHOP [PORT]]]", NULL,
+ nbctl_lr_route_del, NULL, "--if-exists,--policy=", RW },
{ "lr-route-list", 1, 1, "ROUTER", NULL, nbctl_lr_route_list, NULL,
"", RO },
This patch supports ECMP routes for OVN routers. When there are multiple routes with same prefix and policy, OVN will select one of the routes based on 5-tuple hash of the packet header with the help of the "select" action added earlier. This patch also updates ovn-nbctl so that it allows adding multiple routes with same prefix and policy when option --ecmp is specified, and also supports deleting a specific route among ECMP routes by specifying nexthop and output_port fields. Signed-off-by: Han Zhou <hzhou@ovn.org> --- northd/ovn-northd.8.xml | 91 +++++++- northd/ovn-northd.c | 580 ++++++++++++++++++++++++++++++++++++---------- ovn-nb.xml | 6 + tests/ovn-nbctl.at | 47 +++- tests/ovn.at | 114 ++++++++- utilities/ovn-nbctl.8.xml | 26 ++- utilities/ovn-nbctl.c | 195 +++++++++++----- 7 files changed, 854 insertions(+), 205 deletions(-)