[ovs-dev,ovn,v2,4/4] ovn-northd: Support ECMP routes.
diff mbox series

Message ID 1579560985-36919-4-git-send-email-hzhou@ovn.org
State New
Headers show
Series
  • Untitled series #154327
Related show

Commit Message

Han Zhou Jan. 20, 2020, 10:56 p.m. UTC
This patch supports ECMP routes for OVN routers. When there are multiple
routes with same prefix and policy, OVN will select one of the routes
based on 5-tuple hash of the packet header with the help of the "select"
action added earlier.

This patch also updates ovn-nbctl so that it allows adding multiple
routes with same prefix and policy when option --ecmp is specified,
and also supports deleting a specific route among ECMP routes
by specifying nexthop and output_port fields.

Signed-off-by: Han Zhou <hzhou@ovn.org>
---
v1 -> v2: updated according to Numan's comment. Use the new format of select
    in logical flows.
 northd/ovn-northd.8.xml   |  91 ++++++-
 northd/ovn-northd.c       | 590 ++++++++++++++++++++++++++++++++++++----------
 ovn-nb.xml                |   6 +
 tests/ovn-nbctl.at        |  47 +++-
 tests/ovn.at              | 114 ++++++++-
 utilities/ovn-nbctl.8.xml |  26 +-
 utilities/ovn-nbctl.c     | 195 ++++++++++-----
 7 files changed, 862 insertions(+), 207 deletions(-)

Patch
diff mbox series

diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index b0ee73e..3628d35 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -2262,6 +2262,15 @@  output;
     </p>
 
     <p>
+      For ECMP routes, i.e. multiple static routes with same policy and
+      prefix but different nexthops, the above actions are deferred to next
+      table.  This table, instead, is responsible for determine the ECMP
+      group id and select a member id within the group based on 5-tuple
+      hashing.  It stores group id in <code>reg8[0..15]</code> and member id in
+      <code>reg8[16..31]</code>.
+    </p>
+
+    <p>
       This table contains the following logical flows:
     </p>
 
@@ -2347,6 +2356,7 @@  outport = <code>redirect-chassis-port</code>;
 
         <pre>
 ip.ttl--;
+reg8[0..15] = 0;
 reg0 = <var>G</var>;
 reg1 = <var>A</var>;
 eth.src = <var>E</var>;
@@ -2380,6 +2390,7 @@  next;
 
         <pre>
 ip.ttl--;
+reg8[0..15] = 0;
 xxreg0 = <var>G</var>;
 xxreg1 = <var>A</var>;
 eth.src = <var>E</var>;
@@ -2404,9 +2415,79 @@  next;
           route will be limited to sending on the ingress port.
         </p>
       </li>
+
+      <li>
+        <p>
+          For ECMP routes, they are grouped by policy and prefix.  An unique id
+          (non-zero) is assigned to each group, and each member is also
+          assigned an unique id (non-zero) within each group.
+        </p>
+
+        <p>
+          For each IPv4/IPv6 ECMP group with group id <var>GID</var> and member
+          ids <var>MID1</var>, <var>MID2</var>, ..., a logical flow with match
+          in CIDR notation <code>ip4.dst == <var>N</var>/<var>M</var></code>,
+          or <code>ip6.dst == <var>N</var>/<var>M</var></code>, whose priority
+          is the integer value of <var>M</var>, has the following actions:
+        </p>
+
+        <pre>
+ip.ttl--;
+flags.loopback = 1;
+reg8[0..15] = <var>GID</var>;
+select(reg8[16..31], <var>MID1</var>, <var>MID2</var>, ...);
+        </pre>
+      </li>
+    </ul>
+
+    <h3>Ingress Table 10: IP_ROUTING_ECMP</h3>
+
+    <p>
+      This table implements the second part of IP routing for ECMP routes
+      following the previous table.  If a packet matched a ECMP group in the
+      previous table, this table matches the group id and member id stored
+      from the previous table, setting <code>reg0</code>
+      (or <code>xxreg0</code> for IPv6) to the next-hop IP address
+      (leaving <code>ip4.dst</code> or <code>ip6.dst</code>, the
+      packet's final destination, unchanged) and advances to the next
+      table for ARP resolution.  It also sets <code>reg1</code> (or
+      <code>xxreg1</code>) to the IP address owned by the selected router
+      port (ingress table <code>ARP Request</code> will generate an ARP
+      request, if needed, with <code>reg0</code> as the target protocol
+      address and <code>reg1</code> as the source protocol address).
+    </p>
+
+    <p>
+      This table contains the following logical flows:
+    </p>
+
+    <ul>
+      <li>
+        <p>
+          A priority-150 flow that matches <code>reg8[0..15] == 0</code>
+          with action <code>next;</code> directly bypasses packets of non-ECMP
+          routes.
+        </p>
+      </li>
+
+      <li>
+        <p>
+          For each member with ID <var>MID</var> in each ECMP group with ID
+          <var>GID</var>, a priority-100 flow with match
+          <code>reg8[0..15] == <var>GID</var> &amp;&amp; reg8[16..31] == <var>MID</var></code>
+          has following actions:
+        </p>
+
+        <pre>
+[xx]reg0 = <var>G</var>;
+[xx]reg1 = <var>A</var>;
+eth.src = <var>E</var>;
+outport = <var>P</var>;
+        </pre>
+      </li>
     </ul>
 
-    <h3>Ingress Table 10: ARP/ND Resolution</h3>
+    <h3>Ingress Table 12: ARP/ND Resolution</h3>
 
     <p>
       Any packet that reaches this table is an IP packet whose next-hop
@@ -2553,7 +2634,7 @@  next;
 
     </ul>
 
-    <h3>Ingress Table 11: Check packet length</h3>
+    <h3>Ingress Table 13: Check packet length</h3>
 
     <p>
       For distributed logical routers with distributed gateway port configured
@@ -2583,7 +2664,7 @@  REGBIT_PKT_LARGER = check_pkt_larger(<var>L</var>); next;
       and advances to the next table.
     </p>
 
-    <h3>Ingress Table 12: Handle larger packets</h3>
+    <h3>Ingress Table 14: Handle larger packets</h3>
 
     <p>
       For distributed logical routers with distributed gateway port configured
@@ -2632,7 +2713,7 @@  icmp4 {
       and advances to the next table.
     </p>
 
-    <h3>Ingress Table 13: Gateway Redirect</h3>
+    <h3>Ingress Table 15: Gateway Redirect</h3>
 
     <p>
       For distributed logical routers where one of the logical router
@@ -2694,7 +2775,7 @@  icmp4 {
       </li>
     </ul>
 
-    <h3>Ingress Table 14: ARP Request</h3>
+    <h3>Ingress Table 16: ARP Request</h3>
 
     <p>
       In the common case where the Ethernet destination has been resolved, this
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index a967f35..b1e782e 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -176,12 +176,13 @@  enum ovn_stage {
     PIPELINE_STAGE(ROUTER, IN,  ND_RA_OPTIONS,   7, "lr_in_nd_ra_options") \
     PIPELINE_STAGE(ROUTER, IN,  ND_RA_RESPONSE,  8, "lr_in_nd_ra_response") \
     PIPELINE_STAGE(ROUTER, IN,  IP_ROUTING,      9, "lr_in_ip_routing")   \
-    PIPELINE_STAGE(ROUTER, IN,  POLICY,          10, "lr_in_policy")       \
-    PIPELINE_STAGE(ROUTER, IN,  ARP_RESOLVE,     11, "lr_in_arp_resolve")  \
-    PIPELINE_STAGE(ROUTER, IN,  CHK_PKT_LEN   ,  12, "lr_in_chk_pkt_len")   \
-    PIPELINE_STAGE(ROUTER, IN,  LARGER_PKTS,     13,"lr_in_larger_pkts")   \
-    PIPELINE_STAGE(ROUTER, IN,  GW_REDIRECT,     14, "lr_in_gw_redirect")  \
-    PIPELINE_STAGE(ROUTER, IN,  ARP_REQUEST,     15, "lr_in_arp_request")  \
+    PIPELINE_STAGE(ROUTER, IN,  IP_ROUTING_ECMP, 10, "lr_in_ip_routing_ecmp") \
+    PIPELINE_STAGE(ROUTER, IN,  POLICY,          11, "lr_in_policy")       \
+    PIPELINE_STAGE(ROUTER, IN,  ARP_RESOLVE,     12, "lr_in_arp_resolve")  \
+    PIPELINE_STAGE(ROUTER, IN,  CHK_PKT_LEN   ,  13, "lr_in_chk_pkt_len")   \
+    PIPELINE_STAGE(ROUTER, IN,  LARGER_PKTS,     14,"lr_in_larger_pkts")   \
+    PIPELINE_STAGE(ROUTER, IN,  GW_REDIRECT,     15, "lr_in_gw_redirect")  \
+    PIPELINE_STAGE(ROUTER, IN,  ARP_REQUEST,     16, "lr_in_arp_request")  \
                                                                       \
     /* Logical router egress stages. */                               \
     PIPELINE_STAGE(ROUTER, OUT, UNDNAT,    0, "lr_out_undnat")        \
@@ -222,6 +223,9 @@  enum ovn_stage {
 #define REGBIT_PKT_LARGER        "reg9[3]"
 #define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[4]"
 #define REGBIT_SKIP_LOOKUP_NEIGHBOR "reg9[5]"
+/* Register for ECMP bucket selection. */
+#define REG_ECMP_GROUP_ID       "reg8[0..15]"
+#define REG_ECMP_MEMBER_ID      "reg8[16..31]"
 
 #define FLAGBIT_NOT_VXLAN "flags[1] == 0"
 
@@ -6717,132 +6721,290 @@  add_distributed_nat_routes(struct hmap *lflows, const struct ovn_port *op)
     ds_destroy(&actions);
 }
 
-static void
-add_route(struct hmap *lflows, const struct ovn_port *op,
-          const char *lrp_addr_s, const char *network_s, int plen,
-          const char *gateway, const char *policy)
+static bool
+ip46_parse_cidr(const char *str, struct v46_ip *prefix, unsigned int *plen)
 {
-    bool is_ipv4 = strchr(network_s, '.') ? true : false;
-    struct ds match = DS_EMPTY_INITIALIZER;
-    const char *dir;
-    uint16_t priority;
-
-    if (policy && !strcmp(policy, "src-ip")) {
-        dir = "src";
-        priority = plen * 2;
-    } else {
-        dir = "dst";
-        priority = (plen * 2) + 1;
+    char *error = ip_parse_cidr(str, &prefix->ipv4, plen);
+    if (!error) {
+        prefix->family = AF_INET;
+        return true;
     }
+    free(error);
+    error = ipv6_parse_cidr(str, &prefix->ipv6, plen);
+    if (!error) {
+        prefix->family = AF_INET6;
+        return true;
+    }
+    free(error);
+    return false;
+}
 
-    /* IPv6 link-local addresses must be scoped to the local router port. */
-    if (!is_ipv4) {
-        struct in6_addr network;
-        ovs_assert(ipv6_parse(network_s, &network));
-        if (in6_is_lla(&network)) {
-            ds_put_format(&match, "inport == %s && ", op->json_key);
-        }
+static bool
+ip46_equals(const struct v46_ip *addr1, const struct v46_ip *addr2)
+{
+    return (addr1->family == addr2->family &&
+            (addr1->family == AF_INET ? addr1->ipv4 == addr2->ipv4 :
+             IN6_ARE_ADDR_EQUAL(&addr1->ipv6, &addr2->ipv6)));
+}
+
+struct parsed_route {
+    struct ovs_list list_node;
+    struct v46_ip prefix;
+    unsigned int plen;
+    bool is_src_route;
+    uint32_t hash;
+    const struct nbrec_logical_router_static_route *route;
+};
+
+static uint32_t
+route_hash(struct parsed_route *route)
+{
+    return hash_bytes(&route->prefix, sizeof route->prefix,
+                      (uint32_t)route->plen);
+}
+
+/* Parse and validate the route. Return the parsed route if successful.
+ * Otherwise return NULL. */
+static struct parsed_route *
+parsed_routes_add(struct ovs_list *routes,
+                  const struct nbrec_logical_router_static_route *route)
+{
+    /* Verify that the next hop is an IP address with an all-ones mask. */
+    struct v46_ip nexthop;
+    unsigned int plen;
+    if (!ip46_parse_cidr(route->nexthop, &nexthop, &plen)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+        VLOG_WARN_RL(&rl, "bad 'nexthop' %s in static route"
+                     UUID_FMT, route->nexthop,
+                     UUID_ARGS(&route->header_.uuid));
+        return NULL;
+    }
+    if ((nexthop.family == AF_INET && plen != 32) ||
+        (nexthop.family == AF_INET6 && plen != 128)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+        VLOG_WARN_RL(&rl, "bad next hop mask %s in static route"
+                     UUID_FMT, route->nexthop,
+                     UUID_ARGS(&route->header_.uuid));
+        return NULL;
     }
-    ds_put_format(&match, "ip%s.%s == %s/%d", is_ipv4 ? "4" : "6", dir,
-                  network_s, plen);
 
-    struct ds actions = DS_EMPTY_INITIALIZER;
-    ds_put_format(&actions, "ip.ttl--; %sreg0 = ", is_ipv4 ? "" : "xx");
+    /* Parse ip_prefix */
+    struct v46_ip prefix;
+    if (!ip46_parse_cidr(route->ip_prefix, &prefix, &plen)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+        VLOG_WARN_RL(&rl, "bad 'ip_prefix' %s in static route"
+                     UUID_FMT, route->ip_prefix,
+                     UUID_ARGS(&route->header_.uuid));
+        return NULL;
+    }
 
-    if (gateway) {
-        ds_put_cstr(&actions, gateway);
-    } else {
-        ds_put_format(&actions, "ip%s.dst", is_ipv4 ? "4" : "6");
+    /* Verify that ip_prefix and nexthop have same address familiy. */
+    if (prefix.family != nexthop.family) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+        VLOG_WARN_RL(&rl, "Address family doesn't match between 'ip_prefix' %s"
+                     " and 'nexthop' %s in static route"UUID_FMT,
+                     route->ip_prefix, route->nexthop,
+                     UUID_ARGS(&route->header_.uuid));
+        return NULL;
     }
-    ds_put_format(&actions, "; "
-                  "%sreg1 = %s; "
-                  "eth.src = %s; "
-                  "outport = %s; "
-                  "flags.loopback = 1; "
-                  "next;",
-                  is_ipv4 ? "" : "xx",
-                  lrp_addr_s,
-                  op->lrp_networks.ea_s,
-                  op->json_key);
 
-    /* The priority here is calculated to implement longest-prefix-match
-     * routing. */
-    ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_ROUTING, priority,
-                  ds_cstr(&match), ds_cstr(&actions));
-    ds_destroy(&match);
-    ds_destroy(&actions);
+    struct parsed_route *pr = xzalloc(sizeof *pr);
+    pr->prefix = prefix;
+    pr->plen = plen;
+    pr->is_src_route = (route->policy && !strcmp(route->policy,
+                                                 "src-ip"));
+    pr->hash = route_hash(pr);
+    pr->route = route;
+    ovs_list_insert(routes, &pr->list_node);
+    return pr;
 }
 
 static void
-build_static_route_flow(struct hmap *lflows, struct ovn_datapath *od,
-                        struct hmap *ports,
-                        const struct nbrec_logical_router_static_route *route)
+parsed_routes_destroy(struct ovs_list *routes)
 {
-    ovs_be32 nexthop;
-    const char *lrp_addr_s = NULL;
+    struct parsed_route *pr, *next;
+    LIST_FOR_EACH_SAFE (pr, next, list_node, routes) {
+        ovs_list_remove(&pr->list_node);
+        free(pr);
+    }
+}
+
+struct ecmp_route_list_node {
+    struct ovs_list list_node;
+    uint16_t id; /* starts from 1 */
+    const struct parsed_route *route;
+};
+
+struct ecmp_groups_node {
+    struct hmap_node hmap_node; /* In ecmp_groups */
+    uint16_t id; /* starts from 1 */
+    struct v46_ip prefix;
     unsigned int plen;
-    bool is_ipv4;
+    bool is_src_route;
+    uint16_t route_count;
+    struct ovs_list route_list; /* Contains ecmp_route_list_node */
+};
 
-    /* Verify that the next hop is an IP address with an all-ones mask. */
-    char *error = ip_parse_cidr(route->nexthop, &nexthop, &plen);
-    if (!error) {
-        if (plen != 32) {
-            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
-            VLOG_WARN_RL(&rl, "bad next hop mask %s", route->nexthop);
-            return;
+static void
+ecmp_groups_add_route(struct ecmp_groups_node *group,
+                      const struct parsed_route *route)
+{
+    struct ecmp_route_list_node *er = xmalloc(sizeof *er);
+    er->route = route;
+    er->id = ++group->route_count;
+    if (er->id == 0) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+        VLOG_WARN_RL(&rl, "too many routes in a single ecmp group.");
+        return;
+    }
+
+    ovs_list_insert(&group->route_list, &er->list_node);
+}
+
+static struct ecmp_groups_node *
+ecmp_groups_add(struct hmap *ecmp_groups,
+                const struct parsed_route *route)
+{
+    if (hmap_count(ecmp_groups) == UINT16_MAX) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+        VLOG_WARN_RL(&rl, "too many ecmp groups.");
+        return NULL;
+    }
+
+    struct ecmp_groups_node *eg = xzalloc(sizeof *eg);
+    hmap_insert(ecmp_groups, &eg->hmap_node, route->hash);
+
+    eg->id = hmap_count(ecmp_groups);
+    eg->prefix = route->prefix;
+    eg->plen = route->plen;
+    eg->is_src_route = route->is_src_route;
+    ovs_list_init(&eg->route_list);
+    ecmp_groups_add_route(eg, route);
+
+    return eg;
+}
+
+static struct ecmp_groups_node *
+ecmp_groups_find(struct hmap *ecmp_groups, struct parsed_route *route)
+{
+    struct ecmp_groups_node *eg;
+    HMAP_FOR_EACH_WITH_HASH (eg, hmap_node, route->hash, ecmp_groups) {
+        if (ip46_equals(&eg->prefix, &route->prefix) &&
+            eg->plen == route->plen &&
+            eg->is_src_route == route->is_src_route) {
+            return eg;
         }
-        is_ipv4 = true;
-    } else {
-        free(error);
+    }
+    return NULL;
+}
 
-        struct in6_addr ip6;
-        error = ipv6_parse_cidr(route->nexthop, &ip6, &plen);
-        if (!error) {
-            if (plen != 128) {
-                static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
-                VLOG_WARN_RL(&rl, "bad next hop mask %s", route->nexthop);
-                return;
-            }
-            is_ipv4 = false;
-        } else {
-            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
-            VLOG_WARN_RL(&rl, "bad next hop ip address %s", route->nexthop);
-            free(error);
-            return;
+static void
+ecmp_groups_destroy(struct hmap *ecmp_groups)
+{
+    struct ecmp_groups_node *eg, *next;
+    HMAP_FOR_EACH_SAFE (eg, next, hmap_node, ecmp_groups) {
+        struct ecmp_route_list_node *er, *er_next;
+        LIST_FOR_EACH_SAFE (er, er_next, list_node, &eg->route_list) {
+            ovs_list_remove(&er->list_node);
+            free(er);
         }
+        hmap_remove(ecmp_groups, &eg->hmap_node);
+        free(eg);
     }
+    hmap_destroy(ecmp_groups);
+}
 
-    char *prefix_s;
-    if (is_ipv4) {
-        ovs_be32 prefix;
-        /* Verify that ip prefix is a valid IPv4 address. */
-        error = ip_parse_cidr(route->ip_prefix, &prefix, &plen);
-        if (error) {
-            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
-            VLOG_WARN_RL(&rl, "bad 'ip_prefix' in static routes %s",
-                         route->ip_prefix);
-            free(error);
-            return;
+struct unique_routes_node {
+    struct hmap_node hmap_node;
+    const struct parsed_route *route;
+};
+
+static void
+unique_routes_add(struct hmap *unique_routes,
+                  const struct parsed_route *route)
+{
+    struct unique_routes_node *ur = xmalloc(sizeof *ur);
+    ur->route = route;
+    hmap_insert(unique_routes, &ur->hmap_node, route->hash);
+}
+
+/* Remove the unique_routes_node from the hmap, and return the parsed_route
+ * pointed by the removed node. */
+static const struct parsed_route *
+unique_routes_remove(struct hmap *unique_routes,
+                     const struct parsed_route *route)
+{
+    struct unique_routes_node *ur;
+    HMAP_FOR_EACH_WITH_HASH (ur, hmap_node, route->hash, unique_routes) {
+        if (ip46_equals(&route->prefix, &ur->route->prefix) &&
+            route->plen == ur->route->plen &&
+            route->is_src_route == ur->route->is_src_route) {
+            hmap_remove(unique_routes, &ur->hmap_node);
+            const struct parsed_route *existed_route = ur->route;
+            free(ur);
+            return existed_route;
         }
-        prefix_s = xasprintf(IP_FMT, IP_ARGS(prefix & be32_prefix_mask(plen)));
+    }
+    return NULL;
+}
+
+static void
+unique_routes_destroy(struct hmap *unique_routes)
+{
+    struct unique_routes_node *ur, *next;
+    HMAP_FOR_EACH_SAFE (ur, next, hmap_node, unique_routes) {
+        hmap_remove(unique_routes, &ur->hmap_node);
+        free(ur);
+    }
+    hmap_destroy(unique_routes);
+}
+
+static char *
+build_route_prefix_s(const struct v46_ip *prefix, unsigned int plen)
+{
+    char *prefix_s;
+    if (prefix->family == AF_INET) {
+        prefix_s = xasprintf(IP_FMT, IP_ARGS(prefix->ipv4 &
+                                             be32_prefix_mask(plen)));
     } else {
-        /* Verify that ip prefix is a valid IPv6 address. */
-        struct in6_addr prefix;
-        error = ipv6_parse_cidr(route->ip_prefix, &prefix, &plen);
-        if (error) {
-            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
-            VLOG_WARN_RL(&rl, "bad 'ip_prefix' in static routes %s",
-                         route->ip_prefix);
-            free(error);
-            return;
-        }
         struct in6_addr mask = ipv6_create_mask(plen);
-        struct in6_addr network = ipv6_addr_bitand(&prefix, &mask);
+        struct in6_addr network = ipv6_addr_bitand(&prefix->ipv6, &mask);
         prefix_s = xmalloc(INET6_ADDRSTRLEN);
         inet_ntop(AF_INET6, &network, prefix_s, INET6_ADDRSTRLEN);
     }
+    return prefix_s;
+}
 
-    /* Find the outgoing port. */
+static void
+build_route_match(const struct ovn_port *op_inport, const char *network_s,
+                  int plen, bool is_src_route, bool is_ipv4, struct ds *match,
+                  uint16_t *priority)
+{
+    const char *dir;
+    /* The priority here is calculated to implement longest-prefix-match
+     * routing. */
+    if (is_src_route) {
+        dir = "src";
+        *priority = plen * 2;
+    } else {
+        dir = "dst";
+        *priority = (plen * 2) + 1;
+    }
+
+    if (op_inport) {
+        ds_put_format(match, "inport == %s && ", op_inport->json_key);
+    }
+    ds_put_format(match, "ip%s.%s == %s/%d", is_ipv4 ? "4" : "6", dir,
+                  network_s, plen);
+}
+
+/* Output: p_lrp_addr_s and p_out_port. */
+static bool
+find_static_route_outport(struct ovn_datapath *od, struct hmap *ports,
+    const struct nbrec_logical_router_static_route *route, bool is_ipv4,
+    const char **p_lrp_addr_s, struct ovn_port **p_out_port)
+{
+    const char *lrp_addr_s = NULL;
     struct ovn_port *out_port = NULL;
     if (route->output_port) {
         out_port = ovn_port_find(ports, route->output_port);
@@ -6850,7 +7012,7 @@  build_static_route_flow(struct hmap *lflows, struct ovn_datapath *od,
             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
             VLOG_WARN_RL(&rl, "Bad out port %s for static route %s",
                          route->output_port, route->ip_prefix);
-            goto free_prefix_s;
+            return false;
         }
         lrp_addr_s = find_lrp_member_ip(out_port, route->nexthop);
         if (!lrp_addr_s) {
@@ -6888,20 +7050,157 @@  build_static_route_flow(struct hmap *lflows, struct ovn_datapath *od,
             }
         }
     }
-
     if (!out_port || !lrp_addr_s) {
         /* There is no matched out port. */
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
         VLOG_WARN_RL(&rl, "No path for static route %s; next hop %s",
                      route->ip_prefix, route->nexthop);
-        goto free_prefix_s;
+        return false;
+    }
+    *p_out_port = out_port;
+    *p_lrp_addr_s = lrp_addr_s;
+
+    return true;
+}
+
+static void
+build_ecmp_route_flow(struct hmap *lflows, struct ovn_datapath *od,
+                      struct hmap *ports, struct ecmp_groups_node *eg)
+
+{
+    bool is_ipv4 = (eg->prefix.family == AF_INET);
+    struct ds match = DS_EMPTY_INITIALIZER;
+    uint16_t priority;
+
+    char *prefix_s = build_route_prefix_s(&eg->prefix, eg->plen);
+    build_route_match(NULL, prefix_s, eg->plen, eg->is_src_route, is_ipv4,
+                      &match, &priority);
+    free(prefix_s);
+
+    struct ds actions = DS_EMPTY_INITIALIZER;
+    ds_put_format(&actions, "ip.ttl--; flags.loopback = 1; %s = %"PRIu16
+                  "; %s = select(", REG_ECMP_GROUP_ID, eg->id,
+                  REG_ECMP_MEMBER_ID);
+
+    struct ecmp_route_list_node *er;
+    bool is_first = true;
+    LIST_FOR_EACH (er, list_node, &eg->route_list) {
+        if (is_first) {
+            is_first = false;
+        } else {
+            ds_put_cstr(&actions, ", ");
+        }
+        ds_put_format(&actions, "%"PRIu16, er->id);
+    }
+
+    ds_put_cstr(&actions, ");");
+
+    ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, priority,
+                  ds_cstr(&match), ds_cstr(&actions));
+
+    /* Add per member flow */
+    LIST_FOR_EACH (er, list_node, &eg->route_list) {
+
+        const struct parsed_route *route_ = er->route;
+        const struct nbrec_logical_router_static_route *route = route_->route;
+        /* Find the outgoing port. */
+        const char *lrp_addr_s = NULL;
+        struct ovn_port *out_port = NULL;
+        if (!find_static_route_outport(od, ports, route, is_ipv4, &lrp_addr_s,
+                                       &out_port)) {
+            continue;
+        }
+        ds_clear(&match);
+        ds_put_format(&match, REG_ECMP_GROUP_ID" == %"PRIu16" && "
+                      REG_ECMP_MEMBER_ID" == %"PRIu16,
+                      eg->id, er->id);
+        ds_clear(&actions);
+        ds_put_format(&actions, "%sreg0 = %s; "
+                      "%sreg1 = %s; "
+                      "eth.src = %s; "
+                      "outport = %s; "
+                      "next;",
+                      is_ipv4 ? "" : "xx",
+                      route->nexthop,
+                      is_ipv4 ? "" : "xx",
+                      lrp_addr_s,
+                      out_port->lrp_networks.ea_s,
+                      out_port->json_key);
+        ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING_ECMP, 100,
+                      ds_cstr(&match), ds_cstr(&actions));
+    }
+    ds_destroy(&match);
+    ds_destroy(&actions);
+}
+
+static void
+add_route(struct hmap *lflows, const struct ovn_port *op,
+          const char *lrp_addr_s, const char *network_s, int plen,
+          const char *gateway, bool is_src_route)
+{
+    bool is_ipv4 = strchr(network_s, '.') ? true : false;
+    struct ds match = DS_EMPTY_INITIALIZER;
+    uint16_t priority;
+    const struct ovn_port *op_inport = NULL;
+
+    /* IPv6 link-local addresses must be scoped to the local router port. */
+    if (!is_ipv4) {
+        struct in6_addr network;
+        ovs_assert(ipv6_parse(network_s, &network));
+        if (in6_is_lla(&network)) {
+            op_inport = op;
+        }
     }
+    build_route_match(op_inport, network_s, plen, is_src_route, is_ipv4,
+                      &match, &priority);
 
-    char *policy = route->policy ? route->policy : "dst-ip";
-    add_route(lflows, out_port, lrp_addr_s, prefix_s, plen, route->nexthop,
-              policy);
+    struct ds actions = DS_EMPTY_INITIALIZER;
+    ds_put_format(&actions, "ip.ttl--; "REG_ECMP_GROUP_ID" = 0; %sreg0 = ",
+                  is_ipv4 ? "" : "xx");
+
+    if (gateway) {
+        ds_put_cstr(&actions, gateway);
+    } else {
+        ds_put_format(&actions, "ip%s.dst", is_ipv4 ? "4" : "6");
+    }
+    ds_put_format(&actions, "; "
+                  "%sreg1 = %s; "
+                  "eth.src = %s; "
+                  "outport = %s; "
+                  "flags.loopback = 1; "
+                  "next;",
+                  is_ipv4 ? "" : "xx",
+                  lrp_addr_s,
+                  op->lrp_networks.ea_s,
+                  op->json_key);
+
+    ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_ROUTING, priority,
+                  ds_cstr(&match), ds_cstr(&actions));
+    ds_destroy(&match);
+    ds_destroy(&actions);
+}
+
+static void
+build_static_route_flow(struct hmap *lflows, struct ovn_datapath *od,
+                        struct hmap *ports,
+                        const struct parsed_route *route_)
+{
+    const char *lrp_addr_s = NULL;
+    struct ovn_port *out_port = NULL;
+    bool is_ipv4 = (route_->prefix.family == AF_INET);
+
+    const struct nbrec_logical_router_static_route *route = route_->route;
+
+    /* Find the outgoing port. */
+    if (!find_static_route_outport(od, ports, route, is_ipv4, &lrp_addr_s,
+                                   &out_port)) {
+        return;
+    }
+
+    char *prefix_s = build_route_prefix_s(&route_->prefix, route_->plen);
+    add_route(lflows, out_port, lrp_addr_s, prefix_s, route_->plen,
+              route->nexthop, route_->is_src_route);
 
-free_prefix_s:
     free(prefix_s);
 }
 
@@ -8744,14 +9043,21 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
         ovn_lflow_add(lflows, od, S_ROUTER_IN_ND_RA_RESPONSE, 0, "1", "next;");
     }
 
-    /* Logical router ingress table IP_ROUTING: IP Routing.
+    /* Logical router ingress table IP_ROUTING & IP_ROUTING_ECMP: IP Routing.
      *
      * A packet that arrives at this table is an IP packet that should be
-     * routed to the address in 'ip[46].dst'. This table sets outport to
-     * the correct output port, eth.src to the output port's MAC
-     * address, and '[xx]reg0' to the next-hop IP address (leaving
-     * 'ip[46].dst', the packet’s final destination, unchanged), and
-     * advances to the next table for ARP/ND resolution. */
+     * routed to the address in 'ip[46].dst'.
+     *
+     * For regular routes without ECMP, table IP_ROUTING sets outport to the
+     * correct output port, eth.src to the output port's MAC address, and
+     * '[xx]reg0' to the next-hop IP address (leaving 'ip[46].dst', the
+     * packet’s final destination, unchanged), and advances to the next table.
+     *
+     * For ECMP routes, i.e. multiple routes with same policy and prefix, table
+     * IP_ROUTING remembers ECMP group id and selects a member id, and advances
+     * to table IP_ROUTING_ECMP, which sets outport, eth.src and '[xx]reg0' for
+     * the selected ECMP member.
+     * */
     HMAP_FOR_EACH (op, key_node, ports) {
         if (!op->nbrp) {
             continue;
@@ -8763,13 +9069,13 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
         for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
             add_route(lflows, op, op->lrp_networks.ipv4_addrs[i].addr_s,
                       op->lrp_networks.ipv4_addrs[i].network_s,
-                      op->lrp_networks.ipv4_addrs[i].plen, NULL, NULL);
+                      op->lrp_networks.ipv4_addrs[i].plen, NULL, false);
         }
 
         for (int i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
             add_route(lflows, op, op->lrp_networks.ipv6_addrs[i].addr_s,
                       op->lrp_networks.ipv6_addrs[i].network_s,
-                      op->lrp_networks.ipv6_addrs[i].plen, NULL, NULL);
+                      op->lrp_networks.ipv6_addrs[i].plen, NULL, false);
         }
     }
 
@@ -8778,13 +9084,47 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
         if (!od->nbr) {
             continue;
         }
+        ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING_ECMP, 150,
+                      REG_ECMP_GROUP_ID" == 0", "next;");
 
+        struct hmap ecmp_groups = HMAP_INITIALIZER(&ecmp_groups);
+        struct hmap unique_routes = HMAP_INITIALIZER(&unique_routes);
+        struct ovs_list parsed_routes = OVS_LIST_INITIALIZER(&parsed_routes);
+        struct ecmp_groups_node *group;
         for (int i = 0; i < od->nbr->n_static_routes; i++) {
-            const struct nbrec_logical_router_static_route *route;
-
-            route = od->nbr->static_routes[i];
-            build_static_route_flow(lflows, od, ports, route);
+            struct parsed_route *route =
+                parsed_routes_add(&parsed_routes, od->nbr->static_routes[i]);
+            if (!route) {
+                continue;
+            }
+            group = ecmp_groups_find(&ecmp_groups, route);
+            if (group) {
+                ecmp_groups_add_route(group, route);
+            } else {
+                const struct parsed_route *existed_route =
+                    unique_routes_remove(&unique_routes, route);
+                if (existed_route) {
+                    group = ecmp_groups_add(&ecmp_groups, existed_route);
+                    if (group) {
+                        ecmp_groups_add_route(group, route);
+                    }
+                } else {
+                    unique_routes_add(&unique_routes, route);
+                }
+            }
+        }
+        HMAP_FOR_EACH (group, hmap_node, &ecmp_groups) {
+            /* add a flow in IP_ROUTING, and one flow for each member in
+             * IP_ROUTING_ECMP. */
+            build_ecmp_route_flow(lflows, od, ports, group);
+        }
+        const struct unique_routes_node *ur;
+        HMAP_FOR_EACH (ur, hmap_node, &unique_routes) {
+            build_static_route_flow(lflows, od, ports, ur->route);
         }
+        ecmp_groups_destroy(&ecmp_groups);
+        unique_routes_destroy(&unique_routes);
+        parsed_routes_destroy(&parsed_routes);
     }
 
     /* IP Multicast lookup. Here we set the output port, adjust TTL and
diff --git a/ovn-nb.xml b/ovn-nb.xml
index 5ae52bb..4515538 100644
--- a/ovn-nb.xml
+++ b/ovn-nb.xml
@@ -2199,6 +2199,12 @@ 
       a <code>src-ip</code> route.
     </p>
 
+    <p>
+      When there are ECMP routes, i.e. multiple routes with same prefix and
+      policy, one of them will be selected based on the 5-tuple hashing of the
+      packet header.
+    </p>
+
     <column name="ip_prefix">
       <p>
         IP prefix of this route (e.g. 192.168.100.0/24).
diff --git a/tests/ovn-nbctl.at b/tests/ovn-nbctl.at
index 22bc38b..13fa7c3 100644
--- a/tests/ovn-nbctl.at
+++ b/tests/ovn-nbctl.at
@@ -1379,7 +1379,13 @@  IPv4 Routes
 
 dnl Delete non-existent prefix
 AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.2.1/24], [1], [],
-  [ovn-nbctl: no matching prefix: 10.0.2.0/24 (policy: any)
+  [ovn-nbctl: no matching route: policy 'any', prefix '10.0.2.0/24', nexthop 'any', output_port 'any'.
+])
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.1.0/24 11.0.1.2], [1], [],
+  [ovn-nbctl: no matching route: policy 'any', prefix '10.0.1.0/24', nexthop '11.0.1.2', output_port 'any'.
+])
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.1.0/24 11.0.1.1 lp1], [1], [],
+  [ovn-nbctl: no matching route: policy 'any', prefix '10.0.1.0/24', nexthop '11.0.1.1', output_port 'lp1'.
 ])
 AT_CHECK([ovn-nbctl --if-exists lr-route-del lr0 10.0.2.1/24])
 
@@ -1415,6 +1421,45 @@  AT_CHECK([ovn-nbctl lr-route-del lr0])
 AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
 ])
 
+dnl Add ecmp routes
+AT_CHECK([ovn-nbctl lr-route-add lr0 10.0.0.0/24 11.0.0.1])
+AT_CHECK([ovn-nbctl --ecmp lr-route-add lr0 10.0.0.0/24 11.0.0.2])
+AT_CHECK([ovn-nbctl --ecmp lr-route-add lr0 10.0.0.0/24 11.0.0.2])
+AT_CHECK([ovn-nbctl --ecmp lr-route-add lr0 10.0.0.0/24 11.0.0.3])
+AT_CHECK([ovn-nbctl --ecmp lr-route-add lr0 10.0.0.0/24 11.0.0.3 lp0])
+AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
+IPv4 Routes
+              10.0.0.0/24                  11.0.0.1 dst-ip
+              10.0.0.0/24                  11.0.0.2 dst-ip
+              10.0.0.0/24                  11.0.0.2 dst-ip
+              10.0.0.0/24                  11.0.0.3 dst-ip
+              10.0.0.0/24                  11.0.0.3 dst-ip lp0
+])
+
+dnl Delete ecmp routes
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.0.0/24 11.0.0.1])
+AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
+IPv4 Routes
+              10.0.0.0/24                  11.0.0.2 dst-ip
+              10.0.0.0/24                  11.0.0.2 dst-ip
+              10.0.0.0/24                  11.0.0.3 dst-ip
+              10.0.0.0/24                  11.0.0.3 dst-ip lp0
+])
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.0.0/24 11.0.0.2])
+AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
+IPv4 Routes
+              10.0.0.0/24                  11.0.0.3 dst-ip
+              10.0.0.0/24                  11.0.0.3 dst-ip lp0
+])
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.0.0/24 11.0.0.3 lp0])
+AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
+IPv4 Routes
+              10.0.0.0/24                  11.0.0.3 dst-ip
+])
+AT_CHECK([ovn-nbctl lr-route-del lr0 10.0.0.0/24 11.0.0.3])
+AT_CHECK([ovn-nbctl lr-route-list lr0], [0], [dnl
+])
+
 dnl Check IPv6 routes
 AT_CHECK([ovn-nbctl lr-route-add lr0 0:0:0:0:0:0:0:0/0 2001:0db8:0:f101::1])
 AT_CHECK([ovn-nbctl lr-route-add lr0 2001:0db8:0::/64 2001:0db8:0:f102::1 lp0])
diff --git a/tests/ovn.at b/tests/ovn.at
index de80bce..25ce34d 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -14873,7 +14873,7 @@  ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
 # Since the sw0-vir is not claimed by any chassis, eth.dst should be set to
 # zero if the ip4.dst is the virtual ip in the router pipeline.
 AT_CHECK([cat lflows.txt], [0], [dnl
-  table=11(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 00:00:00:00:00:00; next;)
+  table=12(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 00:00:00:00:00:00; next;)
 ])
 
 ip_to_hex() {
@@ -14909,7 +14909,7 @@  ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
 # There should be an arp resolve flow to resolve the virtual_ip with the
 # sw0-p1's MAC.
 AT_CHECK([cat lflows.txt], [0], [dnl
-  table=11(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:03; next;)
+  table=12(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:03; next;)
 ])
 
 # From sw0-p3 send GARP for 10.0.0.10. hv1 should claim sw0-vir
@@ -14932,7 +14932,7 @@  ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
 # There should be an arp resolve flow to resolve the virtual_ip with the
 # sw0-p2's MAC.
 AT_CHECK([cat lflows.txt], [0], [dnl
-  table=11(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:05; next;)
+  table=12(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:05; next;)
 ])
 
 # send the garp from sw0-p2 (in hv2). hv2 should claim sw0-vir
@@ -14955,7 +14955,7 @@  ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
 # There should be an arp resolve flow to resolve the virtual_ip with the
 # sw0-p3's MAC.
 AT_CHECK([cat lflows.txt], [0], [dnl
-  table=11(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:04; next;)
+  table=12(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:04; next;)
 ])
 
 # Now send arp reply from sw0-p1. hv1 should claim sw0-vir
@@ -14976,7 +14976,7 @@  ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
 > lflows.txt
 
 AT_CHECK([cat lflows.txt], [0], [dnl
-  table=11(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:03; next;)
+  table=12(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:03; next;)
 ])
 
 # Delete hv1-vif1 port. hv1 should release sw0-vir
@@ -14994,7 +14994,7 @@  ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
 > lflows.txt
 
 AT_CHECK([cat lflows.txt], [0], [dnl
-  table=11(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 00:00:00:00:00:00; next;)
+  table=12(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 00:00:00:00:00:00; next;)
 ])
 
 # Now send arp reply from sw0-p2. hv2 should claim sw0-vir
@@ -15015,7 +15015,7 @@  ovn-sbctl dump-flows lr0 | grep lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
 > lflows.txt
 
 AT_CHECK([cat lflows.txt], [0], [dnl
-  table=11(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:04; next;)
+  table=12(lr_in_arp_resolve  ), priority=100  , match=(outport == "lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst = 50:54:00:00:00:04; next;)
 ])
 
 # Delete sw0-p2 logical port
@@ -17361,3 +17361,103 @@  OVS_WAIT_UNTIL([
 
 OVN_CLEANUP([hv1])
 AT_CLEANUP
+
+AT_SETUP([ovn -- ECMP static routes])
+ovn_start
+
+# Logical network:
+# ls1 (192.168.1.0/24) - lr1 - ls2 (192.168.2.0/24)
+# lsl has lsp11 (192.168.1.11) and ls2 has lsp21 (192.168.2.21) and lsp22
+# (192.168.2.22)
+#
+# Static routes on lr1:
+# 10.0.0.0/24 nexthop 192.168.2.21
+# 10.0.0.0/24 nexthop 192.168.2.22
+#
+# Test:
+# lsp11 send packets to 10.0.0.*
+#
+# Expected result:
+# Both lsp21 and lsp22 should received some of the packets.
+
+ovn-nbctl lr-add lr1
+
+ovn-nbctl ls-add ls1
+ovn-nbctl ls-add ls2
+
+for i in 1 2; do
+    ovn-nbctl lrp-add lr1 lrp-lr1-ls${i} 00:00:00:01:0${i}:01 192.168.${i}.1/24
+    ovn-nbctl lsp-add ls${i} lsp-ls${i}-lr1 -- lsp-set-type lsp-ls${i}-lr1 router \
+        -- lsp-set-options lsp-ls${i}-lr1 router-port=lrp-lr1-ls${i} \
+        -- lsp-set-addresses lsp-ls${i}-lr1 router
+done
+
+#install static routes
+ovn-nbctl lr-route-add lr1 10.0.0.0/24 192.168.2.21
+ovn-nbctl --ecmp lr-route-add lr1 10.0.0.0/24 192.168.2.22
+
+# Create logical ports
+ovn-nbctl lsp-add ls1 lsp11 -- \
+    lsp-set-addresses lsp11 "f0:00:00:00:01:11 192.168.1.11"
+ovn-nbctl lsp-add ls2 lsp21 -- \
+    lsp-set-addresses lsp21 "f0:00:00:00:02:21 192.168.2.21"
+ovn-nbctl lsp-add ls2 lsp22 -- \
+    lsp-set-addresses lsp22 "f0:00:00:00:02:22 192.168.2.22"
+
+net_add n1
+sim_add hv1
+as hv1
+ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+ovs-vsctl -- add-port br-int hv1-vif1 -- \
+    set interface hv1-vif1 external-ids:iface-id=lsp11 \
+    options:tx_pcap=hv1/vif1-tx.pcap \
+    options:rxq_pcap=hv1/vif1-rx.pcap \
+    ofport-request=1
+
+ovs-vsctl -- add-port br-int hv1-vif2 -- \
+    set interface hv1-vif2 external-ids:iface-id=lsp21 \
+    options:tx_pcap=hv1/vif2-tx.pcap \
+    options:rxq_pcap=hv1/vif2-rx.pcap \
+    ofport-request=2
+
+ovs-vsctl -- add-port br-int hv1-vif3 -- \
+    set interface hv1-vif3 external-ids:iface-id=lsp22 \
+    options:tx_pcap=hv1/vif3-tx.pcap \
+    options:rxq_pcap=hv1/vif3-rx.pcap \
+    ofport-request=3
+
+# wait for earlier changes to take effect
+AT_CHECK([ovn-nbctl --timeout=3 --wait=hv sync], [0], [ignore])
+
+for i in $(seq 5001 5010); do
+    packet="inport==\"lsp11\" && eth.src==f0:00:00:00:01:11 && eth.dst==00:00:00:01:01:01 &&
+            ip4 && ip.ttl==64 && ip4.src==192.168.1.11 && ip4.dst==10.0.0.123 &&
+            tcp && tcp.src==$i && tcp.dst==80"
+    AT_CHECK([as hv1 ovs-appctl -t ovn-controller inject-pkt "$packet"])
+
+    for j in 1 2; do
+        # Assume all packets go to lsp2${j}.
+        exp_packet="eth.src==00:00:00:01:02:01 && eth.dst==f0:00:00:00:02:2${j} &&
+                ip4 && ip.ttl==63 && ip4.src==192.168.1.11 && ip4.dst==10.0.0.123 &&
+                tcp && tcp.src==$i && tcp.dst==80"
+        echo $exp_packet | ovstest test-ovn expr-to-packets >> expected_lsp2${j}
+    done
+done
+
+# Each port should receive some packets and the total number should be 10
+OVS_WAIT_UNTIL([
+    rcv_n1=`$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/vif2-tx.pcap > lsp21.packets && cat lsp21.packets | wc -l`
+    rcv_n2=`$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/vif3-tx.pcap > lsp22.packets && cat lsp22.packets | wc -l`
+    echo $rcv_n1 $rcv_n2
+    test $rcv_n1 -ge 1 -a $rcv_n2 -ge 1 -a $(($rcv_n1 + $rcv_n2)) -ge 10])
+
+# Each port should receive a subset of expected packets
+for i in 1 2; do
+    sort expected_lsp2$i > expout
+    AT_CHECK([cat lsp2${i}.packets expected_lsp2$i | sort | uniq], [0], [expout])
+done
+
+OVN_CLEANUP([hv1])
+
+AT_CLEANUP
diff --git a/utilities/ovn-nbctl.8.xml b/utilities/ovn-nbctl.8.xml
index ec7c28c..f20f992 100644
--- a/utilities/ovn-nbctl.8.xml
+++ b/utilities/ovn-nbctl.8.xml
@@ -612,7 +612,9 @@ 
     <h1>Logical Router Static Route Commands</h1>
 
     <dl>
-      <dt>[<code>--may-exist</code>] [<code>--policy</code>=<var>POLICY</var>] <code>lr-route-add</code> <var>router</var> <var>prefix</var> <var>nexthop</var> [<var>port</var>]</dt>
+      <dt>[<code>--may-exist</code>] [<code>--policy</code>=<var>POLICY</var>]
+        [<code>--ecmp</code>] <code>lr-route-add</code> <var>router</var>
+        <var>prefix</var> <var>nexthop</var> [<var>port</var>]</dt>
       <dd>
         <p>
           Adds the specified route to <var>router</var>.
@@ -635,19 +637,27 @@ 
 
         <p>
           It is an error if a route with <var>prefix</var> and
-          <var>POLICY</var> already exists, unless <code>--may-exist</code> is
-          specified.
+          <var>POLICY</var> already exists, unless <code>--may-exist</code> or
+          <code>--ecmp</code> is specified.  If <code>--may-exist</code> is
+          specified but not <code>--ecmp</code>, the existed route will be
+          updated with the new nexthop and port.  If <code>--ecmp</code> is
+          specified, a new route will be added, regardless of the existed
+          route, which is useful when adding ECMP routes, i.e. routes with same
+          <var>POLICY</var> and <var>prefix</var> but different
+          <var>nexthop</var> and <var>port</var>.
         </p>
       </dd>
 
-      <dt>[<code>--if-exists</code>] [<code>--policy</code>=<var>POLICY</var>] <code>lr-route-del</code> <var>router</var> [<var>prefix</var>]</dt>
+      <dt>[<code>--if-exists</code>] [<code>--policy</code>=<var>POLICY</var>]
+        <code>lr-route-del</code> <var>router</var> [<var>prefix</var>
+        [<var>nexthop</var> [<var>port</var>]]]</dt>
       <dd>
         <p>
           Deletes routes from <var>router</var>.  If only <var>router</var>
-          is supplied, all the routes from the logical router are
-          deleted.  If <var>POLICY</var> and/or <var>prefix</var> are also
-          specified, then all the routes that match the conditions will be
-          deleted from the logical router.
+          is supplied, all the routes from the logical router are deleted.  If
+          <var>POLICY</var>, <var>prefix</var>, <var>nexthop</var> and/or
+          <var>port</var> are also specified, then all the routes that match
+          the conditions will be deleted from the logical router.
         </p>
 
         <p>
diff --git a/utilities/ovn-nbctl.c b/utilities/ovn-nbctl.c
index 10c458f..0bcb975 100644
--- a/utilities/ovn-nbctl.c
+++ b/utilities/ovn-nbctl.c
@@ -680,9 +680,9 @@  Logical router port commands:\n\
                             ('overlay' or 'vlan')\n\
 \n\
 Route commands:\n\
-  [--policy=POLICY] lr-route-add ROUTER PREFIX NEXTHOP [PORT]\n\
+  [--policy=POLICY] [--ecmp] lr-route-add ROUTER PREFIX NEXTHOP [PORT]\n\
                             add a route to ROUTER\n\
-  [--policy=POLICY] lr-route-del ROUTER [PREFIX]\n\
+  [--policy=POLICY] lr-route-del ROUTER [PREFIX [NEXTHOP [PORT]]]\n\
                             remove routes from ROUTER\n\
   lr-route-list ROUTER      print routes for ROUTER\n\
 \n\
@@ -3739,59 +3739,62 @@  nbctl_lr_route_add(struct ctl_context *ctx)
     }
 
     bool may_exist = shash_find(&ctx->options, "--may-exist") != NULL;
-    for (int i = 0; i < lr->n_static_routes; i++) {
-        const struct nbrec_logical_router_static_route *route
-            = lr->static_routes[i];
-        char *rt_prefix;
+    bool ecmp = shash_find(&ctx->options, "--ecmp") != NULL;
+    if (!ecmp) {
+        for (int i = 0; i < lr->n_static_routes; i++) {
+            const struct nbrec_logical_router_static_route *route
+                = lr->static_routes[i];
+            char *rt_prefix;
+
+            /* Compare route policy. */
+            char *nb_policy = lr->static_routes[i]->policy;
+            bool nb_is_src_route = false;
+            if (nb_policy && !strcmp(nb_policy, "src-ip")) {
+                    nb_is_src_route = true;
+            }
+            if (is_src_route != nb_is_src_route) {
+                continue;
+            }
 
-        /* Compare route policy. */
-        char *nb_policy = lr->static_routes[i]->policy;
-        bool nb_is_src_route = false;
-        if (nb_policy && !strcmp(nb_policy, "src-ip")) {
-                nb_is_src_route = true;
-        }
-        if (is_src_route != nb_is_src_route) {
-            continue;
-        }
+            /* Compare route prefix. */
+            rt_prefix = normalize_prefix_str(lr->static_routes[i]->ip_prefix);
+            if (!rt_prefix) {
+                /* Ignore existing prefix we couldn't parse. */
+                continue;
+            }
 
-        /* Compare route prefix. */
-        rt_prefix = normalize_prefix_str(lr->static_routes[i]->ip_prefix);
-        if (!rt_prefix) {
-            /* Ignore existing prefix we couldn't parse. */
-            continue;
-        }
+            if (strcmp(rt_prefix, prefix)) {
+                free(rt_prefix);
+                continue;
+            }
 
-        if (strcmp(rt_prefix, prefix)) {
-            free(rt_prefix);
-            continue;
-        }
+            if (!may_exist) {
+                ctl_error(ctx, "duplicate prefix: %s (policy: %s)",
+                          prefix, is_src_route ? "src-ip" : "dst-ip");
+                free(next_hop);
+                free(rt_prefix);
+                free(prefix);
+                return;
+            }
 
-        if (!may_exist) {
-            ctl_error(ctx, "duplicate prefix: %s (policy: %s)",
-                      prefix, is_src_route ? "src-ip" : "dst-ip");
-            free(next_hop);
+            /* Update the next hop for an existing route. */
+            nbrec_logical_router_verify_static_routes(lr);
+            nbrec_logical_router_static_route_verify_ip_prefix(route);
+            nbrec_logical_router_static_route_verify_nexthop(route);
+            nbrec_logical_router_static_route_set_ip_prefix(route, prefix);
+            nbrec_logical_router_static_route_set_nexthop(route, next_hop);
+            if (ctx->argc == 5) {
+                nbrec_logical_router_static_route_set_output_port(
+                    route, ctx->argv[4]);
+            }
+            if (policy) {
+                 nbrec_logical_router_static_route_set_policy(route, policy);
+            }
             free(rt_prefix);
+            free(next_hop);
             free(prefix);
             return;
         }
-
-        /* Update the next hop for an existing route. */
-        nbrec_logical_router_verify_static_routes(lr);
-        nbrec_logical_router_static_route_verify_ip_prefix(route);
-        nbrec_logical_router_static_route_verify_nexthop(route);
-        nbrec_logical_router_static_route_set_ip_prefix(route, prefix);
-        nbrec_logical_router_static_route_set_nexthop(route, next_hop);
-        if (ctx->argc == 5) {
-            nbrec_logical_router_static_route_set_output_port(route,
-                                                              ctx->argv[4]);
-        }
-        if (policy) {
-             nbrec_logical_router_static_route_set_policy(route, policy);
-        }
-        free(rt_prefix);
-        free(next_hop);
-        free(prefix);
-        return;
     }
 
     struct nbrec_logical_router_static_route *route;
@@ -3854,6 +3857,20 @@  nbctl_lr_route_del(struct ctl_context *ctx)
         }
     }
 
+    char *nexthop = NULL;
+    if (ctx->argc >= 4) {
+        nexthop = normalize_prefix_str(ctx->argv[3]);
+        if (!nexthop) {
+            ctl_error(ctx, "bad nexthop argument: %s", ctx->argv[3]);
+            return;
+        }
+    }
+
+    char *output_port = NULL;
+    if (ctx->argc == 5) {
+        output_port = ctx->argv[4];
+    }
+
     struct nbrec_logical_router_static_route **new_routes
         = xmemdup(lr->static_routes,
                   sizeof *new_routes * lr->n_static_routes);
@@ -3882,27 +3899,59 @@  nbctl_lr_route_del(struct ctl_context *ctx)
                 continue;
             }
 
-            if (strcmp(prefix, rt_prefix)) {
+            int ret = strcmp(prefix, rt_prefix);
+            free(rt_prefix);
+            if (ret) {
+                new_routes[n_new++] = lr->static_routes[i];
+                continue;
+            }
+        }
+
+        /* Compare nexthop, if specified. */
+        if (nexthop) {
+            char *rt_nexthop =
+                normalize_prefix_str(lr->static_routes[i]->nexthop);
+            if (!rt_nexthop) {
+                /* Ignore existing nexthop we couldn't parse. */
+                new_routes[n_new++] = lr->static_routes[i];
+                continue;
+            }
+            int ret = strcmp(nexthop, rt_nexthop);
+            free(rt_nexthop);
+            if (ret) {
+                new_routes[n_new++] = lr->static_routes[i];
+                continue;
+            }
+        }
+
+        /* Compare output_port, if specified. */
+        if (output_port) {
+            char *rt_output_port = lr->static_routes[i]->output_port;
+            if (!rt_output_port || strcmp(output_port, rt_output_port)) {
                 new_routes[n_new++] = lr->static_routes[i];
             }
-            free(rt_prefix);
         }
     }
 
     if (n_new < lr->n_static_routes) {
         nbrec_logical_router_verify_static_routes(lr);
         nbrec_logical_router_set_static_routes(lr, new_routes, n_new);
-        free(new_routes);
-        free(prefix);
-        return;
+        goto out;
     }
 
     if (!shash_find(&ctx->options, "--if-exists")) {
-        ctl_error(ctx, "no matching prefix: %s (policy: %s)",
-                  prefix, policy ? policy : "any");
+        ctl_error(ctx, "no matching route: policy '%s', prefix '%s', nexthop "
+                  "'%s', output_port '%s'.",
+                  policy ? policy : "any",
+                  prefix,
+                  nexthop ? nexthop : "any",
+                  output_port ? output_port : "any");
     }
+
+out:
     free(new_routes);
     free(prefix);
+    free(nexthop);
 }
 
 static void
@@ -4773,6 +4822,26 @@  nbctl_lrp_get_redirect_type(struct ctl_context *ctx)
 }
 
 
+static int
+route_cmp_details(const struct nbrec_logical_router_static_route *r1,
+                  const struct nbrec_logical_router_static_route *r2)
+{
+    int ret = strcmp(r1->nexthop, r2->nexthop);
+    if (ret) {
+        return ret;
+    }
+    if (r1->output_port && r2->output_port) {
+        ret = strcmp(r1->output_port, r2->output_port);
+        if (ret) {
+            return ret;
+        }
+        return 0;
+    }
+    if (!r1->output_port && !r2->output_port) {
+        return 0;
+    }
+    return r1->output_port ? 1 : -1;
+}
 struct ipv4_route {
     int priority;
     ovs_be32 addr;
@@ -4787,11 +4856,11 @@  ipv4_route_cmp(const void *route1_, const void *route2_)
 
     if (route1p->priority != route2p->priority) {
         return route1p->priority > route2p->priority ? -1 : 1;
-    } else if (route1p->addr != route2p->addr) {
+    }
+    if (route1p->addr != route2p->addr) {
         return ntohl(route1p->addr) < ntohl(route2p->addr) ? -1 : 1;
-    } else {
-        return 0;
     }
+    return route_cmp_details(route1p->route, route2p->route);
 }
 
 struct ipv6_route {
@@ -4809,7 +4878,11 @@  ipv6_route_cmp(const void *route1_, const void *route2_)
     if (route1p->priority != route2p->priority) {
         return route1p->priority > route2p->priority ? -1 : 1;
     }
-    return memcmp(&route1p->addr, &route2p->addr, sizeof(route1p->addr));
+    int ret = memcmp(&route1p->addr, &route2p->addr, sizeof(route1p->addr));
+    if (ret) {
+        return ret;
+    }
+    return route_cmp_details(route1p->route, route2p->route);
 }
 
 static void
@@ -5786,9 +5859,9 @@  static const struct ctl_command_syntax nbctl_commands[] = {
 
     /* logical router route commands. */
     { "lr-route-add", 3, 4, "ROUTER PREFIX NEXTHOP [PORT]", NULL,
-      nbctl_lr_route_add, NULL, "--may-exist,--policy=", RW },
-    { "lr-route-del", 1, 2, "ROUTER [PREFIX]", NULL, nbctl_lr_route_del,
-      NULL, "--if-exists,--policy=", RW },
+      nbctl_lr_route_add, NULL, "--may-exist,--ecmp,--policy=", RW },
+    { "lr-route-del", 1, 4, "ROUTER [PREFIX [NEXTHOP [PORT]]]", NULL,
+      nbctl_lr_route_del, NULL, "--if-exists,--policy=", RW },
     { "lr-route-list", 1, 1, "ROUTER", NULL, nbctl_lr_route_list, NULL,
       "", RO },