diff mbox series

[ovs-dev,v3,ovn,5/5] ovn-northd: Minimize number of ARP/NS responder flows for DNAT.

Message ID 20200702145337.10223.97947.stgit@dceara.remote.csb
State Superseded
Headers show
Series Reduce number of flows in IN_IP_INPUT table for DNAT. | expand

Commit Message

Dumitru Ceara July 2, 2020, 2:53 p.m. UTC
Most ARP/NS responder flows can be configured per datapath instead of per
router port.

The only exception is with distributed gateway router ports which need
special treatment. This patch changes the ARP/NS responder behavior and adds:
- Priority 92 flows to reply to ARP requests on distributed gateway router
  ports, on the chassis where the DNAT entry is bound.
- Priority 91 flows to drop ARP requests on distributed gateway router ports,
  on chassis where the DNAT entry is not bound.
- Priority 90 flows to reply to ARP requests on all other router ports. This
  last type of flows is programmed exactly once per logical router limiting
  the total number of required logical flows.

Suggested-by: Han Zhou <hzhou@ovn.org>
Reported-by: Girish Moodalbail <gmoodalbail@gmail.com>
Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2020-June/050186.html
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 northd/ovn-northd.8.xml |   16 +++-
 northd/ovn-northd.c     |  203 ++++++++++++++++++++++++++++++++---------------
 tests/ovn-northd.at     |   65 +++++++++------
 tests/ovn.at            |    8 +-
 4 files changed, 190 insertions(+), 102 deletions(-)

Comments

Han Zhou July 3, 2020, 5:46 a.m. UTC | #1
Thanks Dumitru. I have 2 comments below.

On Thu, Jul 2, 2020 at 7:53 AM Dumitru Ceara <dceara@redhat.com> wrote:
>
> Most ARP/NS responder flows can be configured per datapath instead of per
> router port.
>
> The only exception is with distributed gateway router ports which need
> special treatment. This patch changes the ARP/NS responder behavior and
adds:
> - Priority 92 flows to reply to ARP requests on distributed gateway router
>   ports, on the chassis where the DNAT entry is bound.
> - Priority 91 flows to drop ARP requests on distributed gateway router
ports,
>   on chassis where the DNAT entry is not bound.
> - Priority 90 flows to reply to ARP requests on all other router ports.
This
>   last type of flows is programmed exactly once per logical router
limiting
>   the total number of required logical flows.
>
> Suggested-by: Han Zhou <hzhou@ovn.org>
> Reported-by: Girish Moodalbail <gmoodalbail@gmail.com>
> Reported-at:
https://mail.openvswitch.org/pipermail/ovs-discuss/2020-June/050186.html
> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
> ---
>  northd/ovn-northd.8.xml |   16 +++-
>  northd/ovn-northd.c     |  203
++++++++++++++++++++++++++++++++---------------
>  tests/ovn-northd.at     |   65 +++++++++------
>  tests/ovn.at            |    8 +-
>  4 files changed, 190 insertions(+), 102 deletions(-)
>
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index 84224ff..11607c0 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -1857,9 +1857,8 @@ nd_na_router {
>            IPv4: For a configured DNAT IP address or a load balancer
>            IPv4 VIP <var>A</var>, for each router port <var>P</var> with
>            Ethernet address <var>E</var>, a priority-90 flow matches
> -          <code>inport == <var>P</var> &amp;&amp; arp.op == 1 &amp;&amp;
> -          arp.tpa == <var>A</var></code> (ARP request)
> -          with the following actions:
> +          <code>arp.op == 1 &amp;&amp; arp.tpa == <var>A</var></code>
> +          (ARP request) with the following actions:
>          </p>
>
>          <pre>
> @@ -1876,6 +1875,11 @@ output;
>          </pre>
>
>          <p>
> +          IPv4: For a configured load balancer IPv4 VIP, a similar flow
is
> +          added with the additional match <code>inport ==
<var>P</var></code>.
> +        </p>
> +
> +        <p>
>            If the router port <var>P</var> is a distributed gateway router
>            port, then the <code>is_chassis_resident(<var>P</var>)</code>
is
>            also added in the match condition for the load balancer IPv4
> @@ -1922,9 +1926,11 @@ nd_na {
>          <ul>
>            <li>
>              If the corresponding NAT rule cannot be handled in a
> -            distributed manner, then this flow is only programmed on
> +            distributed manner, then a priority-92 flow is programmed on
>              the gateway port instance on the
> -            <code>redirect-chassis</code>.  This behavior avoids
> +            <code>redirect-chassis</code>.  A priority-91 drop flow is
> +            programmed on the other chassis when ARP requests/NS packets
> +            are received on the gateway port. This behavior avoids
>              generation of multiple ARP responses from different chassis,
>              and allows upstream MAC learning to point to the
>              <code>redirect-chassis</code>.
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index a1ba56f..10fc8cf 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -8048,7 +8048,7 @@ lrouter_nat_is_stateless(const struct nbrec_nat
*nat)
>  static void
>  build_lrouter_arp_flow(struct ovn_datapath *od, struct ovn_port *op,
>                         const char *ip_address, const char *eth_addr,
> -                       struct ds *extra_match, uint16_t priority,
> +                       struct ds *extra_match, bool drop, uint16_t
priority,
>                         struct hmap *lflows, const struct ovsdb_idl_row
*hint)
>  {
>      struct ds match = DS_EMPTY_INITIALIZER;
> @@ -8063,20 +8063,24 @@ build_lrouter_arp_flow(struct ovn_datapath *od,
struct ovn_port *op,
>      if (extra_match && ds_last(extra_match) != EOF) {
>          ds_put_format(&match, " && %s", ds_cstr(extra_match));
>      }
> -    ds_put_format(&actions,
> -                  "eth.dst = eth.src; "
> -                  "eth.src = %s; "
> -                  "arp.op = 2; /* ARP reply */ "
> -                  "arp.tha = arp.sha; "
> -                  "arp.sha = %s; "
> -                  "arp.tpa = arp.spa; "
> -                  "arp.spa = %s; "
> -                  "outport = inport; "
> -                  "flags.loopback = 1; "
> -                  "output;",
> -                  eth_addr,
> -                  eth_addr,
> -                  ip_address);
> +    if (drop) {
> +        ds_put_format(&actions, "drop;");
> +    } else {
> +        ds_put_format(&actions,
> +                      "eth.dst = eth.src; "
> +                      "eth.src = %s; "
> +                      "arp.op = 2; /* ARP reply */ "
> +                      "arp.tha = arp.sha; "
> +                      "arp.sha = %s; "
> +                      "arp.tpa = arp.spa; "
> +                      "arp.spa = %s; "
> +                      "outport = inport; "
> +                      "flags.loopback = 1; "
> +                      "output;",
> +                      eth_addr,
> +                      eth_addr,
> +                      ip_address);
> +    }
>
>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT, priority,
>                              ds_cstr(&match), ds_cstr(&actions), hint);
> @@ -8095,7 +8099,7 @@ static void
>  build_lrouter_nd_flow(struct ovn_datapath *od, struct ovn_port *op,
>                        const char *action, const char *ip_address,
>                        const char *sn_ip_address, const char *eth_addr,
> -                      struct ds *extra_match, uint16_t priority,
> +                      struct ds *extra_match, bool drop, uint16_t
priority,
>                        struct hmap *lflows,
>                        const struct ovsdb_idl_row *hint)
>  {
> @@ -8117,21 +8121,25 @@ build_lrouter_nd_flow(struct ovn_datapath *od,
struct ovn_port *op,
>          ds_put_format(&match, " && %s", ds_cstr(extra_match));
>      }
>
> -    ds_put_format(&actions,
> -                  "%s { "
> -                    "eth.src = %s; "
> -                    "ip6.src = %s; "
> -                    "nd.target = %s; "
> -                    "nd.tll = %s; "
> -                    "outport = inport; "
> -                    "flags.loopback = 1; "
> -                    "output; "
> -                  "};",
> -                  action,
> -                  eth_addr,
> -                  ip_address,
> -                  ip_address,
> -                  eth_addr);
> +    if (drop) {
> +        ds_put_format(&actions, "drop;");
> +    } else {
> +        ds_put_format(&actions,
> +                      "%s { "
> +                        "eth.src = %s; "
> +                        "ip6.src = %s; "
> +                        "nd.target = %s; "
> +                        "nd.tll = %s; "
> +                        "outport = inport; "
> +                        "flags.loopback = 1; "
> +                        "output; "
> +                      "};",
> +                      action,
> +                      eth_addr,
> +                      ip_address,
> +                      ip_address,
> +                      eth_addr);
> +    }
>
>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT, priority,
>                              ds_cstr(&match), ds_cstr(&actions), hint);
> @@ -8321,7 +8329,41 @@ build_lrouter_flows(struct hmap *datapaths, struct
hmap *ports,
>                        "ip4.dst == 0.0.0.0/8",
>                        "drop;");
>
> -        /* Priority-90 flows reply to ARP requests and ND packets. */
> +        /* Priority-90-92 flows handle ARP requests and ND packets. Most
are
> +         * per logical port but DNAT addresses can be handled per
datapath
> +         * for non gateway router ports.
> +         */
> +        for (int i = 0; i < od->nbr->n_nat; i++) {
> +            struct ovn_nat *nat_entry = &od->nat_entries[i];
> +            const struct nbrec_nat *nat = nat_entry->nb;
> +
> +            /* Skip entries we failed to parse. */
> +            if (!nat_entry_is_valid(nat_entry)) {
> +                continue;
> +            }
> +
> +            if (!strcmp(nat->type, "snat")) {
> +                continue;
> +            }
> +
> +            /* Priority 91 and 92 flows are added for each gateway router
> +             * port to handle the special cases. In case we get the
packet
> +             * on a regular port, just reply with the port's ETH address.
> +             */
> +            struct lport_addresses *ext_addrs = &nat_entry->ext_addrs;
> +            if (nat_entry_is_v6(nat_entry)) {
> +                build_lrouter_nd_flow(od, NULL, "nd_na",
> +                                      ext_addrs->ipv6_addrs[0].addr_s,
> +                                      ext_addrs->ipv6_addrs[0].sn_addr_s,
> +                                      REG_INPORT_ETH_ADDR, NULL, false,
90,
> +                                      lflows, &nat->header_);
> +            } else {
> +                build_lrouter_arp_flow(od, NULL,
> +                                       ext_addrs->ipv4_addrs[0].addr_s,
> +                                       REG_INPORT_ETH_ADDR, NULL, false,
90,
> +                                       lflows, &nat->header_);
> +            }
> +        }
>
>          /* Drop ARP packets (priority 85). ARP request packets for
router's own
>           * IPs are handled with priority-90 flows.
> @@ -8471,8 +8513,8 @@ build_lrouter_flows(struct hmap *datapaths, struct
hmap *ports,
>
>              build_lrouter_arp_flow(op->od, op,
>                                     op->lrp_networks.ipv4_addrs[i].addr_s,
> -                                   REG_INPORT_ETH_ADDR, &match, 90,
lflows,
> -                                   &op->nbrp->header_);
> +                                   REG_INPORT_ETH_ADDR, &match, false,
90,
> +                                   lflows, &op->nbrp->header_);
>          }
>
>          /* A set to hold all load-balancer vips that need ARP responses.
*/
> @@ -8490,7 +8532,7 @@ build_lrouter_flows(struct hmap *datapaths, struct
hmap *ports,
>
>              build_lrouter_arp_flow(op->od, op,
>                                     ip_address, REG_INPORT_ETH_ADDR,
> -                                   &match, 90, lflows, NULL);
> +                                   &match, false, 90, lflows, NULL);
>          }
>
>          SSET_FOR_EACH (ip_address, &all_ips_v6) {
> @@ -8502,7 +8544,7 @@ build_lrouter_flows(struct hmap *datapaths, struct
hmap *ports,
>
>              build_lrouter_nd_flow(op->od, op, "nd_na",
>                                    ip_address, NULL, REG_INPORT_ETH_ADDR,
> -                                  &match, 90, lflows, NULL);
> +                                  &match, false, 90, lflows, NULL);
>          }
>
>          sset_destroy(&all_ips_v4);
> @@ -8555,53 +8597,84 @@ build_lrouter_flows(struct hmap *datapaths,
struct hmap *ports,
>              /* Mac address to use when replying to ARP/NS. */
>              const char *mac_s = REG_INPORT_ETH_ADDR;
>
> +            /* ARP/NS packets are taken care of per router. The only
exception
> +             * is on the l3dgw_port where we might need to use a
different
> +             * ETH address.
> +             */
> +            if (op != op->od->l3dgw_port) {

This check should instead be put before the for loop, since this condition
is the same for the whole loop.

> +                continue;
> +            }
> +
>              /* ARP / ND handling for external IP addresses.
>               *
>               * DNAT IP addresses are external IP addresses that need ARP
>               * handling. */
>              ds_clear(&match);
>
> -            if (op->od->l3dgw_port && op == op->od->l3dgw_port) {
> -                struct eth_addr mac;
> -                if (nat->external_mac &&
> -                    eth_addr_from_string(nat->external_mac, &mac)
> -                    && nat->logical_port) {
> -                    /* distributed NAT case, use nat->external_mac */
> -                    mac_s = nat->external_mac;
> -                    /* Traffic with eth.src = nat->external_mac should
only be
> -                     * sent from the chassis where nat->logical_port is
> -                     * resident, so that upstream MAC learning points to
the
> -                     * correct chassis.  Also need to avoid generation of
> -                     * multiple ARP responses from different chassis. */
> -                    ds_put_format(&match, "is_chassis_resident(\"%s\")",
> -                                  nat->logical_port);
> -                } else {
> -                    mac_s = REG_INPORT_ETH_ADDR;
> -                    /* Traffic with eth.src =
l3dgw_port->lrp_networks.ea_s
> -                     * should only be sent from the "redirect-chassis",
so that
> -                     * upstream MAC learning points to the
"redirect-chassis".
> -                     * Also need to avoid generation of multiple ARP
responses
> -                     * from different chassis. */
> -                    if (op->od->l3redirect_port) {
> -                        ds_put_format(&match, "is_chassis_resident(%s)",
> -                                      op->od->l3redirect_port->json_key);
> -                    }
> +            struct ds match_cr_port = DS_EMPTY_INITIALIZER;
> +            struct ds match_non_cr_port = DS_EMPTY_INITIALIZER;
> +
> +            struct eth_addr mac;
> +            if (nat->external_mac &&
> +                eth_addr_from_string(nat->external_mac, &mac)
> +                && nat->logical_port) {
> +                /* distributed NAT case, use nat->external_mac */
> +                mac_s = nat->external_mac;
> +                /* Traffic with eth.src = nat->external_mac should only
be
> +                 * sent from the chassis where nat->logical_port is
> +                 * resident, so that upstream MAC learning points to the
> +                 * correct chassis.  Also need to avoid generation of
> +                 * multiple ARP responses from different chassis. */
> +                ds_put_format(&match_cr_port,
> +                              "is_chassis_resident(\"%s\")",
> +                              nat->logical_port);
> +                ds_put_format(&match_non_cr_port,
> +                              "!is_chassis_resident(\"%s\")",
> +                              nat->logical_port);
> +            } else {
> +                mac_s = REG_INPORT_ETH_ADDR;
> +                /* Traffic with eth.src = l3dgw_port->lrp_networks.ea_s
> +                 * should only be sent from the "redirect-chassis", so
that
> +                 * upstream MAC learning points to the
"redirect-chassis".
> +                 * Also need to avoid generation of multiple ARP
responses
> +                 * from different chassis. */
> +                if (op->od->l3redirect_port) {
> +                    ds_put_format(&match_cr_port,
> +                                  "is_chassis_resident(\"%s\")",
> +                                  op->od->l3redirect_port->json_key);
> +                    ds_put_format(&match_non_cr_port,
> +                                  "!is_chassis_resident(\"%s\")",
> +                                  op->od->l3redirect_port->json_key);
>                  }
>              }
>
> +            /* Respond to ARP/NS requests on the chassis that binds the
gw
> +             * port. Drop the ARP/NS requests on other chassis.
> +             */
>              struct lport_addresses *ext_addrs = &nat_entry->ext_addrs;
>              if (nat_entry_is_v6(nat_entry)) {
>                  build_lrouter_nd_flow(op->od, op, "nd_na",
>                                        ext_addrs->ipv6_addrs[0].addr_s,
>                                        ext_addrs->ipv6_addrs[0].sn_addr_s,
> -                                      mac_s, &match, 90,
> +                                      mac_s, &match_cr_port, false, 92,
> +                                      lflows, &nat->header_);
> +                build_lrouter_nd_flow(op->od, op, "nd_na",
> +                                      ext_addrs->ipv6_addrs[0].addr_s,
> +                                      ext_addrs->ipv6_addrs[0].sn_addr_s,
> +                                      mac_s, &match_non_cr_port, true,
91,
>                                        lflows, &nat->header_);

I think "match_non_cr_port" is not needed. If the priority-92 flow (with
"match_cr_port") doesn't match, then it can just let the priority-91 flow
to drop the packet. Alternatively, if "match_non_cr_port" is added, then
the two lflows can use the same priority.

>              } else {
>                  build_lrouter_arp_flow(op->od, op,
>                                         ext_addrs->ipv4_addrs[0].addr_s,
> -                                       mac_s, &match, 90,
> +                                       mac_s, &match_cr_port, false, 92,
> +                                       lflows, &nat->header_);
> +                build_lrouter_arp_flow(op->od, op,
> +                                       ext_addrs->ipv4_addrs[0].addr_s,
> +                                       mac_s, &match_non_cr_port, true,
91,
>                                         lflows, &nat->header_);
>              }
> +            ds_destroy(&match_cr_port);
> +            ds_destroy(&match_non_cr_port);
>          }
>
>          if (!smap_get(&op->od->nbr->options, "chassis")
> @@ -8787,8 +8860,8 @@ build_lrouter_flows(struct hmap *datapaths, struct
hmap *ports,
>              build_lrouter_nd_flow(op->od, op, "nd_na_router",
>                                    op->lrp_networks.ipv6_addrs[i].addr_s,
>
 op->lrp_networks.ipv6_addrs[i].sn_addr_s,
> -                                  REG_INPORT_ETH_ADDR, &match, 90,
lflows,
> -                                  &op->nbrp->header_);
> +                                  REG_INPORT_ETH_ADDR, &match, false, 90,
> +                                  lflows, &op->nbrp->header_);
>          }
>
>          /* UDP/TCP port unreachable */
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index a3fb7ef..4a13456 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -1594,33 +1594,24 @@ action=(xreg0[[0..47]] = 00:00:00:00:01:00; next;)
>  # Ingress router port ETH address is used for ARP reply/NA in
lr_in_ip_input.
>  AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" |
grep "arp\|nd" | sort], [0], [dnl
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
arp.spa == 42.42.42.0/24), dnl
> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1; output;)
> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.2), dnl
> +match=(arp.op == 1 && arp.tpa == 43.43.43.2), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.3), dnl
> +match=(arp.op == 1 && arp.tpa == 43.43.43.3), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.4), dnl
> +match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> +match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
arp.spa == 42.42.42.0/24), dnl
> +action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1; output;)
> +  table=3 (lr_in_ip_input     ), priority=90   , dnl
>  match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1,
ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
fe80::200:ff:fe00:1; nd.target = fe80::200:ff:fe00:1; nd.tll =
xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>  match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 &&
arp.spa == 43.43.43.0/24), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.1; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2),
dnl
> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1; output;)
> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3),
dnl
> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1; output;)
> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.4),
dnl
> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1; output;)
> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
>  match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100,
ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100), dnl
>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll =
xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>  ])
> @@ -1654,37 +1645,55 @@ action=(xreg0[[0..47]] = 00:00:00:00:01:00; next;)
>
>  # Ingress router port is used for ARP reply/NA in lr_in_ip_input.
>  # xxreg0[0..47] is used unless external_mac is set.
> +# Priority 90 flows (per router).
>  AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" |
grep "arp\|nd" | sort], [0], [dnl
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
arp.spa == 42.42.42.0/24), dnl
> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1; output;)
> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.2), dnl
> +match=(arp.op == 1 && arp.tpa == 43.43.43.2), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.3), dnl
> +match=(arp.op == 1 && arp.tpa == 43.43.43.3), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.4), dnl
> +match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> +match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
arp.spa == 42.42.42.0/24), dnl
> +action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1; output;)
> +  table=3 (lr_in_ip_input     ), priority=90   , dnl
>  match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1,
ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
fe80::200:ff:fe00:1; nd.target = fe80::200:ff:fe00:1; nd.tll =
xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>  match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 &&
arp.spa == 43.43.43.0/24), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.1; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2 &&
is_chassis_resident("cr-lrp-public")), dnl
> +match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100,
ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100 &&
is_chassis_resident("cr-lrp-public")), dnl
> +action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll =
xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
> +])
> +
> +# Priority 91 drop flows (per distributed gw port), if port is not
resident.
> +AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=91" |
grep "arp\|nd" | sort], [0], [dnl
> +  table=3 (lr_in_ip_input     ), priority=91   , dnl
> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2 &&
!is_chassis_resident(""cr-lrp-public"")), dnl
> +action=(drop;)
> +  table=3 (lr_in_ip_input     ), priority=91   , dnl
> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3 &&
!is_chassis_resident(""cr-lrp-public"")), dnl
> +action=(drop;)
> +  table=3 (lr_in_ip_input     ), priority=91   , dnl
> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.4 &&
!is_chassis_resident("ls-vm")), dnl
> +action=(drop;)
> +])
> +
> +# Priority 92 ARP/NS responders (per distributed gw port), if port is
resident.
> +AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=92" |
grep "arp\|nd" | sort], [0], [dnl
> +  table=3 (lr_in_ip_input     ), priority=92   , dnl
> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2 &&
is_chassis_resident(""cr-lrp-public"")), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1; output;)
> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3 &&
is_chassis_resident("cr-lrp-public")), dnl
> +  table=3 (lr_in_ip_input     ), priority=92   , dnl
> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3 &&
is_chassis_resident(""cr-lrp-public"")), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP
reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa;
arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1; output;)
> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> +  table=3 (lr_in_ip_input     ), priority=92   , dnl
>  match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.4 &&
is_chassis_resident("ls-vm")), dnl
>  action=(eth.dst = eth.src; eth.src = 00:00:00:00:00:02; arp.op = 2; /*
ARP reply */ arp.tha = arp.sha; arp.sha = 00:00:00:00:00:02; arp.tpa =
arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1;
output;)
> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100,
ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100 &&
is_chassis_resident("cr-lrp-public")), dnl
> -action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll =
xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>  ])
>
>  # xreg0[0..47] isn't used anywhere else.
> diff --git a/tests/ovn.at b/tests/ovn.at
> index 1a1cfbf..9522b55 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -19336,7 +19336,7 @@ OVS_WAIT_UNTIL([
>  send_arp_request 1 0 ${src_mac} $(ip_to_hex 10 0 0 254) $(ip_to_hex 10 0
0 121)
>
>  # Verify that the ARP request is replied to from hv1 and not hv2.
>
-match_arp_req="priority=90.*${match_r1_metadata}.*arp_tpa=10.0.0.121,arp_op=1"
>
+match_arp_req="priority=92.*${match_r1_metadata}.*arp_tpa=10.0.0.121,arp_op=1"
>
>  as hv1
>  OVS_WAIT_UNTIL([
> @@ -19356,7 +19356,7 @@ OVS_WAIT_UNTIL([
>  send_arp_request 1 0 ${src_mac} $(ip_to_hex 10 0 0 254) $(ip_to_hex 10 0
0 122)
>
>  # Verify that the ARP request is replied to from hv2 and not hv1.
>
-match_arp_req="priority=90.*${match_r1_metadata}.*arp_tpa=10.0.0.122,arp_op=1"
>
+match_arp_req="priority=92.*${match_r1_metadata}.*arp_tpa=10.0.0.122,arp_op=1"
>
>  as hv2
>  OVS_WAIT_UNTIL([
> @@ -19400,7 +19400,7 @@ dst_ipv6=00100000000000000000000000000121
>  send_nd_ns 1 0 ${src_mac} ${src_ipv6} ${dst_ipv6} 72dd
>
>  # Verify that the ND_NS is replied to from hv1 and not hv2.
>
-match_nd_ns="priority=90.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::121"
>
+match_nd_ns="priority=92.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::121"
>
>  as hv1
>  OVS_WAIT_UNTIL([
> @@ -19422,7 +19422,7 @@ dst_ipv6=00100000000000000000000000000122
>  send_nd_ns 1 0 ${src_mac} ${src_ipv6} ${dst_ipv6} 72db
>
>  # Verify that the ND_NS is replied to from hv2 and not hv1.
>
-match_nd_ns="priority=90.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::122"
>
+match_nd_ns="priority=92.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::122"
>
>  as hv2
>  OVS_WAIT_UNTIL([
>
Dumitru Ceara July 3, 2020, 8:05 a.m. UTC | #2
On 7/3/20 7:46 AM, Han Zhou wrote:
> Thanks Dumitru. I have 2 comments below.
> 
> On Thu, Jul 2, 2020 at 7:53 AM Dumitru Ceara <dceara@redhat.com
> <mailto:dceara@redhat.com>> wrote:
>>
>> Most ARP/NS responder flows can be configured per datapath instead of per
>> router port.
>>
>> The only exception is with distributed gateway router ports which need
>> special treatment. This patch changes the ARP/NS responder behavior
> and adds:
>> - Priority 92 flows to reply to ARP requests on distributed gateway router
>>   ports, on the chassis where the DNAT entry is bound.
>> - Priority 91 flows to drop ARP requests on distributed gateway router
> ports,
>>   on chassis where the DNAT entry is not bound.
>> - Priority 90 flows to reply to ARP requests on all other router
> ports. This
>>   last type of flows is programmed exactly once per logical router
> limiting
>>   the total number of required logical flows.
>>
>> Suggested-by: Han Zhou <hzhou@ovn.org <mailto:hzhou@ovn.org>>
>> Reported-by: Girish Moodalbail <gmoodalbail@gmail.com
> <mailto:gmoodalbail@gmail.com>>
>> Reported-at:
> https://mail.openvswitch.org/pipermail/ovs-discuss/2020-June/050186.html
>> Signed-off-by: Dumitru Ceara <dceara@redhat.com
> <mailto:dceara@redhat.com>>
>> ---
>>  northd/ovn-northd.8.xml |   16 +++-
>>  northd/ovn-northd.c     |  203
> ++++++++++++++++++++++++++++++++---------------
>>  tests/ovn-northd.at <http://ovn-northd.at>     |   65 +++++++++------
>>  tests/ovn.at <http://ovn.at>            |    8 +-
>>  4 files changed, 190 insertions(+), 102 deletions(-)
>>
>> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
>> index 84224ff..11607c0 100644
>> --- a/northd/ovn-northd.8.xml
>> +++ b/northd/ovn-northd.8.xml
>> @@ -1857,9 +1857,8 @@ nd_na_router {
>>            IPv4: For a configured DNAT IP address or a load balancer
>>            IPv4 VIP <var>A</var>, for each router port <var>P</var> with
>>            Ethernet address <var>E</var>, a priority-90 flow matches
>> -          <code>inport == <var>P</var> &amp;&amp; arp.op == 1 &amp;&amp;
>> -          arp.tpa == <var>A</var></code> (ARP request)
>> -          with the following actions:
>> +          <code>arp.op == 1 &amp;&amp; arp.tpa == <var>A</var></code>
>> +          (ARP request) with the following actions:
>>          </p>
>>
>>          <pre>
>> @@ -1876,6 +1875,11 @@ output;
>>          </pre>
>>
>>          <p>
>> +          IPv4: For a configured load balancer IPv4 VIP, a similar
> flow is
>> +          added with the additional match <code>inport ==
> <var>P</var></code>.
>> +        </p>
>> +
>> +        <p>
>>            If the router port <var>P</var> is a distributed gateway router
>>            port, then the
> <code>is_chassis_resident(<var>P</var>)</code> is
>>            also added in the match condition for the load balancer IPv4
>> @@ -1922,9 +1926,11 @@ nd_na {
>>          <ul>
>>            <li>
>>              If the corresponding NAT rule cannot be handled in a
>> -            distributed manner, then this flow is only programmed on
>> +            distributed manner, then a priority-92 flow is programmed on
>>              the gateway port instance on the
>> -            <code>redirect-chassis</code>.  This behavior avoids
>> +            <code>redirect-chassis</code>.  A priority-91 drop flow is
>> +            programmed on the other chassis when ARP requests/NS packets
>> +            are received on the gateway port. This behavior avoids
>>              generation of multiple ARP responses from different chassis,
>>              and allows upstream MAC learning to point to the
>>              <code>redirect-chassis</code>.
>> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
>> index a1ba56f..10fc8cf 100644
>> --- a/northd/ovn-northd.c
>> +++ b/northd/ovn-northd.c
>> @@ -8048,7 +8048,7 @@ lrouter_nat_is_stateless(const struct nbrec_nat
> *nat)
>>  static void
>>  build_lrouter_arp_flow(struct ovn_datapath *od, struct ovn_port *op,
>>                         const char *ip_address, const char *eth_addr,
>> -                       struct ds *extra_match, uint16_t priority,
>> +                       struct ds *extra_match, bool drop, uint16_t
> priority,
>>                         struct hmap *lflows, const struct
> ovsdb_idl_row *hint)
>>  {
>>      struct ds match = DS_EMPTY_INITIALIZER;
>> @@ -8063,20 +8063,24 @@ build_lrouter_arp_flow(struct ovn_datapath
> *od, struct ovn_port *op,
>>      if (extra_match && ds_last(extra_match) != EOF) {
>>          ds_put_format(&match, " && %s", ds_cstr(extra_match));
>>      }
>> -    ds_put_format(&actions,
>> -                  "eth.dst = eth.src; "
>> -                  "eth.src = %s; "
>> -                  "arp.op = 2; /* ARP reply */ "
>> -                  "arp.tha = arp.sha; "
>> -                  "arp.sha = %s; "
>> -                  "arp.tpa = arp.spa; "
>> -                  "arp.spa = %s; "
>> -                  "outport = inport; "
>> -                  "flags.loopback = 1; "
>> -                  "output;",
>> -                  eth_addr,
>> -                  eth_addr,
>> -                  ip_address);
>> +    if (drop) {
>> +        ds_put_format(&actions, "drop;");
>> +    } else {
>> +        ds_put_format(&actions,
>> +                      "eth.dst = eth.src; "
>> +                      "eth.src = %s; "
>> +                      "arp.op = 2; /* ARP reply */ "
>> +                      "arp.tha = arp.sha; "
>> +                      "arp.sha = %s; "
>> +                      "arp.tpa = arp.spa; "
>> +                      "arp.spa = %s; "
>> +                      "outport = inport; "
>> +                      "flags.loopback = 1; "
>> +                      "output;",
>> +                      eth_addr,
>> +                      eth_addr,
>> +                      ip_address);
>> +    }
>>
>>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT, priority,
>>                              ds_cstr(&match), ds_cstr(&actions), hint);
>> @@ -8095,7 +8099,7 @@ static void
>>  build_lrouter_nd_flow(struct ovn_datapath *od, struct ovn_port *op,
>>                        const char *action, const char *ip_address,
>>                        const char *sn_ip_address, const char *eth_addr,
>> -                      struct ds *extra_match, uint16_t priority,
>> +                      struct ds *extra_match, bool drop, uint16_t
> priority,
>>                        struct hmap *lflows,
>>                        const struct ovsdb_idl_row *hint)
>>  {
>> @@ -8117,21 +8121,25 @@ build_lrouter_nd_flow(struct ovn_datapath *od,
> struct ovn_port *op,
>>          ds_put_format(&match, " && %s", ds_cstr(extra_match));
>>      }
>>
>> -    ds_put_format(&actions,
>> -                  "%s { "
>> -                    "eth.src = %s; "
>> -                    "ip6.src = %s; "
>> -                    "nd.target = %s; "
>> -                    "nd.tll = %s; "
>> -                    "outport = inport; "
>> -                    "flags.loopback = 1; "
>> -                    "output; "
>> -                  "};",
>> -                  action,
>> -                  eth_addr,
>> -                  ip_address,
>> -                  ip_address,
>> -                  eth_addr);
>> +    if (drop) {
>> +        ds_put_format(&actions, "drop;");
>> +    } else {
>> +        ds_put_format(&actions,
>> +                      "%s { "
>> +                        "eth.src = %s; "
>> +                        "ip6.src = %s; "
>> +                        "nd.target = %s; "
>> +                        "nd.tll = %s; "
>> +                        "outport = inport; "
>> +                        "flags.loopback = 1; "
>> +                        "output; "
>> +                      "};",
>> +                      action,
>> +                      eth_addr,
>> +                      ip_address,
>> +                      ip_address,
>> +                      eth_addr);
>> +    }
>>
>>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT, priority,
>>                              ds_cstr(&match), ds_cstr(&actions), hint);
>> @@ -8321,7 +8329,41 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
>>                        "ip4.dst == 0.0.0.0/8 <http://0.0.0.0/8>",
>>                        "drop;");
>>
>> -        /* Priority-90 flows reply to ARP requests and ND packets. */
>> +        /* Priority-90-92 flows handle ARP requests and ND packets.
> Most are
>> +         * per logical port but DNAT addresses can be handled per
> datapath
>> +         * for non gateway router ports.
>> +         */
>> +        for (int i = 0; i < od->nbr->n_nat; i++) {
>> +            struct ovn_nat *nat_entry = &od->nat_entries[i];
>> +            const struct nbrec_nat *nat = nat_entry->nb;
>> +
>> +            /* Skip entries we failed to parse. */
>> +            if (!nat_entry_is_valid(nat_entry)) {
>> +                continue;
>> +            }
>> +
>> +            if (!strcmp(nat->type, "snat")) {
>> +                continue;
>> +            }
>> +
>> +            /* Priority 91 and 92 flows are added for each gateway router
>> +             * port to handle the special cases. In case we get the
> packet
>> +             * on a regular port, just reply with the port's ETH address.
>> +             */
>> +            struct lport_addresses *ext_addrs = &nat_entry->ext_addrs;
>> +            if (nat_entry_is_v6(nat_entry)) {
>> +                build_lrouter_nd_flow(od, NULL, "nd_na",
>> +                                      ext_addrs->ipv6_addrs[0].addr_s,
>> +                                      ext_addrs->ipv6_addrs[0].sn_addr_s,
>> +                                      REG_INPORT_ETH_ADDR, NULL,
> false, 90,
>> +                                      lflows, &nat->header_);
>> +            } else {
>> +                build_lrouter_arp_flow(od, NULL,
>> +                                       ext_addrs->ipv4_addrs[0].addr_s,
>> +                                       REG_INPORT_ETH_ADDR, NULL,
> false, 90,
>> +                                       lflows, &nat->header_);
>> +            }
>> +        }
>>
>>          /* Drop ARP packets (priority 85). ARP request packets for
> router's own
>>           * IPs are handled with priority-90 flows.
>> @@ -8471,8 +8513,8 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
>>
>>              build_lrouter_arp_flow(op->od, op,
>>                                     op->lrp_networks.ipv4_addrs[i].addr_s,
>> -                                   REG_INPORT_ETH_ADDR, &match, 90,
> lflows,
>> -                                   &op->nbrp->header_);
>> +                                   REG_INPORT_ETH_ADDR, &match,
> false, 90,
>> +                                   lflows, &op->nbrp->header_);
>>          }
>>
>>          /* A set to hold all load-balancer vips that need ARP
> responses. */
>> @@ -8490,7 +8532,7 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
>>
>>              build_lrouter_arp_flow(op->od, op,
>>                                     ip_address, REG_INPORT_ETH_ADDR,
>> -                                   &match, 90, lflows, NULL);
>> +                                   &match, false, 90, lflows, NULL);
>>          }
>>
>>          SSET_FOR_EACH (ip_address, &all_ips_v6) {
>> @@ -8502,7 +8544,7 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
>>
>>              build_lrouter_nd_flow(op->od, op, "nd_na",
>>                                    ip_address, NULL, REG_INPORT_ETH_ADDR,
>> -                                  &match, 90, lflows, NULL);
>> +                                  &match, false, 90, lflows, NULL);
>>          }
>>
>>          sset_destroy(&all_ips_v4);
>> @@ -8555,53 +8597,84 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
>>              /* Mac address to use when replying to ARP/NS. */
>>              const char *mac_s = REG_INPORT_ETH_ADDR;
>>
>> +            /* ARP/NS packets are taken care of per router. The only
> exception
>> +             * is on the l3dgw_port where we might need to use a
> different
>> +             * ETH address.
>> +             */
>> +            if (op != op->od->l3dgw_port) {
> 
> This check should instead be put before the for loop, since this
> condition is the same for the whole loop.
> 

Hi Han,

Thanks for the review. I don't think that would work because we also
build snat_ips in the same loop, regardless of op->od->l3dgw_port.

It might be cleaner to run the loop twice, once for SNAT and once for
DNAT (if op == op->od->l3dgw_port).

What do you think?

>> +                continue;
>> +            }
>> +
>>              /* ARP / ND handling for external IP addresses.
>>               *
>>               * DNAT IP addresses are external IP addresses that need ARP
>>               * handling. */
>>              ds_clear(&match);
>>
>> -            if (op->od->l3dgw_port && op == op->od->l3dgw_port) {
>> -                struct eth_addr mac;
>> -                if (nat->external_mac &&
>> -                    eth_addr_from_string(nat->external_mac, &mac)
>> -                    && nat->logical_port) {
>> -                    /* distributed NAT case, use nat->external_mac */
>> -                    mac_s = nat->external_mac;
>> -                    /* Traffic with eth.src = nat->external_mac
> should only be
>> -                     * sent from the chassis where nat->logical_port is
>> -                     * resident, so that upstream MAC learning points
> to the
>> -                     * correct chassis.  Also need to avoid generation of
>> -                     * multiple ARP responses from different chassis. */
>> -                    ds_put_format(&match, "is_chassis_resident(\"%s\")",
>> -                                  nat->logical_port);
>> -                } else {
>> -                    mac_s = REG_INPORT_ETH_ADDR;
>> -                    /* Traffic with eth.src =
> l3dgw_port->lrp_networks.ea_s
>> -                     * should only be sent from the
> "redirect-chassis", so that
>> -                     * upstream MAC learning points to the
> "redirect-chassis".
>> -                     * Also need to avoid generation of multiple ARP
> responses
>> -                     * from different chassis. */
>> -                    if (op->od->l3redirect_port) {
>> -                        ds_put_format(&match, "is_chassis_resident(%s)",
>> -                                      op->od->l3redirect_port->json_key);
>> -                    }
>> +            struct ds match_cr_port = DS_EMPTY_INITIALIZER;
>> +            struct ds match_non_cr_port = DS_EMPTY_INITIALIZER;
>> +
>> +            struct eth_addr mac;
>> +            if (nat->external_mac &&
>> +                eth_addr_from_string(nat->external_mac, &mac)
>> +                && nat->logical_port) {
>> +                /* distributed NAT case, use nat->external_mac */
>> +                mac_s = nat->external_mac;
>> +                /* Traffic with eth.src = nat->external_mac should
> only be
>> +                 * sent from the chassis where nat->logical_port is
>> +                 * resident, so that upstream MAC learning points to the
>> +                 * correct chassis.  Also need to avoid generation of
>> +                 * multiple ARP responses from different chassis. */
>> +                ds_put_format(&match_cr_port,
>> +                              "is_chassis_resident(\"%s\")",
>> +                              nat->logical_port);
>> +                ds_put_format(&match_non_cr_port,
>> +                              "!is_chassis_resident(\"%s\")",
>> +                              nat->logical_port);
>> +            } else {
>> +                mac_s = REG_INPORT_ETH_ADDR;
>> +                /* Traffic with eth.src = l3dgw_port->lrp_networks.ea_s
>> +                 * should only be sent from the "redirect-chassis",
> so that
>> +                 * upstream MAC learning points to the
> "redirect-chassis".
>> +                 * Also need to avoid generation of multiple ARP
> responses
>> +                 * from different chassis. */
>> +                if (op->od->l3redirect_port) {
>> +                    ds_put_format(&match_cr_port,
>> +                                  "is_chassis_resident(\"%s\")",
>> +                                  op->od->l3redirect_port->json_key);
>> +                    ds_put_format(&match_non_cr_port,
>> +                                  "!is_chassis_resident(\"%s\")",
>> +                                  op->od->l3redirect_port->json_key);
>>                  }
>>              }
>>
>> +            /* Respond to ARP/NS requests on the chassis that binds
> the gw
>> +             * port. Drop the ARP/NS requests on other chassis.
>> +             */
>>              struct lport_addresses *ext_addrs = &nat_entry->ext_addrs;
>>              if (nat_entry_is_v6(nat_entry)) {
>>                  build_lrouter_nd_flow(op->od, op, "nd_na",
>>                                        ext_addrs->ipv6_addrs[0].addr_s,
>>                                        ext_addrs->ipv6_addrs[0].sn_addr_s,
>> -                                      mac_s, &match, 90,
>> +                                      mac_s, &match_cr_port, false, 92,
>> +                                      lflows, &nat->header_);
>> +                build_lrouter_nd_flow(op->od, op, "nd_na",
>> +                                      ext_addrs->ipv6_addrs[0].addr_s,
>> +                                      ext_addrs->ipv6_addrs[0].sn_addr_s,
>> +                                      mac_s, &match_non_cr_port,
> true, 91,
>>                                        lflows, &nat->header_);
> 
> I think "match_non_cr_port" is not needed. If the priority-92 flow (with
> "match_cr_port") doesn't match, then it can just let the priority-91
> flow to drop the packet. Alternatively, if "match_non_cr_port" is added,
> then the two lflows can use the same priority.
> 

You're right, we don't need match_non_cr_port, I'll remove it.

Thanks,
Dumitru

>>              } else {
>>                  build_lrouter_arp_flow(op->od, op,
>>                                         ext_addrs->ipv4_addrs[0].addr_s,
>> -                                       mac_s, &match, 90,
>> +                                       mac_s, &match_cr_port, false, 92,
>> +                                       lflows, &nat->header_);
>> +                build_lrouter_arp_flow(op->od, op,
>> +                                       ext_addrs->ipv4_addrs[0].addr_s,
>> +                                       mac_s, &match_non_cr_port,
> true, 91,
>>                                         lflows, &nat->header_);
>>              }
>> +            ds_destroy(&match_cr_port);
>> +            ds_destroy(&match_non_cr_port);
>>          }
>>
>>          if (!smap_get(&op->od->nbr->options, "chassis")
>> @@ -8787,8 +8860,8 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
>>              build_lrouter_nd_flow(op->od, op, "nd_na_router",
>>                                    op->lrp_networks.ipv6_addrs[i].addr_s,
>>                                  
>  op->lrp_networks.ipv6_addrs[i].sn_addr_s,
>> -                                  REG_INPORT_ETH_ADDR, &match, 90,
> lflows,
>> -                                  &op->nbrp->header_);
>> +                                  REG_INPORT_ETH_ADDR, &match, false, 90,
>> +                                  lflows, &op->nbrp->header_);
>>          }
>>
>>          /* UDP/TCP port unreachable */
>> diff --git a/tests/ovn-northd.at <http://ovn-northd.at>
> b/tests/ovn-northd.at <http://ovn-northd.at>
>> index a3fb7ef..4a13456 100644
>> --- a/tests/ovn-northd.at <http://ovn-northd.at>
>> +++ b/tests/ovn-northd.at <http://ovn-northd.at>
>> @@ -1594,33 +1594,24 @@ action=(xreg0[[0..47]] = 00:00:00:00:01:00; next;)
>>  # Ingress router port ETH address is used for ARP reply/NA in
> lr_in_ip_input.
>>  AT_CHECK([ovn-sbctl lflow-list | grep -E
> "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sort], [0], [dnl
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
> arp.spa == 42.42.42.0/24 <http://42.42.42.0/24>), dnl
>> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1;
> output;)
>> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.2), dnl
>> +match=(arp.op == 1 && arp.tpa == 43.43.43.2), dnl
>>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1;
> output;)
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.3), dnl
>> +match=(arp.op == 1 && arp.tpa == 43.43.43.3), dnl
>>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1;
> output;)
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.4), dnl
>> +match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
>>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1;
> output;)
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>> +match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
> arp.spa == 42.42.42.0/24 <http://42.42.42.0/24>), dnl
>> +action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1;
> output;)
>> +  table=3 (lr_in_ip_input     ), priority=90   , dnl
>>  match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1,
> ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
>>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
> fe80::200:ff:fe00:1; nd.target = fe80::200:ff:fe00:1; nd.tll =
> xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>>  match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1
> && arp.spa == 43.43.43.0/24 <http://43.43.43.0/24>), dnl
>>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.1; outport = inport; flags.loopback = 1;
> output;)
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa ==
> 43.43.43.2), dnl
>> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1;
> output;)
>> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa ==
> 43.43.43.3), dnl
>> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1;
> output;)
>> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa ==
> 43.43.43.4), dnl
>> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1;
> output;)
>> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
>>  match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100,
> ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100), dnl
>>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
> fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll =
> xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>>  ])
>> @@ -1654,37 +1645,55 @@ action=(xreg0[[0..47]] = 00:00:00:00:01:00; next;)
>>
>>  # Ingress router port is used for ARP reply/NA in lr_in_ip_input.
>>  # xxreg0[0..47] is used unless external_mac is set.
>> +# Priority 90 flows (per router).
>>  AT_CHECK([ovn-sbctl lflow-list | grep -E
> "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sort], [0], [dnl
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
> arp.spa == 42.42.42.0/24 <http://42.42.42.0/24>), dnl
>> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1;
> output;)
>> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.2), dnl
>> +match=(arp.op == 1 && arp.tpa == 43.43.43.2), dnl
>>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1;
> output;)
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.3), dnl
>> +match=(arp.op == 1 && arp.tpa == 43.43.43.3), dnl
>>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1;
> output;)
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.4), dnl
>> +match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
>>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1;
> output;)
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>> +match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
> arp.spa == 42.42.42.0/24 <http://42.42.42.0/24>), dnl
>> +action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1;
> output;)
>> +  table=3 (lr_in_ip_input     ), priority=90   , dnl
>>  match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1,
> ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
>>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
> fe80::200:ff:fe00:1; nd.target = fe80::200:ff:fe00:1; nd.tll =
> xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>>  match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1
> && arp.spa == 43.43.43.0/24 <http://43.43.43.0/24>), dnl
>>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.1; outport = inport; flags.loopback = 1;
> output;)
>>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2
> && is_chassis_resident("cr-lrp-public")), dnl
>> +match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100,
> ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100 &&
> is_chassis_resident("cr-lrp-public")), dnl
>> +action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
> fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll =
> xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>> +])
>> +
>> +# Priority 91 drop flows (per distributed gw port), if port is not
> resident.
>> +AT_CHECK([ovn-sbctl lflow-list | grep -E
> "lr_in_ip_input.*priority=91" | grep "arp\|nd" | sort], [0], [dnl
>> +  table=3 (lr_in_ip_input     ), priority=91   , dnl
>> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2
> && !is_chassis_resident(""cr-lrp-public"")), dnl
>> +action=(drop;)
>> +  table=3 (lr_in_ip_input     ), priority=91   , dnl
>> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3
> && !is_chassis_resident(""cr-lrp-public"")), dnl
>> +action=(drop;)
>> +  table=3 (lr_in_ip_input     ), priority=91   , dnl
>> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.4
> && !is_chassis_resident("ls-vm")), dnl
>> +action=(drop;)
>> +])
>> +
>> +# Priority 92 ARP/NS responders (per distributed gw port), if port is
> resident.
>> +AT_CHECK([ovn-sbctl lflow-list | grep -E
> "lr_in_ip_input.*priority=92" | grep "arp\|nd" | sort], [0], [dnl
>> +  table=3 (lr_in_ip_input     ), priority=92   , dnl
>> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2
> && is_chassis_resident(""cr-lrp-public"")), dnl
>>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1;
> output;)
>> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3
> && is_chassis_resident("cr-lrp-public")), dnl
>> +  table=3 (lr_in_ip_input     ), priority=92   , dnl
>> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3
> && is_chassis_resident(""cr-lrp-public"")), dnl
>>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1;
> output;)
>> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
>> +  table=3 (lr_in_ip_input     ), priority=92   , dnl
>>  match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.4
> && is_chassis_resident("ls-vm")), dnl
>>  action=(eth.dst = eth.src; eth.src = 00:00:00:00:00:02; arp.op = 2;
> /* ARP reply */ arp.tha = arp.sha; arp.sha = 00:00:00:00:00:02; arp.tpa
> = arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1;
> output;)
>> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
>> -match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100,
> ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100 &&
> is_chassis_resident("cr-lrp-public")), dnl
>> -action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
> fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll =
> xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>>  ])
>>
>>  # xreg0[0..47] isn't used anywhere else.
>> diff --git a/tests/ovn.at <http://ovn.at> b/tests/ovn.at <http://ovn.at>
>> index 1a1cfbf..9522b55 100644
>> --- a/tests/ovn.at <http://ovn.at>
>> +++ b/tests/ovn.at <http://ovn.at>
>> @@ -19336,7 +19336,7 @@ OVS_WAIT_UNTIL([
>>  send_arp_request 1 0 ${src_mac} $(ip_to_hex 10 0 0 254) $(ip_to_hex
> 10 0 0 121)
>>
>>  # Verify that the ARP request is replied to from hv1 and not hv2.
>>
> -match_arp_req="priority=90.*${match_r1_metadata}.*arp_tpa=10.0.0.121,arp_op=1"
>>
> +match_arp_req="priority=92.*${match_r1_metadata}.*arp_tpa=10.0.0.121,arp_op=1"
>>
>>  as hv1
>>  OVS_WAIT_UNTIL([
>> @@ -19356,7 +19356,7 @@ OVS_WAIT_UNTIL([
>>  send_arp_request 1 0 ${src_mac} $(ip_to_hex 10 0 0 254) $(ip_to_hex
> 10 0 0 122)
>>
>>  # Verify that the ARP request is replied to from hv2 and not hv1.
>>
> -match_arp_req="priority=90.*${match_r1_metadata}.*arp_tpa=10.0.0.122,arp_op=1"
>>
> +match_arp_req="priority=92.*${match_r1_metadata}.*arp_tpa=10.0.0.122,arp_op=1"
>>
>>  as hv2
>>  OVS_WAIT_UNTIL([
>> @@ -19400,7 +19400,7 @@ dst_ipv6=00100000000000000000000000000121
>>  send_nd_ns 1 0 ${src_mac} ${src_ipv6} ${dst_ipv6} 72dd
>>
>>  # Verify that the ND_NS is replied to from hv1 and not hv2.
>>
> -match_nd_ns="priority=90.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::121"
>>
> +match_nd_ns="priority=92.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::121"
>>
>>  as hv1
>>  OVS_WAIT_UNTIL([
>> @@ -19422,7 +19422,7 @@ dst_ipv6=00100000000000000000000000000122
>>  send_nd_ns 1 0 ${src_mac} ${src_ipv6} ${dst_ipv6} 72db
>>
>>  # Verify that the ND_NS is replied to from hv2 and not hv1.
>>
> -match_nd_ns="priority=90.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::122"
>>
> +match_nd_ns="priority=92.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::122"
>>
>>  as hv2
>>  OVS_WAIT_UNTIL([
>>
Han Zhou July 3, 2020, 5:04 p.m. UTC | #3
On Fri, Jul 3, 2020 at 1:06 AM Dumitru Ceara <dceara@redhat.com> wrote:
>
> On 7/3/20 7:46 AM, Han Zhou wrote:
> > Thanks Dumitru. I have 2 comments below.
> >
> > On Thu, Jul 2, 2020 at 7:53 AM Dumitru Ceara <dceara@redhat.com
> > <mailto:dceara@redhat.com>> wrote:
> >>
> >> Most ARP/NS responder flows can be configured per datapath instead of
per
> >> router port.
> >>
> >> The only exception is with distributed gateway router ports which need
> >> special treatment. This patch changes the ARP/NS responder behavior
> > and adds:
> >> - Priority 92 flows to reply to ARP requests on distributed gateway
router
> >>   ports, on the chassis where the DNAT entry is bound.
> >> - Priority 91 flows to drop ARP requests on distributed gateway router
> > ports,
> >>   on chassis where the DNAT entry is not bound.
> >> - Priority 90 flows to reply to ARP requests on all other router
> > ports. This
> >>   last type of flows is programmed exactly once per logical router
> > limiting
> >>   the total number of required logical flows.
> >>
> >> Suggested-by: Han Zhou <hzhou@ovn.org <mailto:hzhou@ovn.org>>
> >> Reported-by: Girish Moodalbail <gmoodalbail@gmail.com
> > <mailto:gmoodalbail@gmail.com>>
> >> Reported-at:
> > https://mail.openvswitch.org/pipermail/ovs-discuss/2020-June/050186.html
> >> Signed-off-by: Dumitru Ceara <dceara@redhat.com
> > <mailto:dceara@redhat.com>>
> >> ---
> >>  northd/ovn-northd.8.xml |   16 +++-
> >>  northd/ovn-northd.c     |  203
> > ++++++++++++++++++++++++++++++++---------------
> >>  tests/ovn-northd.at <http://ovn-northd.at>     |   65 +++++++++------
> >>  tests/ovn.at <http://ovn.at>            |    8 +-
> >>  4 files changed, 190 insertions(+), 102 deletions(-)
> >>
> >> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> >> index 84224ff..11607c0 100644
> >> --- a/northd/ovn-northd.8.xml
> >> +++ b/northd/ovn-northd.8.xml
> >> @@ -1857,9 +1857,8 @@ nd_na_router {
> >>            IPv4: For a configured DNAT IP address or a load balancer
> >>            IPv4 VIP <var>A</var>, for each router port <var>P</var>
with
> >>            Ethernet address <var>E</var>, a priority-90 flow matches
> >> -          <code>inport == <var>P</var> &amp;&amp; arp.op == 1
&amp;&amp;
> >> -          arp.tpa == <var>A</var></code> (ARP request)
> >> -          with the following actions:
> >> +          <code>arp.op == 1 &amp;&amp; arp.tpa == <var>A</var></code>
> >> +          (ARP request) with the following actions:
> >>          </p>
> >>
> >>          <pre>
> >> @@ -1876,6 +1875,11 @@ output;
> >>          </pre>
> >>
> >>          <p>
> >> +          IPv4: For a configured load balancer IPv4 VIP, a similar
> > flow is
> >> +          added with the additional match <code>inport ==
> > <var>P</var></code>.
> >> +        </p>
> >> +
> >> +        <p>
> >>            If the router port <var>P</var> is a distributed gateway
router
> >>            port, then the
> > <code>is_chassis_resident(<var>P</var>)</code> is
> >>            also added in the match condition for the load balancer IPv4
> >> @@ -1922,9 +1926,11 @@ nd_na {
> >>          <ul>
> >>            <li>
> >>              If the corresponding NAT rule cannot be handled in a
> >> -            distributed manner, then this flow is only programmed on
> >> +            distributed manner, then a priority-92 flow is programmed
on
> >>              the gateway port instance on the
> >> -            <code>redirect-chassis</code>.  This behavior avoids
> >> +            <code>redirect-chassis</code>.  A priority-91 drop flow is
> >> +            programmed on the other chassis when ARP requests/NS
packets
> >> +            are received on the gateway port. This behavior avoids
> >>              generation of multiple ARP responses from different
chassis,
> >>              and allows upstream MAC learning to point to the
> >>              <code>redirect-chassis</code>.
> >> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> >> index a1ba56f..10fc8cf 100644
> >> --- a/northd/ovn-northd.c
> >> +++ b/northd/ovn-northd.c
> >> @@ -8048,7 +8048,7 @@ lrouter_nat_is_stateless(const struct nbrec_nat
> > *nat)
> >>  static void
> >>  build_lrouter_arp_flow(struct ovn_datapath *od, struct ovn_port *op,
> >>                         const char *ip_address, const char *eth_addr,
> >> -                       struct ds *extra_match, uint16_t priority,
> >> +                       struct ds *extra_match, bool drop, uint16_t
> > priority,
> >>                         struct hmap *lflows, const struct
> > ovsdb_idl_row *hint)
> >>  {
> >>      struct ds match = DS_EMPTY_INITIALIZER;
> >> @@ -8063,20 +8063,24 @@ build_lrouter_arp_flow(struct ovn_datapath
> > *od, struct ovn_port *op,
> >>      if (extra_match && ds_last(extra_match) != EOF) {
> >>          ds_put_format(&match, " && %s", ds_cstr(extra_match));
> >>      }
> >> -    ds_put_format(&actions,
> >> -                  "eth.dst = eth.src; "
> >> -                  "eth.src = %s; "
> >> -                  "arp.op = 2; /* ARP reply */ "
> >> -                  "arp.tha = arp.sha; "
> >> -                  "arp.sha = %s; "
> >> -                  "arp.tpa = arp.spa; "
> >> -                  "arp.spa = %s; "
> >> -                  "outport = inport; "
> >> -                  "flags.loopback = 1; "
> >> -                  "output;",
> >> -                  eth_addr,
> >> -                  eth_addr,
> >> -                  ip_address);
> >> +    if (drop) {
> >> +        ds_put_format(&actions, "drop;");
> >> +    } else {
> >> +        ds_put_format(&actions,
> >> +                      "eth.dst = eth.src; "
> >> +                      "eth.src = %s; "
> >> +                      "arp.op = 2; /* ARP reply */ "
> >> +                      "arp.tha = arp.sha; "
> >> +                      "arp.sha = %s; "
> >> +                      "arp.tpa = arp.spa; "
> >> +                      "arp.spa = %s; "
> >> +                      "outport = inport; "
> >> +                      "flags.loopback = 1; "
> >> +                      "output;",
> >> +                      eth_addr,
> >> +                      eth_addr,
> >> +                      ip_address);
> >> +    }
> >>
> >>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT,
priority,
> >>                              ds_cstr(&match), ds_cstr(&actions), hint);
> >> @@ -8095,7 +8099,7 @@ static void
> >>  build_lrouter_nd_flow(struct ovn_datapath *od, struct ovn_port *op,
> >>                        const char *action, const char *ip_address,
> >>                        const char *sn_ip_address, const char *eth_addr,
> >> -                      struct ds *extra_match, uint16_t priority,
> >> +                      struct ds *extra_match, bool drop, uint16_t
> > priority,
> >>                        struct hmap *lflows,
> >>                        const struct ovsdb_idl_row *hint)
> >>  {
> >> @@ -8117,21 +8121,25 @@ build_lrouter_nd_flow(struct ovn_datapath *od,
> > struct ovn_port *op,
> >>          ds_put_format(&match, " && %s", ds_cstr(extra_match));
> >>      }
> >>
> >> -    ds_put_format(&actions,
> >> -                  "%s { "
> >> -                    "eth.src = %s; "
> >> -                    "ip6.src = %s; "
> >> -                    "nd.target = %s; "
> >> -                    "nd.tll = %s; "
> >> -                    "outport = inport; "
> >> -                    "flags.loopback = 1; "
> >> -                    "output; "
> >> -                  "};",
> >> -                  action,
> >> -                  eth_addr,
> >> -                  ip_address,
> >> -                  ip_address,
> >> -                  eth_addr);
> >> +    if (drop) {
> >> +        ds_put_format(&actions, "drop;");
> >> +    } else {
> >> +        ds_put_format(&actions,
> >> +                      "%s { "
> >> +                        "eth.src = %s; "
> >> +                        "ip6.src = %s; "
> >> +                        "nd.target = %s; "
> >> +                        "nd.tll = %s; "
> >> +                        "outport = inport; "
> >> +                        "flags.loopback = 1; "
> >> +                        "output; "
> >> +                      "};",
> >> +                      action,
> >> +                      eth_addr,
> >> +                      ip_address,
> >> +                      ip_address,
> >> +                      eth_addr);
> >> +    }
> >>
> >>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT,
priority,
> >>                              ds_cstr(&match), ds_cstr(&actions), hint);
> >> @@ -8321,7 +8329,41 @@ build_lrouter_flows(struct hmap *datapaths,
> > struct hmap *ports,
> >>                        "ip4.dst == 0.0.0.0/8 <http://0.0.0.0/8>",
> >>                        "drop;");
> >>
> >> -        /* Priority-90 flows reply to ARP requests and ND packets. */
> >> +        /* Priority-90-92 flows handle ARP requests and ND packets.
> > Most are
> >> +         * per logical port but DNAT addresses can be handled per
> > datapath
> >> +         * for non gateway router ports.
> >> +         */
> >> +        for (int i = 0; i < od->nbr->n_nat; i++) {
> >> +            struct ovn_nat *nat_entry = &od->nat_entries[i];
> >> +            const struct nbrec_nat *nat = nat_entry->nb;
> >> +
> >> +            /* Skip entries we failed to parse. */
> >> +            if (!nat_entry_is_valid(nat_entry)) {
> >> +                continue;
> >> +            }
> >> +
> >> +            if (!strcmp(nat->type, "snat")) {
> >> +                continue;
> >> +            }
> >> +
> >> +            /* Priority 91 and 92 flows are added for each gateway
router
> >> +             * port to handle the special cases. In case we get the
> > packet
> >> +             * on a regular port, just reply with the port's ETH
address.
> >> +             */
> >> +            struct lport_addresses *ext_addrs = &nat_entry->ext_addrs;
> >> +            if (nat_entry_is_v6(nat_entry)) {
> >> +                build_lrouter_nd_flow(od, NULL, "nd_na",
> >> +                                      ext_addrs->ipv6_addrs[0].addr_s,
> >> +
 ext_addrs->ipv6_addrs[0].sn_addr_s,
> >> +                                      REG_INPORT_ETH_ADDR, NULL,
> > false, 90,
> >> +                                      lflows, &nat->header_);
> >> +            } else {
> >> +                build_lrouter_arp_flow(od, NULL,
> >> +
ext_addrs->ipv4_addrs[0].addr_s,
> >> +                                       REG_INPORT_ETH_ADDR, NULL,
> > false, 90,
> >> +                                       lflows, &nat->header_);
> >> +            }
> >> +        }
> >>
> >>          /* Drop ARP packets (priority 85). ARP request packets for
> > router's own
> >>           * IPs are handled with priority-90 flows.
> >> @@ -8471,8 +8513,8 @@ build_lrouter_flows(struct hmap *datapaths,
> > struct hmap *ports,
> >>
> >>              build_lrouter_arp_flow(op->od, op,
> >>
op->lrp_networks.ipv4_addrs[i].addr_s,
> >> -                                   REG_INPORT_ETH_ADDR, &match, 90,
> > lflows,
> >> -                                   &op->nbrp->header_);
> >> +                                   REG_INPORT_ETH_ADDR, &match,
> > false, 90,
> >> +                                   lflows, &op->nbrp->header_);
> >>          }
> >>
> >>          /* A set to hold all load-balancer vips that need ARP
> > responses. */
> >> @@ -8490,7 +8532,7 @@ build_lrouter_flows(struct hmap *datapaths,
> > struct hmap *ports,
> >>
> >>              build_lrouter_arp_flow(op->od, op,
> >>                                     ip_address, REG_INPORT_ETH_ADDR,
> >> -                                   &match, 90, lflows, NULL);
> >> +                                   &match, false, 90, lflows, NULL);
> >>          }
> >>
> >>          SSET_FOR_EACH (ip_address, &all_ips_v6) {
> >> @@ -8502,7 +8544,7 @@ build_lrouter_flows(struct hmap *datapaths,
> > struct hmap *ports,
> >>
> >>              build_lrouter_nd_flow(op->od, op, "nd_na",
> >>                                    ip_address, NULL,
REG_INPORT_ETH_ADDR,
> >> -                                  &match, 90, lflows, NULL);
> >> +                                  &match, false, 90, lflows, NULL);
> >>          }
> >>
> >>          sset_destroy(&all_ips_v4);
> >> @@ -8555,53 +8597,84 @@ build_lrouter_flows(struct hmap *datapaths,
> > struct hmap *ports,
> >>              /* Mac address to use when replying to ARP/NS. */
> >>              const char *mac_s = REG_INPORT_ETH_ADDR;
> >>
> >> +            /* ARP/NS packets are taken care of per router. The only
> > exception
> >> +             * is on the l3dgw_port where we might need to use a
> > different
> >> +             * ETH address.
> >> +             */
> >> +            if (op != op->od->l3dgw_port) {
> >
> > This check should instead be put before the for loop, since this
> > condition is the same for the whole loop.
> >
>
> Hi Han,
>
> Thanks for the review. I don't think that would work because we also
> build snat_ips in the same loop, regardless of op->od->l3dgw_port.
>
> It might be cleaner to run the loop twice, once for SNAT and once for
> DNAT (if op == op->od->l3dgw_port).
>
> What do you think?
>

Sorry, I missed that part. Yes, it would be cleaner to separate the loop.
The first loop for collecting SNAT, and then we can check if (op ==
op->od->l3dgw_port) before the second loop.

> >> +                continue;
> >> +            }
> >> +
> >>              /* ARP / ND handling for external IP addresses.
> >>               *
> >>               * DNAT IP addresses are external IP addresses that need
ARP
> >>               * handling. */
> >>              ds_clear(&match);
> >>
> >> -            if (op->od->l3dgw_port && op == op->od->l3dgw_port) {
> >> -                struct eth_addr mac;
> >> -                if (nat->external_mac &&
> >> -                    eth_addr_from_string(nat->external_mac, &mac)
> >> -                    && nat->logical_port) {
> >> -                    /* distributed NAT case, use nat->external_mac */
> >> -                    mac_s = nat->external_mac;
> >> -                    /* Traffic with eth.src = nat->external_mac
> > should only be
> >> -                     * sent from the chassis where nat->logical_port
is
> >> -                     * resident, so that upstream MAC learning points
> > to the
> >> -                     * correct chassis.  Also need to avoid
generation of
> >> -                     * multiple ARP responses from different chassis.
*/
> >> -                    ds_put_format(&match,
"is_chassis_resident(\"%s\")",
> >> -                                  nat->logical_port);
> >> -                } else {
> >> -                    mac_s = REG_INPORT_ETH_ADDR;
> >> -                    /* Traffic with eth.src =
> > l3dgw_port->lrp_networks.ea_s
> >> -                     * should only be sent from the
> > "redirect-chassis", so that
> >> -                     * upstream MAC learning points to the
> > "redirect-chassis".
> >> -                     * Also need to avoid generation of multiple ARP
> > responses
> >> -                     * from different chassis. */
> >> -                    if (op->od->l3redirect_port) {
> >> -                        ds_put_format(&match,
"is_chassis_resident(%s)",
> >> -
 op->od->l3redirect_port->json_key);
> >> -                    }
> >> +            struct ds match_cr_port = DS_EMPTY_INITIALIZER;
> >> +            struct ds match_non_cr_port = DS_EMPTY_INITIALIZER;
> >> +
> >> +            struct eth_addr mac;
> >> +            if (nat->external_mac &&
> >> +                eth_addr_from_string(nat->external_mac, &mac)
> >> +                && nat->logical_port) {
> >> +                /* distributed NAT case, use nat->external_mac */
> >> +                mac_s = nat->external_mac;
> >> +                /* Traffic with eth.src = nat->external_mac should
> > only be
> >> +                 * sent from the chassis where nat->logical_port is
> >> +                 * resident, so that upstream MAC learning points to
the
> >> +                 * correct chassis.  Also need to avoid generation of
> >> +                 * multiple ARP responses from different chassis. */
> >> +                ds_put_format(&match_cr_port,
> >> +                              "is_chassis_resident(\"%s\")",
> >> +                              nat->logical_port);
> >> +                ds_put_format(&match_non_cr_port,
> >> +                              "!is_chassis_resident(\"%s\")",
> >> +                              nat->logical_port);
> >> +            } else {
> >> +                mac_s = REG_INPORT_ETH_ADDR;
> >> +                /* Traffic with eth.src =
l3dgw_port->lrp_networks.ea_s
> >> +                 * should only be sent from the "redirect-chassis",
> > so that
> >> +                 * upstream MAC learning points to the
> > "redirect-chassis".
> >> +                 * Also need to avoid generation of multiple ARP
> > responses
> >> +                 * from different chassis. */
> >> +                if (op->od->l3redirect_port) {
> >> +                    ds_put_format(&match_cr_port,
> >> +                                  "is_chassis_resident(\"%s\")",
> >> +                                  op->od->l3redirect_port->json_key);
> >> +                    ds_put_format(&match_non_cr_port,
> >> +                                  "!is_chassis_resident(\"%s\")",
> >> +                                  op->od->l3redirect_port->json_key);
> >>                  }
> >>              }
> >>
> >> +            /* Respond to ARP/NS requests on the chassis that binds
> > the gw
> >> +             * port. Drop the ARP/NS requests on other chassis.
> >> +             */
> >>              struct lport_addresses *ext_addrs = &nat_entry->ext_addrs;
> >>              if (nat_entry_is_v6(nat_entry)) {
> >>                  build_lrouter_nd_flow(op->od, op, "nd_na",
> >>                                        ext_addrs->ipv6_addrs[0].addr_s,
> >>
 ext_addrs->ipv6_addrs[0].sn_addr_s,
> >> -                                      mac_s, &match, 90,
> >> +                                      mac_s, &match_cr_port, false,
92,
> >> +                                      lflows, &nat->header_);
> >> +                build_lrouter_nd_flow(op->od, op, "nd_na",
> >> +                                      ext_addrs->ipv6_addrs[0].addr_s,
> >> +
 ext_addrs->ipv6_addrs[0].sn_addr_s,
> >> +                                      mac_s, &match_non_cr_port,
> > true, 91,
> >>                                        lflows, &nat->header_);
> >
> > I think "match_non_cr_port" is not needed. If the priority-92 flow (with
> > "match_cr_port") doesn't match, then it can just let the priority-91
> > flow to drop the packet. Alternatively, if "match_non_cr_port" is added,
> > then the two lflows can use the same priority.
> >
>
> You're right, we don't need match_non_cr_port, I'll remove it.
>
> Thanks,
> Dumitru
>
> >>              } else {
> >>                  build_lrouter_arp_flow(op->od, op,
> >>
ext_addrs->ipv4_addrs[0].addr_s,
> >> -                                       mac_s, &match, 90,
> >> +                                       mac_s, &match_cr_port, false,
92,
> >> +                                       lflows, &nat->header_);
> >> +                build_lrouter_arp_flow(op->od, op,
> >> +
ext_addrs->ipv4_addrs[0].addr_s,
> >> +                                       mac_s, &match_non_cr_port,
> > true, 91,
> >>                                         lflows, &nat->header_);
> >>              }
> >> +            ds_destroy(&match_cr_port);
> >> +            ds_destroy(&match_non_cr_port);
> >>          }
> >>
> >>          if (!smap_get(&op->od->nbr->options, "chassis")
> >> @@ -8787,8 +8860,8 @@ build_lrouter_flows(struct hmap *datapaths,
> > struct hmap *ports,
> >>              build_lrouter_nd_flow(op->od, op, "nd_na_router",
> >>
 op->lrp_networks.ipv6_addrs[i].addr_s,
> >>
> >  op->lrp_networks.ipv6_addrs[i].sn_addr_s,
> >> -                                  REG_INPORT_ETH_ADDR, &match, 90,
> > lflows,
> >> -                                  &op->nbrp->header_);
> >> +                                  REG_INPORT_ETH_ADDR, &match, false,
90,
> >> +                                  lflows, &op->nbrp->header_);
> >>          }
> >>
> >>          /* UDP/TCP port unreachable */
> >> diff --git a/tests/ovn-northd.at <http://ovn-northd.at>
> > b/tests/ovn-northd.at <http://ovn-northd.at>
> >> index a3fb7ef..4a13456 100644
> >> --- a/tests/ovn-northd.at <http://ovn-northd.at>
> >> +++ b/tests/ovn-northd.at <http://ovn-northd.at>
> >> @@ -1594,33 +1594,24 @@ action=(xreg0[[0..47]] = 00:00:00:00:01:00;
next;)
> >>  # Ingress router port ETH address is used for ARP reply/NA in
> > lr_in_ip_input.
> >>  AT_CHECK([ovn-sbctl lflow-list | grep -E
> > "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sort], [0], [dnl
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
> > arp.spa == 42.42.42.0/24 <http://42.42.42.0/24>), dnl
> >> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1;
> > output;)
> >> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.2), dnl
> >> +match=(arp.op == 1 && arp.tpa == 43.43.43.2), dnl
> >>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1;
> > output;)
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.3), dnl
> >> +match=(arp.op == 1 && arp.tpa == 43.43.43.3), dnl
> >>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1;
> > output;)
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.4), dnl
> >> +match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
> >>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1;
> > output;)
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> +match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
> > arp.spa == 42.42.42.0/24 <http://42.42.42.0/24>), dnl
> >> +action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1;
> > output;)
> >> +  table=3 (lr_in_ip_input     ), priority=90   , dnl
> >>  match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1,
> > ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
> >>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
> > fe80::200:ff:fe00:1; nd.target = fe80::200:ff:fe00:1; nd.tll =
> > xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >>  match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1
> > && arp.spa == 43.43.43.0/24 <http://43.43.43.0/24>), dnl
> >>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.1; outport = inport; flags.loopback = 1;
> > output;)
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa ==
> > 43.43.43.2), dnl
> >> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1;
> > output;)
> >> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa ==
> > 43.43.43.3), dnl
> >> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1;
> > output;)
> >> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa ==
> > 43.43.43.4), dnl
> >> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1;
> > output;)
> >> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> >>  match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100,
> > ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100), dnl
> >>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
> > fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll =
> > xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
> >>  ])
> >> @@ -1654,37 +1645,55 @@ action=(xreg0[[0..47]] = 00:00:00:00:01:00;
next;)
> >>
> >>  # Ingress router port is used for ARP reply/NA in lr_in_ip_input.
> >>  # xxreg0[0..47] is used unless external_mac is set.
> >> +# Priority 90 flows (per router).
> >>  AT_CHECK([ovn-sbctl lflow-list | grep -E
> > "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sort], [0], [dnl
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
> > arp.spa == 42.42.42.0/24 <http://42.42.42.0/24>), dnl
> >> -action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1;
> > output;)
> >> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.2), dnl
> >> +match=(arp.op == 1 && arp.tpa == 43.43.43.2), dnl
> >>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1;
> > output;)
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.3), dnl
> >> +match=(arp.op == 1 && arp.tpa == 43.43.43.3), dnl
> >>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1;
> > output;)
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.4), dnl
> >> +match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
> >>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1;
> > output;)
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> +match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 &&
> > arp.spa == 42.42.42.0/24 <http://42.42.42.0/24>), dnl
> >> +action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1;
> > output;)
> >> +  table=3 (lr_in_ip_input     ), priority=90   , dnl
> >>  match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1,
> > ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
> >>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
> > fe80::200:ff:fe00:1; nd.target = fe80::200:ff:fe00:1; nd.tll =
> > xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >>  match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1
> > && arp.spa == 43.43.43.0/24 <http://43.43.43.0/24>), dnl
> >>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.1; outport = inport; flags.loopback = 1;
> > output;)
> >>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2
> > && is_chassis_resident("cr-lrp-public")), dnl
> >> +match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100,
> > ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100 &&
> > is_chassis_resident("cr-lrp-public")), dnl
> >> +action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
> > fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll =
> > xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
> >> +])
> >> +
> >> +# Priority 91 drop flows (per distributed gw port), if port is not
> > resident.
> >> +AT_CHECK([ovn-sbctl lflow-list | grep -E
> > "lr_in_ip_input.*priority=91" | grep "arp\|nd" | sort], [0], [dnl
> >> +  table=3 (lr_in_ip_input     ), priority=91   , dnl
> >> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2
> > && !is_chassis_resident(""cr-lrp-public"")), dnl
> >> +action=(drop;)
> >> +  table=3 (lr_in_ip_input     ), priority=91   , dnl
> >> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3
> > && !is_chassis_resident(""cr-lrp-public"")), dnl
> >> +action=(drop;)
> >> +  table=3 (lr_in_ip_input     ), priority=91   , dnl
> >> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.4
> > && !is_chassis_resident("ls-vm")), dnl
> >> +action=(drop;)
> >> +])
> >> +
> >> +# Priority 92 ARP/NS responders (per distributed gw port), if port is
> > resident.
> >> +AT_CHECK([ovn-sbctl lflow-list | grep -E
> > "lr_in_ip_input.*priority=92" | grep "arp\|nd" | sort], [0], [dnl
> >> +  table=3 (lr_in_ip_input     ), priority=92   , dnl
> >> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2
> > && is_chassis_resident(""cr-lrp-public"")), dnl
> >>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1;
> > output;)
> >> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3
> > && is_chassis_resident("cr-lrp-public")), dnl
> >> +  table=3 (lr_in_ip_input     ), priority=92   , dnl
> >> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3
> > && is_chassis_resident(""cr-lrp-public"")), dnl
> >>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /*
> > ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa =
> > arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1;
> > output;)
> >> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> +  table=3 (lr_in_ip_input     ), priority=92   , dnl
> >>  match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.4
> > && is_chassis_resident("ls-vm")), dnl
> >>  action=(eth.dst = eth.src; eth.src = 00:00:00:00:00:02; arp.op = 2;
> > /* ARP reply */ arp.tha = arp.sha; arp.sha = 00:00:00:00:00:02; arp.tpa
> > = arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1;
> > output;)
> >> -  table=3 (lr_in_ip_input     ), priority=90   , dnl
> >> -match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100,
> > ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100 &&
> > is_chassis_resident("cr-lrp-public")), dnl
> >> -action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src =
> > fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll =
> > xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
> >>  ])
> >>
> >>  # xreg0[0..47] isn't used anywhere else.
> >> diff --git a/tests/ovn.at <http://ovn.at> b/tests/ovn.at <http://ovn.at
>
> >> index 1a1cfbf..9522b55 100644
> >> --- a/tests/ovn.at <http://ovn.at>
> >> +++ b/tests/ovn.at <http://ovn.at>
> >> @@ -19336,7 +19336,7 @@ OVS_WAIT_UNTIL([
> >>  send_arp_request 1 0 ${src_mac} $(ip_to_hex 10 0 0 254) $(ip_to_hex
> > 10 0 0 121)
> >>
> >>  # Verify that the ARP request is replied to from hv1 and not hv2.
> >>
> >
-match_arp_req="priority=90.*${match_r1_metadata}.*arp_tpa=10.0.0.121,arp_op=1"
> >>
> >
+match_arp_req="priority=92.*${match_r1_metadata}.*arp_tpa=10.0.0.121,arp_op=1"
> >>
> >>  as hv1
> >>  OVS_WAIT_UNTIL([
> >> @@ -19356,7 +19356,7 @@ OVS_WAIT_UNTIL([
> >>  send_arp_request 1 0 ${src_mac} $(ip_to_hex 10 0 0 254) $(ip_to_hex
> > 10 0 0 122)
> >>
> >>  # Verify that the ARP request is replied to from hv2 and not hv1.
> >>
> >
-match_arp_req="priority=90.*${match_r1_metadata}.*arp_tpa=10.0.0.122,arp_op=1"
> >>
> >
+match_arp_req="priority=92.*${match_r1_metadata}.*arp_tpa=10.0.0.122,arp_op=1"
> >>
> >>  as hv2
> >>  OVS_WAIT_UNTIL([
> >> @@ -19400,7 +19400,7 @@ dst_ipv6=00100000000000000000000000000121
> >>  send_nd_ns 1 0 ${src_mac} ${src_ipv6} ${dst_ipv6} 72dd
> >>
> >>  # Verify that the ND_NS is replied to from hv1 and not hv2.
> >>
> >
-match_nd_ns="priority=90.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::121"
> >>
> >
+match_nd_ns="priority=92.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::121"
> >>
> >>  as hv1
> >>  OVS_WAIT_UNTIL([
> >> @@ -19422,7 +19422,7 @@ dst_ipv6=00100000000000000000000000000122
> >>  send_nd_ns 1 0 ${src_mac} ${src_ipv6} ${dst_ipv6} 72db
> >>
> >>  # Verify that the ND_NS is replied to from hv2 and not hv1.
> >>
> >
-match_nd_ns="priority=90.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::122"
> >>
> >
+match_nd_ns="priority=92.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::122"
> >>
> >>  as hv2
> >>  OVS_WAIT_UNTIL([
> >>
>
Dumitru Ceara July 3, 2020, 6:37 p.m. UTC | #4
On 7/3/20 7:04 PM, Han Zhou wrote:
> 
> 
> On Fri, Jul 3, 2020 at 1:06 AM Dumitru Ceara <dceara@redhat.com
> <mailto:dceara@redhat.com>> wrote:
>>
>> On 7/3/20 7:46 AM, Han Zhou wrote:
>> > Thanks Dumitru. I have 2 comments below.
>> >
>> > On Thu, Jul 2, 2020 at 7:53 AM Dumitru Ceara <dceara@redhat.com
> <mailto:dceara@redhat.com>
>> > <mailto:dceara@redhat.com <mailto:dceara@redhat.com>>> wrote:
>> >>
>> >> Most ARP/NS responder flows can be configured per datapath instead
> of per
>> >> router port.
>> >>
>> >> The only exception is with distributed gateway router ports which need
>> >> special treatment. This patch changes the ARP/NS responder behavior
>> > and adds:
>> >> - Priority 92 flows to reply to ARP requests on distributed gateway
> router
>> >>   ports, on the chassis where the DNAT entry is bound.
>> >> - Priority 91 flows to drop ARP requests on distributed gateway router
>> > ports,
>> >>   on chassis where the DNAT entry is not bound.
>> >> - Priority 90 flows to reply to ARP requests on all other router
>> > ports. This
>> >>   last type of flows is programmed exactly once per logical router
>> > limiting
>> >>   the total number of required logical flows.
>> >>
>> >> Suggested-by: Han Zhou <hzhou@ovn.org <mailto:hzhou@ovn.org>
> <mailto:hzhou@ovn.org <mailto:hzhou@ovn.org>>>
>> >> Reported-by: Girish Moodalbail <gmoodalbail@gmail.com
> <mailto:gmoodalbail@gmail.com>
>> > <mailto:gmoodalbail@gmail.com <mailto:gmoodalbail@gmail.com>>>
>> >> Reported-at:
>> > https://mail.openvswitch.org/pipermail/ovs-discuss/2020-June/050186.html
>> >> Signed-off-by: Dumitru Ceara <dceara@redhat.com
> <mailto:dceara@redhat.com>
>> > <mailto:dceara@redhat.com <mailto:dceara@redhat.com>>>
>> >> ---
>> >>  northd/ovn-northd.8.xml |   16 +++-
>> >>  northd/ovn-northd.c     |  203
>> > ++++++++++++++++++++++++++++++++---------------
>> >>  tests/ovn-northd.at <http://ovn-northd.at> <http://ovn-northd.at>
>     |   65 +++++++++------
>> >>  tests/ovn.at <http://ovn.at> <http://ovn.at>            |    8 +-
>> >>  4 files changed, 190 insertions(+), 102 deletions(-)
>> >>
>> >> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
>> >> index 84224ff..11607c0 100644
>> >> --- a/northd/ovn-northd.8.xml
>> >> +++ b/northd/ovn-northd.8.xml
>> >> @@ -1857,9 +1857,8 @@ nd_na_router {
>> >>            IPv4: For a configured DNAT IP address or a load balancer
>> >>            IPv4 VIP <var>A</var>, for each router port <var>P</var>
> with
>> >>            Ethernet address <var>E</var>, a priority-90 flow matches
>> >> -          <code>inport == <var>P</var> &amp;&amp; arp.op == 1
> &amp;&amp;
>> >> -          arp.tpa == <var>A</var></code> (ARP request)
>> >> -          with the following actions:
>> >> +          <code>arp.op == 1 &amp;&amp; arp.tpa == <var>A</var></code>
>> >> +          (ARP request) with the following actions:
>> >>          </p>
>> >>
>> >>          <pre>
>> >> @@ -1876,6 +1875,11 @@ output;
>> >>          </pre>
>> >>
>> >>          <p>
>> >> +          IPv4: For a configured load balancer IPv4 VIP, a similar
>> > flow is
>> >> +          added with the additional match <code>inport ==
>> > <var>P</var></code>.
>> >> +        </p>
>> >> +
>> >> +        <p>
>> >>            If the router port <var>P</var> is a distributed gateway
> router
>> >>            port, then the
>> > <code>is_chassis_resident(<var>P</var>)</code> is
>> >>            also added in the match condition for the load balancer IPv4
>> >> @@ -1922,9 +1926,11 @@ nd_na {
>> >>          <ul>
>> >>            <li>
>> >>              If the corresponding NAT rule cannot be handled in a
>> >> -            distributed manner, then this flow is only programmed on
>> >> +            distributed manner, then a priority-92 flow is
> programmed on
>> >>              the gateway port instance on the
>> >> -            <code>redirect-chassis</code>.  This behavior avoids
>> >> +            <code>redirect-chassis</code>.  A priority-91 drop flow is
>> >> +            programmed on the other chassis when ARP requests/NS
> packets
>> >> +            are received on the gateway port. This behavior avoids
>> >>              generation of multiple ARP responses from different
> chassis,
>> >>              and allows upstream MAC learning to point to the
>> >>              <code>redirect-chassis</code>.
>> >> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
>> >> index a1ba56f..10fc8cf 100644
>> >> --- a/northd/ovn-northd.c
>> >> +++ b/northd/ovn-northd.c
>> >> @@ -8048,7 +8048,7 @@ lrouter_nat_is_stateless(const struct nbrec_nat
>> > *nat)
>> >>  static void
>> >>  build_lrouter_arp_flow(struct ovn_datapath *od, struct ovn_port *op,
>> >>                         const char *ip_address, const char *eth_addr,
>> >> -                       struct ds *extra_match, uint16_t priority,
>> >> +                       struct ds *extra_match, bool drop, uint16_t
>> > priority,
>> >>                         struct hmap *lflows, const struct
>> > ovsdb_idl_row *hint)
>> >>  {
>> >>      struct ds match = DS_EMPTY_INITIALIZER;
>> >> @@ -8063,20 +8063,24 @@ build_lrouter_arp_flow(struct ovn_datapath
>> > *od, struct ovn_port *op,
>> >>      if (extra_match && ds_last(extra_match) != EOF) {
>> >>          ds_put_format(&match, " && %s", ds_cstr(extra_match));
>> >>      }
>> >> -    ds_put_format(&actions,
>> >> -                  "eth.dst = eth.src; "
>> >> -                  "eth.src = %s; "
>> >> -                  "arp.op = 2; /* ARP reply */ "
>> >> -                  "arp.tha = arp.sha; "
>> >> -                  "arp.sha = %s; "
>> >> -                  "arp.tpa = arp.spa; "
>> >> -                  "arp.spa = %s; "
>> >> -                  "outport = inport; "
>> >> -                  "flags.loopback = 1; "
>> >> -                  "output;",
>> >> -                  eth_addr,
>> >> -                  eth_addr,
>> >> -                  ip_address);
>> >> +    if (drop) {
>> >> +        ds_put_format(&actions, "drop;");
>> >> +    } else {
>> >> +        ds_put_format(&actions,
>> >> +                      "eth.dst = eth.src; "
>> >> +                      "eth.src = %s; "
>> >> +                      "arp.op = 2; /* ARP reply */ "
>> >> +                      "arp.tha = arp.sha; "
>> >> +                      "arp.sha = %s; "
>> >> +                      "arp.tpa = arp.spa; "
>> >> +                      "arp.spa = %s; "
>> >> +                      "outport = inport; "
>> >> +                      "flags.loopback = 1; "
>> >> +                      "output;",
>> >> +                      eth_addr,
>> >> +                      eth_addr,
>> >> +                      ip_address);
>> >> +    }
>> >>
>> >>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT,
> priority,
>> >>                              ds_cstr(&match), ds_cstr(&actions), hint);
>> >> @@ -8095,7 +8099,7 @@ static void
>> >>  build_lrouter_nd_flow(struct ovn_datapath *od, struct ovn_port *op,
>> >>                        const char *action, const char *ip_address,
>> >>                        const char *sn_ip_address, const char *eth_addr,
>> >> -                      struct ds *extra_match, uint16_t priority,
>> >> +                      struct ds *extra_match, bool drop, uint16_t
>> > priority,
>> >>                        struct hmap *lflows,
>> >>                        const struct ovsdb_idl_row *hint)
>> >>  {
>> >> @@ -8117,21 +8121,25 @@ build_lrouter_nd_flow(struct ovn_datapath *od,
>> > struct ovn_port *op,
>> >>          ds_put_format(&match, " && %s", ds_cstr(extra_match));
>> >>      }
>> >>
>> >> -    ds_put_format(&actions,
>> >> -                  "%s { "
>> >> -                    "eth.src = %s; "
>> >> -                    "ip6.src = %s; "
>> >> -                    "nd.target = %s; "
>> >> -                    "nd.tll = %s; "
>> >> -                    "outport = inport; "
>> >> -                    "flags.loopback = 1; "
>> >> -                    "output; "
>> >> -                  "};",
>> >> -                  action,
>> >> -                  eth_addr,
>> >> -                  ip_address,
>> >> -                  ip_address,
>> >> -                  eth_addr);
>> >> +    if (drop) {
>> >> +        ds_put_format(&actions, "drop;");
>> >> +    } else {
>> >> +        ds_put_format(&actions,
>> >> +                      "%s { "
>> >> +                        "eth.src = %s; "
>> >> +                        "ip6.src = %s; "
>> >> +                        "nd.target = %s; "
>> >> +                        "nd.tll = %s; "
>> >> +                        "outport = inport; "
>> >> +                        "flags.loopback = 1; "
>> >> +                        "output; "
>> >> +                      "};",
>> >> +                      action,
>> >> +                      eth_addr,
>> >> +                      ip_address,
>> >> +                      ip_address,
>> >> +                      eth_addr);
>> >> +    }
>> >>
>> >>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT,
> priority,
>> >>                              ds_cstr(&match), ds_cstr(&actions), hint);
>> >> @@ -8321,7 +8329,41 @@ build_lrouter_flows(struct hmap *datapaths,
>> > struct hmap *ports,
>> >>                        "ip4.dst == 0.0.0.0/8 <http://0.0.0.0/8>
> <http://0.0.0.0/8>",
>> >>                        "drop;");
>> >>
>> >> -        /* Priority-90 flows reply to ARP requests and ND packets. */
>> >> +        /* Priority-90-92 flows handle ARP requests and ND packets.
>> > Most are
>> >> +         * per logical port but DNAT addresses can be handled per
>> > datapath
>> >> +         * for non gateway router ports.
>> >> +         */
>> >> +        for (int i = 0; i < od->nbr->n_nat; i++) {
>> >> +            struct ovn_nat *nat_entry = &od->nat_entries[i];
>> >> +            const struct nbrec_nat *nat = nat_entry->nb;
>> >> +
>> >> +            /* Skip entries we failed to parse. */
>> >> +            if (!nat_entry_is_valid(nat_entry)) {
>> >> +                continue;
>> >> +            }
>> >> +
>> >> +            if (!strcmp(nat->type, "snat")) {
>> >> +                continue;
>> >> +            }
>> >> +
>> >> +            /* Priority 91 and 92 flows are added for each gateway
> router
>> >> +             * port to handle the special cases. In case we get the
>> > packet
>> >> +             * on a regular port, just reply with the port's ETH
> address.
>> >> +             */
>> >> +            struct lport_addresses *ext_addrs = &nat_entry->ext_addrs;
>> >> +            if (nat_entry_is_v6(nat_entry)) {
>> >> +                build_lrouter_nd_flow(od, NULL, "nd_na",
>> >> +                                      ext_addrs->ipv6_addrs[0].addr_s,
>> >> +                                    
>  ext_addrs->ipv6_addrs[0].sn_addr_s,
>> >> +                                      REG_INPORT_ETH_ADDR, NULL,
>> > false, 90,
>> >> +                                      lflows, &nat->header_);
>> >> +            } else {
>> >> +                build_lrouter_arp_flow(od, NULL,
>> >> +                                      
> ext_addrs->ipv4_addrs[0].addr_s,
>> >> +                                       REG_INPORT_ETH_ADDR, NULL,
>> > false, 90,
>> >> +                                       lflows, &nat->header_);
>> >> +            }
>> >> +        }
>> >>
>> >>          /* Drop ARP packets (priority 85). ARP request packets for
>> > router's own
>> >>           * IPs are handled with priority-90 flows.
>> >> @@ -8471,8 +8513,8 @@ build_lrouter_flows(struct hmap *datapaths,
>> > struct hmap *ports,
>> >>
>> >>              build_lrouter_arp_flow(op->od, op,
>> >>                                    
> op->lrp_networks.ipv4_addrs[i].addr_s,
>> >> -                                   REG_INPORT_ETH_ADDR, &match, 90,
>> > lflows,
>> >> -                                   &op->nbrp->header_);
>> >> +                                   REG_INPORT_ETH_ADDR, &match,
>> > false, 90,
>> >> +                                   lflows, &op->nbrp->header_);
>> >>          }
>> >>
>> >>          /* A set to hold all load-balancer vips that need ARP
>> > responses. */
>> >> @@ -8490,7 +8532,7 @@ build_lrouter_flows(struct hmap *datapaths,
>> > struct hmap *ports,
>> >>
>> >>              build_lrouter_arp_flow(op->od, op,
>> >>                                     ip_address, REG_INPORT_ETH_ADDR,
>> >> -                                   &match, 90, lflows, NULL);
>> >> +                                   &match, false, 90, lflows, NULL);
>> >>          }
>> >>
>> >>          SSET_FOR_EACH (ip_address, &all_ips_v6) {
>> >> @@ -8502,7 +8544,7 @@ build_lrouter_flows(struct hmap *datapaths,
>> > struct hmap *ports,
>> >>
>> >>              build_lrouter_nd_flow(op->od, op, "nd_na",
>> >>                                    ip_address, NULL,
> REG_INPORT_ETH_ADDR,
>> >> -                                  &match, 90, lflows, NULL);
>> >> +                                  &match, false, 90, lflows, NULL);
>> >>          }
>> >>
>> >>          sset_destroy(&all_ips_v4);
>> >> @@ -8555,53 +8597,84 @@ build_lrouter_flows(struct hmap *datapaths,
>> > struct hmap *ports,
>> >>              /* Mac address to use when replying to ARP/NS. */
>> >>              const char *mac_s = REG_INPORT_ETH_ADDR;
>> >>
>> >> +            /* ARP/NS packets are taken care of per router. The only
>> > exception
>> >> +             * is on the l3dgw_port where we might need to use a
>> > different
>> >> +             * ETH address.
>> >> +             */
>> >> +            if (op != op->od->l3dgw_port) {
>> >
>> > This check should instead be put before the for loop, since this
>> > condition is the same for the whole loop.
>> >
>>
>> Hi Han,
>>
>> Thanks for the review. I don't think that would work because we also
>> build snat_ips in the same loop, regardless of op->od->l3dgw_port.
>>
>> It might be cleaner to run the loop twice, once for SNAT and once for
>> DNAT (if op == op->od->l3dgw_port).
>>
>> What do you think?
>>
> 
> Sorry, I missed that part. Yes, it would be cleaner to separate the
> loop.  The first loop for collecting SNAT, and then we can check if (op
> == op->od->l3dgw_port) before the second loop.
> 

Thanks, I sent v4:

https://patchwork.ozlabs.org/project/openvswitch/list/?series=187493

Regards,
Dumitru
diff mbox series

Patch

diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 84224ff..11607c0 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -1857,9 +1857,8 @@  nd_na_router {
           IPv4: For a configured DNAT IP address or a load balancer
           IPv4 VIP <var>A</var>, for each router port <var>P</var> with
           Ethernet address <var>E</var>, a priority-90 flow matches
-          <code>inport == <var>P</var> &amp;&amp; arp.op == 1 &amp;&amp;
-          arp.tpa == <var>A</var></code> (ARP request)
-          with the following actions:
+          <code>arp.op == 1 &amp;&amp; arp.tpa == <var>A</var></code>
+          (ARP request) with the following actions:
         </p>
 
         <pre>
@@ -1876,6 +1875,11 @@  output;
         </pre>
 
         <p>
+          IPv4: For a configured load balancer IPv4 VIP, a similar flow is
+          added with the additional match <code>inport == <var>P</var></code>.
+        </p>
+
+        <p>
           If the router port <var>P</var> is a distributed gateway router
           port, then the <code>is_chassis_resident(<var>P</var>)</code> is
           also added in the match condition for the load balancer IPv4
@@ -1922,9 +1926,11 @@  nd_na {
         <ul>
           <li>
             If the corresponding NAT rule cannot be handled in a
-            distributed manner, then this flow is only programmed on
+            distributed manner, then a priority-92 flow is programmed on
             the gateway port instance on the
-            <code>redirect-chassis</code>.  This behavior avoids
+            <code>redirect-chassis</code>.  A priority-91 drop flow is
+            programmed on the other chassis when ARP requests/NS packets
+            are received on the gateway port. This behavior avoids
             generation of multiple ARP responses from different chassis,
             and allows upstream MAC learning to point to the
             <code>redirect-chassis</code>.
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index a1ba56f..10fc8cf 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -8048,7 +8048,7 @@  lrouter_nat_is_stateless(const struct nbrec_nat *nat)
 static void
 build_lrouter_arp_flow(struct ovn_datapath *od, struct ovn_port *op,
                        const char *ip_address, const char *eth_addr,
-                       struct ds *extra_match, uint16_t priority,
+                       struct ds *extra_match, bool drop, uint16_t priority,
                        struct hmap *lflows, const struct ovsdb_idl_row *hint)
 {
     struct ds match = DS_EMPTY_INITIALIZER;
@@ -8063,20 +8063,24 @@  build_lrouter_arp_flow(struct ovn_datapath *od, struct ovn_port *op,
     if (extra_match && ds_last(extra_match) != EOF) {
         ds_put_format(&match, " && %s", ds_cstr(extra_match));
     }
-    ds_put_format(&actions,
-                  "eth.dst = eth.src; "
-                  "eth.src = %s; "
-                  "arp.op = 2; /* ARP reply */ "
-                  "arp.tha = arp.sha; "
-                  "arp.sha = %s; "
-                  "arp.tpa = arp.spa; "
-                  "arp.spa = %s; "
-                  "outport = inport; "
-                  "flags.loopback = 1; "
-                  "output;",
-                  eth_addr,
-                  eth_addr,
-                  ip_address);
+    if (drop) {
+        ds_put_format(&actions, "drop;");
+    } else {
+        ds_put_format(&actions,
+                      "eth.dst = eth.src; "
+                      "eth.src = %s; "
+                      "arp.op = 2; /* ARP reply */ "
+                      "arp.tha = arp.sha; "
+                      "arp.sha = %s; "
+                      "arp.tpa = arp.spa; "
+                      "arp.spa = %s; "
+                      "outport = inport; "
+                      "flags.loopback = 1; "
+                      "output;",
+                      eth_addr,
+                      eth_addr,
+                      ip_address);
+    }
 
     ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT, priority,
                             ds_cstr(&match), ds_cstr(&actions), hint);
@@ -8095,7 +8099,7 @@  static void
 build_lrouter_nd_flow(struct ovn_datapath *od, struct ovn_port *op,
                       const char *action, const char *ip_address,
                       const char *sn_ip_address, const char *eth_addr,
-                      struct ds *extra_match, uint16_t priority,
+                      struct ds *extra_match, bool drop, uint16_t priority,
                       struct hmap *lflows,
                       const struct ovsdb_idl_row *hint)
 {
@@ -8117,21 +8121,25 @@  build_lrouter_nd_flow(struct ovn_datapath *od, struct ovn_port *op,
         ds_put_format(&match, " && %s", ds_cstr(extra_match));
     }
 
-    ds_put_format(&actions,
-                  "%s { "
-                    "eth.src = %s; "
-                    "ip6.src = %s; "
-                    "nd.target = %s; "
-                    "nd.tll = %s; "
-                    "outport = inport; "
-                    "flags.loopback = 1; "
-                    "output; "
-                  "};",
-                  action,
-                  eth_addr,
-                  ip_address,
-                  ip_address,
-                  eth_addr);
+    if (drop) {
+        ds_put_format(&actions, "drop;");
+    } else {
+        ds_put_format(&actions,
+                      "%s { "
+                        "eth.src = %s; "
+                        "ip6.src = %s; "
+                        "nd.target = %s; "
+                        "nd.tll = %s; "
+                        "outport = inport; "
+                        "flags.loopback = 1; "
+                        "output; "
+                      "};",
+                      action,
+                      eth_addr,
+                      ip_address,
+                      ip_address,
+                      eth_addr);
+    }
 
     ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT, priority,
                             ds_cstr(&match), ds_cstr(&actions), hint);
@@ -8321,7 +8329,41 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
                       "ip4.dst == 0.0.0.0/8",
                       "drop;");
 
-        /* Priority-90 flows reply to ARP requests and ND packets. */
+        /* Priority-90-92 flows handle ARP requests and ND packets. Most are
+         * per logical port but DNAT addresses can be handled per datapath
+         * for non gateway router ports.
+         */
+        for (int i = 0; i < od->nbr->n_nat; i++) {
+            struct ovn_nat *nat_entry = &od->nat_entries[i];
+            const struct nbrec_nat *nat = nat_entry->nb;
+
+            /* Skip entries we failed to parse. */
+            if (!nat_entry_is_valid(nat_entry)) {
+                continue;
+            }
+
+            if (!strcmp(nat->type, "snat")) {
+                continue;
+            }
+
+            /* Priority 91 and 92 flows are added for each gateway router
+             * port to handle the special cases. In case we get the packet
+             * on a regular port, just reply with the port's ETH address.
+             */
+            struct lport_addresses *ext_addrs = &nat_entry->ext_addrs;
+            if (nat_entry_is_v6(nat_entry)) {
+                build_lrouter_nd_flow(od, NULL, "nd_na",
+                                      ext_addrs->ipv6_addrs[0].addr_s,
+                                      ext_addrs->ipv6_addrs[0].sn_addr_s,
+                                      REG_INPORT_ETH_ADDR, NULL, false, 90,
+                                      lflows, &nat->header_);
+            } else {
+                build_lrouter_arp_flow(od, NULL,
+                                       ext_addrs->ipv4_addrs[0].addr_s,
+                                       REG_INPORT_ETH_ADDR, NULL, false, 90,
+                                       lflows, &nat->header_);
+            }
+        }
 
         /* Drop ARP packets (priority 85). ARP request packets for router's own
          * IPs are handled with priority-90 flows.
@@ -8471,8 +8513,8 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
 
             build_lrouter_arp_flow(op->od, op,
                                    op->lrp_networks.ipv4_addrs[i].addr_s,
-                                   REG_INPORT_ETH_ADDR, &match, 90, lflows,
-                                   &op->nbrp->header_);
+                                   REG_INPORT_ETH_ADDR, &match, false, 90,
+                                   lflows, &op->nbrp->header_);
         }
 
         /* A set to hold all load-balancer vips that need ARP responses. */
@@ -8490,7 +8532,7 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
 
             build_lrouter_arp_flow(op->od, op,
                                    ip_address, REG_INPORT_ETH_ADDR,
-                                   &match, 90, lflows, NULL);
+                                   &match, false, 90, lflows, NULL);
         }
 
         SSET_FOR_EACH (ip_address, &all_ips_v6) {
@@ -8502,7 +8544,7 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
 
             build_lrouter_nd_flow(op->od, op, "nd_na",
                                   ip_address, NULL, REG_INPORT_ETH_ADDR,
-                                  &match, 90, lflows, NULL);
+                                  &match, false, 90, lflows, NULL);
         }
 
         sset_destroy(&all_ips_v4);
@@ -8555,53 +8597,84 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
             /* Mac address to use when replying to ARP/NS. */
             const char *mac_s = REG_INPORT_ETH_ADDR;
 
+            /* ARP/NS packets are taken care of per router. The only exception
+             * is on the l3dgw_port where we might need to use a different
+             * ETH address.
+             */
+            if (op != op->od->l3dgw_port) {
+                continue;
+            }
+
             /* ARP / ND handling for external IP addresses.
              *
              * DNAT IP addresses are external IP addresses that need ARP
              * handling. */
             ds_clear(&match);
 
-            if (op->od->l3dgw_port && op == op->od->l3dgw_port) {
-                struct eth_addr mac;
-                if (nat->external_mac &&
-                    eth_addr_from_string(nat->external_mac, &mac)
-                    && nat->logical_port) {
-                    /* distributed NAT case, use nat->external_mac */
-                    mac_s = nat->external_mac;
-                    /* Traffic with eth.src = nat->external_mac should only be
-                     * sent from the chassis where nat->logical_port is
-                     * resident, so that upstream MAC learning points to the
-                     * correct chassis.  Also need to avoid generation of
-                     * multiple ARP responses from different chassis. */
-                    ds_put_format(&match, "is_chassis_resident(\"%s\")",
-                                  nat->logical_port);
-                } else {
-                    mac_s = REG_INPORT_ETH_ADDR;
-                    /* Traffic with eth.src = l3dgw_port->lrp_networks.ea_s
-                     * should only be sent from the "redirect-chassis", so that
-                     * upstream MAC learning points to the "redirect-chassis".
-                     * Also need to avoid generation of multiple ARP responses
-                     * from different chassis. */
-                    if (op->od->l3redirect_port) {
-                        ds_put_format(&match, "is_chassis_resident(%s)",
-                                      op->od->l3redirect_port->json_key);
-                    }
+            struct ds match_cr_port = DS_EMPTY_INITIALIZER;
+            struct ds match_non_cr_port = DS_EMPTY_INITIALIZER;
+
+            struct eth_addr mac;
+            if (nat->external_mac &&
+                eth_addr_from_string(nat->external_mac, &mac)
+                && nat->logical_port) {
+                /* distributed NAT case, use nat->external_mac */
+                mac_s = nat->external_mac;
+                /* Traffic with eth.src = nat->external_mac should only be
+                 * sent from the chassis where nat->logical_port is
+                 * resident, so that upstream MAC learning points to the
+                 * correct chassis.  Also need to avoid generation of
+                 * multiple ARP responses from different chassis. */
+                ds_put_format(&match_cr_port,
+                              "is_chassis_resident(\"%s\")",
+                              nat->logical_port);
+                ds_put_format(&match_non_cr_port,
+                              "!is_chassis_resident(\"%s\")",
+                              nat->logical_port);
+            } else {
+                mac_s = REG_INPORT_ETH_ADDR;
+                /* Traffic with eth.src = l3dgw_port->lrp_networks.ea_s
+                 * should only be sent from the "redirect-chassis", so that
+                 * upstream MAC learning points to the "redirect-chassis".
+                 * Also need to avoid generation of multiple ARP responses
+                 * from different chassis. */
+                if (op->od->l3redirect_port) {
+                    ds_put_format(&match_cr_port,
+                                  "is_chassis_resident(\"%s\")",
+                                  op->od->l3redirect_port->json_key);
+                    ds_put_format(&match_non_cr_port,
+                                  "!is_chassis_resident(\"%s\")",
+                                  op->od->l3redirect_port->json_key);
                 }
             }
 
+            /* Respond to ARP/NS requests on the chassis that binds the gw
+             * port. Drop the ARP/NS requests on other chassis.
+             */
             struct lport_addresses *ext_addrs = &nat_entry->ext_addrs;
             if (nat_entry_is_v6(nat_entry)) {
                 build_lrouter_nd_flow(op->od, op, "nd_na",
                                       ext_addrs->ipv6_addrs[0].addr_s,
                                       ext_addrs->ipv6_addrs[0].sn_addr_s,
-                                      mac_s, &match, 90,
+                                      mac_s, &match_cr_port, false, 92,
+                                      lflows, &nat->header_);
+                build_lrouter_nd_flow(op->od, op, "nd_na",
+                                      ext_addrs->ipv6_addrs[0].addr_s,
+                                      ext_addrs->ipv6_addrs[0].sn_addr_s,
+                                      mac_s, &match_non_cr_port, true, 91,
                                       lflows, &nat->header_);
             } else {
                 build_lrouter_arp_flow(op->od, op,
                                        ext_addrs->ipv4_addrs[0].addr_s,
-                                       mac_s, &match, 90,
+                                       mac_s, &match_cr_port, false, 92,
+                                       lflows, &nat->header_);
+                build_lrouter_arp_flow(op->od, op,
+                                       ext_addrs->ipv4_addrs[0].addr_s,
+                                       mac_s, &match_non_cr_port, true, 91,
                                        lflows, &nat->header_);
             }
+            ds_destroy(&match_cr_port);
+            ds_destroy(&match_non_cr_port);
         }
 
         if (!smap_get(&op->od->nbr->options, "chassis")
@@ -8787,8 +8860,8 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
             build_lrouter_nd_flow(op->od, op, "nd_na_router",
                                   op->lrp_networks.ipv6_addrs[i].addr_s,
                                   op->lrp_networks.ipv6_addrs[i].sn_addr_s,
-                                  REG_INPORT_ETH_ADDR, &match, 90, lflows,
-                                  &op->nbrp->header_);
+                                  REG_INPORT_ETH_ADDR, &match, false, 90,
+                                  lflows, &op->nbrp->header_);
         }
 
         /* UDP/TCP port unreachable */
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index a3fb7ef..4a13456 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -1594,33 +1594,24 @@  action=(xreg0[[0..47]] = 00:00:00:00:01:00; next;)
 # Ingress router port ETH address is used for ARP reply/NA in lr_in_ip_input.
 AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sort], [0], [dnl
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
-action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1; output;)
-  table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.2), dnl
+match=(arp.op == 1 && arp.tpa == 43.43.43.2), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.3), dnl
+match=(arp.op == 1 && arp.tpa == 43.43.43.3), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.4), dnl
+match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
+match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
+action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1; output;)
+  table=3 (lr_in_ip_input     ), priority=90   , dnl
 match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1, ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
 action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = fe80::200:ff:fe00:1; nd.target = fe80::200:ff:fe00:1; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
 match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 && arp.spa == 43.43.43.0/24), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.1; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2), dnl
-action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1; output;)
-  table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3), dnl
-action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1; output;)
-  table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.4), dnl
-action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1; output;)
-  table=3 (lr_in_ip_input     ), priority=90   , dnl
 match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100, ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100), dnl
 action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
 ])
@@ -1654,37 +1645,55 @@  action=(xreg0[[0..47]] = 00:00:00:00:01:00; next;)
 
 # Ingress router port is used for ARP reply/NA in lr_in_ip_input.
 # xxreg0[0..47] is used unless external_mac is set.
+# Priority 90 flows (per router).
 AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sort], [0], [dnl
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
-action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1; output;)
-  table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.2), dnl
+match=(arp.op == 1 && arp.tpa == 43.43.43.2), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.3), dnl
+match=(arp.op == 1 && arp.tpa == 43.43.43.3), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == 43.43.43.4), dnl
+match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
+match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
+action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 42.42.42.1; outport = inport; flags.loopback = 1; output;)
+  table=3 (lr_in_ip_input     ), priority=90   , dnl
 match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1, ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
 action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = fe80::200:ff:fe00:1; nd.target = fe80::200:ff:fe00:1; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
 match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 && arp.spa == 43.43.43.0/24), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.1; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2 && is_chassis_resident("cr-lrp-public")), dnl
+match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100, ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100 && is_chassis_resident("cr-lrp-public")), dnl
+action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
+])
+
+# Priority 91 drop flows (per distributed gw port), if port is not resident.
+AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=91" | grep "arp\|nd" | sort], [0], [dnl
+  table=3 (lr_in_ip_input     ), priority=91   , dnl
+match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2 && !is_chassis_resident(""cr-lrp-public"")), dnl
+action=(drop;)
+  table=3 (lr_in_ip_input     ), priority=91   , dnl
+match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3 && !is_chassis_resident(""cr-lrp-public"")), dnl
+action=(drop;)
+  table=3 (lr_in_ip_input     ), priority=91   , dnl
+match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.4 && !is_chassis_resident("ls-vm")), dnl
+action=(drop;)
+])
+
+# Priority 92 ARP/NS responders (per distributed gw port), if port is resident.
+AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=92" | grep "arp\|nd" | sort], [0], [dnl
+  table=3 (lr_in_ip_input     ), priority=92   , dnl
+match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.2 && is_chassis_resident(""cr-lrp-public"")), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.2; outport = inport; flags.loopback = 1; output;)
-  table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3 && is_chassis_resident("cr-lrp-public")), dnl
+  table=3 (lr_in_ip_input     ), priority=92   , dnl
+match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.3 && is_chassis_resident(""cr-lrp-public"")), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa = arp.spa; arp.spa = 43.43.43.3; outport = inport; flags.loopback = 1; output;)
-  table=3 (lr_in_ip_input     ), priority=90   , dnl
+  table=3 (lr_in_ip_input     ), priority=92   , dnl
 match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.4 && is_chassis_resident("ls-vm")), dnl
 action=(eth.dst = eth.src; eth.src = 00:00:00:00:00:02; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = 00:00:00:00:00:02; arp.tpa = arp.spa; arp.spa = 43.43.43.4; outport = inport; flags.loopback = 1; output;)
-  table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100, ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100 && is_chassis_resident("cr-lrp-public")), dnl
-action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = fe80::200:ff:fe00:100; nd.target = fe80::200:ff:fe00:100; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
 ])
 
 # xreg0[0..47] isn't used anywhere else.
diff --git a/tests/ovn.at b/tests/ovn.at
index 1a1cfbf..9522b55 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -19336,7 +19336,7 @@  OVS_WAIT_UNTIL([
 send_arp_request 1 0 ${src_mac} $(ip_to_hex 10 0 0 254) $(ip_to_hex 10 0 0 121)
 
 # Verify that the ARP request is replied to from hv1 and not hv2.
-match_arp_req="priority=90.*${match_r1_metadata}.*arp_tpa=10.0.0.121,arp_op=1"
+match_arp_req="priority=92.*${match_r1_metadata}.*arp_tpa=10.0.0.121,arp_op=1"
 
 as hv1
 OVS_WAIT_UNTIL([
@@ -19356,7 +19356,7 @@  OVS_WAIT_UNTIL([
 send_arp_request 1 0 ${src_mac} $(ip_to_hex 10 0 0 254) $(ip_to_hex 10 0 0 122)
 
 # Verify that the ARP request is replied to from hv2 and not hv1.
-match_arp_req="priority=90.*${match_r1_metadata}.*arp_tpa=10.0.0.122,arp_op=1"
+match_arp_req="priority=92.*${match_r1_metadata}.*arp_tpa=10.0.0.122,arp_op=1"
 
 as hv2
 OVS_WAIT_UNTIL([
@@ -19400,7 +19400,7 @@  dst_ipv6=00100000000000000000000000000121
 send_nd_ns 1 0 ${src_mac} ${src_ipv6} ${dst_ipv6} 72dd
 
 # Verify that the ND_NS is replied to from hv1 and not hv2.
-match_nd_ns="priority=90.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::121"
+match_nd_ns="priority=92.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::121"
 
 as hv1
 OVS_WAIT_UNTIL([
@@ -19422,7 +19422,7 @@  dst_ipv6=00100000000000000000000000000122
 send_nd_ns 1 0 ${src_mac} ${src_ipv6} ${dst_ipv6} 72db
 
 # Verify that the ND_NS is replied to from hv2 and not hv1.
-match_nd_ns="priority=90.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::122"
+match_nd_ns="priority=92.*${match_r1_metadata}.*icmp_type=135.*nd_target=10::122"
 
 as hv2
 OVS_WAIT_UNTIL([