diff mbox series

[ovs-dev,ovn,v2] ovn-northd: Limit ARP/ND broadcast domain whenever possible.

Message ID 1571814640-30749-1-git-send-email-dceara@redhat.com
State Superseded
Headers show
Series [ovs-dev,ovn,v2] ovn-northd: Limit ARP/ND broadcast domain whenever possible. | expand

Commit Message

Dumitru Ceara Oct. 23, 2019, 7:10 a.m. UTC
ARP request and ND NS packets for router owned IPs were being
flooded in the complete L2 domain (using the MC_FLOOD multicast group).
However this creates a scaling issue in scenarios where aggregation
logical switches are connected to more logical routers (~350). The
logical pipelines of all routers would have to be executed before the
packet is finally replied to by a single router, the owner of the IP
address.

This commit limits the broadcast domain by bypassing the L2 Lookup stage
for ARP requests that will be replied by a single router. The packets
are still flooded in the L2 domain but not on any of the other patch
ports towards other routers connected to the switch. This restricted
flooding is done by using a new multicast group (MC_ARP_ND_FLOOD).

IPs that are owned by the routers and for which this fix applies are:
- IP addresses configured on the router ports.
- VIPs.
- NAT IPs.

Reported-at: https://bugzilla.redhat.com/1756945
Reported-by: Anil Venkata <vkommadi@redhat.com>
Signed-off-by: Dumitru Ceara <dceara@redhat.com>

---
v2: Move ARP broadcast domain limiting to table S_SWITCH_IN_L2_LKUP to
address localnet ports too.
---
 lib/mcast-group-index.h |   1 +
 northd/ovn-northd.8.xml |  16 ++++
 northd/ovn-northd.c     | 190 +++++++++++++++++++++++++++++++++++++++++-------
 tests/ovn.at            |   4 +-
 4 files changed, 181 insertions(+), 30 deletions(-)

Comments

Numan Siddique Oct. 25, 2019, 4:04 a.m. UTC | #1
On Wed, Oct 23, 2019 at 12:41 PM Dumitru Ceara <dceara@redhat.com> wrote:

> ARP request and ND NS packets for router owned IPs were being
> flooded in the complete L2 domain (using the MC_FLOOD multicast group).
> However this creates a scaling issue in scenarios where aggregation
> logical switches are connected to more logical routers (~350). The
> logical pipelines of all routers would have to be executed before the
> packet is finally replied to by a single router, the owner of the IP
> address.
>
> This commit limits the broadcast domain by bypassing the L2 Lookup stage
> for ARP requests that will be replied by a single router. The packets
> are still flooded in the L2 domain but not on any of the other patch
> ports towards other routers connected to the switch. This restricted
> flooding is done by using a new multicast group (MC_ARP_ND_FLOOD).
>
> IPs that are owned by the routers and for which this fix applies are:
> - IP addresses configured on the router ports.
> - VIPs.
> - NAT IPs.
>
> Reported-at: https://bugzilla.redhat.com/1756945
> Reported-by: Anil Venkata <vkommadi@redhat.com>
> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>
> ---
> v2: Move ARP broadcast domain limiting to table S_SWITCH_IN_L2_LKUP to
> address localnet ports too.
> ---
>  lib/mcast-group-index.h |   1 +
>  northd/ovn-northd.8.xml |  16 ++++
>  northd/ovn-northd.c     | 190
> +++++++++++++++++++++++++++++++++++++++++-------
>  tests/ovn.at            |   4 +-
>  4 files changed, 181 insertions(+), 30 deletions(-)
>
> diff --git a/lib/mcast-group-index.h b/lib/mcast-group-index.h
> index ba995ba..06bd8b3 100644
> --- a/lib/mcast-group-index.h
> +++ b/lib/mcast-group-index.h
> @@ -27,6 +27,7 @@ enum ovn_mcast_tunnel_keys {
>
>      OVN_MCAST_FLOOD_TUNNEL_KEY = OVN_MIN_MULTICAST,
>      OVN_MCAST_UNKNOWN_TUNNEL_KEY,
> +    OVN_MCAST_ARP_ND_TUNNEL_KEY,
>      OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY,
>      OVN_MCAST_MROUTER_STATIC_TUNNEL_KEY,
>      OVN_MCAST_STATIC_TUNNEL_KEY,
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index d3e0e5e..6bba8c9 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -1005,6 +1005,22 @@ output;
>        </li>
>
>        <li>
> +        Priority-80 flows for each port connected to a logical router
> +        matching self originated GARP/ARP request/ND packets. These
> packets
> +        are flooded to the <code>MC_FLOOD</code> which contains all
> logical
> +        ports.
> +      </li>
> +
> +      <li>
> +        Priority-75 flows for each IP address/VIP/NAT address owned by a
> +        router port connected to the switch. These flows match ARP
> requests
> +        and ND packets for the specific IP addresses.  Matched packets are
> +        forwarded in the L3 domain only to the router that owns the IP
> +        address and flooded in the L2 domain on all ports except patch
> +        ports connected to logical routers.
> +      </li>
> +
> +      <li>
>          A priority-70 flow that outputs all packets with an Ethernet
> broadcast
>          or multicast <code>eth.dst</code> to the <code>MC_FLOOD</code>
>          multicast group.
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index ea8ad7c..649c0ac 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -1193,6 +1193,34 @@ ovn_port_allocate_key(struct ovn_datapath *od)
>                            1, (1u << 15) - 1, &od->port_key_hint);
>  }
>
> +/* Returns true if the logical switch port 'enabled' column is empty or
> + * set to true.  Otherwise, returns false. */
> +static bool
> +lsp_is_enabled(const struct nbrec_logical_switch_port *lsp)
> +{
> +    return !lsp->n_enabled || *lsp->enabled;
> +}
> +
> +/* Returns true only if the logical switch port 'up' column is set to
> true.
> + * Otherwise, if the column is not set or set to false, returns false. */
> +static bool
> +lsp_is_up(const struct nbrec_logical_switch_port *lsp)
> +{
> +    return lsp->n_up && *lsp->up;
> +}
> +
> +static bool
> +lsp_is_external(const struct nbrec_logical_switch_port *nbsp)
> +{
> +    return !strcmp(nbsp->type, "external");
> +}
> +
> +static bool
> +lrport_is_enabled(const struct nbrec_logical_router_port *lrport)
> +{
> +    return !lrport->enabled || *lrport->enabled;
> +}
> +
>  static char *
>  chassis_redirect_name(const char *port_name)
>  {
> @@ -3018,6 +3046,10 @@ static const struct multicast_group mc_static =
>  static const struct multicast_group mc_unknown =
>      { MC_UNKNOWN, OVN_MCAST_UNKNOWN_TUNNEL_KEY };
>
> +#define MC_ARP_ND "_MC_arp_nd"
> +static const struct multicast_group mc_arp_nd =
> +    { MC_ARP_ND, OVN_MCAST_ARP_ND_TUNNEL_KEY };
> +
>  static bool
>  multicast_group_equal(const struct multicast_group *a,
>                        const struct multicast_group *b)
> @@ -3719,28 +3751,6 @@ build_port_security_ip(enum ovn_pipeline pipeline,
> struct ovn_port *op,
>
>  }
>
> -/* Returns true if the logical switch port 'enabled' column is empty or
> - * set to true.  Otherwise, returns false. */
> -static bool
> -lsp_is_enabled(const struct nbrec_logical_switch_port *lsp)
> -{
> -    return !lsp->n_enabled || *lsp->enabled;
> -}
> -
> -/* Returns true only if the logical switch port 'up' column is set to
> true.
> - * Otherwise, if the column is not set or set to false, returns false. */
> -static bool
> -lsp_is_up(const struct nbrec_logical_switch_port *lsp)
> -{
> -    return lsp->n_up && *lsp->up;
> -}
> -
> -static bool
> -lsp_is_external(const struct nbrec_logical_switch_port *nbsp)
> -{
> -    return !strcmp(nbsp->type, "external");
> -}
> -
>  static bool
>  build_dhcpv4_action(struct ovn_port *op, ovs_be32 offer_ip,
>                      struct ds *options_action, struct ds *response_action,
> @@ -5143,6 +5153,121 @@ build_lrouter_groups(struct hmap *ports, struct
> ovs_list *lr_list)
>      }
>  }
>
> +/*
> + * Ingress table 17: Flows that forward ARP/ND requests only to the
> routers
> + * that own the addresses. Packets are still flooded in the switching
> domain
> + * as regular broadcast.
> + */
> +static void
> +build_lswitch_rport_arp_flow(const char *target_address, int addr_family,
> +                             struct ovn_port *patch_op,
> +                             struct ovn_datapath *od,
> +                             uint32_t priority,
> +                             struct hmap *lflows)
> +{
> +    struct ds match   = DS_EMPTY_INITIALIZER;
> +    struct ds actions = DS_EMPTY_INITIALIZER;
> +
> +    if (addr_family == AF_INET) {
> +        ds_put_format(&match, "arp.tpa == %s && arp.op == 1",
> target_address);
> +    } else {
> +        ds_put_format(&match, "nd.target == %s && nd_ns", target_address);
> +    }
> +
> +    ds_put_format(&match, " && is_chassis_resident(%s)",
> patch_op->json_key);
> +
> +    /* Send a clone of the packet to the router pipeline and flood the
> +     * original in the broadcast domain (skipping router ports). */
> +    ds_put_format(&actions,
> +                  "clone { outport = %s; output; }; "
> +                  "outport = \""MC_ARP_ND"\"; output;",
> +                  patch_op->json_key);
> +    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, priority,
> +                  ds_cstr(&match), ds_cstr(&actions));
> +
> +    ds_destroy(&match);
> +    ds_destroy(&actions);
> +}
> +
> +/*
> + * Ingress table 17: Flows that forward ARP/ND requests only to the
> routers
> + * that own the addresses.
> + * Priorities:
> + * - 80: self originated GARPs that need to follow regular processing.
> + * - 75: ARP requests to router owned IPs (interface IP/LB/NAT).
> + */
> +static void
> +build_lswitch_rport_arp_responders(struct ovn_port *op,
> +                                   struct ovn_datapath *sw_od,
> +                                   struct ovn_port *sw_op,
> +                                   struct hmap *lflows)
> +{
> +    if (!op || !op->nbrp) {
> +        return;
> +    }
> +
> +    if (!lrport_is_enabled(op->nbrp)) {
> +        return;
> +    }
> +
> +    struct ds match = DS_EMPTY_INITIALIZER;
> +
> +    /* Self originated (G)ARP requests/ND need to be flooded as usual.
> +     * Priority: 40.
> +     */
> +    ds_put_format(&match, "inport == %s && (arp.op == 1 || nd_ns)",
> +                  sw_op->json_key);
> +    ovn_lflow_add(lflows, sw_od, S_SWITCH_IN_L2_LKUP, 80,
> +                  ds_cstr(&match),
> +                  "outport = \""MC_FLOOD"\"; output;");
> +
> +    ds_destroy(&match);
> +
> +    /* Forward ARP requests for IPs configured on the router only to this
> +     * router port.
> +     * Priority: 30.
> +     */
> +    for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
> +
> build_lswitch_rport_arp_flow(op->lrp_networks.ipv4_addrs[i].addr_s,
> +                                     AF_INET, sw_op, sw_od, 75, lflows);
> +    }
> +    for (int i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
> +
> build_lswitch_rport_arp_flow(op->lrp_networks.ipv6_addrs[i].addr_s,
> +                                     AF_INET6, sw_op, sw_od, 75, lflows);
> +    }
> +
> +    /* Forward ARP requests to load-balancer VIPs configured on the router
> +     * only to this router port.
> +     * Priority: 30.
> +     */
> +    struct sset all_ips = SSET_INITIALIZER(&all_ips);
> +    const char *ip_address;
> +    int addr_family;
> +
> +    get_router_load_balancer_ips(op->od, &all_ips, &addr_family);
> +
> +    SSET_FOR_EACH (ip_address, &all_ips) {
> +        build_lswitch_rport_arp_flow(ip_address, addr_family, sw_op,
> sw_od,
> +                                     75, lflows);
> +    }
> +    sset_destroy(&all_ips);
> +
> +    /* Forward ARP requests to NAT addresses configured on the router
> +     * only to this router port.
> +     * Priority: 30.
> +     */
> +    for (int i = 0; i < op->od->nbr->n_nat; i++) {
> +        const struct nbrec_nat *nat = op->od->nbr->nat[i];
> +
> +        if (!strcmp(nat->type, "snat")) {
> +            continue;
> +        }
> +
> +        build_lswitch_rport_arp_flow(nat->external_ip, AF_INET, sw_op,
> sw_od,
> +                                     75, lflows);
> +    }
> +}
> +
>  static void
>  build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>                      struct hmap *port_groups, struct hmap *lflows,
> @@ -5730,6 +5855,15 @@ build_lswitch_flows(struct hmap *datapaths, struct
> hmap *ports,
>              continue;
>          }
>
> +        /* For ports connected to logical routers add flows to bypass the
> +         * broadcast flooding of ARP/ND requests in table 17. We direct
> the
> +         * requests only to the router port that owns the IP address.
> +         */
> +        if (!strcmp(op->nbsp->type, "router")) {
> +            build_lswitch_rport_arp_responders(op->peer, op->od, op,
> +                                               lflows);
> +        }
> +
>          for (size_t i = 0; i < op->nbsp->n_addresses; i++) {
>              /* Addresses are owned by the logical port.
>               * Ethernet address followed by zero or more IPv4
> @@ -5861,12 +5995,6 @@ build_lswitch_flows(struct hmap *datapaths, struct
> hmap *ports,
>      ds_destroy(&actions);
>  }
>
> -static bool
> -lrport_is_enabled(const struct nbrec_logical_router_port *lrport)
> -{
> -    return !lrport->enabled || *lrport->enabled;
> -}
> -
>  /* Returns a string of the IP address of the router port 'op' that
>   * overlaps with 'ip_s".  If one is not found, returns NULL.
>   *
> @@ -9129,6 +9257,12 @@ build_mcast_groups(struct northd_context *ctx,
>          } else if (op->nbsp && lsp_is_enabled(op->nbsp)) {
>              ovn_multicast_add(mcast_groups, &mc_flood, op);
>
> +            /* Add all non-router ports to the ARP ND L2 broadcast flood
> +             * domain entry. */
> +            if (strcmp(op->nbsp->type, "router")) {
> +                ovn_multicast_add(mcast_groups, &mc_arp_nd, op);
> +            }
> +
>              /* If this port is connected to a multicast router then add it
>               * to the MC_MROUTER_FLOOD group.
>               */
> diff --git a/tests/ovn.at b/tests/ovn.at
> index 22b272a..3bad363 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -10719,7 +10719,7 @@ ovn-nbctl --wait=hv --timeout=3 sync
>  # Check that there is a logical flow in logical switch foo's pipeline
>  # to set the outport to rp-foo (which is expected).
>  OVS_WAIT_UNTIL([test 1 = `ovn-sbctl dump-flows foo | grep ls_in_l2_lkup |
> \
> -grep rp-foo | grep -v is_chassis_resident | wc -l`])
> +grep rp-foo | grep -v is_chassis_resident | grep "eth.dst" | wc -l`])
>

Hi Dumitru,

The patch LGTM. Can you please also grep for "priority=<expected_no>" to be
sure.

It would be great If you can also add few test cases.

Thanks
Numan


>  # Set the option 'reside-on-redirect-chassis' for foo
>  ovn-nbctl set logical_router_port foo
> options:reside-on-redirect-chassis=true
> @@ -10727,7 +10727,7 @@ ovn-nbctl set logical_router_port foo
> options:reside-on-redirect-chassis=true
>  # to set the outport to rp-foo with the condition is_chassis_redirect.
>  ovn-sbctl dump-flows foo
>  OVS_WAIT_UNTIL([test 1 = `ovn-sbctl dump-flows foo | grep ls_in_l2_lkup |
> \
> -grep rp-foo | grep is_chassis_resident | wc -l`])
> +grep rp-foo | grep is_chassis_resident | grep -v clone | wc -l`])
>
>  echo "---------NB dump-----"
>  ovn-nbctl show
> --
> 1.8.3.1
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Han Zhou Oct. 31, 2019, 1:05 a.m. UTC | #2
On Wed, Oct 23, 2019 at 12:11 AM Dumitru Ceara <dceara@redhat.com> wrote:
>
> ARP request and ND NS packets for router owned IPs were being
> flooded in the complete L2 domain (using the MC_FLOOD multicast group).
> However this creates a scaling issue in scenarios where aggregation
> logical switches are connected to more logical routers (~350). The
> logical pipelines of all routers would have to be executed before the
> packet is finally replied to by a single router, the owner of the IP
> address.
>
> This commit limits the broadcast domain by bypassing the L2 Lookup stage
> for ARP requests that will be replied by a single router. The packets
> are still flooded in the L2 domain but not on any of the other patch
> ports towards other routers connected to the switch. This restricted
> flooding is done by using a new multicast group (MC_ARP_ND_FLOOD).
>
> IPs that are owned by the routers and for which this fix applies are:
> - IP addresses configured on the router ports.
> - VIPs.
> - NAT IPs.
>
> Reported-at: https://bugzilla.redhat.com/1756945
> Reported-by: Anil Venkata <vkommadi@redhat.com>
> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>

Thanks Dumitru for addressing the issue. I have only one concern, but I am
not sure if it would cause real issue. The concern is, this patch changes
the behavior that originally if there is any ARP request broadcasted by an
external routers, all OVN routers will learn the MAC-IP bindings from the
ARP request, but with this change only the one with the requested IP would
learn it. At the same time, what if an ARP response (or GARP) is coming?
Would it still trigger the same problem since it has to go through all
router pipelines?

Thanks,
Han
Dumitru Ceara Oct. 31, 2019, 8:12 a.m. UTC | #3
On Thu, Oct 31, 2019 at 2:05 AM Han Zhou <hzhou@ovn.org> wrote:
>
>
>
> On Wed, Oct 23, 2019 at 12:11 AM Dumitru Ceara <dceara@redhat.com> wrote:
> >
> > ARP request and ND NS packets for router owned IPs were being
> > flooded in the complete L2 domain (using the MC_FLOOD multicast group).
> > However this creates a scaling issue in scenarios where aggregation
> > logical switches are connected to more logical routers (~350). The
> > logical pipelines of all routers would have to be executed before the
> > packet is finally replied to by a single router, the owner of the IP
> > address.
> >
> > This commit limits the broadcast domain by bypassing the L2 Lookup stage
> > for ARP requests that will be replied by a single router. The packets
> > are still flooded in the L2 domain but not on any of the other patch
> > ports towards other routers connected to the switch. This restricted
> > flooding is done by using a new multicast group (MC_ARP_ND_FLOOD).
> >
> > IPs that are owned by the routers and for which this fix applies are:
> > - IP addresses configured on the router ports.
> > - VIPs.
> > - NAT IPs.
> >
> > Reported-at: https://bugzilla.redhat.com/1756945
> > Reported-by: Anil Venkata <vkommadi@redhat.com>
> > Signed-off-by: Dumitru Ceara <dceara@redhat.com>
> >
>
> Thanks Dumitru for addressing the issue. I have only one concern, but I am not sure if it would cause real issue. The concern is, this patch changes the behavior that originally if there is any ARP request broadcasted by an external routers, all OVN routers will learn the MAC-IP bindings from the ARP request, but with this change only the one with the requested IP would learn it. At the same time, what if an ARP response (or GARP) is coming? Would it still trigger the same problem since it has to go through all router pipelines?

Hi Han,

Indeed, without the patch all connected OVN routers would learn the
MAC-IP binding from an ARP request. However, even with a reasonably
sized topology we can easily end up going over the 4K resubmit limit
for an ARP request broadcast packet because we run all the router
pipelines. I think that's a bigger problem because there's no way to
make sure that the ARP request reaches at least the router that owns
the IP address so we might end up unable to reach part of the network.
With the fix other routers will now have to resolve the host that they
would've previously known from the ARP request but in the end we
should have proper connectivity in the whole network.

The patch changes the behavior only for ARP requests because replies
are usually (not always) unicast. GARP requests from external hosts
are not affected by the fix because we do a host match on the target
IP and make sure it matches the IPs owned by OVN. In case of an
external GARP the packet will get flooded to all router pipelines.
While we might hit the 4K resubmit issue, the reasoning is that at
least some routers get the packet and learn the mac binding.

I'll be sending a v3 soon as I need to rebase and add autotests and
also fix an issue I had for packets coming from VXLAN tunnels.

Thanks,
Dumitru

>
> Thanks,
> Han
Dumitru Ceara Oct. 31, 2019, 11:21 a.m. UTC | #4
On Thu, Oct 31, 2019 at 9:12 AM Dumitru Ceara <dceara@redhat.com> wrote:
>
> On Thu, Oct 31, 2019 at 2:05 AM Han Zhou <hzhou@ovn.org> wrote:
> >
> >
> >
> > On Wed, Oct 23, 2019 at 12:11 AM Dumitru Ceara <dceara@redhat.com> wrote:
> > >
> > > ARP request and ND NS packets for router owned IPs were being
> > > flooded in the complete L2 domain (using the MC_FLOOD multicast group).
> > > However this creates a scaling issue in scenarios where aggregation
> > > logical switches are connected to more logical routers (~350). The
> > > logical pipelines of all routers would have to be executed before the
> > > packet is finally replied to by a single router, the owner of the IP
> > > address.
> > >
> > > This commit limits the broadcast domain by bypassing the L2 Lookup stage
> > > for ARP requests that will be replied by a single router. The packets
> > > are still flooded in the L2 domain but not on any of the other patch
> > > ports towards other routers connected to the switch. This restricted
> > > flooding is done by using a new multicast group (MC_ARP_ND_FLOOD).
> > >
> > > IPs that are owned by the routers and for which this fix applies are:
> > > - IP addresses configured on the router ports.
> > > - VIPs.
> > > - NAT IPs.
> > >
> > > Reported-at: https://bugzilla.redhat.com/1756945
> > > Reported-by: Anil Venkata <vkommadi@redhat.com>
> > > Signed-off-by: Dumitru Ceara <dceara@redhat.com>
> > >
> >
> > Thanks Dumitru for addressing the issue. I have only one concern, but I am not sure if it would cause real issue. The concern is, this patch changes the behavior that originally if there is any ARP request broadcasted by an external routers, all OVN routers will learn the MAC-IP bindings from the ARP request, but with this change only the one with the requested IP would learn it. At the same time, what if an ARP response (or GARP) is coming? Would it still trigger the same problem since it has to go through all router pipelines?
>
> Hi Han,
>
> Indeed, without the patch all connected OVN routers would learn the
> MAC-IP binding from an ARP request. However, even with a reasonably
> sized topology we can easily end up going over the 4K resubmit limit
> for an ARP request broadcast packet because we run all the router
> pipelines. I think that's a bigger problem because there's no way to
> make sure that the ARP request reaches at least the router that owns
> the IP address so we might end up unable to reach part of the network.
> With the fix other routers will now have to resolve the host that they
> would've previously known from the ARP request but in the end we
> should have proper connectivity in the whole network.
>
> The patch changes the behavior only for ARP requests because replies
> are usually (not always) unicast. GARP requests from external hosts
> are not affected by the fix because we do a host match on the target
> IP and make sure it matches the IPs owned by OVN. In case of an
> external GARP the packet will get flooded to all router pipelines.
> While we might hit the 4K resubmit issue, the reasoning is that at
> least some routers get the packet and learn the mac binding.
>
> I'll be sending a v3 soon as I need to rebase and add autotests and
> also fix an issue I had for packets coming from VXLAN tunnels.

v3 which should cover the newly added IPv6 NAT case and addresses
Numan's comments:
https://patchwork.ozlabs.org/patch/1187378/

Thanks,
Dumitru
diff mbox series

Patch

diff --git a/lib/mcast-group-index.h b/lib/mcast-group-index.h
index ba995ba..06bd8b3 100644
--- a/lib/mcast-group-index.h
+++ b/lib/mcast-group-index.h
@@ -27,6 +27,7 @@  enum ovn_mcast_tunnel_keys {
 
     OVN_MCAST_FLOOD_TUNNEL_KEY = OVN_MIN_MULTICAST,
     OVN_MCAST_UNKNOWN_TUNNEL_KEY,
+    OVN_MCAST_ARP_ND_TUNNEL_KEY,
     OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY,
     OVN_MCAST_MROUTER_STATIC_TUNNEL_KEY,
     OVN_MCAST_STATIC_TUNNEL_KEY,
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index d3e0e5e..6bba8c9 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -1005,6 +1005,22 @@  output;
       </li>
 
       <li>
+        Priority-80 flows for each port connected to a logical router
+        matching self originated GARP/ARP request/ND packets. These packets
+        are flooded to the <code>MC_FLOOD</code> which contains all logical
+        ports.
+      </li>
+
+      <li>
+        Priority-75 flows for each IP address/VIP/NAT address owned by a
+        router port connected to the switch. These flows match ARP requests
+        and ND packets for the specific IP addresses.  Matched packets are
+        forwarded in the L3 domain only to the router that owns the IP
+        address and flooded in the L2 domain on all ports except patch
+        ports connected to logical routers.
+      </li>
+
+      <li>
         A priority-70 flow that outputs all packets with an Ethernet broadcast
         or multicast <code>eth.dst</code> to the <code>MC_FLOOD</code>
         multicast group.
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index ea8ad7c..649c0ac 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -1193,6 +1193,34 @@  ovn_port_allocate_key(struct ovn_datapath *od)
                           1, (1u << 15) - 1, &od->port_key_hint);
 }
 
+/* Returns true if the logical switch port 'enabled' column is empty or
+ * set to true.  Otherwise, returns false. */
+static bool
+lsp_is_enabled(const struct nbrec_logical_switch_port *lsp)
+{
+    return !lsp->n_enabled || *lsp->enabled;
+}
+
+/* Returns true only if the logical switch port 'up' column is set to true.
+ * Otherwise, if the column is not set or set to false, returns false. */
+static bool
+lsp_is_up(const struct nbrec_logical_switch_port *lsp)
+{
+    return lsp->n_up && *lsp->up;
+}
+
+static bool
+lsp_is_external(const struct nbrec_logical_switch_port *nbsp)
+{
+    return !strcmp(nbsp->type, "external");
+}
+
+static bool
+lrport_is_enabled(const struct nbrec_logical_router_port *lrport)
+{
+    return !lrport->enabled || *lrport->enabled;
+}
+
 static char *
 chassis_redirect_name(const char *port_name)
 {
@@ -3018,6 +3046,10 @@  static const struct multicast_group mc_static =
 static const struct multicast_group mc_unknown =
     { MC_UNKNOWN, OVN_MCAST_UNKNOWN_TUNNEL_KEY };
 
+#define MC_ARP_ND "_MC_arp_nd"
+static const struct multicast_group mc_arp_nd =
+    { MC_ARP_ND, OVN_MCAST_ARP_ND_TUNNEL_KEY };
+
 static bool
 multicast_group_equal(const struct multicast_group *a,
                       const struct multicast_group *b)
@@ -3719,28 +3751,6 @@  build_port_security_ip(enum ovn_pipeline pipeline, struct ovn_port *op,
 
 }
 
-/* Returns true if the logical switch port 'enabled' column is empty or
- * set to true.  Otherwise, returns false. */
-static bool
-lsp_is_enabled(const struct nbrec_logical_switch_port *lsp)
-{
-    return !lsp->n_enabled || *lsp->enabled;
-}
-
-/* Returns true only if the logical switch port 'up' column is set to true.
- * Otherwise, if the column is not set or set to false, returns false. */
-static bool
-lsp_is_up(const struct nbrec_logical_switch_port *lsp)
-{
-    return lsp->n_up && *lsp->up;
-}
-
-static bool
-lsp_is_external(const struct nbrec_logical_switch_port *nbsp)
-{
-    return !strcmp(nbsp->type, "external");
-}
-
 static bool
 build_dhcpv4_action(struct ovn_port *op, ovs_be32 offer_ip,
                     struct ds *options_action, struct ds *response_action,
@@ -5143,6 +5153,121 @@  build_lrouter_groups(struct hmap *ports, struct ovs_list *lr_list)
     }
 }
 
+/*
+ * Ingress table 17: Flows that forward ARP/ND requests only to the routers
+ * that own the addresses. Packets are still flooded in the switching domain
+ * as regular broadcast.
+ */
+static void
+build_lswitch_rport_arp_flow(const char *target_address, int addr_family,
+                             struct ovn_port *patch_op,
+                             struct ovn_datapath *od,
+                             uint32_t priority,
+                             struct hmap *lflows)
+{
+    struct ds match   = DS_EMPTY_INITIALIZER;
+    struct ds actions = DS_EMPTY_INITIALIZER;
+
+    if (addr_family == AF_INET) {
+        ds_put_format(&match, "arp.tpa == %s && arp.op == 1", target_address);
+    } else {
+        ds_put_format(&match, "nd.target == %s && nd_ns", target_address);
+    }
+
+    ds_put_format(&match, " && is_chassis_resident(%s)", patch_op->json_key);
+
+    /* Send a clone of the packet to the router pipeline and flood the
+     * original in the broadcast domain (skipping router ports). */
+    ds_put_format(&actions,
+                  "clone { outport = %s; output; }; "
+                  "outport = \""MC_ARP_ND"\"; output;",
+                  patch_op->json_key);
+    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, priority,
+                  ds_cstr(&match), ds_cstr(&actions));
+
+    ds_destroy(&match);
+    ds_destroy(&actions);
+}
+
+/*
+ * Ingress table 17: Flows that forward ARP/ND requests only to the routers
+ * that own the addresses.
+ * Priorities:
+ * - 80: self originated GARPs that need to follow regular processing.
+ * - 75: ARP requests to router owned IPs (interface IP/LB/NAT).
+ */
+static void
+build_lswitch_rport_arp_responders(struct ovn_port *op,
+                                   struct ovn_datapath *sw_od,
+                                   struct ovn_port *sw_op,
+                                   struct hmap *lflows)
+{
+    if (!op || !op->nbrp) {
+        return;
+    }
+
+    if (!lrport_is_enabled(op->nbrp)) {
+        return;
+    }
+
+    struct ds match = DS_EMPTY_INITIALIZER;
+
+    /* Self originated (G)ARP requests/ND need to be flooded as usual.
+     * Priority: 40.
+     */
+    ds_put_format(&match, "inport == %s && (arp.op == 1 || nd_ns)",
+                  sw_op->json_key);
+    ovn_lflow_add(lflows, sw_od, S_SWITCH_IN_L2_LKUP, 80,
+                  ds_cstr(&match),
+                  "outport = \""MC_FLOOD"\"; output;");
+
+    ds_destroy(&match);
+
+    /* Forward ARP requests for IPs configured on the router only to this
+     * router port.
+     * Priority: 30.
+     */
+    for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
+        build_lswitch_rport_arp_flow(op->lrp_networks.ipv4_addrs[i].addr_s,
+                                     AF_INET, sw_op, sw_od, 75, lflows);
+    }
+    for (int i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
+        build_lswitch_rport_arp_flow(op->lrp_networks.ipv6_addrs[i].addr_s,
+                                     AF_INET6, sw_op, sw_od, 75, lflows);
+    }
+
+    /* Forward ARP requests to load-balancer VIPs configured on the router
+     * only to this router port.
+     * Priority: 30.
+     */
+    struct sset all_ips = SSET_INITIALIZER(&all_ips);
+    const char *ip_address;
+    int addr_family;
+
+    get_router_load_balancer_ips(op->od, &all_ips, &addr_family);
+
+    SSET_FOR_EACH (ip_address, &all_ips) {
+        build_lswitch_rport_arp_flow(ip_address, addr_family, sw_op, sw_od,
+                                     75, lflows);
+    }
+    sset_destroy(&all_ips);
+
+    /* Forward ARP requests to NAT addresses configured on the router
+     * only to this router port.
+     * Priority: 30.
+     */
+    for (int i = 0; i < op->od->nbr->n_nat; i++) {
+        const struct nbrec_nat *nat = op->od->nbr->nat[i];
+
+        if (!strcmp(nat->type, "snat")) {
+            continue;
+        }
+
+        build_lswitch_rport_arp_flow(nat->external_ip, AF_INET, sw_op, sw_od,
+                                     75, lflows);
+    }
+}
+
 static void
 build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
                     struct hmap *port_groups, struct hmap *lflows,
@@ -5730,6 +5855,15 @@  build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
             continue;
         }
 
+        /* For ports connected to logical routers add flows to bypass the
+         * broadcast flooding of ARP/ND requests in table 17. We direct the
+         * requests only to the router port that owns the IP address.
+         */
+        if (!strcmp(op->nbsp->type, "router")) {
+            build_lswitch_rport_arp_responders(op->peer, op->od, op,
+                                               lflows);
+        }
+
         for (size_t i = 0; i < op->nbsp->n_addresses; i++) {
             /* Addresses are owned by the logical port.
              * Ethernet address followed by zero or more IPv4
@@ -5861,12 +5995,6 @@  build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
     ds_destroy(&actions);
 }
 
-static bool
-lrport_is_enabled(const struct nbrec_logical_router_port *lrport)
-{
-    return !lrport->enabled || *lrport->enabled;
-}
-
 /* Returns a string of the IP address of the router port 'op' that
  * overlaps with 'ip_s".  If one is not found, returns NULL.
  *
@@ -9129,6 +9257,12 @@  build_mcast_groups(struct northd_context *ctx,
         } else if (op->nbsp && lsp_is_enabled(op->nbsp)) {
             ovn_multicast_add(mcast_groups, &mc_flood, op);
 
+            /* Add all non-router ports to the ARP ND L2 broadcast flood
+             * domain entry. */
+            if (strcmp(op->nbsp->type, "router")) {
+                ovn_multicast_add(mcast_groups, &mc_arp_nd, op);
+            }
+
             /* If this port is connected to a multicast router then add it
              * to the MC_MROUTER_FLOOD group.
              */
diff --git a/tests/ovn.at b/tests/ovn.at
index 22b272a..3bad363 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -10719,7 +10719,7 @@  ovn-nbctl --wait=hv --timeout=3 sync
 # Check that there is a logical flow in logical switch foo's pipeline
 # to set the outport to rp-foo (which is expected).
 OVS_WAIT_UNTIL([test 1 = `ovn-sbctl dump-flows foo | grep ls_in_l2_lkup | \
-grep rp-foo | grep -v is_chassis_resident | wc -l`])
+grep rp-foo | grep -v is_chassis_resident | grep "eth.dst" | wc -l`])
 
 # Set the option 'reside-on-redirect-chassis' for foo
 ovn-nbctl set logical_router_port foo options:reside-on-redirect-chassis=true
@@ -10727,7 +10727,7 @@  ovn-nbctl set logical_router_port foo options:reside-on-redirect-chassis=true
 # to set the outport to rp-foo with the condition is_chassis_redirect.
 ovn-sbctl dump-flows foo
 OVS_WAIT_UNTIL([test 1 = `ovn-sbctl dump-flows foo | grep ls_in_l2_lkup | \
-grep rp-foo | grep is_chassis_resident | wc -l`])
+grep rp-foo | grep is_chassis_resident | grep -v clone | wc -l`])
 
 echo "---------NB dump-----"
 ovn-nbctl show