diff mbox series

[ovs-dev,5/5] controller, northd: pass arp/nd from HW VTEP to lrouter pipeline

Message ID 20230519181859.1195040-6-odivlad@gmail.com
State Changes Requested
Headers show
Series VTEP lport ARP handling fixes & GARP support | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_ovn-kubernetes success github build: passed
ovsrobot/github-robot-_Build_and_Test fail github build: failed

Commit Message

Vladislav Odintsov May 19, 2023, 6:18 p.m. UTC
This patch is intended to make next two changes:

1. Support create/update of MAC_Binding for GARP/ND from HW VTEP.

Prior to this patch MAC_Binding records were created only when LRP issued
ARP request/Neighbor solicitation.  If IP to MAC in network attached via
vtep lport was changed the old MAC_Binding prevented normal
communication.  Now router port (chassisredirect) in logical switch with
vtep lport catches GARP or Neighbor Solicitation and updates MAC_Binding.

New flow with max priority was added in ls_in_arp_nd_resp and additional
OF rule in table 37 (OFTABLE_REMOTE_OUTPUT) to process multicast packets
received from RAMP_SWITCH.

2. Answer to ARP/ND on requests coming from HW VTEP in lrouter pipeline.

This is needed to reduce duplicated arp-responses, which were generated
by OVN arp-responder flows in node, where chassisredirect port located
with responses coming directly from VM interfaces.

As a result of this change, now vifs located on same chassis, where
chassisredirect ports are located receive ARP/ND mcast packets, which
previously were suppressed by arp-responder on the node.

Tests and documentation were updated to conform to new behaviour.

Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
---
 controller/binding.c    | 48 ++++++++++++++++++++
 controller/local_data.h |  3 ++
 controller/physical.c   | 46 +++++++++++++++++--
 northd/northd.c         | 10 +++++
 northd/ovn-northd.8.xml |  6 +++
 tests/ovn.at            | 99 ++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 207 insertions(+), 5 deletions(-)

Comments

Dumitru Ceara May 30, 2023, 9:27 a.m. UTC | #1
On 5/19/23 20:18, Vladislav Odintsov wrote:
> This patch is intended to make next two changes:
> 
> 1. Support create/update of MAC_Binding for GARP/ND from HW VTEP.
> 
> Prior to this patch MAC_Binding records were created only when LRP issued
> ARP request/Neighbor solicitation.  If IP to MAC in network attached via
> vtep lport was changed the old MAC_Binding prevented normal
> communication.  Now router port (chassisredirect) in logical switch with
> vtep lport catches GARP or Neighbor Solicitation and updates MAC_Binding.
> 
> New flow with max priority was added in ls_in_arp_nd_resp and additional
> OF rule in table 37 (OFTABLE_REMOTE_OUTPUT) to process multicast packets
> received from RAMP_SWITCH.
> 
> 2. Answer to ARP/ND on requests coming from HW VTEP in lrouter pipeline.
> 
> This is needed to reduce duplicated arp-responses, which were generated
> by OVN arp-responder flows in node, where chassisredirect port located
> with responses coming directly from VM interfaces.
> 
> As a result of this change, now vifs located on same chassis, where
> chassisredirect ports are located receive ARP/ND mcast packets, which
> previously were suppressed by arp-responder on the node.
> 
> Tests and documentation were updated to conform to new behaviour.
> 
> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
> ---

Hi Vladislav,

Thanks for the patch.

>  controller/binding.c    | 48 ++++++++++++++++++++
>  controller/local_data.h |  3 ++
>  controller/physical.c   | 46 +++++++++++++++++--
>  northd/northd.c         | 10 +++++
>  northd/ovn-northd.8.xml |  6 +++
>  tests/ovn.at            | 99 ++++++++++++++++++++++++++++++++++++++++-

Could you please add a NEWS entry for this behavior change?

>  6 files changed, 207 insertions(+), 5 deletions(-)
> 
> diff --git a/controller/binding.c b/controller/binding.c
> index c7a2b3f10..5932ad785 100644
> --- a/controller/binding.c
> +++ b/controller/binding.c
> @@ -509,6 +509,30 @@ update_ld_multichassis_ports(const struct sbrec_port_binding *binding_rec,
>      }
>  }
>  
> +static void
> +update_ld_vtep_port(const struct sbrec_port_binding *binding_rec,
> +                    struct hmap *local_datapaths)
> +{
> +    struct local_datapath *ld
> +        = get_local_datapath(local_datapaths,
> +                             binding_rec->datapath->tunnel_key);
> +    if (!ld) {
> +        return;
> +    }
> +
> +    if (ld->vtep_port && strcmp(ld->vtep_port->logical_port,
> +                                binding_rec->logical_port)) {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
> +        VLOG_WARN_RL(&rl, "vtep port '%s' already set for datapath "
> +                     "'%"PRId64"', skipping the new port '%s'.",
> +                     ld->vtep_port->logical_port,
> +                     binding_rec->datapath->tunnel_key,
> +                     binding_rec->logical_port);
> +        return;
> +    }
> +    ld->vtep_port = binding_rec;
> +}
> +
>  static void
>  update_ld_localnet_port(const struct sbrec_port_binding *binding_rec,
>                          struct shash *bridge_mappings,
> @@ -1987,6 +2011,7 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
>      struct ovs_list external_lports = OVS_LIST_INITIALIZER(&external_lports);
>      struct ovs_list multichassis_ports = OVS_LIST_INITIALIZER(
>                                                          &multichassis_ports);
> +    struct ovs_list vtep_lports = OVS_LIST_INITIALIZER(&vtep_lports);
>  
>      struct lport {
>          struct ovs_list list_node;
> @@ -2010,8 +2035,13 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
>  
>          switch (lport_type) {
>          case LP_PATCH:
> +            update_related_lport(pb, b_ctx_out);
> +            break;
>          case LP_VTEP:
>              update_related_lport(pb, b_ctx_out);
> +            struct lport *vtep_lport = xmalloc(sizeof *vtep_lport);
> +            vtep_lport->pb = pb;
> +            ovs_list_push_back(&vtep_lports, &vtep_lport->list_node);
>              break;
>  
>          case LP_LOCALPORT:
> @@ -2111,6 +2141,16 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
>          free(multichassis_lport);
>      }
>  
> +    /* Run through vtep lport list to see if there are vtep ports
> +     * on local datapaths discovered from above loop, and update the
> +     * corresponding local datapath accordingly. */
> +    struct lport *vtep_lport;
> +    ovs_list_size(&vtep_lports);

I guess this is a leftover.

> +    LIST_FOR_EACH_POP (vtep_lport, list_node, &vtep_lports) {
> +        update_ld_vtep_port(vtep_lport->pb, b_ctx_out->local_datapaths);
> +        free(vtep_lport);
> +    }
> +
>      shash_destroy(&bridge_mappings);
>      /* Remove stale QoS entries. */
>      ovs_qos_entries_gc(b_ctx_in->ovs_idl_txn, b_ctx_in->ovsrec_port_by_qos,
> @@ -2175,6 +2215,11 @@ remove_pb_from_local_datapath(const struct sbrec_port_binding *pb,
>          }
>      } else if (!strcmp(pb->type, "external")) {
>          remove_local_datapath_external_port(ld, pb->logical_port);
> +    } else if (!strcmp(pb->type, "vtep")) {
> +        if (ld->vtep_port && !strcmp(ld->vtep_port->logical_port,
> +                                     pb->logical_port)) {
> +            ld->vtep_port = NULL;
> +        }
>      }
>      remove_local_datapath_multichassis_port(ld, pb->logical_port);
>  }
> @@ -2836,6 +2881,7 @@ handle_updated_port(struct binding_ctx_in *b_ctx_in,
>  
>      case LP_VTEP:
>          update_related_lport(pb, b_ctx_out);
> +        update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
>          /* VTEP lports are claimed/released by ovn-controller-vteps.
>           * We are not sure what changed. */
>          b_ctx_out->non_vif_ports_changed = true;
> @@ -3091,6 +3137,8 @@ delete_done:
>                  } else if (pb->n_additional_chassis) {
>                      update_ld_multichassis_ports(pb,
>                                                   b_ctx_out->local_datapaths);
> +                } else if (lport_type == LP_VTEP) {
> +                    update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
>                  }

Nit: I think I'd move this "else if (lport_type == LP_VTEP) {" just
under the LP_EXTERNAL case.

>              }
>              sbrec_port_binding_index_destroy_row(target);
> diff --git a/controller/local_data.h b/controller/local_data.h
> index 748f009aa..19d13a558 100644
> --- a/controller/local_data.h
> +++ b/controller/local_data.h
> @@ -50,6 +50,9 @@ struct local_datapath {
>      /* The localnet port in this datapath, if any (at most one is allowed). */
>      const struct sbrec_port_binding *localnet_port;
>  
> +    /* The vtep port in this datapath, if any (at most one is allowed). */
> +    const struct sbrec_port_binding *vtep_port;
> +
>      struct {
>          const struct sbrec_port_binding *local;
>          const struct sbrec_port_binding *remote;
> diff --git a/controller/physical.c b/controller/physical.c
> index 2925dcd1d..b75e120c7 100644
> --- a/controller/physical.c
> +++ b/controller/physical.c
> @@ -194,6 +194,15 @@ get_localnet_port(const struct hmap *local_datapaths, int64_t tunnel_key)
>  }
>  
>  
> +static const struct sbrec_port_binding *
> +get_vtep_port(const struct hmap *local_datapaths, int64_t tunnel_key)
> +{
> +    const struct local_datapath *ld = get_local_datapath(local_datapaths,
> +                                                         tunnel_key);
> +    return ld ? ld->vtep_port : NULL;
> +}
> +
> +
>  static struct zone_ids
>  get_zone_ids(const struct sbrec_port_binding *binding,
>               const struct simap *ct_zones)
> @@ -1732,7 +1741,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>      struct sset remote_chassis = SSET_INITIALIZER(&remote_chassis);
>      struct sset vtep_chassis = SSET_INITIALIZER(&vtep_chassis);
>  
> -    struct match match;
> +    struct match match, match_ramp;
>      match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
>  
>      /* Go through all of the ports in the multicast group:
> @@ -1755,10 +1764,10 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>       *      set the output port to be the router patch port for which
>       *      the redirect port was added.
>       */
> -    struct ofpbuf ofpacts;
> +    struct ofpbuf ofpacts, remote_ofpacts, remote_ofpacts_ramp;
>      ofpbuf_init(&ofpacts, 0);
> -    struct ofpbuf remote_ofpacts;
>      ofpbuf_init(&remote_ofpacts, 0);
> +    ofpbuf_init(&remote_ofpacts_ramp, 0);
>      for (size_t i = 0; i < mc->n_ports; i++) {
>          struct sbrec_port_binding *port = mc->ports[i];
>  
> @@ -1783,6 +1792,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>                  local_output_pb(port->tunnel_key, &ofpacts);
>              } else {
>                  local_output_pb(port->tunnel_key, &remote_ofpacts);
> +                local_output_pb(port->tunnel_key, &remote_ofpacts_ramp);
>              }
>          } if (!strcmp(port->type, "remote")) {
>              if (port->chassis) {
> @@ -1866,6 +1876,35 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>               * multicast group as the logical output port. */
>              put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
>                       &remote_ofpacts);
> +
> +            if (get_vtep_port(local_datapaths, mc->datapath->tunnel_key)) {

I'd declare 'match_ramp' here as it's only used on this branch of the if
statement.

> +                match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, 0,
> +                                     MLF_RCV_FROM_RAMP);
> +

Are we going to match less traffic now?  We add this "flags &
MLF_RCV_FROM_RAMP" condition to the already existing priority-100
OFTABLE_REMOTE_OUTPUT flow.  Does this break multicast traffic not
received from RAMP (vtep) ports?

> +                put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
> +                         &remote_ofpacts_ramp);
> +
> +                /* MCAST traffic which was originally received from RAMP_SWITCH
> +                 * is not allowed to be re-sent to remote_chassis.
> +                 * Process "patch" port binding for routing in
> +                 * OFTABLE_REMOTE_OUTPUT and resubmit packets to
> +                 * OFTABLE_LOCAL_OUTPUT for local delivery. */
> +
> +                match_outport_dp_and_port_keys(&match_ramp, dp_key,
> +                                               mc->tunnel_key);
> +
> +                /* Match packets coming from RAMP_SWITCH and allowed for
> +                * loopback processing (including routing). */
> +                match_set_reg_masked(&match_ramp, MFF_LOG_FLAGS - MFF_REG0,
> +                                     MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK,
> +                                     MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK);
> +
> +                put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts_ramp);
> +
> +                ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 120,
> +                                mc->header_.uuid.parts[0], &match_ramp,
> +                                &remote_ofpacts_ramp, &mc->header_.uuid);
> +            }
>          }
>  
>          fanout_to_chassis(mff_ovn_geneve, &remote_chassis, chassis_tunnels,
> @@ -1886,6 +1925,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>      }
>      ofpbuf_uninit(&ofpacts);
>      ofpbuf_uninit(&remote_ofpacts);
> +    ofpbuf_uninit(&remote_ofpacts_ramp);
>      sset_destroy(&remote_chassis);
>      sset_destroy(&vtep_chassis);
>  }
> diff --git a/northd/northd.c b/northd/northd.c
> index e713973e7..6dcbf0e1a 100644
> --- a/northd/northd.c
> +++ b/northd/northd.c
> @@ -7849,6 +7849,16 @@ build_vtep_hairpin(struct ovn_datapath *od, struct hmap *lflows)
>                            ds_cstr(&match), "next;");
>          }
>      }
> +
> +    /* ARP/Neighbor Solicitation requests must skip ls_in_arp_rsp table for
> +     * packets arrived from HW VTEP (ramp) switch.
> +     * Neighbor resolution for router ports is done in logical router ingress
> +     * pipeline.  ARP resolution for vif lports is done directly by vif ports.
> +     */
> +    ovn_lflow_add(lflows, od, S_SWITCH_IN_ARP_ND_RSP, 65535,
> +                  REGBIT_FROM_RAMP" == 1 && (arp || nd_ns)",
> +                  "flags.loopback = 1; next;");
> +
>      ds_destroy(&match);
>  }
>  
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index 8164be300..1a9e50b6f 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -1378,6 +1378,12 @@
>       </p>
>  
>      <ul>
> +      <li>
> +        If packet was received from HW VTEP (ramp switch), and this packet is
> +        ARP or Neighbor Solicitation, such packet is passed to next table with
> +        max proirity.  ARP/ND requests from HW VTEP must be handled in logical
> +        router ingress pipeline.
> +      </li>
>        <li>
>          If the logical switch has no router ports with options:arp_proxy
>          configured add a priority-100 flows to skip the ARP responder if inport
> diff --git a/tests/ovn.at b/tests/ovn.at
> index 53349530b..cb643c39d 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -4267,7 +4267,7 @@ ovn_start
>  ovn-nbctl ls-add lsw0
>  
>  ovn-nbctl lsp-add lsw0 lp1
> -ovn-nbctl lsp-set-addresses lp1 f0:00:00:00:00:01
> +ovn-nbctl lsp-set-addresses lp1 'f0:00:00:00:00:01 192.168.1.24'
>  
>  ovn-nbctl lsp-add lsw0 lp2
>  ovn-nbctl lsp-set-addresses lp2 f0:00:00:00:00:02
> @@ -4288,7 +4288,6 @@ ovn-nbctl set Logical_Switch_Port lpr \
>  ovn-nbctl lrp-set-gateway-chassis lrp1 hv1
>  
>  ovn-nbctl lsp-add lsw0 lpr2
> -ovn-nbctl lr-add lr2
>  ovn-nbctl lrp-add lr lrp2 f0:00:00:00:00:f2 192.168.1.254/24
>  ovn-nbctl set Logical_Switch_Port lpr2 \
>      type=router \
> @@ -4421,6 +4420,7 @@ spa=`ip_to_hex 192 168 1 2`
>  tpa=`ip_to_hex 192 168 1 1`
>  request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa}
>  as hv3 ovs-appctl netdev-dummy/receive vif3 $request
> +echo $request >> 1.expected
>  echo $request >> 2.expected
>  
>  lrpmac=f000000000f1
> @@ -4431,12 +4431,107 @@ response=${sha}${lrpmac}08060001080006040002${lrpmac}${tpa}${sha}${spa}
>  # we expect arp reply packet on hv3
>  echo $response >> 3.expected
>  
> +# Ensure there is no MAC_Binding, send GARP from vtep side,
> +# check that GARP was received by vif1, vif2 and MAC_Binding was created.
> +wait_row_count MAC_Binding 0 ip="192.168.1.200" mac='"f0:00:00:00:00:12"'
> +
> +sha=0200000000ff
> +tha=ffffffffffff
> +spa=`ip_to_hex 192 168 1 200`
> +tpa=$spa
> +garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
> +as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
> +echo $garp >> 1.expected
> +echo $garp >> 2.expected
> +
> +wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ff"'
> +
> +# Send same GARP with changed SHA and ensure MAC_Binding was updated.
> +sha=0200000000ee
> +garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
> +as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
> +echo $garp >> 1.expected
> +echo $garp >> 2.expected
> +
> +wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ee"'
> +
> +# Remove local ports on hv1 and check that ARP logic is stil working. -->
> +check as hv1 ovs-vsctl remove interface vif1 external_ids iface-id
> +hv_uuid=$(fetch_column Chassis _uuid name=hv1)
> +wait_row_count Port_Binding 0 chassis="$hv_uuid" type="''"
> +
> +# Cleanup MAC_Bindings after previous checks.
> +check ovn-sbctl --all destroy MAC_Binding
> +
> +# ARP request from VTEP to LRP should be responded by ARP responder.
> +sha=f00000000003
> +spa=`ip_to_hex 192 168 1 2`
> +tpa=`ip_to_hex 192 168 1 1`
> +request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa}
> +as hv3 ovs-appctl netdev-dummy/receive vif3 $request
> +echo $request >> 2.expected
> +
> +lrpmac=f000000000f1
> +response=${sha}${lrpmac}08060001080006040002${lrpmac}${tpa}${sha}${spa}
> +# since lrp1 has gateway chassis set on hv1, hv1 will suppress arp request and
> +# answer with arp reply by OVS directly to vtep lport. all other lports,
> +# except lport from which this request was initiated, will receive arp request.
> +# we expect arp reply packet on hv3
> +echo $response >> 3.expected
> +
> +# Ensure there is no MAC_Binding, send GARP from vtep side,
> +# check that GARP was received by vif1, vif2 and MAC_Binding was created.
> +wait_row_count MAC_Binding 0 ip="192.168.1.200" mac='"f0:00:00:00:00:12"'
> +
> +sha=0200000000ff
> +tha=ffffffffffff
> +spa=`ip_to_hex 192 168 1 200`
> +tpa=$spa
> +garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
> +as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
> +echo $garp >> 2.expected
> +
> +wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ff"'
> +
> +# Send same GARP with changed SHA and ensure MAC_Binding was updated.
> +sha=0200000000ee
> +garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
> +as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
> +echo $garp >> 2.expected
> +
> +wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ee"'
> +
> +# Restore.
> +check as hv1 ovs-vsctl set interface vif1 external_ids:iface-id=lp1
> +wait_row_count Port_Binding 1 chassis="$hv_uuid" type='""'
> +# <--
> +
> +# Check ARP response to request for IP on VIF port sent from vtep.
> +# ARP request from VTEP to LSP's IP must be answered only by LSP vif1 (lp1).
> +sha=f00000000003
> +spa=`ip_to_hex 192 168 1 2`
> +tpa=`ip_to_hex 192 168 1 24`
> +request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa}
> +as hv3 ovs-appctl netdev-dummy/receive vif3 $request
> +echo $request >> 1.expected
> +echo $request >> 2.expected
> +
> +# Simulate response from vif.
> +sha=f00000000001
> +tha=f00000000003
> +spa=`ip_to_hex 192 168 1 200`
> +tpa=`ip_to_hex 192 168 1 2`
> +response=${tha}${sha}08060001080006040002${sha}${spa}${tha}${tpa}
> +as hv1 ovs-appctl netdev-dummy/receive vif1 $response
> +echo $response >> 3.expected
> +
>  # First ensure basic flow contents are as we expect.
>  AT_CHECK([ovn-sbctl lflow-list lsw0 | grep 'reg0[\[14\]]' | sort | sed 's/table=../table=??/g'], [0], [dnl
>    table=??(ls_in_check_port_sec), priority=70   , match=(inport == "lp-vtep"), action=(reg0[[14]] = 1; next(pipeline=ingress, table=??);)
>    table=??(ls_in_hairpin      ), priority=1000 , match=(reg0[[14]] == 1), action=(next(pipeline=ingress, table=??);)
>    table=??(ls_in_hairpin      ), priority=2000 , match=(reg0[[14]] == 1 && is_chassis_resident("cr-lrp1")), action=(next;)
>    table=??(ls_in_hairpin      ), priority=2000 , match=(reg0[[14]] == 1 && is_chassis_resident("cr-lrp2")), action=(next;)
> +  table=??(ls_in_arp_rsp      ), priority=65535, match=(reg0[[14]] == 1 && (arp || nd_ns)), action=(flags.loopback = 1; next;)
>  ])
>  
>  # dump information with counters

Regards,
Dumitru
Vladislav Odintsov May 30, 2023, 9:59 a.m. UTC | #2
Hi Dumitru,

thanks for the review!
My answers are inline.

> On 30 May 2023, at 12:27, Dumitru Ceara <dceara@redhat.com> wrote:
> 
> On 5/19/23 20:18, Vladislav Odintsov wrote:
>> This patch is intended to make next two changes:
>> 
>> 1. Support create/update of MAC_Binding for GARP/ND from HW VTEP.
>> 
>> Prior to this patch MAC_Binding records were created only when LRP issued
>> ARP request/Neighbor solicitation.  If IP to MAC in network attached via
>> vtep lport was changed the old MAC_Binding prevented normal
>> communication.  Now router port (chassisredirect) in logical switch with
>> vtep lport catches GARP or Neighbor Solicitation and updates MAC_Binding.
>> 
>> New flow with max priority was added in ls_in_arp_nd_resp and additional
>> OF rule in table 37 (OFTABLE_REMOTE_OUTPUT) to process multicast packets
>> received from RAMP_SWITCH.
>> 
>> 2. Answer to ARP/ND on requests coming from HW VTEP in lrouter pipeline.
>> 
>> This is needed to reduce duplicated arp-responses, which were generated
>> by OVN arp-responder flows in node, where chassisredirect port located
>> with responses coming directly from VM interfaces.
>> 
>> As a result of this change, now vifs located on same chassis, where
>> chassisredirect ports are located receive ARP/ND mcast packets, which
>> previously were suppressed by arp-responder on the node.
>> 
>> Tests and documentation were updated to conform to new behaviour.
>> 
>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
>> ---
> 
> Hi Vladislav,
> 
> Thanks for the patch.
> 
>> controller/binding.c    | 48 ++++++++++++++++++++
>> controller/local_data.h |  3 ++
>> controller/physical.c   | 46 +++++++++++++++++--
>> northd/northd.c         | 10 +++++
>> northd/ovn-northd.8.xml |  6 +++
>> tests/ovn.at <http://ovn.at/>            | 99 ++++++++++++++++++++++++++++++++++++++++-
> 
> Could you please add a NEWS entry for this behavior change?

Sure, will do it in v2.

> 
>> 6 files changed, 207 insertions(+), 5 deletions(-)
>> 
>> diff --git a/controller/binding.c b/controller/binding.c
>> index c7a2b3f10..5932ad785 100644
>> --- a/controller/binding.c
>> +++ b/controller/binding.c
>> @@ -509,6 +509,30 @@ update_ld_multichassis_ports(const struct sbrec_port_binding *binding_rec,
>>     }
>> }
>> 
>> +static void
>> +update_ld_vtep_port(const struct sbrec_port_binding *binding_rec,
>> +                    struct hmap *local_datapaths)
>> +{
>> +    struct local_datapath *ld
>> +        = get_local_datapath(local_datapaths,
>> +                             binding_rec->datapath->tunnel_key);
>> +    if (!ld) {
>> +        return;
>> +    }
>> +
>> +    if (ld->vtep_port && strcmp(ld->vtep_port->logical_port,
>> +                                binding_rec->logical_port)) {
>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
>> +        VLOG_WARN_RL(&rl, "vtep port '%s' already set for datapath "
>> +                     "'%"PRId64"', skipping the new port '%s'.",
>> +                     ld->vtep_port->logical_port,
>> +                     binding_rec->datapath->tunnel_key,
>> +                     binding_rec->logical_port);
>> +        return;
>> +    }
>> +    ld->vtep_port = binding_rec;
>> +}
>> +
>> static void
>> update_ld_localnet_port(const struct sbrec_port_binding *binding_rec,
>>                         struct shash *bridge_mappings,
>> @@ -1987,6 +2011,7 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
>>     struct ovs_list external_lports = OVS_LIST_INITIALIZER(&external_lports);
>>     struct ovs_list multichassis_ports = OVS_LIST_INITIALIZER(
>>                                                         &multichassis_ports);
>> +    struct ovs_list vtep_lports = OVS_LIST_INITIALIZER(&vtep_lports);
>> 
>>     struct lport {
>>         struct ovs_list list_node;
>> @@ -2010,8 +2035,13 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
>> 
>>         switch (lport_type) {
>>         case LP_PATCH:
>> +            update_related_lport(pb, b_ctx_out);
>> +            break;
>>         case LP_VTEP:
>>             update_related_lport(pb, b_ctx_out);
>> +            struct lport *vtep_lport = xmalloc(sizeof *vtep_lport);
>> +            vtep_lport->pb = pb;
>> +            ovs_list_push_back(&vtep_lports, &vtep_lport->list_node);
>>             break;
>> 
>>         case LP_LOCALPORT:
>> @@ -2111,6 +2141,16 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
>>         free(multichassis_lport);
>>     }
>> 
>> +    /* Run through vtep lport list to see if there are vtep ports
>> +     * on local datapaths discovered from above loop, and update the
>> +     * corresponding local datapath accordingly. */
>> +    struct lport *vtep_lport;
>> +    ovs_list_size(&vtep_lports);
> 
> I guess this is a leftover.

Oops, will fix it in v2.

> 
>> +    LIST_FOR_EACH_POP (vtep_lport, list_node, &vtep_lports) {
>> +        update_ld_vtep_port(vtep_lport->pb, b_ctx_out->local_datapaths);
>> +        free(vtep_lport);
>> +    }
>> +
>>     shash_destroy(&bridge_mappings);
>>     /* Remove stale QoS entries. */
>>     ovs_qos_entries_gc(b_ctx_in->ovs_idl_txn, b_ctx_in->ovsrec_port_by_qos,
>> @@ -2175,6 +2215,11 @@ remove_pb_from_local_datapath(const struct sbrec_port_binding *pb,
>>         }
>>     } else if (!strcmp(pb->type, "external")) {
>>         remove_local_datapath_external_port(ld, pb->logical_port);
>> +    } else if (!strcmp(pb->type, "vtep")) {
>> +        if (ld->vtep_port && !strcmp(ld->vtep_port->logical_port,
>> +                                     pb->logical_port)) {
>> +            ld->vtep_port = NULL;
>> +        }
>>     }
>>     remove_local_datapath_multichassis_port(ld, pb->logical_port);
>> }
>> @@ -2836,6 +2881,7 @@ handle_updated_port(struct binding_ctx_in *b_ctx_in,
>> 
>>     case LP_VTEP:
>>         update_related_lport(pb, b_ctx_out);
>> +        update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
>>         /* VTEP lports are claimed/released by ovn-controller-vteps.
>>          * We are not sure what changed. */
>>         b_ctx_out->non_vif_ports_changed = true;
>> @@ -3091,6 +3137,8 @@ delete_done:
>>                 } else if (pb->n_additional_chassis) {
>>                     update_ld_multichassis_ports(pb,
>>                                                  b_ctx_out->local_datapaths);
>> +                } else if (lport_type == LP_VTEP) {
>> +                    update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
>>                 }
> 
> Nit: I think I'd move this "else if (lport_type == LP_VTEP) {" just
> under the LP_EXTERNAL case.

I’ll move it, but I guess it’s not possible to have additional chassis for vtep pb.

> 
>>             }
>>             sbrec_port_binding_index_destroy_row(target);
>> diff --git a/controller/local_data.h b/controller/local_data.h
>> index 748f009aa..19d13a558 100644
>> --- a/controller/local_data.h
>> +++ b/controller/local_data.h
>> @@ -50,6 +50,9 @@ struct local_datapath {
>>     /* The localnet port in this datapath, if any (at most one is allowed). */
>>     const struct sbrec_port_binding *localnet_port;
>> 
>> +    /* The vtep port in this datapath, if any (at most one is allowed). */
>> +    const struct sbrec_port_binding *vtep_port;
>> +
>>     struct {
>>         const struct sbrec_port_binding *local;
>>         const struct sbrec_port_binding *remote;
>> diff --git a/controller/physical.c b/controller/physical.c
>> index 2925dcd1d..b75e120c7 100644
>> --- a/controller/physical.c
>> +++ b/controller/physical.c
>> @@ -194,6 +194,15 @@ get_localnet_port(const struct hmap *local_datapaths, int64_t tunnel_key)
>> }
>> 
>> 
>> +static const struct sbrec_port_binding *
>> +get_vtep_port(const struct hmap *local_datapaths, int64_t tunnel_key)
>> +{
>> +    const struct local_datapath *ld = get_local_datapath(local_datapaths,
>> +                                                         tunnel_key);
>> +    return ld ? ld->vtep_port : NULL;
>> +}
>> +
>> +
>> static struct zone_ids
>> get_zone_ids(const struct sbrec_port_binding *binding,
>>              const struct simap *ct_zones)
>> @@ -1732,7 +1741,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>>     struct sset remote_chassis = SSET_INITIALIZER(&remote_chassis);
>>     struct sset vtep_chassis = SSET_INITIALIZER(&vtep_chassis);
>> 
>> -    struct match match;
>> +    struct match match, match_ramp;
>>     match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
>> 
>>     /* Go through all of the ports in the multicast group:
>> @@ -1755,10 +1764,10 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>>      *      set the output port to be the router patch port for which
>>      *      the redirect port was added.
>>      */
>> -    struct ofpbuf ofpacts;
>> +    struct ofpbuf ofpacts, remote_ofpacts, remote_ofpacts_ramp;
>>     ofpbuf_init(&ofpacts, 0);
>> -    struct ofpbuf remote_ofpacts;
>>     ofpbuf_init(&remote_ofpacts, 0);
>> +    ofpbuf_init(&remote_ofpacts_ramp, 0);
>>     for (size_t i = 0; i < mc->n_ports; i++) {
>>         struct sbrec_port_binding *port = mc->ports[i];
>> 
>> @@ -1783,6 +1792,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>>                 local_output_pb(port->tunnel_key, &ofpacts);
>>             } else {
>>                 local_output_pb(port->tunnel_key, &remote_ofpacts);
>> +                local_output_pb(port->tunnel_key, &remote_ofpacts_ramp);
>>             }
>>         } if (!strcmp(port->type, "remote")) {
>>             if (port->chassis) {
>> @@ -1866,6 +1876,35 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>>              * multicast group as the logical output port. */
>>             put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
>>                      &remote_ofpacts);
>> +
>> +            if (get_vtep_port(local_datapaths, mc->datapath->tunnel_key)) {
> 
> I'd declare 'match_ramp' here as it's only used on this branch of the if
> statement.

Okay, will be fixed in v2.

> 
>> +                match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, 0,
>> +                                     MLF_RCV_FROM_RAMP);
>> +
> 
> Are we going to match less traffic now?  We add this "flags &
> MLF_RCV_FROM_RAMP" condition to the already existing priority-100
> OFTABLE_REMOTE_OUTPUT flow.  Does this break multicast traffic not
> received from RAMP (vtep) ports?

Yes, this limits traffic matching on this flow.
Such flow matches only traffic, which was received from non-RAMP lports.
We need to flood traffic to all multicast group ports only if it was received from non-RAMP ports.
This is due to fact that flood on the directon "from vtep" is done on the vtep (ramp) switch: it floods BUM to all hypervisors.
If we don’t do it, we’ll duplicate BUM traffic (first packet is originally sent from VTEP switch to all hypervisors) and others are replicated on all hypervisors and re-sent to other hypervisors.

Please, let me know if this makes sense.

> 
>> +                put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
>> +                         &remote_ofpacts_ramp);
>> +
>> +                /* MCAST traffic which was originally received from RAMP_SWITCH
>> +                 * is not allowed to be re-sent to remote_chassis.
>> +                 * Process "patch" port binding for routing in
>> +                 * OFTABLE_REMOTE_OUTPUT and resubmit packets to
>> +                 * OFTABLE_LOCAL_OUTPUT for local delivery. */
>> +
>> +                match_outport_dp_and_port_keys(&match_ramp, dp_key,
>> +                                               mc->tunnel_key);
>> +
>> +                /* Match packets coming from RAMP_SWITCH and allowed for
>> +                * loopback processing (including routing). */
>> +                match_set_reg_masked(&match_ramp, MFF_LOG_FLAGS - MFF_REG0,
>> +                                     MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK,
>> +                                     MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK);
>> +
>> +                put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts_ramp);
>> +
>> +                ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 120,
>> +                                mc->header_.uuid.parts[0], &match_ramp,
>> +                                &remote_ofpacts_ramp, &mc->header_.uuid);
>> +            }
>>         }
>> 
>>         fanout_to_chassis(mff_ovn_geneve, &remote_chassis, chassis_tunnels,
>> @@ -1886,6 +1925,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>>     }
>>     ofpbuf_uninit(&ofpacts);
>>     ofpbuf_uninit(&remote_ofpacts);
>> +    ofpbuf_uninit(&remote_ofpacts_ramp);
>>     sset_destroy(&remote_chassis);
>>     sset_destroy(&vtep_chassis);
>> }
>> diff --git a/northd/northd.c b/northd/northd.c
>> index e713973e7..6dcbf0e1a 100644
>> --- a/northd/northd.c
>> +++ b/northd/northd.c
>> @@ -7849,6 +7849,16 @@ build_vtep_hairpin(struct ovn_datapath *od, struct hmap *lflows)
>>                           ds_cstr(&match), "next;");
>>         }
>>     }
>> +
>> +    /* ARP/Neighbor Solicitation requests must skip ls_in_arp_rsp table for
>> +     * packets arrived from HW VTEP (ramp) switch.
>> +     * Neighbor resolution for router ports is done in logical router ingress
>> +     * pipeline.  ARP resolution for vif lports is done directly by vif ports.
>> +     */
>> +    ovn_lflow_add(lflows, od, S_SWITCH_IN_ARP_ND_RSP, 65535,
>> +                  REGBIT_FROM_RAMP" == 1 && (arp || nd_ns)",
>> +                  "flags.loopback = 1; next;");
>> +
>>     ds_destroy(&match);
>> }
>> 
>> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
>> index 8164be300..1a9e50b6f 100644
>> --- a/northd/ovn-northd.8.xml
>> +++ b/northd/ovn-northd.8.xml
>> @@ -1378,6 +1378,12 @@
>>      </p>
>> 
>>     <ul>
>> +      <li>
>> +        If packet was received from HW VTEP (ramp switch), and this packet is
>> +        ARP or Neighbor Solicitation, such packet is passed to next table with
>> +        max proirity.  ARP/ND requests from HW VTEP must be handled in logical
>> +        router ingress pipeline.
>> +      </li>
>>       <li>
>>         If the logical switch has no router ports with options:arp_proxy
>>         configured add a priority-100 flows to skip the ARP responder if inport
>> diff --git a/tests/ovn.at b/tests/ovn.at
>> index 53349530b..cb643c39d 100644
>> --- a/tests/ovn.at
>> +++ b/tests/ovn.at
>> @@ -4267,7 +4267,7 @@ ovn_start
>> ovn-nbctl ls-add lsw0
>> 
>> ovn-nbctl lsp-add lsw0 lp1
>> -ovn-nbctl lsp-set-addresses lp1 f0:00:00:00:00:01
>> +ovn-nbctl lsp-set-addresses lp1 'f0:00:00:00:00:01 192.168.1.24'
>> 
>> ovn-nbctl lsp-add lsw0 lp2
>> ovn-nbctl lsp-set-addresses lp2 f0:00:00:00:00:02
>> @@ -4288,7 +4288,6 @@ ovn-nbctl set Logical_Switch_Port lpr \
>> ovn-nbctl lrp-set-gateway-chassis lrp1 hv1
>> 
>> ovn-nbctl lsp-add lsw0 lpr2
>> -ovn-nbctl lr-add lr2
>> ovn-nbctl lrp-add lr lrp2 f0:00:00:00:00:f2 192.168.1.254/24
>> ovn-nbctl set Logical_Switch_Port lpr2 \
>>     type=router \
>> @@ -4421,6 +4420,7 @@ spa=`ip_to_hex 192 168 1 2`
>> tpa=`ip_to_hex 192 168 1 1`
>> request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa}
>> as hv3 ovs-appctl netdev-dummy/receive vif3 $request
>> +echo $request >> 1.expected
>> echo $request >> 2.expected
>> 
>> lrpmac=f000000000f1
>> @@ -4431,12 +4431,107 @@ response=${sha}${lrpmac}08060001080006040002${lrpmac}${tpa}${sha}${spa}
>> # we expect arp reply packet on hv3
>> echo $response >> 3.expected
>> 
>> +# Ensure there is no MAC_Binding, send GARP from vtep side,
>> +# check that GARP was received by vif1, vif2 and MAC_Binding was created.
>> +wait_row_count MAC_Binding 0 ip="192.168.1.200" mac='"f0:00:00:00:00:12"'
>> +
>> +sha=0200000000ff
>> +tha=ffffffffffff
>> +spa=`ip_to_hex 192 168 1 200`
>> +tpa=$spa
>> +garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
>> +as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
>> +echo $garp >> 1.expected
>> +echo $garp >> 2.expected
>> +
>> +wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ff"'
>> +
>> +# Send same GARP with changed SHA and ensure MAC_Binding was updated.
>> +sha=0200000000ee
>> +garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
>> +as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
>> +echo $garp >> 1.expected
>> +echo $garp >> 2.expected
>> +
>> +wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ee"'
>> +
>> +# Remove local ports on hv1 and check that ARP logic is stil working. -->
>> +check as hv1 ovs-vsctl remove interface vif1 external_ids iface-id
>> +hv_uuid=$(fetch_column Chassis _uuid name=hv1)
>> +wait_row_count Port_Binding 0 chassis="$hv_uuid" type="''"
>> +
>> +# Cleanup MAC_Bindings after previous checks.
>> +check ovn-sbctl --all destroy MAC_Binding
>> +
>> +# ARP request from VTEP to LRP should be responded by ARP responder.
>> +sha=f00000000003
>> +spa=`ip_to_hex 192 168 1 2`
>> +tpa=`ip_to_hex 192 168 1 1`
>> +request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa}
>> +as hv3 ovs-appctl netdev-dummy/receive vif3 $request
>> +echo $request >> 2.expected
>> +
>> +lrpmac=f000000000f1
>> +response=${sha}${lrpmac}08060001080006040002${lrpmac}${tpa}${sha}${spa}
>> +# since lrp1 has gateway chassis set on hv1, hv1 will suppress arp request and
>> +# answer with arp reply by OVS directly to vtep lport. all other lports,
>> +# except lport from which this request was initiated, will receive arp request.
>> +# we expect arp reply packet on hv3
>> +echo $response >> 3.expected
>> +
>> +# Ensure there is no MAC_Binding, send GARP from vtep side,
>> +# check that GARP was received by vif1, vif2 and MAC_Binding was created.
>> +wait_row_count MAC_Binding 0 ip="192.168.1.200" mac='"f0:00:00:00:00:12"'
>> +
>> +sha=0200000000ff
>> +tha=ffffffffffff
>> +spa=`ip_to_hex 192 168 1 200`
>> +tpa=$spa
>> +garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
>> +as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
>> +echo $garp >> 2.expected
>> +
>> +wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ff"'
>> +
>> +# Send same GARP with changed SHA and ensure MAC_Binding was updated.
>> +sha=0200000000ee
>> +garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
>> +as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
>> +echo $garp >> 2.expected
>> +
>> +wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ee"'
>> +
>> +# Restore.
>> +check as hv1 ovs-vsctl set interface vif1 external_ids:iface-id=lp1
>> +wait_row_count Port_Binding 1 chassis="$hv_uuid" type='""'
>> +# <--
>> +
>> +# Check ARP response to request for IP on VIF port sent from vtep.
>> +# ARP request from VTEP to LSP's IP must be answered only by LSP vif1 (lp1).
>> +sha=f00000000003
>> +spa=`ip_to_hex 192 168 1 2`
>> +tpa=`ip_to_hex 192 168 1 24`
>> +request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa}
>> +as hv3 ovs-appctl netdev-dummy/receive vif3 $request
>> +echo $request >> 1.expected
>> +echo $request >> 2.expected
>> +
>> +# Simulate response from vif.
>> +sha=f00000000001
>> +tha=f00000000003
>> +spa=`ip_to_hex 192 168 1 200`
>> +tpa=`ip_to_hex 192 168 1 2`
>> +response=${tha}${sha}08060001080006040002${sha}${spa}${tha}${tpa}
>> +as hv1 ovs-appctl netdev-dummy/receive vif1 $response
>> +echo $response >> 3.expected
>> +
>> # First ensure basic flow contents are as we expect.
>> AT_CHECK([ovn-sbctl lflow-list lsw0 | grep 'reg0[\[14\]]' | sort | sed 's/table=../table=??/g'], [0], [dnl
>>   table=??(ls_in_check_port_sec), priority=70   , match=(inport == "lp-vtep"), action=(reg0[[14]] = 1; next(pipeline=ingress, table=??);)
>>   table=??(ls_in_hairpin      ), priority=1000 , match=(reg0[[14]] == 1), action=(next(pipeline=ingress, table=??);)
>>   table=??(ls_in_hairpin      ), priority=2000 , match=(reg0[[14]] == 1 && is_chassis_resident("cr-lrp1")), action=(next;)
>>   table=??(ls_in_hairpin      ), priority=2000 , match=(reg0[[14]] == 1 && is_chassis_resident("cr-lrp2")), action=(next;)
>> +  table=??(ls_in_arp_rsp      ), priority=65535, match=(reg0[[14]] == 1 && (arp || nd_ns)), action=(flags.loopback = 1; next;)
>> ])
>> 
>> # dump information with counters
> 
> Regards,
> Dumitru
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org <mailto:dev@openvswitch.org>
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Regards,
Vladislav Odintsov
Dumitru Ceara May 30, 2023, 10:10 a.m. UTC | #3
On 5/30/23 11:59, Vladislav Odintsov wrote:
> Hi Dumitru,
> 
> thanks for the review!
> My answers are inline.
> 
>> On 30 May 2023, at 12:27, Dumitru Ceara <dceara@redhat.com> wrote:
>>
>> On 5/19/23 20:18, Vladislav Odintsov wrote:
>>> This patch is intended to make next two changes:
>>>
>>> 1. Support create/update of MAC_Binding for GARP/ND from HW VTEP.
>>>
>>> Prior to this patch MAC_Binding records were created only when LRP issued
>>> ARP request/Neighbor solicitation.  If IP to MAC in network attached via
>>> vtep lport was changed the old MAC_Binding prevented normal
>>> communication.  Now router port (chassisredirect) in logical switch with
>>> vtep lport catches GARP or Neighbor Solicitation and updates MAC_Binding.
>>>
>>> New flow with max priority was added in ls_in_arp_nd_resp and additional
>>> OF rule in table 37 (OFTABLE_REMOTE_OUTPUT) to process multicast packets
>>> received from RAMP_SWITCH.
>>>
>>> 2. Answer to ARP/ND on requests coming from HW VTEP in lrouter pipeline.
>>>
>>> This is needed to reduce duplicated arp-responses, which were generated
>>> by OVN arp-responder flows in node, where chassisredirect port located
>>> with responses coming directly from VM interfaces.
>>>
>>> As a result of this change, now vifs located on same chassis, where
>>> chassisredirect ports are located receive ARP/ND mcast packets, which
>>> previously were suppressed by arp-responder on the node.
>>>
>>> Tests and documentation were updated to conform to new behaviour.
>>>
>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
>>> ---
>>
>> Hi Vladislav,
>>
>> Thanks for the patch.
>>
>>> controller/binding.c    | 48 ++++++++++++++++++++
>>> controller/local_data.h |  3 ++
>>> controller/physical.c   | 46 +++++++++++++++++--
>>> northd/northd.c         | 10 +++++
>>> northd/ovn-northd.8.xml |  6 +++
>>> tests/ovn.at <http://ovn.at/>            | 99
>>> ++++++++++++++++++++++++++++++++++++++++-
>>
>> Could you please add a NEWS entry for this behavior change?
> 
> Sure, will do it in v2.
> 
>>
>>> 6 files changed, 207 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/controller/binding.c b/controller/binding.c
>>> index c7a2b3f10..5932ad785 100644
>>> --- a/controller/binding.c
>>> +++ b/controller/binding.c
>>> @@ -509,6 +509,30 @@ update_ld_multichassis_ports(const struct
>>> sbrec_port_binding *binding_rec,
>>>     }
>>> }
>>>
>>> +static void
>>> +update_ld_vtep_port(const struct sbrec_port_binding *binding_rec,
>>> +                    struct hmap *local_datapaths)
>>> +{
>>> +    struct local_datapath *ld
>>> +        = get_local_datapath(local_datapaths,
>>> +                             binding_rec->datapath->tunnel_key);
>>> +    if (!ld) {
>>> +        return;
>>> +    }
>>> +
>>> +    if (ld->vtep_port && strcmp(ld->vtep_port->logical_port,
>>> +                                binding_rec->logical_port)) {
>>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
>>> +        VLOG_WARN_RL(&rl, "vtep port '%s' already set for datapath "
>>> +                     "'%"PRId64"', skipping the new port '%s'.",
>>> +                     ld->vtep_port->logical_port,
>>> +                     binding_rec->datapath->tunnel_key,
>>> +                     binding_rec->logical_port);
>>> +        return;
>>> +    }
>>> +    ld->vtep_port = binding_rec;
>>> +}
>>> +
>>> static void
>>> update_ld_localnet_port(const struct sbrec_port_binding *binding_rec,
>>>                         struct shash *bridge_mappings,
>>> @@ -1987,6 +2011,7 @@ binding_run(struct binding_ctx_in *b_ctx_in,
>>> struct binding_ctx_out *b_ctx_out)
>>>     struct ovs_list external_lports =
>>> OVS_LIST_INITIALIZER(&external_lports);
>>>     struct ovs_list multichassis_ports = OVS_LIST_INITIALIZER(
>>>                                                         &multichassis_ports);
>>> +    struct ovs_list vtep_lports = OVS_LIST_INITIALIZER(&vtep_lports);
>>>
>>>     struct lport {
>>>         struct ovs_list list_node;
>>> @@ -2010,8 +2035,13 @@ binding_run(struct binding_ctx_in *b_ctx_in,
>>> struct binding_ctx_out *b_ctx_out)
>>>
>>>         switch (lport_type) {
>>>         case LP_PATCH:
>>> +            update_related_lport(pb, b_ctx_out);
>>> +            break;
>>>         case LP_VTEP:
>>>             update_related_lport(pb, b_ctx_out);
>>> +            struct lport *vtep_lport = xmalloc(sizeof *vtep_lport);
>>> +            vtep_lport->pb = pb;
>>> +            ovs_list_push_back(&vtep_lports, &vtep_lport->list_node);
>>>             break;
>>>
>>>         case LP_LOCALPORT:
>>> @@ -2111,6 +2141,16 @@ binding_run(struct binding_ctx_in *b_ctx_in,
>>> struct binding_ctx_out *b_ctx_out)
>>>         free(multichassis_lport);
>>>     }
>>>
>>> +    /* Run through vtep lport list to see if there are vtep ports
>>> +     * on local datapaths discovered from above loop, and update the
>>> +     * corresponding local datapath accordingly. */
>>> +    struct lport *vtep_lport;
>>> +    ovs_list_size(&vtep_lports);
>>
>> I guess this is a leftover.
> 
> Oops, will fix it in v2.
> 
>>
>>> +    LIST_FOR_EACH_POP (vtep_lport, list_node, &vtep_lports) {
>>> +        update_ld_vtep_port(vtep_lport->pb, b_ctx_out->local_datapaths);
>>> +        free(vtep_lport);
>>> +    }
>>> +
>>>     shash_destroy(&bridge_mappings);
>>>     /* Remove stale QoS entries. */
>>>     ovs_qos_entries_gc(b_ctx_in->ovs_idl_txn,
>>> b_ctx_in->ovsrec_port_by_qos,
>>> @@ -2175,6 +2215,11 @@ remove_pb_from_local_datapath(const struct
>>> sbrec_port_binding *pb,
>>>         }
>>>     } else if (!strcmp(pb->type, "external")) {
>>>         remove_local_datapath_external_port(ld, pb->logical_port);
>>> +    } else if (!strcmp(pb->type, "vtep")) {
>>> +        if (ld->vtep_port && !strcmp(ld->vtep_port->logical_port,
>>> +                                     pb->logical_port)) {
>>> +            ld->vtep_port = NULL;
>>> +        }
>>>     }
>>>     remove_local_datapath_multichassis_port(ld, pb->logical_port);
>>> }
>>> @@ -2836,6 +2881,7 @@ handle_updated_port(struct binding_ctx_in
>>> *b_ctx_in,
>>>
>>>     case LP_VTEP:
>>>         update_related_lport(pb, b_ctx_out);
>>> +        update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
>>>         /* VTEP lports are claimed/released by ovn-controller-vteps.
>>>          * We are not sure what changed. */
>>>         b_ctx_out->non_vif_ports_changed = true;
>>> @@ -3091,6 +3137,8 @@ delete_done:
>>>                 } else if (pb->n_additional_chassis) {
>>>                     update_ld_multichassis_ports(pb,
>>>                                                  b_ctx_out->local_datapaths);
>>> +                } else if (lport_type == LP_VTEP) {
>>> +                    update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
>>>                 }
>>
>> Nit: I think I'd move this "else if (lport_type == LP_VTEP) {" just
>> under the LP_EXTERNAL case.
> 
> I’ll move it, but I guess it’s not possible to have additional chassis
> for vtep pb.
> 
>>
>>>             }
>>>             sbrec_port_binding_index_destroy_row(target);
>>> diff --git a/controller/local_data.h b/controller/local_data.h
>>> index 748f009aa..19d13a558 100644
>>> --- a/controller/local_data.h
>>> +++ b/controller/local_data.h
>>> @@ -50,6 +50,9 @@ struct local_datapath {
>>>     /* The localnet port in this datapath, if any (at most one is
>>> allowed). */
>>>     const struct sbrec_port_binding *localnet_port;
>>>
>>> +    /* The vtep port in this datapath, if any (at most one is
>>> allowed). */
>>> +    const struct sbrec_port_binding *vtep_port;
>>> +
>>>     struct {
>>>         const struct sbrec_port_binding *local;
>>>         const struct sbrec_port_binding *remote;
>>> diff --git a/controller/physical.c b/controller/physical.c
>>> index 2925dcd1d..b75e120c7 100644
>>> --- a/controller/physical.c
>>> +++ b/controller/physical.c
>>> @@ -194,6 +194,15 @@ get_localnet_port(const struct hmap
>>> *local_datapaths, int64_t tunnel_key)
>>> }
>>>
>>>
>>> +static const struct sbrec_port_binding *
>>> +get_vtep_port(const struct hmap *local_datapaths, int64_t tunnel_key)
>>> +{
>>> +    const struct local_datapath *ld =
>>> get_local_datapath(local_datapaths,
>>> +                                                         tunnel_key);
>>> +    return ld ? ld->vtep_port : NULL;
>>> +}
>>> +
>>> +
>>> static struct zone_ids
>>> get_zone_ids(const struct sbrec_port_binding *binding,
>>>              const struct simap *ct_zones)
>>> @@ -1732,7 +1741,7 @@ consider_mc_group(struct ovsdb_idl_index
>>> *sbrec_port_binding_by_name,
>>>     struct sset remote_chassis = SSET_INITIALIZER(&remote_chassis);
>>>     struct sset vtep_chassis = SSET_INITIALIZER(&vtep_chassis);
>>>
>>> -    struct match match;
>>> +    struct match match, match_ramp;
>>>     match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
>>>
>>>     /* Go through all of the ports in the multicast group:
>>> @@ -1755,10 +1764,10 @@ consider_mc_group(struct ovsdb_idl_index
>>> *sbrec_port_binding_by_name,
>>>      *      set the output port to be the router patch port for which
>>>      *      the redirect port was added.
>>>      */
>>> -    struct ofpbuf ofpacts;
>>> +    struct ofpbuf ofpacts, remote_ofpacts, remote_ofpacts_ramp;
>>>     ofpbuf_init(&ofpacts, 0);
>>> -    struct ofpbuf remote_ofpacts;
>>>     ofpbuf_init(&remote_ofpacts, 0);
>>> +    ofpbuf_init(&remote_ofpacts_ramp, 0);
>>>     for (size_t i = 0; i < mc->n_ports; i++) {
>>>         struct sbrec_port_binding *port = mc->ports[i];
>>>
>>> @@ -1783,6 +1792,7 @@ consider_mc_group(struct ovsdb_idl_index
>>> *sbrec_port_binding_by_name,
>>>                 local_output_pb(port->tunnel_key, &ofpacts);
>>>             } else {
>>>                 local_output_pb(port->tunnel_key, &remote_ofpacts);
>>> +                local_output_pb(port->tunnel_key, &remote_ofpacts_ramp);
>>>             }
>>>         } if (!strcmp(port->type, "remote")) {
>>>             if (port->chassis) {
>>> @@ -1866,6 +1876,35 @@ consider_mc_group(struct ovsdb_idl_index
>>> *sbrec_port_binding_by_name,
>>>              * multicast group as the logical output port. */
>>>             put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
>>>                      &remote_ofpacts);
>>> +
>>> +            if (get_vtep_port(local_datapaths,
>>> mc->datapath->tunnel_key)) {
>>
>> I'd declare 'match_ramp' here as it's only used on this branch of the if
>> statement.
> 
> Okay, will be fixed in v2.
> 
>>
>>> +                match_set_reg_masked(&match, MFF_LOG_FLAGS -
>>> MFF_REG0, 0,
>>> +                                     MLF_RCV_FROM_RAMP);
>>> +
>>
>> Are we going to match less traffic now?  We add this "flags &
>> MLF_RCV_FROM_RAMP" condition to the already existing priority-100
>> OFTABLE_REMOTE_OUTPUT flow.  Does this break multicast traffic not
>> received from RAMP (vtep) ports?
> 
> Yes, this limits traffic matching on this flow.
> Such flow matches only traffic, which was received from non-RAMP lports.
> We need to flood traffic to all multicast group ports only if it was
> received from non-RAMP ports.
> This is due to fact that flood on the directon "from vtep" is done on
> the vtep (ramp) switch: it floods BUM to all hypervisors.
> If we don’t do it, we’ll duplicate BUM traffic (first packet is
> originally sent from VTEP switch to all hypervisors) and others are
> replicated on all hypervisors and re-sent to other hypervisors.
> 
> Please, let me know if this makes sense.
> 

It does, sorry for the noise.  I missed the value = 0 argument.

Thanks for the clarification!

I'm planning to apply the first 4 patches today and I'll look forward
for a v2 of this patch afterwards.

Regards,
Dumitru
Vladislav Odintsov May 30, 2023, 10:12 a.m. UTC | #4
> On 30 May 2023, at 13:10, Dumitru Ceara <dceara@redhat.com> wrote:
> 
> On 5/30/23 11:59, Vladislav Odintsov wrote:
>> Hi Dumitru,
>> 
>> thanks for the review!
>> My answers are inline.
>> 
>>> On 30 May 2023, at 12:27, Dumitru Ceara <dceara@redhat.com> wrote:
>>> 
>>> On 5/19/23 20:18, Vladislav Odintsov wrote:
>>>> This patch is intended to make next two changes:
>>>> 
>>>> 1. Support create/update of MAC_Binding for GARP/ND from HW VTEP.
>>>> 
>>>> Prior to this patch MAC_Binding records were created only when LRP issued
>>>> ARP request/Neighbor solicitation.  If IP to MAC in network attached via
>>>> vtep lport was changed the old MAC_Binding prevented normal
>>>> communication.  Now router port (chassisredirect) in logical switch with
>>>> vtep lport catches GARP or Neighbor Solicitation and updates MAC_Binding.
>>>> 
>>>> New flow with max priority was added in ls_in_arp_nd_resp and additional
>>>> OF rule in table 37 (OFTABLE_REMOTE_OUTPUT) to process multicast packets
>>>> received from RAMP_SWITCH.
>>>> 
>>>> 2. Answer to ARP/ND on requests coming from HW VTEP in lrouter pipeline.
>>>> 
>>>> This is needed to reduce duplicated arp-responses, which were generated
>>>> by OVN arp-responder flows in node, where chassisredirect port located
>>>> with responses coming directly from VM interfaces.
>>>> 
>>>> As a result of this change, now vifs located on same chassis, where
>>>> chassisredirect ports are located receive ARP/ND mcast packets, which
>>>> previously were suppressed by arp-responder on the node.
>>>> 
>>>> Tests and documentation were updated to conform to new behaviour.
>>>> 
>>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
>>>> ---
>>> 
>>> Hi Vladislav,
>>> 
>>> Thanks for the patch.
>>> 
>>>> controller/binding.c    | 48 ++++++++++++++++++++
>>>> controller/local_data.h |  3 ++
>>>> controller/physical.c   | 46 +++++++++++++++++--
>>>> northd/northd.c         | 10 +++++
>>>> northd/ovn-northd.8.xml |  6 +++
>>>> tests/ovn.at <http://ovn.at/>            | 99
>>>> ++++++++++++++++++++++++++++++++++++++++-
>>> 
>>> Could you please add a NEWS entry for this behavior change?
>> 
>> Sure, will do it in v2.
>> 
>>> 
>>>> 6 files changed, 207 insertions(+), 5 deletions(-)
>>>> 
>>>> diff --git a/controller/binding.c b/controller/binding.c
>>>> index c7a2b3f10..5932ad785 100644
>>>> --- a/controller/binding.c
>>>> +++ b/controller/binding.c
>>>> @@ -509,6 +509,30 @@ update_ld_multichassis_ports(const struct
>>>> sbrec_port_binding *binding_rec,
>>>>     }
>>>> }
>>>> 
>>>> +static void
>>>> +update_ld_vtep_port(const struct sbrec_port_binding *binding_rec,
>>>> +                    struct hmap *local_datapaths)
>>>> +{
>>>> +    struct local_datapath *ld
>>>> +        = get_local_datapath(local_datapaths,
>>>> +                             binding_rec->datapath->tunnel_key);
>>>> +    if (!ld) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    if (ld->vtep_port && strcmp(ld->vtep_port->logical_port,
>>>> +                                binding_rec->logical_port)) {
>>>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
>>>> +        VLOG_WARN_RL(&rl, "vtep port '%s' already set for datapath "
>>>> +                     "'%"PRId64"', skipping the new port '%s'.",
>>>> +                     ld->vtep_port->logical_port,
>>>> +                     binding_rec->datapath->tunnel_key,
>>>> +                     binding_rec->logical_port);
>>>> +        return;
>>>> +    }
>>>> +    ld->vtep_port = binding_rec;
>>>> +}
>>>> +
>>>> static void
>>>> update_ld_localnet_port(const struct sbrec_port_binding *binding_rec,
>>>>                         struct shash *bridge_mappings,
>>>> @@ -1987,6 +2011,7 @@ binding_run(struct binding_ctx_in *b_ctx_in,
>>>> struct binding_ctx_out *b_ctx_out)
>>>>     struct ovs_list external_lports =
>>>> OVS_LIST_INITIALIZER(&external_lports);
>>>>     struct ovs_list multichassis_ports = OVS_LIST_INITIALIZER(
>>>>                                                         &multichassis_ports);
>>>> +    struct ovs_list vtep_lports = OVS_LIST_INITIALIZER(&vtep_lports);
>>>> 
>>>>     struct lport {
>>>>         struct ovs_list list_node;
>>>> @@ -2010,8 +2035,13 @@ binding_run(struct binding_ctx_in *b_ctx_in,
>>>> struct binding_ctx_out *b_ctx_out)
>>>> 
>>>>         switch (lport_type) {
>>>>         case LP_PATCH:
>>>> +            update_related_lport(pb, b_ctx_out);
>>>> +            break;
>>>>         case LP_VTEP:
>>>>             update_related_lport(pb, b_ctx_out);
>>>> +            struct lport *vtep_lport = xmalloc(sizeof *vtep_lport);
>>>> +            vtep_lport->pb = pb;
>>>> +            ovs_list_push_back(&vtep_lports, &vtep_lport->list_node);
>>>>             break;
>>>> 
>>>>         case LP_LOCALPORT:
>>>> @@ -2111,6 +2141,16 @@ binding_run(struct binding_ctx_in *b_ctx_in,
>>>> struct binding_ctx_out *b_ctx_out)
>>>>         free(multichassis_lport);
>>>>     }
>>>> 
>>>> +    /* Run through vtep lport list to see if there are vtep ports
>>>> +     * on local datapaths discovered from above loop, and update the
>>>> +     * corresponding local datapath accordingly. */
>>>> +    struct lport *vtep_lport;
>>>> +    ovs_list_size(&vtep_lports);
>>> 
>>> I guess this is a leftover.
>> 
>> Oops, will fix it in v2.
>> 
>>> 
>>>> +    LIST_FOR_EACH_POP (vtep_lport, list_node, &vtep_lports) {
>>>> +        update_ld_vtep_port(vtep_lport->pb, b_ctx_out->local_datapaths);
>>>> +        free(vtep_lport);
>>>> +    }
>>>> +
>>>>     shash_destroy(&bridge_mappings);
>>>>     /* Remove stale QoS entries. */
>>>>     ovs_qos_entries_gc(b_ctx_in->ovs_idl_txn,
>>>> b_ctx_in->ovsrec_port_by_qos,
>>>> @@ -2175,6 +2215,11 @@ remove_pb_from_local_datapath(const struct
>>>> sbrec_port_binding *pb,
>>>>         }
>>>>     } else if (!strcmp(pb->type, "external")) {
>>>>         remove_local_datapath_external_port(ld, pb->logical_port);
>>>> +    } else if (!strcmp(pb->type, "vtep")) {
>>>> +        if (ld->vtep_port && !strcmp(ld->vtep_port->logical_port,
>>>> +                                     pb->logical_port)) {
>>>> +            ld->vtep_port = NULL;
>>>> +        }
>>>>     }
>>>>     remove_local_datapath_multichassis_port(ld, pb->logical_port);
>>>> }
>>>> @@ -2836,6 +2881,7 @@ handle_updated_port(struct binding_ctx_in
>>>> *b_ctx_in,
>>>> 
>>>>     case LP_VTEP:
>>>>         update_related_lport(pb, b_ctx_out);
>>>> +        update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
>>>>         /* VTEP lports are claimed/released by ovn-controller-vteps.
>>>>          * We are not sure what changed. */
>>>>         b_ctx_out->non_vif_ports_changed = true;
>>>> @@ -3091,6 +3137,8 @@ delete_done:
>>>>                 } else if (pb->n_additional_chassis) {
>>>>                     update_ld_multichassis_ports(pb,
>>>>                                                  b_ctx_out->local_datapaths);
>>>> +                } else if (lport_type == LP_VTEP) {
>>>> +                    update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
>>>>                 }
>>> 
>>> Nit: I think I'd move this "else if (lport_type == LP_VTEP) {" just
>>> under the LP_EXTERNAL case.
>> 
>> I’ll move it, but I guess it’s not possible to have additional chassis
>> for vtep pb.
>> 
>>> 
>>>>             }
>>>>             sbrec_port_binding_index_destroy_row(target);
>>>> diff --git a/controller/local_data.h b/controller/local_data.h
>>>> index 748f009aa..19d13a558 100644
>>>> --- a/controller/local_data.h
>>>> +++ b/controller/local_data.h
>>>> @@ -50,6 +50,9 @@ struct local_datapath {
>>>>     /* The localnet port in this datapath, if any (at most one is
>>>> allowed). */
>>>>     const struct sbrec_port_binding *localnet_port;
>>>> 
>>>> +    /* The vtep port in this datapath, if any (at most one is
>>>> allowed). */
>>>> +    const struct sbrec_port_binding *vtep_port;
>>>> +
>>>>     struct {
>>>>         const struct sbrec_port_binding *local;
>>>>         const struct sbrec_port_binding *remote;
>>>> diff --git a/controller/physical.c b/controller/physical.c
>>>> index 2925dcd1d..b75e120c7 100644
>>>> --- a/controller/physical.c
>>>> +++ b/controller/physical.c
>>>> @@ -194,6 +194,15 @@ get_localnet_port(const struct hmap
>>>> *local_datapaths, int64_t tunnel_key)
>>>> }
>>>> 
>>>> 
>>>> +static const struct sbrec_port_binding *
>>>> +get_vtep_port(const struct hmap *local_datapaths, int64_t tunnel_key)
>>>> +{
>>>> +    const struct local_datapath *ld =
>>>> get_local_datapath(local_datapaths,
>>>> +                                                         tunnel_key);
>>>> +    return ld ? ld->vtep_port : NULL;
>>>> +}
>>>> +
>>>> +
>>>> static struct zone_ids
>>>> get_zone_ids(const struct sbrec_port_binding *binding,
>>>>              const struct simap *ct_zones)
>>>> @@ -1732,7 +1741,7 @@ consider_mc_group(struct ovsdb_idl_index
>>>> *sbrec_port_binding_by_name,
>>>>     struct sset remote_chassis = SSET_INITIALIZER(&remote_chassis);
>>>>     struct sset vtep_chassis = SSET_INITIALIZER(&vtep_chassis);
>>>> 
>>>> -    struct match match;
>>>> +    struct match match, match_ramp;
>>>>     match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
>>>> 
>>>>     /* Go through all of the ports in the multicast group:
>>>> @@ -1755,10 +1764,10 @@ consider_mc_group(struct ovsdb_idl_index
>>>> *sbrec_port_binding_by_name,
>>>>      *      set the output port to be the router patch port for which
>>>>      *      the redirect port was added.
>>>>      */
>>>> -    struct ofpbuf ofpacts;
>>>> +    struct ofpbuf ofpacts, remote_ofpacts, remote_ofpacts_ramp;
>>>>     ofpbuf_init(&ofpacts, 0);
>>>> -    struct ofpbuf remote_ofpacts;
>>>>     ofpbuf_init(&remote_ofpacts, 0);
>>>> +    ofpbuf_init(&remote_ofpacts_ramp, 0);
>>>>     for (size_t i = 0; i < mc->n_ports; i++) {
>>>>         struct sbrec_port_binding *port = mc->ports[i];
>>>> 
>>>> @@ -1783,6 +1792,7 @@ consider_mc_group(struct ovsdb_idl_index
>>>> *sbrec_port_binding_by_name,
>>>>                 local_output_pb(port->tunnel_key, &ofpacts);
>>>>             } else {
>>>>                 local_output_pb(port->tunnel_key, &remote_ofpacts);
>>>> +                local_output_pb(port->tunnel_key, &remote_ofpacts_ramp);
>>>>             }
>>>>         } if (!strcmp(port->type, "remote")) {
>>>>             if (port->chassis) {
>>>> @@ -1866,6 +1876,35 @@ consider_mc_group(struct ovsdb_idl_index
>>>> *sbrec_port_binding_by_name,
>>>>              * multicast group as the logical output port. */
>>>>             put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
>>>>                      &remote_ofpacts);
>>>> +
>>>> +            if (get_vtep_port(local_datapaths,
>>>> mc->datapath->tunnel_key)) {
>>> 
>>> I'd declare 'match_ramp' here as it's only used on this branch of the if
>>> statement.
>> 
>> Okay, will be fixed in v2.
>> 
>>> 
>>>> +                match_set_reg_masked(&match, MFF_LOG_FLAGS -
>>>> MFF_REG0, 0,
>>>> +                                     MLF_RCV_FROM_RAMP);
>>>> +
>>> 
>>> Are we going to match less traffic now?  We add this "flags &
>>> MLF_RCV_FROM_RAMP" condition to the already existing priority-100
>>> OFTABLE_REMOTE_OUTPUT flow.  Does this break multicast traffic not
>>> received from RAMP (vtep) ports?
>> 
>> Yes, this limits traffic matching on this flow.
>> Such flow matches only traffic, which was received from non-RAMP lports.
>> We need to flood traffic to all multicast group ports only if it was
>> received from non-RAMP ports.
>> This is due to fact that flood on the directon "from vtep" is done on
>> the vtep (ramp) switch: it floods BUM to all hypervisors.
>> If we don’t do it, we’ll duplicate BUM traffic (first packet is
>> originally sent from VTEP switch to all hypervisors) and others are
>> replicated on all hypervisors and re-sent to other hypervisors.
>> 
>> Please, let me know if this makes sense.
>> 
> 
> It does, sorry for the noise.  I missed the value = 0 argument.
> 
> Thanks for the clarification!
> 
> I'm planning to apply the first 4 patches today and I'll look forward
> for a v2 of this patch afterwards.

Okay, thanks.
I’ll submit v2 right after #1-#4 merge.

> 
> Regards,
> Dumitru
> 


Regards,
Vladislav Odintsov
Vladislav Odintsov May 30, 2023, 3:05 p.m. UTC | #5
v2 of patch #5 was submitted:

https://patchwork.ozlabs.org/project/ovn/patch/20230530150328.1224972-1-odivlad@gmail.com/

> On 30 May 2023, at 13:12, Vladislav Odintsov <odivlad@gmail.com> wrote:
> 
> 
> 
>> On 30 May 2023, at 13:10, Dumitru Ceara <dceara@redhat.com> wrote:
>> 
>> On 5/30/23 11:59, Vladislav Odintsov wrote:
>>> Hi Dumitru,
>>> 
>>> thanks for the review!
>>> My answers are inline.
>>> 
>>>> On 30 May 2023, at 12:27, Dumitru Ceara <dceara@redhat.com> wrote:
>>>> 
>>>> On 5/19/23 20:18, Vladislav Odintsov wrote:
>>>>> This patch is intended to make next two changes:
>>>>> 
>>>>> 1. Support create/update of MAC_Binding for GARP/ND from HW VTEP.
>>>>> 
>>>>> Prior to this patch MAC_Binding records were created only when LRP issued
>>>>> ARP request/Neighbor solicitation.  If IP to MAC in network attached via
>>>>> vtep lport was changed the old MAC_Binding prevented normal
>>>>> communication.  Now router port (chassisredirect) in logical switch with
>>>>> vtep lport catches GARP or Neighbor Solicitation and updates MAC_Binding.
>>>>> 
>>>>> New flow with max priority was added in ls_in_arp_nd_resp and additional
>>>>> OF rule in table 37 (OFTABLE_REMOTE_OUTPUT) to process multicast packets
>>>>> received from RAMP_SWITCH.
>>>>> 
>>>>> 2. Answer to ARP/ND on requests coming from HW VTEP in lrouter pipeline.
>>>>> 
>>>>> This is needed to reduce duplicated arp-responses, which were generated
>>>>> by OVN arp-responder flows in node, where chassisredirect port located
>>>>> with responses coming directly from VM interfaces.
>>>>> 
>>>>> As a result of this change, now vifs located on same chassis, where
>>>>> chassisredirect ports are located receive ARP/ND mcast packets, which
>>>>> previously were suppressed by arp-responder on the node.
>>>>> 
>>>>> Tests and documentation were updated to conform to new behaviour.
>>>>> 
>>>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
>>>>> ---
>>>> 
>>>> Hi Vladislav,
>>>> 
>>>> Thanks for the patch.
>>>> 
>>>>> controller/binding.c    | 48 ++++++++++++++++++++
>>>>> controller/local_data.h |  3 ++
>>>>> controller/physical.c   | 46 +++++++++++++++++--
>>>>> northd/northd.c         | 10 +++++
>>>>> northd/ovn-northd.8.xml |  6 +++
>>>>> tests/ovn.at <http://ovn.at/>            | 99
>>>>> ++++++++++++++++++++++++++++++++++++++++-
>>>> 
>>>> Could you please add a NEWS entry for this behavior change?
>>> 
>>> Sure, will do it in v2.
>>> 
>>>> 
>>>>> 6 files changed, 207 insertions(+), 5 deletions(-)
>>>>> 
>>>>> diff --git a/controller/binding.c b/controller/binding.c
>>>>> index c7a2b3f10..5932ad785 100644
>>>>> --- a/controller/binding.c
>>>>> +++ b/controller/binding.c
>>>>> @@ -509,6 +509,30 @@ update_ld_multichassis_ports(const struct
>>>>> sbrec_port_binding *binding_rec,
>>>>>    }
>>>>> }
>>>>> 
>>>>> +static void
>>>>> +update_ld_vtep_port(const struct sbrec_port_binding *binding_rec,
>>>>> +                    struct hmap *local_datapaths)
>>>>> +{
>>>>> +    struct local_datapath *ld
>>>>> +        = get_local_datapath(local_datapaths,
>>>>> +                             binding_rec->datapath->tunnel_key);
>>>>> +    if (!ld) {
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    if (ld->vtep_port && strcmp(ld->vtep_port->logical_port,
>>>>> +                                binding_rec->logical_port)) {
>>>>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
>>>>> +        VLOG_WARN_RL(&rl, "vtep port '%s' already set for datapath "
>>>>> +                     "'%"PRId64"', skipping the new port '%s'.",
>>>>> +                     ld->vtep_port->logical_port,
>>>>> +                     binding_rec->datapath->tunnel_key,
>>>>> +                     binding_rec->logical_port);
>>>>> +        return;
>>>>> +    }
>>>>> +    ld->vtep_port = binding_rec;
>>>>> +}
>>>>> +
>>>>> static void
>>>>> update_ld_localnet_port(const struct sbrec_port_binding *binding_rec,
>>>>>                        struct shash *bridge_mappings,
>>>>> @@ -1987,6 +2011,7 @@ binding_run(struct binding_ctx_in *b_ctx_in,
>>>>> struct binding_ctx_out *b_ctx_out)
>>>>>    struct ovs_list external_lports =
>>>>> OVS_LIST_INITIALIZER(&external_lports);
>>>>>    struct ovs_list multichassis_ports = OVS_LIST_INITIALIZER(
>>>>>                                                        &multichassis_ports);
>>>>> +    struct ovs_list vtep_lports = OVS_LIST_INITIALIZER(&vtep_lports);
>>>>> 
>>>>>    struct lport {
>>>>>        struct ovs_list list_node;
>>>>> @@ -2010,8 +2035,13 @@ binding_run(struct binding_ctx_in *b_ctx_in,
>>>>> struct binding_ctx_out *b_ctx_out)
>>>>> 
>>>>>        switch (lport_type) {
>>>>>        case LP_PATCH:
>>>>> +            update_related_lport(pb, b_ctx_out);
>>>>> +            break;
>>>>>        case LP_VTEP:
>>>>>            update_related_lport(pb, b_ctx_out);
>>>>> +            struct lport *vtep_lport = xmalloc(sizeof *vtep_lport);
>>>>> +            vtep_lport->pb = pb;
>>>>> +            ovs_list_push_back(&vtep_lports, &vtep_lport->list_node);
>>>>>            break;
>>>>> 
>>>>>        case LP_LOCALPORT:
>>>>> @@ -2111,6 +2141,16 @@ binding_run(struct binding_ctx_in *b_ctx_in,
>>>>> struct binding_ctx_out *b_ctx_out)
>>>>>        free(multichassis_lport);
>>>>>    }
>>>>> 
>>>>> +    /* Run through vtep lport list to see if there are vtep ports
>>>>> +     * on local datapaths discovered from above loop, and update the
>>>>> +     * corresponding local datapath accordingly. */
>>>>> +    struct lport *vtep_lport;
>>>>> +    ovs_list_size(&vtep_lports);
>>>> 
>>>> I guess this is a leftover.
>>> 
>>> Oops, will fix it in v2.
>>> 
>>>> 
>>>>> +    LIST_FOR_EACH_POP (vtep_lport, list_node, &vtep_lports) {
>>>>> +        update_ld_vtep_port(vtep_lport->pb, b_ctx_out->local_datapaths);
>>>>> +        free(vtep_lport);
>>>>> +    }
>>>>> +
>>>>>    shash_destroy(&bridge_mappings);
>>>>>    /* Remove stale QoS entries. */
>>>>>    ovs_qos_entries_gc(b_ctx_in->ovs_idl_txn,
>>>>> b_ctx_in->ovsrec_port_by_qos,
>>>>> @@ -2175,6 +2215,11 @@ remove_pb_from_local_datapath(const struct
>>>>> sbrec_port_binding *pb,
>>>>>        }
>>>>>    } else if (!strcmp(pb->type, "external")) {
>>>>>        remove_local_datapath_external_port(ld, pb->logical_port);
>>>>> +    } else if (!strcmp(pb->type, "vtep")) {
>>>>> +        if (ld->vtep_port && !strcmp(ld->vtep_port->logical_port,
>>>>> +                                     pb->logical_port)) {
>>>>> +            ld->vtep_port = NULL;
>>>>> +        }
>>>>>    }
>>>>>    remove_local_datapath_multichassis_port(ld, pb->logical_port);
>>>>> }
>>>>> @@ -2836,6 +2881,7 @@ handle_updated_port(struct binding_ctx_in
>>>>> *b_ctx_in,
>>>>> 
>>>>>    case LP_VTEP:
>>>>>        update_related_lport(pb, b_ctx_out);
>>>>> +        update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
>>>>>        /* VTEP lports are claimed/released by ovn-controller-vteps.
>>>>>         * We are not sure what changed. */
>>>>>        b_ctx_out->non_vif_ports_changed = true;
>>>>> @@ -3091,6 +3137,8 @@ delete_done:
>>>>>                } else if (pb->n_additional_chassis) {
>>>>>                    update_ld_multichassis_ports(pb,
>>>>>                                                 b_ctx_out->local_datapaths);
>>>>> +                } else if (lport_type == LP_VTEP) {
>>>>> +                    update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
>>>>>                }
>>>> 
>>>> Nit: I think I'd move this "else if (lport_type == LP_VTEP) {" just
>>>> under the LP_EXTERNAL case.
>>> 
>>> I’ll move it, but I guess it’s not possible to have additional chassis
>>> for vtep pb.
>>> 
>>>> 
>>>>>            }
>>>>>            sbrec_port_binding_index_destroy_row(target);
>>>>> diff --git a/controller/local_data.h b/controller/local_data.h
>>>>> index 748f009aa..19d13a558 100644
>>>>> --- a/controller/local_data.h
>>>>> +++ b/controller/local_data.h
>>>>> @@ -50,6 +50,9 @@ struct local_datapath {
>>>>>    /* The localnet port in this datapath, if any (at most one is
>>>>> allowed). */
>>>>>    const struct sbrec_port_binding *localnet_port;
>>>>> 
>>>>> +    /* The vtep port in this datapath, if any (at most one is
>>>>> allowed). */
>>>>> +    const struct sbrec_port_binding *vtep_port;
>>>>> +
>>>>>    struct {
>>>>>        const struct sbrec_port_binding *local;
>>>>>        const struct sbrec_port_binding *remote;
>>>>> diff --git a/controller/physical.c b/controller/physical.c
>>>>> index 2925dcd1d..b75e120c7 100644
>>>>> --- a/controller/physical.c
>>>>> +++ b/controller/physical.c
>>>>> @@ -194,6 +194,15 @@ get_localnet_port(const struct hmap
>>>>> *local_datapaths, int64_t tunnel_key)
>>>>> }
>>>>> 
>>>>> 
>>>>> +static const struct sbrec_port_binding *
>>>>> +get_vtep_port(const struct hmap *local_datapaths, int64_t tunnel_key)
>>>>> +{
>>>>> +    const struct local_datapath *ld =
>>>>> get_local_datapath(local_datapaths,
>>>>> +                                                         tunnel_key);
>>>>> +    return ld ? ld->vtep_port : NULL;
>>>>> +}
>>>>> +
>>>>> +
>>>>> static struct zone_ids
>>>>> get_zone_ids(const struct sbrec_port_binding *binding,
>>>>>             const struct simap *ct_zones)
>>>>> @@ -1732,7 +1741,7 @@ consider_mc_group(struct ovsdb_idl_index
>>>>> *sbrec_port_binding_by_name,
>>>>>    struct sset remote_chassis = SSET_INITIALIZER(&remote_chassis);
>>>>>    struct sset vtep_chassis = SSET_INITIALIZER(&vtep_chassis);
>>>>> 
>>>>> -    struct match match;
>>>>> +    struct match match, match_ramp;
>>>>>    match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
>>>>> 
>>>>>    /* Go through all of the ports in the multicast group:
>>>>> @@ -1755,10 +1764,10 @@ consider_mc_group(struct ovsdb_idl_index
>>>>> *sbrec_port_binding_by_name,
>>>>>     *      set the output port to be the router patch port for which
>>>>>     *      the redirect port was added.
>>>>>     */
>>>>> -    struct ofpbuf ofpacts;
>>>>> +    struct ofpbuf ofpacts, remote_ofpacts, remote_ofpacts_ramp;
>>>>>    ofpbuf_init(&ofpacts, 0);
>>>>> -    struct ofpbuf remote_ofpacts;
>>>>>    ofpbuf_init(&remote_ofpacts, 0);
>>>>> +    ofpbuf_init(&remote_ofpacts_ramp, 0);
>>>>>    for (size_t i = 0; i < mc->n_ports; i++) {
>>>>>        struct sbrec_port_binding *port = mc->ports[i];
>>>>> 
>>>>> @@ -1783,6 +1792,7 @@ consider_mc_group(struct ovsdb_idl_index
>>>>> *sbrec_port_binding_by_name,
>>>>>                local_output_pb(port->tunnel_key, &ofpacts);
>>>>>            } else {
>>>>>                local_output_pb(port->tunnel_key, &remote_ofpacts);
>>>>> +                local_output_pb(port->tunnel_key, &remote_ofpacts_ramp);
>>>>>            }
>>>>>        } if (!strcmp(port->type, "remote")) {
>>>>>            if (port->chassis) {
>>>>> @@ -1866,6 +1876,35 @@ consider_mc_group(struct ovsdb_idl_index
>>>>> *sbrec_port_binding_by_name,
>>>>>             * multicast group as the logical output port. */
>>>>>            put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
>>>>>                     &remote_ofpacts);
>>>>> +
>>>>> +            if (get_vtep_port(local_datapaths,
>>>>> mc->datapath->tunnel_key)) {
>>>> 
>>>> I'd declare 'match_ramp' here as it's only used on this branch of the if
>>>> statement.
>>> 
>>> Okay, will be fixed in v2.
>>> 
>>>> 
>>>>> +                match_set_reg_masked(&match, MFF_LOG_FLAGS -
>>>>> MFF_REG0, 0,
>>>>> +                                     MLF_RCV_FROM_RAMP);
>>>>> +
>>>> 
>>>> Are we going to match less traffic now?  We add this "flags &
>>>> MLF_RCV_FROM_RAMP" condition to the already existing priority-100
>>>> OFTABLE_REMOTE_OUTPUT flow.  Does this break multicast traffic not
>>>> received from RAMP (vtep) ports?
>>> 
>>> Yes, this limits traffic matching on this flow.
>>> Such flow matches only traffic, which was received from non-RAMP lports.
>>> We need to flood traffic to all multicast group ports only if it was
>>> received from non-RAMP ports.
>>> This is due to fact that flood on the directon "from vtep" is done on
>>> the vtep (ramp) switch: it floods BUM to all hypervisors.
>>> If we don’t do it, we’ll duplicate BUM traffic (first packet is
>>> originally sent from VTEP switch to all hypervisors) and others are
>>> replicated on all hypervisors and re-sent to other hypervisors.
>>> 
>>> Please, let me know if this makes sense.
>>> 
>> 
>> It does, sorry for the noise.  I missed the value = 0 argument.
>> 
>> Thanks for the clarification!
>> 
>> I'm planning to apply the first 4 patches today and I'll look forward
>> for a v2 of this patch afterwards.
> 
> Okay, thanks.
> I’ll submit v2 right after #1-#4 merge.
> 
>> 
>> Regards,
>> Dumitru
>> 
> 
> 
> Regards,
> Vladislav Odintsov
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org <mailto:dev@openvswitch.org>
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Regards,
Vladislav Odintsov
diff mbox series

Patch

diff --git a/controller/binding.c b/controller/binding.c
index c7a2b3f10..5932ad785 100644
--- a/controller/binding.c
+++ b/controller/binding.c
@@ -509,6 +509,30 @@  update_ld_multichassis_ports(const struct sbrec_port_binding *binding_rec,
     }
 }
 
+static void
+update_ld_vtep_port(const struct sbrec_port_binding *binding_rec,
+                    struct hmap *local_datapaths)
+{
+    struct local_datapath *ld
+        = get_local_datapath(local_datapaths,
+                             binding_rec->datapath->tunnel_key);
+    if (!ld) {
+        return;
+    }
+
+    if (ld->vtep_port && strcmp(ld->vtep_port->logical_port,
+                                binding_rec->logical_port)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+        VLOG_WARN_RL(&rl, "vtep port '%s' already set for datapath "
+                     "'%"PRId64"', skipping the new port '%s'.",
+                     ld->vtep_port->logical_port,
+                     binding_rec->datapath->tunnel_key,
+                     binding_rec->logical_port);
+        return;
+    }
+    ld->vtep_port = binding_rec;
+}
+
 static void
 update_ld_localnet_port(const struct sbrec_port_binding *binding_rec,
                         struct shash *bridge_mappings,
@@ -1987,6 +2011,7 @@  binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
     struct ovs_list external_lports = OVS_LIST_INITIALIZER(&external_lports);
     struct ovs_list multichassis_ports = OVS_LIST_INITIALIZER(
                                                         &multichassis_ports);
+    struct ovs_list vtep_lports = OVS_LIST_INITIALIZER(&vtep_lports);
 
     struct lport {
         struct ovs_list list_node;
@@ -2010,8 +2035,13 @@  binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
 
         switch (lport_type) {
         case LP_PATCH:
+            update_related_lport(pb, b_ctx_out);
+            break;
         case LP_VTEP:
             update_related_lport(pb, b_ctx_out);
+            struct lport *vtep_lport = xmalloc(sizeof *vtep_lport);
+            vtep_lport->pb = pb;
+            ovs_list_push_back(&vtep_lports, &vtep_lport->list_node);
             break;
 
         case LP_LOCALPORT:
@@ -2111,6 +2141,16 @@  binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
         free(multichassis_lport);
     }
 
+    /* Run through vtep lport list to see if there are vtep ports
+     * on local datapaths discovered from above loop, and update the
+     * corresponding local datapath accordingly. */
+    struct lport *vtep_lport;
+    ovs_list_size(&vtep_lports);
+    LIST_FOR_EACH_POP (vtep_lport, list_node, &vtep_lports) {
+        update_ld_vtep_port(vtep_lport->pb, b_ctx_out->local_datapaths);
+        free(vtep_lport);
+    }
+
     shash_destroy(&bridge_mappings);
     /* Remove stale QoS entries. */
     ovs_qos_entries_gc(b_ctx_in->ovs_idl_txn, b_ctx_in->ovsrec_port_by_qos,
@@ -2175,6 +2215,11 @@  remove_pb_from_local_datapath(const struct sbrec_port_binding *pb,
         }
     } else if (!strcmp(pb->type, "external")) {
         remove_local_datapath_external_port(ld, pb->logical_port);
+    } else if (!strcmp(pb->type, "vtep")) {
+        if (ld->vtep_port && !strcmp(ld->vtep_port->logical_port,
+                                     pb->logical_port)) {
+            ld->vtep_port = NULL;
+        }
     }
     remove_local_datapath_multichassis_port(ld, pb->logical_port);
 }
@@ -2836,6 +2881,7 @@  handle_updated_port(struct binding_ctx_in *b_ctx_in,
 
     case LP_VTEP:
         update_related_lport(pb, b_ctx_out);
+        update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
         /* VTEP lports are claimed/released by ovn-controller-vteps.
          * We are not sure what changed. */
         b_ctx_out->non_vif_ports_changed = true;
@@ -3091,6 +3137,8 @@  delete_done:
                 } else if (pb->n_additional_chassis) {
                     update_ld_multichassis_ports(pb,
                                                  b_ctx_out->local_datapaths);
+                } else if (lport_type == LP_VTEP) {
+                    update_ld_vtep_port(pb, b_ctx_out->local_datapaths);
                 }
             }
             sbrec_port_binding_index_destroy_row(target);
diff --git a/controller/local_data.h b/controller/local_data.h
index 748f009aa..19d13a558 100644
--- a/controller/local_data.h
+++ b/controller/local_data.h
@@ -50,6 +50,9 @@  struct local_datapath {
     /* The localnet port in this datapath, if any (at most one is allowed). */
     const struct sbrec_port_binding *localnet_port;
 
+    /* The vtep port in this datapath, if any (at most one is allowed). */
+    const struct sbrec_port_binding *vtep_port;
+
     struct {
         const struct sbrec_port_binding *local;
         const struct sbrec_port_binding *remote;
diff --git a/controller/physical.c b/controller/physical.c
index 2925dcd1d..b75e120c7 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -194,6 +194,15 @@  get_localnet_port(const struct hmap *local_datapaths, int64_t tunnel_key)
 }
 
 
+static const struct sbrec_port_binding *
+get_vtep_port(const struct hmap *local_datapaths, int64_t tunnel_key)
+{
+    const struct local_datapath *ld = get_local_datapath(local_datapaths,
+                                                         tunnel_key);
+    return ld ? ld->vtep_port : NULL;
+}
+
+
 static struct zone_ids
 get_zone_ids(const struct sbrec_port_binding *binding,
              const struct simap *ct_zones)
@@ -1732,7 +1741,7 @@  consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
     struct sset remote_chassis = SSET_INITIALIZER(&remote_chassis);
     struct sset vtep_chassis = SSET_INITIALIZER(&vtep_chassis);
 
-    struct match match;
+    struct match match, match_ramp;
     match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
 
     /* Go through all of the ports in the multicast group:
@@ -1755,10 +1764,10 @@  consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
      *      set the output port to be the router patch port for which
      *      the redirect port was added.
      */
-    struct ofpbuf ofpacts;
+    struct ofpbuf ofpacts, remote_ofpacts, remote_ofpacts_ramp;
     ofpbuf_init(&ofpacts, 0);
-    struct ofpbuf remote_ofpacts;
     ofpbuf_init(&remote_ofpacts, 0);
+    ofpbuf_init(&remote_ofpacts_ramp, 0);
     for (size_t i = 0; i < mc->n_ports; i++) {
         struct sbrec_port_binding *port = mc->ports[i];
 
@@ -1783,6 +1792,7 @@  consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                 local_output_pb(port->tunnel_key, &ofpacts);
             } else {
                 local_output_pb(port->tunnel_key, &remote_ofpacts);
+                local_output_pb(port->tunnel_key, &remote_ofpacts_ramp);
             }
         } if (!strcmp(port->type, "remote")) {
             if (port->chassis) {
@@ -1866,6 +1876,35 @@  consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
              * multicast group as the logical output port. */
             put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
                      &remote_ofpacts);
+
+            if (get_vtep_port(local_datapaths, mc->datapath->tunnel_key)) {
+                match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, 0,
+                                     MLF_RCV_FROM_RAMP);
+
+                put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
+                         &remote_ofpacts_ramp);
+
+                /* MCAST traffic which was originally received from RAMP_SWITCH
+                 * is not allowed to be re-sent to remote_chassis.
+                 * Process "patch" port binding for routing in
+                 * OFTABLE_REMOTE_OUTPUT and resubmit packets to
+                 * OFTABLE_LOCAL_OUTPUT for local delivery. */
+
+                match_outport_dp_and_port_keys(&match_ramp, dp_key,
+                                               mc->tunnel_key);
+
+                /* Match packets coming from RAMP_SWITCH and allowed for
+                * loopback processing (including routing). */
+                match_set_reg_masked(&match_ramp, MFF_LOG_FLAGS - MFF_REG0,
+                                     MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK,
+                                     MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK);
+
+                put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts_ramp);
+
+                ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 120,
+                                mc->header_.uuid.parts[0], &match_ramp,
+                                &remote_ofpacts_ramp, &mc->header_.uuid);
+            }
         }
 
         fanout_to_chassis(mff_ovn_geneve, &remote_chassis, chassis_tunnels,
@@ -1886,6 +1925,7 @@  consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
     }
     ofpbuf_uninit(&ofpacts);
     ofpbuf_uninit(&remote_ofpacts);
+    ofpbuf_uninit(&remote_ofpacts_ramp);
     sset_destroy(&remote_chassis);
     sset_destroy(&vtep_chassis);
 }
diff --git a/northd/northd.c b/northd/northd.c
index e713973e7..6dcbf0e1a 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -7849,6 +7849,16 @@  build_vtep_hairpin(struct ovn_datapath *od, struct hmap *lflows)
                           ds_cstr(&match), "next;");
         }
     }
+
+    /* ARP/Neighbor Solicitation requests must skip ls_in_arp_rsp table for
+     * packets arrived from HW VTEP (ramp) switch.
+     * Neighbor resolution for router ports is done in logical router ingress
+     * pipeline.  ARP resolution for vif lports is done directly by vif ports.
+     */
+    ovn_lflow_add(lflows, od, S_SWITCH_IN_ARP_ND_RSP, 65535,
+                  REGBIT_FROM_RAMP" == 1 && (arp || nd_ns)",
+                  "flags.loopback = 1; next;");
+
     ds_destroy(&match);
 }
 
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 8164be300..1a9e50b6f 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -1378,6 +1378,12 @@ 
      </p>
 
     <ul>
+      <li>
+        If packet was received from HW VTEP (ramp switch), and this packet is
+        ARP or Neighbor Solicitation, such packet is passed to next table with
+        max proirity.  ARP/ND requests from HW VTEP must be handled in logical
+        router ingress pipeline.
+      </li>
       <li>
         If the logical switch has no router ports with options:arp_proxy
         configured add a priority-100 flows to skip the ARP responder if inport
diff --git a/tests/ovn.at b/tests/ovn.at
index 53349530b..cb643c39d 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -4267,7 +4267,7 @@  ovn_start
 ovn-nbctl ls-add lsw0
 
 ovn-nbctl lsp-add lsw0 lp1
-ovn-nbctl lsp-set-addresses lp1 f0:00:00:00:00:01
+ovn-nbctl lsp-set-addresses lp1 'f0:00:00:00:00:01 192.168.1.24'
 
 ovn-nbctl lsp-add lsw0 lp2
 ovn-nbctl lsp-set-addresses lp2 f0:00:00:00:00:02
@@ -4288,7 +4288,6 @@  ovn-nbctl set Logical_Switch_Port lpr \
 ovn-nbctl lrp-set-gateway-chassis lrp1 hv1
 
 ovn-nbctl lsp-add lsw0 lpr2
-ovn-nbctl lr-add lr2
 ovn-nbctl lrp-add lr lrp2 f0:00:00:00:00:f2 192.168.1.254/24
 ovn-nbctl set Logical_Switch_Port lpr2 \
     type=router \
@@ -4421,6 +4420,7 @@  spa=`ip_to_hex 192 168 1 2`
 tpa=`ip_to_hex 192 168 1 1`
 request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa}
 as hv3 ovs-appctl netdev-dummy/receive vif3 $request
+echo $request >> 1.expected
 echo $request >> 2.expected
 
 lrpmac=f000000000f1
@@ -4431,12 +4431,107 @@  response=${sha}${lrpmac}08060001080006040002${lrpmac}${tpa}${sha}${spa}
 # we expect arp reply packet on hv3
 echo $response >> 3.expected
 
+# Ensure there is no MAC_Binding, send GARP from vtep side,
+# check that GARP was received by vif1, vif2 and MAC_Binding was created.
+wait_row_count MAC_Binding 0 ip="192.168.1.200" mac='"f0:00:00:00:00:12"'
+
+sha=0200000000ff
+tha=ffffffffffff
+spa=`ip_to_hex 192 168 1 200`
+tpa=$spa
+garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
+as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
+echo $garp >> 1.expected
+echo $garp >> 2.expected
+
+wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ff"'
+
+# Send same GARP with changed SHA and ensure MAC_Binding was updated.
+sha=0200000000ee
+garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
+as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
+echo $garp >> 1.expected
+echo $garp >> 2.expected
+
+wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ee"'
+
+# Remove local ports on hv1 and check that ARP logic is stil working. -->
+check as hv1 ovs-vsctl remove interface vif1 external_ids iface-id
+hv_uuid=$(fetch_column Chassis _uuid name=hv1)
+wait_row_count Port_Binding 0 chassis="$hv_uuid" type="''"
+
+# Cleanup MAC_Bindings after previous checks.
+check ovn-sbctl --all destroy MAC_Binding
+
+# ARP request from VTEP to LRP should be responded by ARP responder.
+sha=f00000000003
+spa=`ip_to_hex 192 168 1 2`
+tpa=`ip_to_hex 192 168 1 1`
+request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa}
+as hv3 ovs-appctl netdev-dummy/receive vif3 $request
+echo $request >> 2.expected
+
+lrpmac=f000000000f1
+response=${sha}${lrpmac}08060001080006040002${lrpmac}${tpa}${sha}${spa}
+# since lrp1 has gateway chassis set on hv1, hv1 will suppress arp request and
+# answer with arp reply by OVS directly to vtep lport. all other lports,
+# except lport from which this request was initiated, will receive arp request.
+# we expect arp reply packet on hv3
+echo $response >> 3.expected
+
+# Ensure there is no MAC_Binding, send GARP from vtep side,
+# check that GARP was received by vif1, vif2 and MAC_Binding was created.
+wait_row_count MAC_Binding 0 ip="192.168.1.200" mac='"f0:00:00:00:00:12"'
+
+sha=0200000000ff
+tha=ffffffffffff
+spa=`ip_to_hex 192 168 1 200`
+tpa=$spa
+garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
+as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
+echo $garp >> 2.expected
+
+wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ff"'
+
+# Send same GARP with changed SHA and ensure MAC_Binding was updated.
+sha=0200000000ee
+garp=ffffffffffff${sha}08060001080006040002${sha}${spa}${tha}${tpa}
+as hv3 ovs-appctl netdev-dummy/receive vif3 $garp
+echo $garp >> 2.expected
+
+wait_row_count MAC_Binding 2 ip="192.168.1.200" mac='"02:00:00:00:00:ee"'
+
+# Restore.
+check as hv1 ovs-vsctl set interface vif1 external_ids:iface-id=lp1
+wait_row_count Port_Binding 1 chassis="$hv_uuid" type='""'
+# <--
+
+# Check ARP response to request for IP on VIF port sent from vtep.
+# ARP request from VTEP to LSP's IP must be answered only by LSP vif1 (lp1).
+sha=f00000000003
+spa=`ip_to_hex 192 168 1 2`
+tpa=`ip_to_hex 192 168 1 24`
+request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa}
+as hv3 ovs-appctl netdev-dummy/receive vif3 $request
+echo $request >> 1.expected
+echo $request >> 2.expected
+
+# Simulate response from vif.
+sha=f00000000001
+tha=f00000000003
+spa=`ip_to_hex 192 168 1 200`
+tpa=`ip_to_hex 192 168 1 2`
+response=${tha}${sha}08060001080006040002${sha}${spa}${tha}${tpa}
+as hv1 ovs-appctl netdev-dummy/receive vif1 $response
+echo $response >> 3.expected
+
 # First ensure basic flow contents are as we expect.
 AT_CHECK([ovn-sbctl lflow-list lsw0 | grep 'reg0[\[14\]]' | sort | sed 's/table=../table=??/g'], [0], [dnl
   table=??(ls_in_check_port_sec), priority=70   , match=(inport == "lp-vtep"), action=(reg0[[14]] = 1; next(pipeline=ingress, table=??);)
   table=??(ls_in_hairpin      ), priority=1000 , match=(reg0[[14]] == 1), action=(next(pipeline=ingress, table=??);)
   table=??(ls_in_hairpin      ), priority=2000 , match=(reg0[[14]] == 1 && is_chassis_resident("cr-lrp1")), action=(next;)
   table=??(ls_in_hairpin      ), priority=2000 , match=(reg0[[14]] == 1 && is_chassis_resident("cr-lrp2")), action=(next;)
+  table=??(ls_in_arp_rsp      ), priority=65535, match=(reg0[[14]] == 1 && (arp || nd_ns)), action=(flags.loopback = 1; next;)
 ])
 
 # dump information with counters