diff mbox series

[ovs-dev,V2,05/14] netdev-offload-dpdk: Implement HW miss packet recover for vport

Message ID 20210210152702.4898-6-elibr@nvidia.com
State New
Headers show
Series Netdev vxlan-decap offload | expand

Commit Message

Eli Britstein Feb. 10, 2021, 3:26 p.m. UTC
A miss in virtual port offloads means the flow with tnl_pop was
offloaded, but not the following one. Recover the state and continue
with SW processing.

Signed-off-by: Eli Britstein <elibr@nvidia.com>
Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
---
 lib/netdev-offload-dpdk.c | 95 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

Comments

Sriharsha Basavapatna Feb. 23, 2021, 1:10 p.m. UTC | #1
On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein <elibr@nvidia.com> wrote:
>
> A miss in virtual port offloads means the flow with tnl_pop was
> offloaded, but not the following one. Recover the state and continue
> with SW processing.

Relates to my comment on Patch-1; please explain what is meant by
recovering the packet state.

>
> Signed-off-by: Eli Britstein <elibr@nvidia.com>
> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
> ---
>  lib/netdev-offload-dpdk.c | 95 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 95 insertions(+)
>
> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
> index 8cc90d0f1..21aa26b42 100644
> --- a/lib/netdev-offload-dpdk.c
> +++ b/lib/netdev-offload-dpdk.c
> @@ -1610,6 +1610,100 @@ netdev_offload_dpdk_flow_dump_destroy(struct netdev_flow_dump *dump)
>      return 0;
>  }
>
> +static struct netdev *
> +get_vport_netdev(const char *dpif_type,
> +                 struct rte_flow_tunnel *tunnel,
> +                 odp_port_t *odp_port)
> +{
> +    const struct netdev_tunnel_config *tnl_cfg;
> +    struct netdev_flow_dump **netdev_dumps;
> +    struct netdev *vport = NULL;
> +    bool found = false;
> +    int num_ports = 0;
> +    int err;
> +    int i;
> +
> +    netdev_dumps = netdev_ports_flow_dump_create(dpif_type, &num_ports, false);

This relates to my comment in Patch-3; flow_dump_create() in this
context is very confusing since we are not really dumping flows. We
might as well walk the list of ports/netdevs looking for a specific
netdev, just like other netdev_ports_*() routines in netdev-offload.c;
may be add a new function in netdev-offload.c:

netdev_ports_get_tunnel_vport(dpif_type, tunnel_type, tp_dst):    /*
example: tunnel_type == "vxlan", tp_dst = 4789 */

HMAP_FOR_EACH (data, portno_node, &port_to_netdev) {
    if (netdev_get_dpif_type(data->netdev) == dpif_type &&
        netdev_get_type(data->netdev) == tunnel_type) {
            tnl_cfg = netdev_get_tunnel_config(data->netdev);
             if (tnl_cfg && tnl_cfg->dst_port == tp_dst) {
             ....
             ....
            }
    }

> +    for (i = 0; i < num_ports; i++) {
> +        if (!found && tunnel->type == RTE_FLOW_ITEM_TYPE_VXLAN &&
> +            !strcmp(netdev_get_type(netdev_dumps[i]->netdev), "vxlan")) {
> +            tnl_cfg = netdev_get_tunnel_config(netdev_dumps[i]->netdev);
> +            if (tnl_cfg && tnl_cfg->dst_port == tunnel->tp_dst) {
> +                found = true;
> +                vport = netdev_dumps[i]->netdev;
> +                netdev_ref(vport);
> +                *odp_port = netdev_dumps[i]->port;
> +            }
> +        }
> +        err = netdev_flow_dump_destroy(netdev_dumps[i]);
> +        if (err != 0 && err != EOPNOTSUPP) {
> +            VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err));
> +        }
> +    }
> +    return vport;
> +}
> +
> +static int
> +netdev_offload_dpdk_hw_miss_packet_recover(struct netdev *netdev,
> +                                           struct dp_packet *packet)
> +{
> +    struct rte_flow_restore_info rte_restore_info;
> +    struct rte_flow_tunnel *rte_tnl;
> +    struct rte_flow_error error;
> +    struct netdev *vport_netdev;
> +    struct pkt_metadata *md;
> +    struct flow_tnl *md_tnl;
> +    odp_port_t vport_odp;
> +
> +    if (netdev_dpdk_rte_flow_get_restore_info(netdev, packet,
> +                                              &rte_restore_info, &error)) {
> +        /* This function is called for every packet, and in most cases there
> +         * will be no restore info from the HW, thus error is expected.

Right now this API (get_restore_info) is needed only in the case of
tunnel offloads; is there some way we could avoid calling this for
every packet (e.g non-offloaded, non-tunnel-offloaded packets) ?
What is expected from the PMD using the given 'packet' argument ? This
is not clear from the API description in rte_flow.h.
Is the PMD supposed to parse this packet and return success only if it
is an encapsulated packet ? Are there any other additional conditions
like the packet should be marked (i.e., the PMD should validate
rte_mbuf->hash.fdir fields etc) ? If it is a marked packet, does OVS
set the mark action or should the PMD implicitly add a mark action,
while offloading flow F1 ? If the PMD implicitly adds a mark action,
won't it conflict with mark-ids managed by OVS ?

> +         */
> +        (void) error;
> +        return -1;
> +    }
> +
> +    rte_tnl = &rte_restore_info.tunnel;
> +    if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_TUNNEL) {
> +        vport_netdev = get_vport_netdev(netdev->dpif_type, rte_tnl,
> +                                        &vport_odp);
> +        md = &packet->md;
> +        if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_ENCAPSULATED) {
> +            if (!vport_netdev || !vport_netdev->netdev_class ||
> +                !vport_netdev->netdev_class->pop_header) {
> +                VLOG_ERR("vport nedtdev=%s with no pop_header method",
> +                         netdev_get_name(vport_netdev));
> +                return -1;
> +            }
> +            vport_netdev->netdev_class->pop_header(packet);
> +            netdev_close(vport_netdev);
> +         } else {
The else condition handles the case when the packet is tunneled, but
it is not encapsulated ? It is not clear how/when this could happen ?
Thanks,
-Harsha

> +             md_tnl = &md->tunnel;
> +             if (rte_tnl->is_ipv6) {
> +                 memcpy(&md_tnl->ipv6_src, &rte_tnl->ipv6.src_addr,
> +                        sizeof md_tnl->ipv6_src);
> +                 memcpy(&md_tnl->ipv6_dst, &rte_tnl->ipv6.dst_addr,
> +                        sizeof md_tnl->ipv6_dst);
> +             } else {
> +                 md_tnl->ip_src = rte_tnl->ipv4.src_addr;
> +                 md_tnl->ip_dst = rte_tnl->ipv4.dst_addr;
> +             }
> +             md_tnl->tun_id = htonll(rte_tnl->tun_id);
> +             md_tnl->flags = rte_tnl->tun_flags;
> +             md_tnl->ip_tos = rte_tnl->tos;
> +             md_tnl->ip_ttl = rte_tnl->ttl;
> +             md_tnl->tp_src = rte_tnl->tp_src;
> +         }
> +         if (vport_netdev) {
> +             md->in_port.odp_port = vport_odp;
> +         }
> +    }
> +    dp_packet_reset_offload(packet);
> +
> +    return 0;
> +}
> +
>  const struct netdev_flow_api netdev_offload_dpdk = {
>      .type = "dpdk_flow_api",
>      .flow_put = netdev_offload_dpdk_flow_put,
> @@ -1619,4 +1713,5 @@ const struct netdev_flow_api netdev_offload_dpdk = {
>      .flow_flush = netdev_offload_dpdk_flow_flush,
>      .flow_dump_create = netdev_offload_dpdk_flow_dump_create,
>      .flow_dump_destroy = netdev_offload_dpdk_flow_dump_destroy,
> +    .hw_miss_packet_recover = netdev_offload_dpdk_hw_miss_packet_recover,
>  };
> --
> 2.28.0.546.g385c171
>
Eli Britstein Feb. 23, 2021, 1:54 p.m. UTC | #2
On 2/23/2021 3:10 PM, Sriharsha Basavapatna wrote:
> On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein <elibr@nvidia.com> wrote:
>> A miss in virtual port offloads means the flow with tnl_pop was
>> offloaded, but not the following one. Recover the state and continue
>> with SW processing.
> Relates to my comment on Patch-1; please explain what is meant by
> recovering the packet state.
See my response there.
>
>> Signed-off-by: Eli Britstein <elibr@nvidia.com>
>> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
>> ---
>>   lib/netdev-offload-dpdk.c | 95 +++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 95 insertions(+)
>>
>> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
>> index 8cc90d0f1..21aa26b42 100644
>> --- a/lib/netdev-offload-dpdk.c
>> +++ b/lib/netdev-offload-dpdk.c
>> @@ -1610,6 +1610,100 @@ netdev_offload_dpdk_flow_dump_destroy(struct netdev_flow_dump *dump)
>>       return 0;
>>   }
>>
>> +static struct netdev *
>> +get_vport_netdev(const char *dpif_type,
>> +                 struct rte_flow_tunnel *tunnel,
>> +                 odp_port_t *odp_port)
>> +{
>> +    const struct netdev_tunnel_config *tnl_cfg;
>> +    struct netdev_flow_dump **netdev_dumps;
>> +    struct netdev *vport = NULL;
>> +    bool found = false;
>> +    int num_ports = 0;
>> +    int err;
>> +    int i;
>> +
>> +    netdev_dumps = netdev_ports_flow_dump_create(dpif_type, &num_ports, false);
> This relates to my comment in Patch-3; flow_dump_create() in this
> context is very confusing since we are not really dumping flows. We
> might as well walk the list of ports/netdevs looking for a specific
> netdev, just like other netdev_ports_*() routines in netdev-offload.c;
> may be add a new function in netdev-offload.c:
As in the response there. I used an existing API, not introducing new ones.
>
> netdev_ports_get_tunnel_vport(dpif_type, tunnel_type, tp_dst):    /*
> example: tunnel_type == "vxlan", tp_dst = 4789 */
>
> HMAP_FOR_EACH (data, portno_node, &port_to_netdev) {
>      if (netdev_get_dpif_type(data->netdev) == dpif_type &&
>          netdev_get_type(data->netdev) == tunnel_type) {
>              tnl_cfg = netdev_get_tunnel_config(data->netdev);
>               if (tnl_cfg && tnl_cfg->dst_port == tp_dst) {
>               ....
>               ....
>              }
>      }
>
>> +    for (i = 0; i < num_ports; i++) {
>> +        if (!found && tunnel->type == RTE_FLOW_ITEM_TYPE_VXLAN &&
>> +            !strcmp(netdev_get_type(netdev_dumps[i]->netdev), "vxlan")) {
>> +            tnl_cfg = netdev_get_tunnel_config(netdev_dumps[i]->netdev);
>> +            if (tnl_cfg && tnl_cfg->dst_port == tunnel->tp_dst) {
>> +                found = true;
>> +                vport = netdev_dumps[i]->netdev;
>> +                netdev_ref(vport);
>> +                *odp_port = netdev_dumps[i]->port;
>> +            }
>> +        }
>> +        err = netdev_flow_dump_destroy(netdev_dumps[i]);
>> +        if (err != 0 && err != EOPNOTSUPP) {
>> +            VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err));
>> +        }
>> +    }
>> +    return vport;
>> +}
>> +
>> +static int
>> +netdev_offload_dpdk_hw_miss_packet_recover(struct netdev *netdev,
>> +                                           struct dp_packet *packet)
>> +{
>> +    struct rte_flow_restore_info rte_restore_info;
>> +    struct rte_flow_tunnel *rte_tnl;
>> +    struct rte_flow_error error;
>> +    struct netdev *vport_netdev;
>> +    struct pkt_metadata *md;
>> +    struct flow_tnl *md_tnl;
>> +    odp_port_t vport_odp;
>> +
>> +    if (netdev_dpdk_rte_flow_get_restore_info(netdev, packet,
>> +                                              &rte_restore_info, &error)) {
>> +        /* This function is called for every packet, and in most cases there
>> +         * will be no restore info from the HW, thus error is expected.
> Right now this API (get_restore_info) is needed only in the case of
> tunnel offloads; is there some way we could avoid calling this for
> every packet (e.g non-offloaded, non-tunnel-offloaded packets) ?
No other way I can think of. Suggestions?
> What is expected from the PMD using the given 'packet' argument ? This
> is not clear from the API description in rte_flow.h.

The PMD is expected to use any data on the packet in order to provide 
the HW info.

mlx5 PMD uses mark to do it, but each PMD is free to use whatever 
implementation suitable for its HW.

If the description is not clear enough, I suggest you comment/fix in 
DPDK ML.

> Is the PMD supposed to parse this packet and return success only if it
> is an encapsulated packet ? Are there any other additional conditions
> like the packet should be marked (i.e., the PMD should validate
> rte_mbuf->hash.fdir fields etc) ? If it is a marked packet, does OVS
> set the mark action or should the PMD implicitly add a mark action,
> while offloading flow F1 ? If the PMD implicitly adds a mark action,
> won't it conflict with mark-ids managed by OVS ?

Again, it is the PMD decision. In mlx5, there is a devarg to enable 
tunnel offload APIs. In this mode, the PMD uses internal marks, and deny 
the user to use his own MARK actions, so no conflict.

OVS partial offload doesn't work in this mode.

>
>> +         */
>> +        (void) error;
>> +        return -1;
>> +    }
>> +
>> +    rte_tnl = &rte_restore_info.tunnel;
>> +    if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_TUNNEL) {
>> +        vport_netdev = get_vport_netdev(netdev->dpif_type, rte_tnl,
>> +                                        &vport_odp);
>> +        md = &packet->md;
>> +        if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_ENCAPSULATED) {
>> +            if (!vport_netdev || !vport_netdev->netdev_class ||
>> +                !vport_netdev->netdev_class->pop_header) {
>> +                VLOG_ERR("vport nedtdev=%s with no pop_header method",
>> +                         netdev_get_name(vport_netdev));
>> +                return -1;
>> +            }
>> +            vport_netdev->netdev_class->pop_header(packet);
>> +            netdev_close(vport_netdev);
>> +         } else {
> The else condition handles the case when the packet is tunneled, but
> it is not encapsulated ? It is not clear how/when this could happen ?
Again, this is the PMD decision. OVS should handle.
> Thanks,
> -Harsha
>
>> +             md_tnl = &md->tunnel;
>> +             if (rte_tnl->is_ipv6) {
>> +                 memcpy(&md_tnl->ipv6_src, &rte_tnl->ipv6.src_addr,
>> +                        sizeof md_tnl->ipv6_src);
>> +                 memcpy(&md_tnl->ipv6_dst, &rte_tnl->ipv6.dst_addr,
>> +                        sizeof md_tnl->ipv6_dst);
>> +             } else {
>> +                 md_tnl->ip_src = rte_tnl->ipv4.src_addr;
>> +                 md_tnl->ip_dst = rte_tnl->ipv4.dst_addr;
>> +             }
>> +             md_tnl->tun_id = htonll(rte_tnl->tun_id);
>> +             md_tnl->flags = rte_tnl->tun_flags;
>> +             md_tnl->ip_tos = rte_tnl->tos;
>> +             md_tnl->ip_ttl = rte_tnl->ttl;
>> +             md_tnl->tp_src = rte_tnl->tp_src;
>> +         }
>> +         if (vport_netdev) {
>> +             md->in_port.odp_port = vport_odp;
>> +         }
>> +    }
>> +    dp_packet_reset_offload(packet);
>> +
>> +    return 0;
>> +}
>> +
>>   const struct netdev_flow_api netdev_offload_dpdk = {
>>       .type = "dpdk_flow_api",
>>       .flow_put = netdev_offload_dpdk_flow_put,
>> @@ -1619,4 +1713,5 @@ const struct netdev_flow_api netdev_offload_dpdk = {
>>       .flow_flush = netdev_offload_dpdk_flow_flush,
>>       .flow_dump_create = netdev_offload_dpdk_flow_dump_create,
>>       .flow_dump_destroy = netdev_offload_dpdk_flow_dump_destroy,
>> +    .hw_miss_packet_recover = netdev_offload_dpdk_hw_miss_packet_recover,
>>   };
>> --
>> 2.28.0.546.g385c171
>>
Sriharsha Basavapatna Feb. 25, 2021, 7:34 a.m. UTC | #3
On Tue, Feb 23, 2021 at 7:24 PM Eli Britstein <elibr@nvidia.com> wrote:
>
>
> On 2/23/2021 3:10 PM, Sriharsha Basavapatna wrote:
> > On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein <elibr@nvidia.com> wrote:
> >> A miss in virtual port offloads means the flow with tnl_pop was
> >> offloaded, but not the following one. Recover the state and continue
> >> with SW processing.
> > Relates to my comment on Patch-1; please explain what is meant by
> > recovering the packet state.
> See my response there.

Responded to your comment.
> >
> >> Signed-off-by: Eli Britstein <elibr@nvidia.com>
> >> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
> >> ---
> >>   lib/netdev-offload-dpdk.c | 95 +++++++++++++++++++++++++++++++++++++++
> >>   1 file changed, 95 insertions(+)
> >>
> >> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
> >> index 8cc90d0f1..21aa26b42 100644
> >> --- a/lib/netdev-offload-dpdk.c
> >> +++ b/lib/netdev-offload-dpdk.c
> >> @@ -1610,6 +1610,100 @@ netdev_offload_dpdk_flow_dump_destroy(struct netdev_flow_dump *dump)
> >>       return 0;
> >>   }
> >>
> >> +static struct netdev *
> >> +get_vport_netdev(const char *dpif_type,
> >> +                 struct rte_flow_tunnel *tunnel,
> >> +                 odp_port_t *odp_port)
> >> +{
> >> +    const struct netdev_tunnel_config *tnl_cfg;
> >> +    struct netdev_flow_dump **netdev_dumps;
> >> +    struct netdev *vport = NULL;
> >> +    bool found = false;
> >> +    int num_ports = 0;
> >> +    int err;
> >> +    int i;
> >> +
> >> +    netdev_dumps = netdev_ports_flow_dump_create(dpif_type, &num_ports, false);
> > This relates to my comment in Patch-3; flow_dump_create() in this
> > context is very confusing since we are not really dumping flows. We
> > might as well walk the list of ports/netdevs looking for a specific
> > netdev, just like other netdev_ports_*() routines in netdev-offload.c;
> > may be add a new function in netdev-offload.c:
> As in the response there. I used an existing API, not introducing new ones.

Like I said, this can be confusing that while trying to offload a
flow, we invoke flow-dump API. And the flow-dump API implementation
does nothing, apart from getting a reference to a netdev. IMO, it'd be
better to add a new API to keep it simple and clear, as shown below.
> >
> > netdev_ports_get_tunnel_vport(dpif_type, tunnel_type, tp_dst):    /*
> > example: tunnel_type == "vxlan", tp_dst = 4789 */
> >
> > HMAP_FOR_EACH (data, portno_node, &port_to_netdev) {
> >      if (netdev_get_dpif_type(data->netdev) == dpif_type &&
> >          netdev_get_type(data->netdev) == tunnel_type) {
> >              tnl_cfg = netdev_get_tunnel_config(data->netdev);
> >               if (tnl_cfg && tnl_cfg->dst_port == tp_dst) {
> >               ....
> >               ....
> >              }
> >      }
> >
> >> +    for (i = 0; i < num_ports; i++) {
> >> +        if (!found && tunnel->type == RTE_FLOW_ITEM_TYPE_VXLAN &&
> >> +            !strcmp(netdev_get_type(netdev_dumps[i]->netdev), "vxlan")) {
> >> +            tnl_cfg = netdev_get_tunnel_config(netdev_dumps[i]->netdev);
> >> +            if (tnl_cfg && tnl_cfg->dst_port == tunnel->tp_dst) {
> >> +                found = true;
> >> +                vport = netdev_dumps[i]->netdev;
> >> +                netdev_ref(vport);
> >> +                *odp_port = netdev_dumps[i]->port;
> >> +            }
> >> +        }
> >> +        err = netdev_flow_dump_destroy(netdev_dumps[i]);
> >> +        if (err != 0 && err != EOPNOTSUPP) {
> >> +            VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err));
> >> +        }
> >> +    }
> >> +    return vport;
> >> +}
> >> +
> >> +static int
> >> +netdev_offload_dpdk_hw_miss_packet_recover(struct netdev *netdev,
> >> +                                           struct dp_packet *packet)
> >> +{
> >> +    struct rte_flow_restore_info rte_restore_info;
> >> +    struct rte_flow_tunnel *rte_tnl;
> >> +    struct rte_flow_error error;
> >> +    struct netdev *vport_netdev;
> >> +    struct pkt_metadata *md;
> >> +    struct flow_tnl *md_tnl;
> >> +    odp_port_t vport_odp;
> >> +
> >> +    if (netdev_dpdk_rte_flow_get_restore_info(netdev, packet,
> >> +                                              &rte_restore_info, &error)) {
> >> +        /* This function is called for every packet, and in most cases there
> >> +         * will be no restore info from the HW, thus error is expected.
> > Right now this API (get_restore_info) is needed only in the case of
> > tunnel offloads; is there some way we could avoid calling this for
> > every packet (e.g non-offloaded, non-tunnel-offloaded packets) ?
> No other way I can think of. Suggestions?
Suggested in Patch-6 (code refactoring in dfc_processing())

> > What is expected from the PMD using the given 'packet' argument ? This
> > is not clear from the API description in rte_flow.h.
>
> The PMD is expected to use any data on the packet in order to provide
> the HW info.
>
> mlx5 PMD uses mark to do it, but each PMD is free to use whatever
> implementation suitable for its HW.
>
> If the description is not clear enough, I suggest you comment/fix in
> DPDK ML.

I'm commenting here, as I'm reviewing OVS patches.
>
> > Is the PMD supposed to parse this packet and return success only if it
> > is an encapsulated packet ? Are there any other additional conditions
> > like the packet should be marked (i.e., the PMD should validate
> > rte_mbuf->hash.fdir fields etc) ? If it is a marked packet, does OVS
> > set the mark action or should the PMD implicitly add a mark action,
> > while offloading flow F1 ? If the PMD implicitly adds a mark action,
> > won't it conflict with mark-ids managed by OVS ?
>
> Again, it is the PMD decision. In mlx5, there is a devarg to enable
> tunnel offload APIs. In this mode, the PMD uses internal marks, and deny
> the user to use his own MARK actions, so no conflict.
>
> OVS partial offload doesn't work in this mode.

Ok, thanks for the info on mlx5 PMD.
>
> >
> >> +         */
> >> +        (void) error;
> >> +        return -1;
> >> +    }
> >> +
> >> +    rte_tnl = &rte_restore_info.tunnel;
> >> +    if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_TUNNEL) {
> >> +        vport_netdev = get_vport_netdev(netdev->dpif_type, rte_tnl,
> >> +                                        &vport_odp);
> >> +        md = &packet->md;
> >> +        if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_ENCAPSULATED) {
> >> +            if (!vport_netdev || !vport_netdev->netdev_class ||
> >> +                !vport_netdev->netdev_class->pop_header) {
> >> +                VLOG_ERR("vport nedtdev=%s with no pop_header method",
> >> +                         netdev_get_name(vport_netdev));
> >> +                return -1;
> >> +            }
> >> +            vport_netdev->netdev_class->pop_header(packet);
> >> +            netdev_close(vport_netdev);
> >> +         } else {
> > The else condition handles the case when the packet is tunneled, but
> > it is not encapsulated ? It is not clear how/when this could happen ?
> Again, this is the PMD decision. OVS should handle.
> > Thanks,
> > -Harsha
> >
> >> +             md_tnl = &md->tunnel;
> >> +             if (rte_tnl->is_ipv6) {
> >> +                 memcpy(&md_tnl->ipv6_src, &rte_tnl->ipv6.src_addr,
> >> +                        sizeof md_tnl->ipv6_src);
> >> +                 memcpy(&md_tnl->ipv6_dst, &rte_tnl->ipv6.dst_addr,
> >> +                        sizeof md_tnl->ipv6_dst);
> >> +             } else {
> >> +                 md_tnl->ip_src = rte_tnl->ipv4.src_addr;
> >> +                 md_tnl->ip_dst = rte_tnl->ipv4.dst_addr;
> >> +             }
> >> +             md_tnl->tun_id = htonll(rte_tnl->tun_id);
> >> +             md_tnl->flags = rte_tnl->tun_flags;
> >> +             md_tnl->ip_tos = rte_tnl->tos;
> >> +             md_tnl->ip_ttl = rte_tnl->ttl;
> >> +             md_tnl->tp_src = rte_tnl->tp_src;
> >> +         }
> >> +         if (vport_netdev) {
> >> +             md->in_port.odp_port = vport_odp;
> >> +         }
> >> +    }
> >> +    dp_packet_reset_offload(packet);
> >> +
> >> +    return 0;
> >> +}
> >> +
> >>   const struct netdev_flow_api netdev_offload_dpdk = {
> >>       .type = "dpdk_flow_api",
> >>       .flow_put = netdev_offload_dpdk_flow_put,
> >> @@ -1619,4 +1713,5 @@ const struct netdev_flow_api netdev_offload_dpdk = {
> >>       .flow_flush = netdev_offload_dpdk_flow_flush,
> >>       .flow_dump_create = netdev_offload_dpdk_flow_dump_create,
> >>       .flow_dump_destroy = netdev_offload_dpdk_flow_dump_destroy,
> >> +    .hw_miss_packet_recover = netdev_offload_dpdk_hw_miss_packet_recover,
> >>   };
> >> --
> >> 2.28.0.546.g385c171
> >>
Eli Britstein Feb. 25, 2021, 2:20 p.m. UTC | #4
On 2/25/2021 9:34 AM, Sriharsha Basavapatna wrote:
> On Tue, Feb 23, 2021 at 7:24 PM Eli Britstein <elibr@nvidia.com> wrote:
>>
>> On 2/23/2021 3:10 PM, Sriharsha Basavapatna wrote:
>>> On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein <elibr@nvidia.com> wrote:
>>>> A miss in virtual port offloads means the flow with tnl_pop was
>>>> offloaded, but not the following one. Recover the state and continue
>>>> with SW processing.
>>> Relates to my comment on Patch-1; please explain what is meant by
>>> recovering the packet state.
>> See my response there.
> Responded to your comment.
>>>> Signed-off-by: Eli Britstein <elibr@nvidia.com>
>>>> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
>>>> ---
>>>>    lib/netdev-offload-dpdk.c | 95 +++++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 95 insertions(+)
>>>>
>>>> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
>>>> index 8cc90d0f1..21aa26b42 100644
>>>> --- a/lib/netdev-offload-dpdk.c
>>>> +++ b/lib/netdev-offload-dpdk.c
>>>> @@ -1610,6 +1610,100 @@ netdev_offload_dpdk_flow_dump_destroy(struct netdev_flow_dump *dump)
>>>>        return 0;
>>>>    }
>>>>
>>>> +static struct netdev *
>>>> +get_vport_netdev(const char *dpif_type,
>>>> +                 struct rte_flow_tunnel *tunnel,
>>>> +                 odp_port_t *odp_port)
>>>> +{
>>>> +    const struct netdev_tunnel_config *tnl_cfg;
>>>> +    struct netdev_flow_dump **netdev_dumps;
>>>> +    struct netdev *vport = NULL;
>>>> +    bool found = false;
>>>> +    int num_ports = 0;
>>>> +    int err;
>>>> +    int i;
>>>> +
>>>> +    netdev_dumps = netdev_ports_flow_dump_create(dpif_type, &num_ports, false);
>>> This relates to my comment in Patch-3; flow_dump_create() in this
>>> context is very confusing since we are not really dumping flows. We
>>> might as well walk the list of ports/netdevs looking for a specific
>>> netdev, just like other netdev_ports_*() routines in netdev-offload.c;
>>> may be add a new function in netdev-offload.c:
>> As in the response there. I used an existing API, not introducing new ones.
> Like I said, this can be confusing that while trying to offload a
> flow, we invoke flow-dump API. And the flow-dump API implementation
> does nothing, apart from getting a reference to a netdev. IMO, it'd be
> better to add a new API to keep it simple and clear, as shown below.
>>> netdev_ports_get_tunnel_vport(dpif_type, tunnel_type, tp_dst):    /*
>>> example: tunnel_type == "vxlan", tp_dst = 4789 */
>>>
>>> HMAP_FOR_EACH (data, portno_node, &port_to_netdev) {
>>>       if (netdev_get_dpif_type(data->netdev) == dpif_type &&
>>>           netdev_get_type(data->netdev) == tunnel_type) {
>>>               tnl_cfg = netdev_get_tunnel_config(data->netdev);
>>>                if (tnl_cfg && tnl_cfg->dst_port == tp_dst) {
>>>                ....
>>>                ....
>>>               }
>>>       }
An alternative I can suggest is something like the following:

New API:

void
netdev_ports_traverse(const char *dpif_type,
                       void (*cb)(struct netdev *, odp_port_t, void *),
                       void *cookie)
{
     struct port_to_netdev_data *data;

ovs_rwlock_rdlock(&netdev_hmap_rwlock);
     HMAP_FOR_EACH (data, portno_node, &port_to_netdev) {
         if (netdev_get_dpif_type(data->netdev) == dpif_type) {
             cb(data->netdev, data->dpif_port.port_no, cookie);
}
}
ovs_rwlock_unlock(&netdev_hmap_rwlock);
}

Then, implement cb to find the tunnel in netdev-offload-dpdk.c. What do 
you think?

>>>
>>>> +    for (i = 0; i < num_ports; i++) {
>>>> +        if (!found && tunnel->type == RTE_FLOW_ITEM_TYPE_VXLAN &&
>>>> +            !strcmp(netdev_get_type(netdev_dumps[i]->netdev), "vxlan")) {
>>>> +            tnl_cfg = netdev_get_tunnel_config(netdev_dumps[i]->netdev);
>>>> +            if (tnl_cfg && tnl_cfg->dst_port == tunnel->tp_dst) {
>>>> +                found = true;
>>>> +                vport = netdev_dumps[i]->netdev;
>>>> +                netdev_ref(vport);
>>>> +                *odp_port = netdev_dumps[i]->port;
>>>> +            }
>>>> +        }
>>>> +        err = netdev_flow_dump_destroy(netdev_dumps[i]);
>>>> +        if (err != 0 && err != EOPNOTSUPP) {
>>>> +            VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err));
>>>> +        }
>>>> +    }
>>>> +    return vport;
>>>> +}
>>>> +
>>>> +static int
>>>> +netdev_offload_dpdk_hw_miss_packet_recover(struct netdev *netdev,
>>>> +                                           struct dp_packet *packet)
>>>> +{
>>>> +    struct rte_flow_restore_info rte_restore_info;
>>>> +    struct rte_flow_tunnel *rte_tnl;
>>>> +    struct rte_flow_error error;
>>>> +    struct netdev *vport_netdev;
>>>> +    struct pkt_metadata *md;
>>>> +    struct flow_tnl *md_tnl;
>>>> +    odp_port_t vport_odp;
>>>> +
>>>> +    if (netdev_dpdk_rte_flow_get_restore_info(netdev, packet,
>>>> +                                              &rte_restore_info, &error)) {
>>>> +        /* This function is called for every packet, and in most cases there
>>>> +         * will be no restore info from the HW, thus error is expected.
>>> Right now this API (get_restore_info) is needed only in the case of
>>> tunnel offloads; is there some way we could avoid calling this for
>>> every packet (e.g non-offloaded, non-tunnel-offloaded packets) ?
>> No other way I can think of. Suggestions?
> Suggested in Patch-6 (code refactoring in dfc_processing())
Yes, I will refactor. However it will avoid this call only if 
hw-offload=false.
>
>>> What is expected from the PMD using the given 'packet' argument ? This
>>> is not clear from the API description in rte_flow.h.
>> The PMD is expected to use any data on the packet in order to provide
>> the HW info.
>>
>> mlx5 PMD uses mark to do it, but each PMD is free to use whatever
>> implementation suitable for its HW.
>>
>> If the description is not clear enough, I suggest you comment/fix in
>> DPDK ML.
> I'm commenting here, as I'm reviewing OVS patches.

>>> Is the PMD supposed to parse this packet and return success only if it
>>> is an encapsulated packet ? Are there any other additional conditions
>>> like the packet should be marked (i.e., the PMD should validate
>>> rte_mbuf->hash.fdir fields etc) ? If it is a marked packet, does OVS
>>> set the mark action or should the PMD implicitly add a mark action,
>>> while offloading flow F1 ? If the PMD implicitly adds a mark action,
>>> won't it conflict with mark-ids managed by OVS ?
>> Again, it is the PMD decision. In mlx5, there is a devarg to enable
>> tunnel offload APIs. In this mode, the PMD uses internal marks, and deny
>> the user to use his own MARK actions, so no conflict.
>>
>> OVS partial offload doesn't work in this mode.
> Ok, thanks for the info on mlx5 PMD.
>>>> +         */
>>>> +        (void) error;
>>>> +        return -1;
>>>> +    }
>>>> +
>>>> +    rte_tnl = &rte_restore_info.tunnel;
>>>> +    if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_TUNNEL) {
>>>> +        vport_netdev = get_vport_netdev(netdev->dpif_type, rte_tnl,
>>>> +                                        &vport_odp);
>>>> +        md = &packet->md;
>>>> +        if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_ENCAPSULATED) {
>>>> +            if (!vport_netdev || !vport_netdev->netdev_class ||
>>>> +                !vport_netdev->netdev_class->pop_header) {
>>>> +                VLOG_ERR("vport nedtdev=%s with no pop_header method",
>>>> +                         netdev_get_name(vport_netdev));
>>>> +                return -1;
>>>> +            }
>>>> +            vport_netdev->netdev_class->pop_header(packet);
>>>> +            netdev_close(vport_netdev);
>>>> +         } else {
>>> The else condition handles the case when the packet is tunneled, but
>>> it is not encapsulated ? It is not clear how/when this could happen ?
>> Again, this is the PMD decision. OVS should handle.

The PMD may provide an info with RTE_FLOW_RESTORE_INFO_TUNNEL on or off.

In the case it is on, it means the packet is still encapsulated, and we 
do the pop in SW.

In the case it is off, it means the packet is already decapsulated, and 
the tunnel info is provided in the tunnel struct. For this case we take 
it to OVS metadata.

It is the up to the PMD to provide the info. OVS should handle either case.

>>> Thanks,
>>> -Harsha
>>>
>>>> +             md_tnl = &md->tunnel;
>>>> +             if (rte_tnl->is_ipv6) {
>>>> +                 memcpy(&md_tnl->ipv6_src, &rte_tnl->ipv6.src_addr,
>>>> +                        sizeof md_tnl->ipv6_src);
>>>> +                 memcpy(&md_tnl->ipv6_dst, &rte_tnl->ipv6.dst_addr,
>>>> +                        sizeof md_tnl->ipv6_dst);
>>>> +             } else {
>>>> +                 md_tnl->ip_src = rte_tnl->ipv4.src_addr;
>>>> +                 md_tnl->ip_dst = rte_tnl->ipv4.dst_addr;
>>>> +             }
>>>> +             md_tnl->tun_id = htonll(rte_tnl->tun_id);
>>>> +             md_tnl->flags = rte_tnl->tun_flags;
>>>> +             md_tnl->ip_tos = rte_tnl->tos;
>>>> +             md_tnl->ip_ttl = rte_tnl->ttl;
>>>> +             md_tnl->tp_src = rte_tnl->tp_src;
>>>> +         }
>>>> +         if (vport_netdev) {
>>>> +             md->in_port.odp_port = vport_odp;
>>>> +         }
>>>> +    }
>>>> +    dp_packet_reset_offload(packet);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>>    const struct netdev_flow_api netdev_offload_dpdk = {
>>>>        .type = "dpdk_flow_api",
>>>>        .flow_put = netdev_offload_dpdk_flow_put,
>>>> @@ -1619,4 +1713,5 @@ const struct netdev_flow_api netdev_offload_dpdk = {
>>>>        .flow_flush = netdev_offload_dpdk_flow_flush,
>>>>        .flow_dump_create = netdev_offload_dpdk_flow_dump_create,
>>>>        .flow_dump_destroy = netdev_offload_dpdk_flow_dump_destroy,
>>>> +    .hw_miss_packet_recover = netdev_offload_dpdk_hw_miss_packet_recover,
>>>>    };
>>>> --
>>>> 2.28.0.546.g385c171
>>>>
Sriharsha Basavapatna Feb. 25, 2021, 5:19 p.m. UTC | #5
On Thu, Feb 25, 2021 at 7:50 PM Eli Britstein <elibr@nvidia.com> wrote:
>
>
> On 2/25/2021 9:34 AM, Sriharsha Basavapatna wrote:
> > On Tue, Feb 23, 2021 at 7:24 PM Eli Britstein <elibr@nvidia.com> wrote:
> >>
> >> On 2/23/2021 3:10 PM, Sriharsha Basavapatna wrote:
> >>> On Wed, Feb 10, 2021 at 8:57 PM Eli Britstein <elibr@nvidia.com> wrote:
> >>>> A miss in virtual port offloads means the flow with tnl_pop was
> >>>> offloaded, but not the following one. Recover the state and continue
> >>>> with SW processing.
> >>> Relates to my comment on Patch-1; please explain what is meant by
> >>> recovering the packet state.
> >> See my response there.
> > Responded to your comment.
> >>>> Signed-off-by: Eli Britstein <elibr@nvidia.com>
> >>>> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
> >>>> ---
> >>>>    lib/netdev-offload-dpdk.c | 95 +++++++++++++++++++++++++++++++++++++++
> >>>>    1 file changed, 95 insertions(+)
> >>>>
> >>>> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
> >>>> index 8cc90d0f1..21aa26b42 100644
> >>>> --- a/lib/netdev-offload-dpdk.c
> >>>> +++ b/lib/netdev-offload-dpdk.c
> >>>> @@ -1610,6 +1610,100 @@ netdev_offload_dpdk_flow_dump_destroy(struct netdev_flow_dump *dump)
> >>>>        return 0;
> >>>>    }
> >>>>
> >>>> +static struct netdev *
> >>>> +get_vport_netdev(const char *dpif_type,
> >>>> +                 struct rte_flow_tunnel *tunnel,
> >>>> +                 odp_port_t *odp_port)
> >>>> +{
> >>>> +    const struct netdev_tunnel_config *tnl_cfg;
> >>>> +    struct netdev_flow_dump **netdev_dumps;
> >>>> +    struct netdev *vport = NULL;
> >>>> +    bool found = false;
> >>>> +    int num_ports = 0;
> >>>> +    int err;
> >>>> +    int i;
> >>>> +
> >>>> +    netdev_dumps = netdev_ports_flow_dump_create(dpif_type, &num_ports, false);
> >>> This relates to my comment in Patch-3; flow_dump_create() in this
> >>> context is very confusing since we are not really dumping flows. We
> >>> might as well walk the list of ports/netdevs looking for a specific
> >>> netdev, just like other netdev_ports_*() routines in netdev-offload.c;
> >>> may be add a new function in netdev-offload.c:
> >> As in the response there. I used an existing API, not introducing new ones.
> > Like I said, this can be confusing that while trying to offload a
> > flow, we invoke flow-dump API. And the flow-dump API implementation
> > does nothing, apart from getting a reference to a netdev. IMO, it'd be
> > better to add a new API to keep it simple and clear, as shown below.
> >>> netdev_ports_get_tunnel_vport(dpif_type, tunnel_type, tp_dst):    /*
> >>> example: tunnel_type == "vxlan", tp_dst = 4789 */
> >>>
> >>> HMAP_FOR_EACH (data, portno_node, &port_to_netdev) {
> >>>       if (netdev_get_dpif_type(data->netdev) == dpif_type &&
> >>>           netdev_get_type(data->netdev) == tunnel_type) {
> >>>               tnl_cfg = netdev_get_tunnel_config(data->netdev);
> >>>                if (tnl_cfg && tnl_cfg->dst_port == tp_dst) {
> >>>                ....
> >>>                ....
> >>>               }
> >>>       }
> An alternative I can suggest is something like the following:
>
> New API:
>
> void
> netdev_ports_traverse(const char *dpif_type,
>                        void (*cb)(struct netdev *, odp_port_t, void *),
>                        void *cookie)
> {
>      struct port_to_netdev_data *data;
>
> ovs_rwlock_rdlock(&netdev_hmap_rwlock);
>      HMAP_FOR_EACH (data, portno_node, &port_to_netdev) {
>          if (netdev_get_dpif_type(data->netdev) == dpif_type) {
>              cb(data->netdev, data->dpif_port.port_no, cookie);
> }
> }
> ovs_rwlock_unlock(&netdev_hmap_rwlock);
> }
>
> Then, implement cb to find the tunnel in netdev-offload-dpdk.c. What do
> you think?

Yes, that should be ok; it might need some additional logic to stop
traversal when you find the required node ? Callback routine could
return a value that indicates continue or stop traversal ? Anyway,
please go ahead with the changes.

Thanks,
-Harsha

>
> >>>
> >>>> +    for (i = 0; i < num_ports; i++) {
> >>>> +        if (!found && tunnel->type == RTE_FLOW_ITEM_TYPE_VXLAN &&
> >>>> +            !strcmp(netdev_get_type(netdev_dumps[i]->netdev), "vxlan")) {
> >>>> +            tnl_cfg = netdev_get_tunnel_config(netdev_dumps[i]->netdev);
> >>>> +            if (tnl_cfg && tnl_cfg->dst_port == tunnel->tp_dst) {
> >>>> +                found = true;
> >>>> +                vport = netdev_dumps[i]->netdev;
> >>>> +                netdev_ref(vport);
> >>>> +                *odp_port = netdev_dumps[i]->port;
> >>>> +            }
> >>>> +        }
> >>>> +        err = netdev_flow_dump_destroy(netdev_dumps[i]);
> >>>> +        if (err != 0 && err != EOPNOTSUPP) {
> >>>> +            VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err));
> >>>> +        }
> >>>> +    }
> >>>> +    return vport;
> >>>> +}
> >>>> +
> >>>> +static int
> >>>> +netdev_offload_dpdk_hw_miss_packet_recover(struct netdev *netdev,
> >>>> +                                           struct dp_packet *packet)
> >>>> +{
> >>>> +    struct rte_flow_restore_info rte_restore_info;
> >>>> +    struct rte_flow_tunnel *rte_tnl;
> >>>> +    struct rte_flow_error error;
> >>>> +    struct netdev *vport_netdev;
> >>>> +    struct pkt_metadata *md;
> >>>> +    struct flow_tnl *md_tnl;
> >>>> +    odp_port_t vport_odp;
> >>>> +
> >>>> +    if (netdev_dpdk_rte_flow_get_restore_info(netdev, packet,
> >>>> +                                              &rte_restore_info, &error)) {
> >>>> +        /* This function is called for every packet, and in most cases there
> >>>> +         * will be no restore info from the HW, thus error is expected.
> >>> Right now this API (get_restore_info) is needed only in the case of
> >>> tunnel offloads; is there some way we could avoid calling this for
> >>> every packet (e.g non-offloaded, non-tunnel-offloaded packets) ?
> >> No other way I can think of. Suggestions?
> > Suggested in Patch-6 (code refactoring in dfc_processing())
> Yes, I will refactor. However it will avoid this call only if
> hw-offload=false.
> >
> >>> What is expected from the PMD using the given 'packet' argument ? This
> >>> is not clear from the API description in rte_flow.h.
> >> The PMD is expected to use any data on the packet in order to provide
> >> the HW info.
> >>
> >> mlx5 PMD uses mark to do it, but each PMD is free to use whatever
> >> implementation suitable for its HW.
> >>
> >> If the description is not clear enough, I suggest you comment/fix in
> >> DPDK ML.
> > I'm commenting here, as I'm reviewing OVS patches.
>
> >>> Is the PMD supposed to parse this packet and return success only if it
> >>> is an encapsulated packet ? Are there any other additional conditions
> >>> like the packet should be marked (i.e., the PMD should validate
> >>> rte_mbuf->hash.fdir fields etc) ? If it is a marked packet, does OVS
> >>> set the mark action or should the PMD implicitly add a mark action,
> >>> while offloading flow F1 ? If the PMD implicitly adds a mark action,
> >>> won't it conflict with mark-ids managed by OVS ?
> >> Again, it is the PMD decision. In mlx5, there is a devarg to enable
> >> tunnel offload APIs. In this mode, the PMD uses internal marks, and deny
> >> the user to use his own MARK actions, so no conflict.
> >>
> >> OVS partial offload doesn't work in this mode.
> > Ok, thanks for the info on mlx5 PMD.
> >>>> +         */
> >>>> +        (void) error;
> >>>> +        return -1;
> >>>> +    }
> >>>> +
> >>>> +    rte_tnl = &rte_restore_info.tunnel;
> >>>> +    if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_TUNNEL) {
> >>>> +        vport_netdev = get_vport_netdev(netdev->dpif_type, rte_tnl,
> >>>> +                                        &vport_odp);
> >>>> +        md = &packet->md;
> >>>> +        if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_ENCAPSULATED) {
> >>>> +            if (!vport_netdev || !vport_netdev->netdev_class ||
> >>>> +                !vport_netdev->netdev_class->pop_header) {
> >>>> +                VLOG_ERR("vport nedtdev=%s with no pop_header method",
> >>>> +                         netdev_get_name(vport_netdev));
> >>>> +                return -1;
> >>>> +            }
> >>>> +            vport_netdev->netdev_class->pop_header(packet);
> >>>> +            netdev_close(vport_netdev);
> >>>> +         } else {
> >>> The else condition handles the case when the packet is tunneled, but
> >>> it is not encapsulated ? It is not clear how/when this could happen ?
> >> Again, this is the PMD decision. OVS should handle.
>
> The PMD may provide an info with RTE_FLOW_RESTORE_INFO_TUNNEL on or off.
>
> In the case it is on, it means the packet is still encapsulated, and we
> do the pop in SW.
>
> In the case it is off, it means the packet is already decapsulated, and
> the tunnel info is provided in the tunnel struct. For this case we take
> it to OVS metadata.
>
> It is the up to the PMD to provide the info. OVS should handle either case.
>
> >>> Thanks,
> >>> -Harsha
> >>>
> >>>> +             md_tnl = &md->tunnel;
> >>>> +             if (rte_tnl->is_ipv6) {
> >>>> +                 memcpy(&md_tnl->ipv6_src, &rte_tnl->ipv6.src_addr,
> >>>> +                        sizeof md_tnl->ipv6_src);
> >>>> +                 memcpy(&md_tnl->ipv6_dst, &rte_tnl->ipv6.dst_addr,
> >>>> +                        sizeof md_tnl->ipv6_dst);
> >>>> +             } else {
> >>>> +                 md_tnl->ip_src = rte_tnl->ipv4.src_addr;
> >>>> +                 md_tnl->ip_dst = rte_tnl->ipv4.dst_addr;
> >>>> +             }
> >>>> +             md_tnl->tun_id = htonll(rte_tnl->tun_id);
> >>>> +             md_tnl->flags = rte_tnl->tun_flags;
> >>>> +             md_tnl->ip_tos = rte_tnl->tos;
> >>>> +             md_tnl->ip_ttl = rte_tnl->ttl;
> >>>> +             md_tnl->tp_src = rte_tnl->tp_src;
> >>>> +         }
> >>>> +         if (vport_netdev) {
> >>>> +             md->in_port.odp_port = vport_odp;
> >>>> +         }
> >>>> +    }
> >>>> +    dp_packet_reset_offload(packet);
> >>>> +
> >>>> +    return 0;
> >>>> +}
> >>>> +
> >>>>    const struct netdev_flow_api netdev_offload_dpdk = {
> >>>>        .type = "dpdk_flow_api",
> >>>>        .flow_put = netdev_offload_dpdk_flow_put,
> >>>> @@ -1619,4 +1713,5 @@ const struct netdev_flow_api netdev_offload_dpdk = {
> >>>>        .flow_flush = netdev_offload_dpdk_flow_flush,
> >>>>        .flow_dump_create = netdev_offload_dpdk_flow_dump_create,
> >>>>        .flow_dump_destroy = netdev_offload_dpdk_flow_dump_destroy,
> >>>> +    .hw_miss_packet_recover = netdev_offload_dpdk_hw_miss_packet_recover,
> >>>>    };
> >>>> --
> >>>> 2.28.0.546.g385c171
> >>>>
diff mbox series

Patch

diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
index 8cc90d0f1..21aa26b42 100644
--- a/lib/netdev-offload-dpdk.c
+++ b/lib/netdev-offload-dpdk.c
@@ -1610,6 +1610,100 @@  netdev_offload_dpdk_flow_dump_destroy(struct netdev_flow_dump *dump)
     return 0;
 }
 
+static struct netdev *
+get_vport_netdev(const char *dpif_type,
+                 struct rte_flow_tunnel *tunnel,
+                 odp_port_t *odp_port)
+{
+    const struct netdev_tunnel_config *tnl_cfg;
+    struct netdev_flow_dump **netdev_dumps;
+    struct netdev *vport = NULL;
+    bool found = false;
+    int num_ports = 0;
+    int err;
+    int i;
+
+    netdev_dumps = netdev_ports_flow_dump_create(dpif_type, &num_ports, false);
+    for (i = 0; i < num_ports; i++) {
+        if (!found && tunnel->type == RTE_FLOW_ITEM_TYPE_VXLAN &&
+            !strcmp(netdev_get_type(netdev_dumps[i]->netdev), "vxlan")) {
+            tnl_cfg = netdev_get_tunnel_config(netdev_dumps[i]->netdev);
+            if (tnl_cfg && tnl_cfg->dst_port == tunnel->tp_dst) {
+                found = true;
+                vport = netdev_dumps[i]->netdev;
+                netdev_ref(vport);
+                *odp_port = netdev_dumps[i]->port;
+            }
+        }
+        err = netdev_flow_dump_destroy(netdev_dumps[i]);
+        if (err != 0 && err != EOPNOTSUPP) {
+            VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err));
+        }
+    }
+    return vport;
+}
+
+static int
+netdev_offload_dpdk_hw_miss_packet_recover(struct netdev *netdev,
+                                           struct dp_packet *packet)
+{
+    struct rte_flow_restore_info rte_restore_info;
+    struct rte_flow_tunnel *rte_tnl;
+    struct rte_flow_error error;
+    struct netdev *vport_netdev;
+    struct pkt_metadata *md;
+    struct flow_tnl *md_tnl;
+    odp_port_t vport_odp;
+
+    if (netdev_dpdk_rte_flow_get_restore_info(netdev, packet,
+                                              &rte_restore_info, &error)) {
+        /* This function is called for every packet, and in most cases there
+         * will be no restore info from the HW, thus error is expected.
+         */
+        (void) error;
+        return -1;
+    }
+
+    rte_tnl = &rte_restore_info.tunnel;
+    if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_TUNNEL) {
+        vport_netdev = get_vport_netdev(netdev->dpif_type, rte_tnl,
+                                        &vport_odp);
+        md = &packet->md;
+        if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_ENCAPSULATED) {
+            if (!vport_netdev || !vport_netdev->netdev_class ||
+                !vport_netdev->netdev_class->pop_header) {
+                VLOG_ERR("vport nedtdev=%s with no pop_header method",
+                         netdev_get_name(vport_netdev));
+                return -1;
+            }
+            vport_netdev->netdev_class->pop_header(packet);
+            netdev_close(vport_netdev);
+         } else {
+             md_tnl = &md->tunnel;
+             if (rte_tnl->is_ipv6) {
+                 memcpy(&md_tnl->ipv6_src, &rte_tnl->ipv6.src_addr,
+                        sizeof md_tnl->ipv6_src);
+                 memcpy(&md_tnl->ipv6_dst, &rte_tnl->ipv6.dst_addr,
+                        sizeof md_tnl->ipv6_dst);
+             } else {
+                 md_tnl->ip_src = rte_tnl->ipv4.src_addr;
+                 md_tnl->ip_dst = rte_tnl->ipv4.dst_addr;
+             }
+             md_tnl->tun_id = htonll(rte_tnl->tun_id);
+             md_tnl->flags = rte_tnl->tun_flags;
+             md_tnl->ip_tos = rte_tnl->tos;
+             md_tnl->ip_ttl = rte_tnl->ttl;
+             md_tnl->tp_src = rte_tnl->tp_src;
+         }
+         if (vport_netdev) {
+             md->in_port.odp_port = vport_odp;
+         }
+    }
+    dp_packet_reset_offload(packet);
+
+    return 0;
+}
+
 const struct netdev_flow_api netdev_offload_dpdk = {
     .type = "dpdk_flow_api",
     .flow_put = netdev_offload_dpdk_flow_put,
@@ -1619,4 +1713,5 @@  const struct netdev_flow_api netdev_offload_dpdk = {
     .flow_flush = netdev_offload_dpdk_flow_flush,
     .flow_dump_create = netdev_offload_dpdk_flow_dump_create,
     .flow_dump_destroy = netdev_offload_dpdk_flow_dump_destroy,
+    .hw_miss_packet_recover = netdev_offload_dpdk_hw_miss_packet_recover,
 };