[ovs-dev,v8,1/3] dpif-netlink: Detect Out-Of-Resource condition on a netdev

Message ID 20181018161314.27854-2-sriharsha.basavapatna@broadcom.com
State New
Headers show
Series
  • Support dynamic rebalancing of offloaded flows
Related show

Commit Message

Sriharsha Basavapatna via dev Oct. 18, 2018, 4:13 p.m.
This is the first patch in the patch-set to support dynamic rebalancing
of offloaded flows.

The patch detects OOR condition on a netdev port when ENOSPC error is
returned by TC-Flower while adding a flow rule. A new structure is added
to the netdev called "netdev_hw_info", to store OOR related information
required to perform dynamic offload-rebalancing.

Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
Reviewed-by: Sathya Perla <sathya.perla@broadcom.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Ben Pfaff <blp@ovn.org>
---
 lib/dpif-netlink.c    | 18 +++++++++++++++++-
 lib/flow.c            | 25 +++++++++++++++++++++++++
 lib/flow.h            |  1 +
 lib/netdev-provider.h | 11 +++++++++++
 lib/netdev.c          | 34 ++++++++++++++++++++++++++++++++++
 lib/netdev.h          |  3 +++
 6 files changed, 91 insertions(+), 1 deletion(-)

Comments

0-day Robot Oct. 18, 2018, 4:58 p.m. | #1
Bleep bloop.  Greetings Sriharsha Basavapatna via dev, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
ERROR: Author Sriharsha Basavapatna via dev <ovs-dev@openvswitch.org> needs to sign off.
WARNING: Unexpected sign-offs from developers who are not authors or co-authors or committers: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Lines checked: 224, Warnings: 1, Errors: 1


Please check this out.  If you feel there has been an error, please email aconole@bytheb.org

Thanks,
0-day Robot
Eelco Chaudron Oct. 19, 2018, 2:22 p.m. | #2
On 18 Oct 2018, at 18:13, Sriharsha Basavapatna via dev wrote:

> This is the first patch in the patch-set to support dynamic 
> rebalancing
> of offloaded flows.
>
> The patch detects OOR condition on a netdev port when ENOSPC error is
> returned by TC-Flower while adding a flow rule. A new structure is 
> added
> to the netdev called "netdev_hw_info", to store OOR related 
> information
> required to perform dynamic offload-rebalancing.
>
> Signed-off-by: Sriharsha Basavapatna 
> <sriharsha.basavapatna@broadcom.com>
> Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
> Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
> Reviewed-by: Sathya Perla <sathya.perla@broadcom.com>
> Reviewed-by: Simon Horman <simon.horman@netronome.com>
> Reviewed-by: Ben Pfaff <blp@ovn.org>
> ---
>  lib/dpif-netlink.c    | 18 +++++++++++++++++-
>  lib/flow.c            | 25 +++++++++++++++++++++++++
>  lib/flow.h            |  1 +
>  lib/netdev-provider.h | 11 +++++++++++
>  lib/netdev.c          | 34 ++++++++++++++++++++++++++++++++++
>  lib/netdev.h          |  3 +++
>  6 files changed, 91 insertions(+), 1 deletion(-)
>
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index e6d5a6ec5..b9ce9cbe2 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -2178,7 +2178,23 @@ parse_flow_put(struct dpif_netlink *dpif, 
> struct dpif_flow_put *put)
>
>          VLOG_DBG("added flow");
>      } else if (err != EEXIST) {
> -        VLOG_ERR_RL(&rl, "failed to offload flow: %s", 
> ovs_strerror(err));
> +        struct netdev *oor_netdev = NULL;
> +        if (err == ENOSPC && 
> netdev_is_offload_rebalance_policy_enabled()) {
> +            /*
> +             * We need to set OOR on the input netdev (i.e, 'dev') 
> for the
> +             * flow. But if the flow has a tunnel attribute (i.e, 
> decap action,
> +             * with a virtual device like a VxLAN interface as its 
> in-port),
> +             * then lookup and set OOR on the underlying tunnel 
> (real) netdev.
> +             */
> +            oor_netdev = flow_get_tunnel_netdev(&match.flow.tunnel);
> +            if (!oor_netdev) {
> +                /* Not a 'tunnel' flow */
> +                oor_netdev = dev;
> +            }
> +            netdev_set_hw_info(oor_netdev, HW_INFO_TYPE_OOR, true);

Why not just oor_netdev->hw_info.oor = true, see also below.

I have a general comment, don't know where to put it, so I put it here. 
Some hardware might have multiple tables. If one type of table is full 
the ENOSPC might be returned, but it does not mean all type of flows can 
no longer be offloaded. This might be a situation to think about.

> +        }
> +        VLOG_ERR_RL(&rl, "failed to offload flow: %s: %s", 
> ovs_strerror(err),
> +                    (oor_netdev ? oor_netdev->name : dev->name));
>      }
>
>  out:
> diff --git a/lib/flow.c b/lib/flow.c
> index 77ed3d9df..a39807908 100644
> --- a/lib/flow.c
> +++ b/lib/flow.c
> @@ -19,6 +19,7 @@
>  #include <errno.h>
>  #include <inttypes.h>
>  #include <limits.h>
> +#include <net/if.h>
>  #include <netinet/in.h>
>  #include <netinet/icmp6.h>
>  #include <netinet/ip6.h>
> @@ -41,6 +42,8 @@
>  #include "unaligned.h"
>  #include "util.h"
>  #include "openvswitch/nsh.h"
> +#include "ovs-router.h"
> +#include "lib/netdev-provider.h"
>
>  COVERAGE_DEFINE(flow_extract);
>  COVERAGE_DEFINE(miniflow_malloc);
> @@ -3403,3 +3406,25 @@ flow_limit_vlans(int vlan_limit)
>          flow_vlan_limit = MIN(vlan_limit, FLOW_MAX_VLAN_HEADERS);
>      }
>  }
> +
> +struct netdev *
> +flow_get_tunnel_netdev(struct flow_tnl *tunnel)
> +{
> +    char iface[IFNAMSIZ];
> +    struct in6_addr ip6;
> +    struct in6_addr gw;
> +
> +    if (tunnel->ip_src) {
> +        in6_addr_set_mapped_ipv4(&ip6, tunnel->ip_src);
> +    } else if (ipv6_addr_is_set(&tunnel->ipv6_src)) {
> +        ip6 = tunnel->ipv6_src;
> +    } else {
> +        return NULL;
> +    }
> +
> +    if (!ovs_router_lookup(0, &ip6, iface, NULL, &gw)) {
> +        return NULL;
> +    }
> +
> +    return netdev_from_name(iface);
> +}
> diff --git a/lib/flow.h b/lib/flow.h
> index d03f1ba9c..aca60c41a 100644
> --- a/lib/flow.h
> +++ b/lib/flow.h
> @@ -73,6 +73,7 @@ void flow_extract(struct dp_packet *, struct flow 
> *);
>  void flow_zero_wildcards(struct flow *, const struct flow_wildcards 
> *);
>  void flow_unwildcard_tp_ports(const struct flow *, struct 
> flow_wildcards *);
>  void flow_get_metadata(const struct flow *, struct match 
> *flow_metadata);
> +struct netdev *flow_get_tunnel_netdev(struct flow_tnl *tunnel);
>
>  const char *ct_state_to_string(uint32_t state);
>  uint32_t ct_state_from_string(const char *);
> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
> index 5a7947351..e320dad61 100644
> --- a/lib/netdev-provider.h
> +++ b/lib/netdev-provider.h
> @@ -35,6 +35,15 @@ extern "C" {
>  struct netdev_tnl_build_header_params;
>  #define NETDEV_NUMA_UNSPEC OVS_NUMA_UNSPEC
>
> +/* Offload-capable (HW) netdev information */
> +struct netdev_hw_info {
> +    bool oor;		/* Out of Offload Resources ? */
> +};
> +
> +enum hw_info_type {
> +    HW_INFO_TYPE_OOR = 1	/* OOR state */
> +};
> +
>  /* A network device (e.g. an Ethernet device).
>   *
>   * Network device implementations may read these members but should 
> not modify
> @@ -80,6 +89,8 @@ struct netdev {
>      int n_rxq;
>      struct shash_node *node;            /* Pointer to element in 
> global map. */
>      struct ovs_list saved_flags_list; /* Contains "struct 
> netdev_saved_flags". */
> +
> +    struct netdev_hw_info hw_info;	/* offload-capable netdev info */
>  };
>
>  static inline void
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 690d28409..f3fa08ca3 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -415,6 +415,7 @@ netdev_open(const char *name, const char *type, 
> struct netdev **netdevp)
>                  netdev->reconfigure_seq = seq_create();
>                  netdev->last_reconfigure_seq =
>                      seq_read(netdev->reconfigure_seq);
> +                netdev->hw_info.oor = false;
>                  netdev->node = shash_add(&netdev_shash, name, 
> netdev);
>
 >                  /* By default enable one tx and rx queue per netdev. 
*/
> @@ -2252,6 +2253,31 @@ netdev_get_block_id(struct netdev *netdev)
>              : 0);
>  }
>
> +/*
> + * Get the value of the hw info parameter specified by type.
> + * Returns the value on success (>= 0). Returns -1 on failure.
> + */
> +int
> +netdev_get_hw_info(struct netdev *netdev, int type)
> +{
> +    if (type == HW_INFO_TYPE_OOR) {
> +        return netdev->hw_info.oor;
> +    }
> +
> +    return -1;
> +}
> +
> +/*
> + * Set the value of the hw info parameter specified by type.
> + */
> +void
> +netdev_set_hw_info(struct netdev *netdev, int type, int val)
> +{
> +    if (type == HW_INFO_TYPE_OOR) {
> +        netdev->hw_info.oor = val;
> +    }
> +}
> +
>  bool
>  netdev_is_flow_api_enabled(void)
>  {
> @@ -2488,6 +2514,14 @@ netdev_free_custom_stats_counters(struct 
> netdev_custom_stats *custom_stats)
>      }
>  }

I do not like these type of functions as they are easy to cause 
problems. For example, you are using a general parameter to set a bool, 
what if your int input needs to set a uint32_t or uint64_t? What on 
return values, you might need to explicitly cast them.

I would suggest that if this structure values need to be used outside of 
the netdev framework define the specific function. Inside just 
reference/change the values.

So something like bool netdev_get(set)_hw_offload_oor().

>
> +static bool netdev_offload_rebalance_policy = false;
> +
> +bool
> +netdev_is_offload_rebalance_policy_enabled(void)
> +{
> +    return netdev_offload_rebalance_policy;
> +}
> +

See general comment on the cover letter, about a more granular 
configuration option.

>  #ifdef __linux__
>  static void
>  netdev_ports_flow_init(void)
> diff --git a/lib/netdev.h b/lib/netdev.h
> index 556676046..b0e5c5b72 100644
> --- a/lib/netdev.h
> +++ b/lib/netdev.h
> @@ -227,8 +227,11 @@ int netdev_flow_del(struct netdev *, const 
> ovs_u128 *,
>                      struct dpif_flow_stats *);
>  int netdev_init_flow_api(struct netdev *);
>  uint32_t netdev_get_block_id(struct netdev *);
> +int netdev_get_hw_info(struct netdev *, int);
> +void netdev_set_hw_info(struct netdev *, int, int);
>  bool netdev_is_flow_api_enabled(void);
>  void netdev_set_flow_api_enabled(const struct smap 
> *ovs_other_config);
> +bool netdev_is_offload_rebalance_policy_enabled(void);
>
>  struct dpif_port;
>  int netdev_ports_insert(struct netdev *, const struct dpif_class *,
> -- 
> 2.18.0.rc1.1.g6f333ff
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Sriharsha Basavapatna via dev Oct. 25, 2018, 2 p.m. | #3
Hi Eelco,

Thanks for your comments, please see my response below.
On Fri, Oct 19, 2018 at 7:52 PM Eelco Chaudron <echaudro@redhat.com> wrote:
>
> On 18 Oct 2018, at 18:13, Sriharsha Basavapatna via dev wrote:
>
> > This is the first patch in the patch-set to support dynamic
> > rebalancing
> > of offloaded flows.
> >
> > The patch detects OOR condition on a netdev port when ENOSPC error is
> > returned by TC-Flower while adding a flow rule. A new structure is
> > added
> > to the netdev called "netdev_hw_info", to store OOR related
> > information
> > required to perform dynamic offload-rebalancing.
> >
> > Signed-off-by: Sriharsha Basavapatna
> > <sriharsha.basavapatna@broadcom.com>
> > Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
> > Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
> > Reviewed-by: Sathya Perla <sathya.perla@broadcom.com>
> > Reviewed-by: Simon Horman <simon.horman@netronome.com>
> > Reviewed-by: Ben Pfaff <blp@ovn.org>
> > ---
> >  lib/dpif-netlink.c    | 18 +++++++++++++++++-
> >  lib/flow.c            | 25 +++++++++++++++++++++++++
> >  lib/flow.h            |  1 +
> >  lib/netdev-provider.h | 11 +++++++++++
> >  lib/netdev.c          | 34 ++++++++++++++++++++++++++++++++++
> >  lib/netdev.h          |  3 +++
> >  6 files changed, 91 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> > index e6d5a6ec5..b9ce9cbe2 100644
> > --- a/lib/dpif-netlink.c
> > +++ b/lib/dpif-netlink.c
> > @@ -2178,7 +2178,23 @@ parse_flow_put(struct dpif_netlink *dpif,
> > struct dpif_flow_put *put)
> >
> >          VLOG_DBG("added flow");
> >      } else if (err != EEXIST) {
> > -        VLOG_ERR_RL(&rl, "failed to offload flow: %s",
> > ovs_strerror(err));
> > +        struct netdev *oor_netdev = NULL;
> > +        if (err == ENOSPC &&
> > netdev_is_offload_rebalance_policy_enabled()) {
> > +            /*
> > +             * We need to set OOR on the input netdev (i.e, 'dev')
> > for the
> > +             * flow. But if the flow has a tunnel attribute (i.e,
> > decap action,
> > +             * with a virtual device like a VxLAN interface as its
> > in-port),
> > +             * then lookup and set OOR on the underlying tunnel
> > (real) netdev.
> > +             */
> > +            oor_netdev = flow_get_tunnel_netdev(&match.flow.tunnel);
> > +            if (!oor_netdev) {
> > +                /* Not a 'tunnel' flow */
> > +                oor_netdev = dev;
> > +            }
> > +            netdev_set_hw_info(oor_netdev, HW_INFO_TYPE_OOR, true);
>
> Why not just oor_netdev->hw_info.oor = true, see also below.

The original code was directly accessing netdev members. It was changed
based on a review comment to avoid direct access and add an interface.

>
> I have a general comment, don't know where to put it, so I put it here.
> Some hardware might have multiple tables. If one type of table is full
> the ENOSPC might be returned, but it does not mean all type of flows can
> no longer be offloaded. This might be a situation to think about.

Ok, thanks for bringing it up. Currently from OvS daemon's perspective a
request to add/delete a flow is issued on a netdev and the failure indicates
that the particular netdev is out of resources. If we need to handle the
condition where HW has different tables, we need to further extend this
design and the tc interfaces to propagate this fine grained information.

>
> > +        }
> > +        VLOG_ERR_RL(&rl, "failed to offload flow: %s: %s",
> > ovs_strerror(err),
> > +                    (oor_netdev ? oor_netdev->name : dev->name));
> >      }
> >
> >  out:
> > diff --git a/lib/flow.c b/lib/flow.c
> > index 77ed3d9df..a39807908 100644
> > --- a/lib/flow.c
> > +++ b/lib/flow.c
> > @@ -19,6 +19,7 @@
> >  #include <errno.h>
> >  #include <inttypes.h>
> >  #include <limits.h>
> > +#include <net/if.h>
> >  #include <netinet/in.h>
> >  #include <netinet/icmp6.h>
> >  #include <netinet/ip6.h>
> > @@ -41,6 +42,8 @@
> >  #include "unaligned.h"
> >  #include "util.h"
> >  #include "openvswitch/nsh.h"
> > +#include "ovs-router.h"
> > +#include "lib/netdev-provider.h"
> >
> >  COVERAGE_DEFINE(flow_extract);
> >  COVERAGE_DEFINE(miniflow_malloc);
> > @@ -3403,3 +3406,25 @@ flow_limit_vlans(int vlan_limit)
> >          flow_vlan_limit = MIN(vlan_limit, FLOW_MAX_VLAN_HEADERS);
> >      }
> >  }
> > +
> > +struct netdev *
> > +flow_get_tunnel_netdev(struct flow_tnl *tunnel)
> > +{
> > +    char iface[IFNAMSIZ];
> > +    struct in6_addr ip6;
> > +    struct in6_addr gw;
> > +
> > +    if (tunnel->ip_src) {
> > +        in6_addr_set_mapped_ipv4(&ip6, tunnel->ip_src);
> > +    } else if (ipv6_addr_is_set(&tunnel->ipv6_src)) {
> > +        ip6 = tunnel->ipv6_src;
> > +    } else {
> > +        return NULL;
> > +    }
> > +
> > +    if (!ovs_router_lookup(0, &ip6, iface, NULL, &gw)) {
> > +        return NULL;
> > +    }
> > +
> > +    return netdev_from_name(iface);
> > +}
> > diff --git a/lib/flow.h b/lib/flow.h
> > index d03f1ba9c..aca60c41a 100644
> > --- a/lib/flow.h
> > +++ b/lib/flow.h
> > @@ -73,6 +73,7 @@ void flow_extract(struct dp_packet *, struct flow
> > *);
> >  void flow_zero_wildcards(struct flow *, const struct flow_wildcards
> > *);
> >  void flow_unwildcard_tp_ports(const struct flow *, struct
> > flow_wildcards *);
> >  void flow_get_metadata(const struct flow *, struct match
> > *flow_metadata);
> > +struct netdev *flow_get_tunnel_netdev(struct flow_tnl *tunnel);
> >
> >  const char *ct_state_to_string(uint32_t state);
> >  uint32_t ct_state_from_string(const char *);
> > diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
> > index 5a7947351..e320dad61 100644
> > --- a/lib/netdev-provider.h
> > +++ b/lib/netdev-provider.h
> > @@ -35,6 +35,15 @@ extern "C" {
> >  struct netdev_tnl_build_header_params;
> >  #define NETDEV_NUMA_UNSPEC OVS_NUMA_UNSPEC
> >
> > +/* Offload-capable (HW) netdev information */
> > +struct netdev_hw_info {
> > +    bool oor;                /* Out of Offload Resources ? */
> > +};
> > +
> > +enum hw_info_type {
> > +    HW_INFO_TYPE_OOR = 1     /* OOR state */
> > +};
> > +
> >  /* A network device (e.g. an Ethernet device).
> >   *
> >   * Network device implementations may read these members but should
> > not modify
> > @@ -80,6 +89,8 @@ struct netdev {
> >      int n_rxq;
> >      struct shash_node *node;            /* Pointer to element in
> > global map. */
> >      struct ovs_list saved_flags_list; /* Contains "struct
> > netdev_saved_flags". */
> > +
> > +    struct netdev_hw_info hw_info;   /* offload-capable netdev info */
> >  };
> >
> >  static inline void
> > diff --git a/lib/netdev.c b/lib/netdev.c
> > index 690d28409..f3fa08ca3 100644
> > --- a/lib/netdev.c
> > +++ b/lib/netdev.c
> > @@ -415,6 +415,7 @@ netdev_open(const char *name, const char *type,
> > struct netdev **netdevp)
> >                  netdev->reconfigure_seq = seq_create();
> >                  netdev->last_reconfigure_seq =
> >                      seq_read(netdev->reconfigure_seq);
> > +                netdev->hw_info.oor = false;
> >                  netdev->node = shash_add(&netdev_shash, name,
> > netdev);
> >
>  >                  /* By default enable one tx and rx queue per netdev.
> */
> > @@ -2252,6 +2253,31 @@ netdev_get_block_id(struct netdev *netdev)
> >              : 0);
> >  }
> >
> > +/*
> > + * Get the value of the hw info parameter specified by type.
> > + * Returns the value on success (>= 0). Returns -1 on failure.
> > + */
> > +int
> > +netdev_get_hw_info(struct netdev *netdev, int type)
> > +{
> > +    if (type == HW_INFO_TYPE_OOR) {
> > +        return netdev->hw_info.oor;
> > +    }
> > +
> > +    return -1;
> > +}
> > +
> > +/*
> > + * Set the value of the hw info parameter specified by type.
> > + */
> > +void
> > +netdev_set_hw_info(struct netdev *netdev, int type, int val)
> > +{
> > +    if (type == HW_INFO_TYPE_OOR) {
> > +        netdev->hw_info.oor = val;
> > +    }
> > +}
> > +
> >  bool
> >  netdev_is_flow_api_enabled(void)
> >  {
> > @@ -2488,6 +2514,14 @@ netdev_free_custom_stats_counters(struct
> > netdev_custom_stats *custom_stats)
> >      }
> >  }
>
> I do not like these type of functions as they are easy to cause
> problems. For example, you are using a general parameter to set a bool,
> what if your int input needs to set a uint32_t or uint64_t? What on
> return values, you might need to explicitly cast them.
>
> I would suggest that if this structure values need to be used outside of
> the netdev framework define the specific function. Inside just
> reference/change the values.
>
> So something like bool netdev_get(set)_hw_offload_oor().

I didn't want to proliferate hw-info interfaces in netdev; we can change
it in the future if we need to process other types like you mentioned.
>
> >
> > +static bool netdev_offload_rebalance_policy = false;
> > +
> > +bool
> > +netdev_is_offload_rebalance_policy_enabled(void)
> > +{
> > +    return netdev_offload_rebalance_policy;
> > +}
> > +
>
> See general comment on the cover letter, about a more granular
> configuration option.

I responded earlier to this comment on the cover letter.

Thanks,
-Harsha
>
> >  #ifdef __linux__
> >  static void
> >  netdev_ports_flow_init(void)
> > diff --git a/lib/netdev.h b/lib/netdev.h
> > index 556676046..b0e5c5b72 100644
> > --- a/lib/netdev.h
> > +++ b/lib/netdev.h
> > @@ -227,8 +227,11 @@ int netdev_flow_del(struct netdev *, const
> > ovs_u128 *,
> >                      struct dpif_flow_stats *);
> >  int netdev_init_flow_api(struct netdev *);
> >  uint32_t netdev_get_block_id(struct netdev *);
> > +int netdev_get_hw_info(struct netdev *, int);
> > +void netdev_set_hw_info(struct netdev *, int, int);
> >  bool netdev_is_flow_api_enabled(void);
> >  void netdev_set_flow_api_enabled(const struct smap
> > *ovs_other_config);
> > +bool netdev_is_offload_rebalance_policy_enabled(void);
> >
> >  struct dpif_port;
> >  int netdev_ports_insert(struct netdev *, const struct dpif_class *,
> > --
> > 2.18.0.rc1.1.g6f333ff
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Eelco Chaudron Oct. 26, 2018, 7:58 a.m. | #4
On 25 Oct 2018, at 16:00, Sriharsha Basavapatna wrote:

> Hi Eelco,
>
> Thanks for your comments, please see my response below.
> On Fri, Oct 19, 2018 at 7:52 PM Eelco Chaudron <echaudro@redhat.com> 
> wrote:
>>
>> On 18 Oct 2018, at 18:13, Sriharsha Basavapatna via dev wrote:
>>
>>> This is the first patch in the patch-set to support dynamic
>>> rebalancing
>>> of offloaded flows.
>>>
>>> The patch detects OOR condition on a netdev port when ENOSPC error 
>>> is
>>> returned by TC-Flower while adding a flow rule. A new structure is
>>> added
>>> to the netdev called "netdev_hw_info", to store OOR related
>>> information
>>> required to perform dynamic offload-rebalancing.
>>>
>>> Signed-off-by: Sriharsha Basavapatna
>>> <sriharsha.basavapatna@broadcom.com>
>>> Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
>>> Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
>>> Reviewed-by: Sathya Perla <sathya.perla@broadcom.com>
>>> Reviewed-by: Simon Horman <simon.horman@netronome.com>
>>> Reviewed-by: Ben Pfaff <blp@ovn.org>
>>> ---
>>>  lib/dpif-netlink.c    | 18 +++++++++++++++++-
>>>  lib/flow.c            | 25 +++++++++++++++++++++++++
>>>  lib/flow.h            |  1 +
>>>  lib/netdev-provider.h | 11 +++++++++++
>>>  lib/netdev.c          | 34 ++++++++++++++++++++++++++++++++++
>>>  lib/netdev.h          |  3 +++
>>>  6 files changed, 91 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>>> index e6d5a6ec5..b9ce9cbe2 100644
>>> --- a/lib/dpif-netlink.c
>>> +++ b/lib/dpif-netlink.c
>>> @@ -2178,7 +2178,23 @@ parse_flow_put(struct dpif_netlink *dpif,
>>> struct dpif_flow_put *put)
>>>
>>>          VLOG_DBG("added flow");
>>>      } else if (err != EEXIST) {
>>> -        VLOG_ERR_RL(&rl, "failed to offload flow: %s",
>>> ovs_strerror(err));
>>> +        struct netdev *oor_netdev = NULL;
>>> +        if (err == ENOSPC &&
>>> netdev_is_offload_rebalance_policy_enabled()) {
>>> +            /*
>>> +             * We need to set OOR on the input netdev (i.e, 'dev')
>>> for the
>>> +             * flow. But if the flow has a tunnel attribute (i.e,
>>> decap action,
>>> +             * with a virtual device like a VxLAN interface as its
>>> in-port),
>>> +             * then lookup and set OOR on the underlying tunnel
>>> (real) netdev.
>>> +             */
>>> +            oor_netdev = 
>>> flow_get_tunnel_netdev(&match.flow.tunnel);
>>> +            if (!oor_netdev) {
>>> +                /* Not a 'tunnel' flow */
>>> +                oor_netdev = dev;
>>> +            }
>>> +            netdev_set_hw_info(oor_netdev, HW_INFO_TYPE_OOR, true);
>>
>> Why not just oor_netdev->hw_info.oor = true, see also below.
>
> The original code was directly accessing netdev members. It was 
> changed
> based on a review comment to avoid direct access and add an interface.
>
>>
>> I have a general comment, don't know where to put it, so I put it 
>> here.
>> Some hardware might have multiple tables. If one type of table is 
>> full
>> the ENOSPC might be returned, but it does not mean all type of flows 
>> can
>> no longer be offloaded. This might be a situation to think about.
>
> Ok, thanks for bringing it up. Currently from OvS daemon's perspective 
> a
> request to add/delete a flow is issued on a netdev and the failure 
> indicates
> that the particular netdev is out of resources. If we need to handle 
> the
> condition where HW has different tables, we need to further extend 
> this
> design and the tc interfaces to propagate this fine grained 
> information.

Would be good if other hardware vendors can comment here?

>>
>>> +        }
>>> +        VLOG_ERR_RL(&rl, "failed to offload flow: %s: %s",
>>> ovs_strerror(err),
>>> +                    (oor_netdev ? oor_netdev->name : dev->name));
>>>      }
>>>
>>>  out:
>>> diff --git a/lib/flow.c b/lib/flow.c
>>> index 77ed3d9df..a39807908 100644
>>> --- a/lib/flow.c
>>> +++ b/lib/flow.c
>>> @@ -19,6 +19,7 @@
>>>  #include <errno.h>
>>>  #include <inttypes.h>
>>>  #include <limits.h>
>>> +#include <net/if.h>
>>>  #include <netinet/in.h>
>>>  #include <netinet/icmp6.h>
>>>  #include <netinet/ip6.h>
>>> @@ -41,6 +42,8 @@
>>>  #include "unaligned.h"
>>>  #include "util.h"
>>>  #include "openvswitch/nsh.h"
>>> +#include "ovs-router.h"
>>> +#include "lib/netdev-provider.h"
>>>
>>>  COVERAGE_DEFINE(flow_extract);
>>>  COVERAGE_DEFINE(miniflow_malloc);
>>> @@ -3403,3 +3406,25 @@ flow_limit_vlans(int vlan_limit)
>>>          flow_vlan_limit = MIN(vlan_limit, FLOW_MAX_VLAN_HEADERS);
>>>      }
>>>  }
>>> +
>>> +struct netdev *
>>> +flow_get_tunnel_netdev(struct flow_tnl *tunnel)
>>> +{
>>> +    char iface[IFNAMSIZ];
>>> +    struct in6_addr ip6;
>>> +    struct in6_addr gw;
>>> +
>>> +    if (tunnel->ip_src) {
>>> +        in6_addr_set_mapped_ipv4(&ip6, tunnel->ip_src);
>>> +    } else if (ipv6_addr_is_set(&tunnel->ipv6_src)) {
>>> +        ip6 = tunnel->ipv6_src;
>>> +    } else {
>>> +        return NULL;
>>> +    }
>>> +
>>> +    if (!ovs_router_lookup(0, &ip6, iface, NULL, &gw)) {
>>> +        return NULL;
>>> +    }
>>> +
>>> +    return netdev_from_name(iface);
>>> +}
>>> diff --git a/lib/flow.h b/lib/flow.h
>>> index d03f1ba9c..aca60c41a 100644
>>> --- a/lib/flow.h
>>> +++ b/lib/flow.h
>>> @@ -73,6 +73,7 @@ void flow_extract(struct dp_packet *, struct flow
>>> *);
>>>  void flow_zero_wildcards(struct flow *, const struct flow_wildcards
>>> *);
>>>  void flow_unwildcard_tp_ports(const struct flow *, struct
>>> flow_wildcards *);
>>>  void flow_get_metadata(const struct flow *, struct match
>>> *flow_metadata);
>>> +struct netdev *flow_get_tunnel_netdev(struct flow_tnl *tunnel);
>>>
>>>  const char *ct_state_to_string(uint32_t state);
>>>  uint32_t ct_state_from_string(const char *);
>>> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
>>> index 5a7947351..e320dad61 100644
>>> --- a/lib/netdev-provider.h
>>> +++ b/lib/netdev-provider.h
>>> @@ -35,6 +35,15 @@ extern "C" {
>>>  struct netdev_tnl_build_header_params;
>>>  #define NETDEV_NUMA_UNSPEC OVS_NUMA_UNSPEC
>>>
>>> +/* Offload-capable (HW) netdev information */
>>> +struct netdev_hw_info {
>>> +    bool oor;                /* Out of Offload Resources ? */
>>> +};
>>> +
>>> +enum hw_info_type {
>>> +    HW_INFO_TYPE_OOR = 1     /* OOR state */
>>> +};
>>> +
>>>  /* A network device (e.g. an Ethernet device).
>>>   *
>>>   * Network device implementations may read these members but should
>>> not modify
>>> @@ -80,6 +89,8 @@ struct netdev {
>>>      int n_rxq;
>>>      struct shash_node *node;            /* Pointer to element in
>>> global map. */
>>>      struct ovs_list saved_flags_list; /* Contains "struct
>>> netdev_saved_flags". */
>>> +
>>> +    struct netdev_hw_info hw_info;   /* offload-capable netdev info 
>>> */
>>>  };
>>>
>>>  static inline void
>>> diff --git a/lib/netdev.c b/lib/netdev.c
>>> index 690d28409..f3fa08ca3 100644
>>> --- a/lib/netdev.c
>>> +++ b/lib/netdev.c
>>> @@ -415,6 +415,7 @@ netdev_open(const char *name, const char *type,
>>> struct netdev **netdevp)
>>>                  netdev->reconfigure_seq = seq_create();
>>>                  netdev->last_reconfigure_seq =
>>>                      seq_read(netdev->reconfigure_seq);
>>> +                netdev->hw_info.oor = false;
>>>                  netdev->node = shash_add(&netdev_shash, name,
>>> netdev);
>>>
>>>                  /* By default enable one tx and rx queue per 
>>> netdev.
>> */
>>> @@ -2252,6 +2253,31 @@ netdev_get_block_id(struct netdev *netdev)
>>>              : 0);
>>>  }
>>>
>>> +/*
>>> + * Get the value of the hw info parameter specified by type.
>>> + * Returns the value on success (>= 0). Returns -1 on failure.
>>> + */
>>> +int
>>> +netdev_get_hw_info(struct netdev *netdev, int type)
>>> +{
>>> +    if (type == HW_INFO_TYPE_OOR) {
>>> +        return netdev->hw_info.oor;
>>> +    }
>>> +
>>> +    return -1;
>>> +}
>>> +
>>> +/*
>>> + * Set the value of the hw info parameter specified by type.
>>> + */
>>> +void
>>> +netdev_set_hw_info(struct netdev *netdev, int type, int val)
>>> +{
>>> +    if (type == HW_INFO_TYPE_OOR) {
>>> +        netdev->hw_info.oor = val;
>>> +    }
>>> +}
>>> +
>>>  bool
>>>  netdev_is_flow_api_enabled(void)
>>>  {
>>> @@ -2488,6 +2514,14 @@ netdev_free_custom_stats_counters(struct
>>> netdev_custom_stats *custom_stats)
>>>      }
>>>  }
>>
>> I do not like these type of functions as they are easy to cause
>> problems. For example, you are using a general parameter to set a 
>> bool,
>> what if your int input needs to set a uint32_t or uint64_t? What on
>> return values, you might need to explicitly cast them.
>>
>> I would suggest that if this structure values need to be used outside 
>> of
>> the netdev framework define the specific function. Inside just
>> reference/change the values.
>>
>> So something like bool netdev_get(set)_hw_offload_oor().
>
> I didn't want to proliferate hw-info interfaces in netdev; we can 
> change
> it in the future if we need to process other types like you mentioned.
>>
>>>
>>> +static bool netdev_offload_rebalance_policy = false;
>>> +
>>> +bool
>>> +netdev_is_offload_rebalance_policy_enabled(void)
>>> +{
>>> +    return netdev_offload_rebalance_policy;
>>> +}
>>> +
>>
>> See general comment on the cover letter, about a more granular
>> configuration option.
>
> I responded earlier to this comment on the cover letter.
>
> Thanks,
> -Harsha
>>
>>>  #ifdef __linux__
>>>  static void
>>>  netdev_ports_flow_init(void)
>>> diff --git a/lib/netdev.h b/lib/netdev.h
>>> index 556676046..b0e5c5b72 100644
>>> --- a/lib/netdev.h
>>> +++ b/lib/netdev.h
>>> @@ -227,8 +227,11 @@ int netdev_flow_del(struct netdev *, const
>>> ovs_u128 *,
>>>                      struct dpif_flow_stats *);
>>>  int netdev_init_flow_api(struct netdev *);
>>>  uint32_t netdev_get_block_id(struct netdev *);
>>> +int netdev_get_hw_info(struct netdev *, int);
>>> +void netdev_set_hw_info(struct netdev *, int, int);
>>>  bool netdev_is_flow_api_enabled(void);
>>>  void netdev_set_flow_api_enabled(const struct smap
>>> *ovs_other_config);
>>> +bool netdev_is_offload_rebalance_policy_enabled(void);
>>>
>>>  struct dpif_port;
>>>  int netdev_ports_insert(struct netdev *, const struct dpif_class *,
>>> --
>>> 2.18.0.rc1.1.g6f333ff
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Simon Horman Oct. 26, 2018, 9:53 a.m. | #5
On Fri, 26 Oct 2018 at 08:58, Eelco Chaudron <echaudro@redhat.com> wrote:

>
>
> On 25 Oct 2018, at 16:00, Sriharsha Basavapatna wrote:
>
> > Hi Eelco,
> >
> > Thanks for your comments, please see my response below.
> > On Fri, Oct 19, 2018 at 7:52 PM Eelco Chaudron <echaudro@redhat.com>
> > wrote:
> >>
> >> On 18 Oct 2018, at 18:13, Sriharsha Basavapatna via dev wrote:
> >>
> >>> This is the first patch in the patch-set to support dynamic
> >>> rebalancing
> >>> of offloaded flows.
> >>>
> >>> The patch detects OOR condition on a netdev port when ENOSPC error
> >>> is
> >>> returned by TC-Flower while adding a flow rule. A new structure is
> >>> added
> >>> to the netdev called "netdev_hw_info", to store OOR related
> >>> information
> >>> required to perform dynamic offload-rebalancing.
> >>>
> >>> Signed-off-by: Sriharsha Basavapatna
> >>> <sriharsha.basavapatna@broadcom.com>
> >>> Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
> >>> Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
> >>> Reviewed-by: Sathya Perla <sathya.perla@broadcom.com>
> >>> Reviewed-by: Simon Horman <simon.horman@netronome.com>
> >>> Reviewed-by: Ben Pfaff <blp@ovn.org>
> >>> ---
> >>>  lib/dpif-netlink.c    | 18 +++++++++++++++++-
> >>>  lib/flow.c            | 25 +++++++++++++++++++++++++
> >>>  lib/flow.h            |  1 +
> >>>  lib/netdev-provider.h | 11 +++++++++++
> >>>  lib/netdev.c          | 34 ++++++++++++++++++++++++++++++++++
> >>>  lib/netdev.h          |  3 +++
> >>>  6 files changed, 91 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> >>> index e6d5a6ec5..b9ce9cbe2 100644
> >>> --- a/lib/dpif-netlink.c
> >>> +++ b/lib/dpif-netlink.c
> >>> @@ -2178,7 +2178,23 @@ parse_flow_put(struct dpif_netlink *dpif,
> >>> struct dpif_flow_put *put)
> >>>
> >>>          VLOG_DBG("added flow");
> >>>      } else if (err != EEXIST) {
> >>> -        VLOG_ERR_RL(&rl, "failed to offload flow: %s",
> >>> ovs_strerror(err));
> >>> +        struct netdev *oor_netdev = NULL;
> >>> +        if (err == ENOSPC &&
> >>> netdev_is_offload_rebalance_policy_enabled()) {
> >>> +            /*
> >>> +             * We need to set OOR on the input netdev (i.e, 'dev')
> >>> for the
> >>> +             * flow. But if the flow has a tunnel attribute (i.e,
> >>> decap action,
> >>> +             * with a virtual device like a VxLAN interface as its
> >>> in-port),
> >>> +             * then lookup and set OOR on the underlying tunnel
> >>> (real) netdev.
> >>> +             */
> >>> +            oor_netdev =
> >>> flow_get_tunnel_netdev(&match.flow.tunnel);
> >>> +            if (!oor_netdev) {
> >>> +                /* Not a 'tunnel' flow */
> >>> +                oor_netdev = dev;
> >>> +            }
> >>> +            netdev_set_hw_info(oor_netdev, HW_INFO_TYPE_OOR, true);
> >>
> >> Why not just oor_netdev->hw_info.oor = true, see also below.
> >
> > The original code was directly accessing netdev members. It was
> > changed
> > based on a review comment to avoid direct access and add an interface.
> >
> >>
> >> I have a general comment, don't know where to put it, so I put it
> >> here.
> >> Some hardware might have multiple tables. If one type of table is
> >> full
> >> the ENOSPC might be returned, but it does not mean all type of flows
> >> can
> >> no longer be offloaded. This might be a situation to think about.
> >
> > Ok, thanks for bringing it up. Currently from OvS daemon's perspective
> > a
> > request to add/delete a flow is issued on a netdev and the failure
> > indicates
> > that the particular netdev is out of resources. If we need to handle
> > the
> > condition where HW has different tables, we need to further extend
> > this
> > design and the tc interfaces to propagate this fine grained
> > information.
>
> Would be good if other hardware vendors can comment here?
>

There was a discussion in another forum involving at least Mellanox,
Broadcom and Netronome.
From a Netronome point of view this scheme is satisfactory and my
recollection is that
was the agreement of those involved in the discussion.


>
> >>
> >>> +        }
> >>> +        VLOG_ERR_RL(&rl, "failed to offload flow: %s: %s",
> >>> ovs_strerror(err),
> >>> +                    (oor_netdev ? oor_netdev->name : dev->name));
> >>>      }
> >>>
> >>>  out:
> >>> diff --git a/lib/flow.c b/lib/flow.c
> >>> index 77ed3d9df..a39807908 100644
> >>> --- a/lib/flow.c
> >>> +++ b/lib/flow.c
> >>> @@ -19,6 +19,7 @@
> >>>  #include <errno.h>
> >>>  #include <inttypes.h>
> >>>  #include <limits.h>
> >>> +#include <net/if.h>
> >>>  #include <netinet/in.h>
> >>>  #include <netinet/icmp6.h>
> >>>  #include <netinet/ip6.h>
> >>> @@ -41,6 +42,8 @@
> >>>  #include "unaligned.h"
> >>>  #include "util.h"
> >>>  #include "openvswitch/nsh.h"
> >>> +#include "ovs-router.h"
> >>> +#include "lib/netdev-provider.h"
> >>>
> >>>  COVERAGE_DEFINE(flow_extract);
> >>>  COVERAGE_DEFINE(miniflow_malloc);
> >>> @@ -3403,3 +3406,25 @@ flow_limit_vlans(int vlan_limit)
> >>>          flow_vlan_limit = MIN(vlan_limit, FLOW_MAX_VLAN_HEADERS);
> >>>      }
> >>>  }
> >>> +
> >>> +struct netdev *
> >>> +flow_get_tunnel_netdev(struct flow_tnl *tunnel)
> >>> +{
> >>> +    char iface[IFNAMSIZ];
> >>> +    struct in6_addr ip6;
> >>> +    struct in6_addr gw;
> >>> +
> >>> +    if (tunnel->ip_src) {
> >>> +        in6_addr_set_mapped_ipv4(&ip6, tunnel->ip_src);
> >>> +    } else if (ipv6_addr_is_set(&tunnel->ipv6_src)) {
> >>> +        ip6 = tunnel->ipv6_src;
> >>> +    } else {
> >>> +        return NULL;
> >>> +    }
> >>> +
> >>> +    if (!ovs_router_lookup(0, &ip6, iface, NULL, &gw)) {
> >>> +        return NULL;
> >>> +    }
> >>> +
> >>> +    return netdev_from_name(iface);
> >>> +}
> >>> diff --git a/lib/flow.h b/lib/flow.h
> >>> index d03f1ba9c..aca60c41a 100644
> >>> --- a/lib/flow.h
> >>> +++ b/lib/flow.h
> >>> @@ -73,6 +73,7 @@ void flow_extract(struct dp_packet *, struct flow
> >>> *);
> >>>  void flow_zero_wildcards(struct flow *, const struct flow_wildcards
> >>> *);
> >>>  void flow_unwildcard_tp_ports(const struct flow *, struct
> >>> flow_wildcards *);
> >>>  void flow_get_metadata(const struct flow *, struct match
> >>> *flow_metadata);
> >>> +struct netdev *flow_get_tunnel_netdev(struct flow_tnl *tunnel);
> >>>
> >>>  const char *ct_state_to_string(uint32_t state);
> >>>  uint32_t ct_state_from_string(const char *);
> >>> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
> >>> index 5a7947351..e320dad61 100644
> >>> --- a/lib/netdev-provider.h
> >>> +++ b/lib/netdev-provider.h
> >>> @@ -35,6 +35,15 @@ extern "C" {
> >>>  struct netdev_tnl_build_header_params;
> >>>  #define NETDEV_NUMA_UNSPEC OVS_NUMA_UNSPEC
> >>>
> >>> +/* Offload-capable (HW) netdev information */
> >>> +struct netdev_hw_info {
> >>> +    bool oor;                /* Out of Offload Resources ? */
> >>> +};
> >>> +
> >>> +enum hw_info_type {
> >>> +    HW_INFO_TYPE_OOR = 1     /* OOR state */
> >>> +};
> >>> +
> >>>  /* A network device (e.g. an Ethernet device).
> >>>   *
> >>>   * Network device implementations may read these members but should
> >>> not modify
> >>> @@ -80,6 +89,8 @@ struct netdev {
> >>>      int n_rxq;
> >>>      struct shash_node *node;            /* Pointer to element in
> >>> global map. */
> >>>      struct ovs_list saved_flags_list; /* Contains "struct
> >>> netdev_saved_flags". */
> >>> +
> >>> +    struct netdev_hw_info hw_info;   /* offload-capable netdev info
> >>> */
> >>>  };
> >>>
> >>>  static inline void
> >>> diff --git a/lib/netdev.c b/lib/netdev.c
> >>> index 690d28409..f3fa08ca3 100644
> >>> --- a/lib/netdev.c
> >>> +++ b/lib/netdev.c
> >>> @@ -415,6 +415,7 @@ netdev_open(const char *name, const char *type,
> >>> struct netdev **netdevp)
> >>>                  netdev->reconfigure_seq = seq_create();
> >>>                  netdev->last_reconfigure_seq =
> >>>                      seq_read(netdev->reconfigure_seq);
> >>> +                netdev->hw_info.oor = false;
> >>>                  netdev->node = shash_add(&netdev_shash, name,
> >>> netdev);
> >>>
> >>>                  /* By default enable one tx and rx queue per
> >>> netdev.
> >> */
> >>> @@ -2252,6 +2253,31 @@ netdev_get_block_id(struct netdev *netdev)
> >>>              : 0);
> >>>  }
> >>>
> >>> +/*
> >>> + * Get the value of the hw info parameter specified by type.
> >>> + * Returns the value on success (>= 0). Returns -1 on failure.
> >>> + */
> >>> +int
> >>> +netdev_get_hw_info(struct netdev *netdev, int type)
> >>> +{
> >>> +    if (type == HW_INFO_TYPE_OOR) {
> >>> +        return netdev->hw_info.oor;
> >>> +    }
> >>> +
> >>> +    return -1;
> >>> +}
> >>> +
> >>> +/*
> >>> + * Set the value of the hw info parameter specified by type.
> >>> + */
> >>> +void
> >>> +netdev_set_hw_info(struct netdev *netdev, int type, int val)
> >>> +{
> >>> +    if (type == HW_INFO_TYPE_OOR) {
> >>> +        netdev->hw_info.oor = val;
> >>> +    }
> >>> +}
> >>> +
> >>>  bool
> >>>  netdev_is_flow_api_enabled(void)
> >>>  {
> >>> @@ -2488,6 +2514,14 @@ netdev_free_custom_stats_counters(struct
> >>> netdev_custom_stats *custom_stats)
> >>>      }
> >>>  }
> >>
> >> I do not like these type of functions as they are easy to cause
> >> problems. For example, you are using a general parameter to set a
> >> bool,
> >> what if your int input needs to set a uint32_t or uint64_t? What on
> >> return values, you might need to explicitly cast them.
> >>
> >> I would suggest that if this structure values need to be used outside
> >> of
> >> the netdev framework define the specific function. Inside just
> >> reference/change the values.
> >>
> >> So something like bool netdev_get(set)_hw_offload_oor().
> >
> > I didn't want to proliferate hw-info interfaces in netdev; we can
> > change
> > it in the future if we need to process other types like you mentioned.
> >>
> >>>
> >>> +static bool netdev_offload_rebalance_policy = false;
> >>> +
> >>> +bool
> >>> +netdev_is_offload_rebalance_policy_enabled(void)
> >>> +{
> >>> +    return netdev_offload_rebalance_policy;
> >>> +}
> >>> +
> >>
> >> See general comment on the cover letter, about a more granular
> >> configuration option.
> >
> > I responded earlier to this comment on the cover letter.
> >
> > Thanks,
> > -Harsha
> >>
> >>>  #ifdef __linux__
> >>>  static void
> >>>  netdev_ports_flow_init(void)
> >>> diff --git a/lib/netdev.h b/lib/netdev.h
> >>> index 556676046..b0e5c5b72 100644
> >>> --- a/lib/netdev.h
> >>> +++ b/lib/netdev.h
> >>> @@ -227,8 +227,11 @@ int netdev_flow_del(struct netdev *, const
> >>> ovs_u128 *,
> >>>                      struct dpif_flow_stats *);
> >>>  int netdev_init_flow_api(struct netdev *);
> >>>  uint32_t netdev_get_block_id(struct netdev *);
> >>> +int netdev_get_hw_info(struct netdev *, int);
> >>> +void netdev_set_hw_info(struct netdev *, int, int);
> >>>  bool netdev_is_flow_api_enabled(void);
> >>>  void netdev_set_flow_api_enabled(const struct smap
> >>> *ovs_other_config);
> >>> +bool netdev_is_offload_rebalance_policy_enabled(void);
> >>>
> >>>  struct dpif_port;
> >>>  int netdev_ports_insert(struct netdev *, const struct dpif_class *,
> >>> --
> >>> 2.18.0.rc1.1.g6f333ff
> >>>
> >>> _______________________________________________
> >>> dev mailing list
> >>> dev@openvswitch.org
> >>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Eelco Chaudron Oct. 26, 2018, 10:02 a.m. | #6
On 26 Oct 2018, at 11:53, Simon Horman wrote:

> On Fri, 26 Oct 2018 at 08:58, Eelco Chaudron <echaudro@redhat.com> wrote:
>

<SNIP>
>>>> I have a general comment, don't know where to put it, so I put it
>>>> here.
>>>> Some hardware might have multiple tables. If one type of table is
>>>> full
>>>> the ENOSPC might be returned, but it does not mean all type of flows
>>>> can
>>>> no longer be offloaded. This might be a situation to think about.
>>>
>>> Ok, thanks for bringing it up. Currently from OvS daemon's perspective
>>> a
>>> request to add/delete a flow is issued on a netdev and the failure
>>> indicates
>>> that the particular netdev is out of resources. If we need to handle
>>> the
>>> condition where HW has different tables, we need to further extend
>>> this
>>> design and the tc interfaces to propagate this fine grained
>>> information.
>>
>> Would be good if other hardware vendors can comment here?
>>
>
> There was a discussion in another forum involving at least Mellanox,
> Broadcom and Netronome.
> From a Netronome point of view this scheme is satisfactory and my
> recollection is that
> was the agreement of those involved in the discussion.

Thanks for the clarification…

Patch

diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index e6d5a6ec5..b9ce9cbe2 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -2178,7 +2178,23 @@  parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
 
         VLOG_DBG("added flow");
     } else if (err != EEXIST) {
-        VLOG_ERR_RL(&rl, "failed to offload flow: %s", ovs_strerror(err));
+        struct netdev *oor_netdev = NULL;
+        if (err == ENOSPC && netdev_is_offload_rebalance_policy_enabled()) {
+            /*
+             * We need to set OOR on the input netdev (i.e, 'dev') for the
+             * flow. But if the flow has a tunnel attribute (i.e, decap action,
+             * with a virtual device like a VxLAN interface as its in-port),
+             * then lookup and set OOR on the underlying tunnel (real) netdev.
+             */
+            oor_netdev = flow_get_tunnel_netdev(&match.flow.tunnel);
+            if (!oor_netdev) {
+                /* Not a 'tunnel' flow */
+                oor_netdev = dev;
+            }
+            netdev_set_hw_info(oor_netdev, HW_INFO_TYPE_OOR, true);
+        }
+        VLOG_ERR_RL(&rl, "failed to offload flow: %s: %s", ovs_strerror(err),
+                    (oor_netdev ? oor_netdev->name : dev->name));
     }
 
 out:
diff --git a/lib/flow.c b/lib/flow.c
index 77ed3d9df..a39807908 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -19,6 +19,7 @@ 
 #include <errno.h>
 #include <inttypes.h>
 #include <limits.h>
+#include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/icmp6.h>
 #include <netinet/ip6.h>
@@ -41,6 +42,8 @@ 
 #include "unaligned.h"
 #include "util.h"
 #include "openvswitch/nsh.h"
+#include "ovs-router.h"
+#include "lib/netdev-provider.h"
 
 COVERAGE_DEFINE(flow_extract);
 COVERAGE_DEFINE(miniflow_malloc);
@@ -3403,3 +3406,25 @@  flow_limit_vlans(int vlan_limit)
         flow_vlan_limit = MIN(vlan_limit, FLOW_MAX_VLAN_HEADERS);
     }
 }
+
+struct netdev *
+flow_get_tunnel_netdev(struct flow_tnl *tunnel)
+{
+    char iface[IFNAMSIZ];
+    struct in6_addr ip6;
+    struct in6_addr gw;
+
+    if (tunnel->ip_src) {
+        in6_addr_set_mapped_ipv4(&ip6, tunnel->ip_src);
+    } else if (ipv6_addr_is_set(&tunnel->ipv6_src)) {
+        ip6 = tunnel->ipv6_src;
+    } else {
+        return NULL;
+    }
+
+    if (!ovs_router_lookup(0, &ip6, iface, NULL, &gw)) {
+        return NULL;
+    }
+
+    return netdev_from_name(iface);
+}
diff --git a/lib/flow.h b/lib/flow.h
index d03f1ba9c..aca60c41a 100644
--- a/lib/flow.h
+++ b/lib/flow.h
@@ -73,6 +73,7 @@  void flow_extract(struct dp_packet *, struct flow *);
 void flow_zero_wildcards(struct flow *, const struct flow_wildcards *);
 void flow_unwildcard_tp_ports(const struct flow *, struct flow_wildcards *);
 void flow_get_metadata(const struct flow *, struct match *flow_metadata);
+struct netdev *flow_get_tunnel_netdev(struct flow_tnl *tunnel);
 
 const char *ct_state_to_string(uint32_t state);
 uint32_t ct_state_from_string(const char *);
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index 5a7947351..e320dad61 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -35,6 +35,15 @@  extern "C" {
 struct netdev_tnl_build_header_params;
 #define NETDEV_NUMA_UNSPEC OVS_NUMA_UNSPEC
 
+/* Offload-capable (HW) netdev information */
+struct netdev_hw_info {
+    bool oor;		/* Out of Offload Resources ? */
+};
+
+enum hw_info_type {
+    HW_INFO_TYPE_OOR = 1	/* OOR state */
+};
+
 /* A network device (e.g. an Ethernet device).
  *
  * Network device implementations may read these members but should not modify
@@ -80,6 +89,8 @@  struct netdev {
     int n_rxq;
     struct shash_node *node;            /* Pointer to element in global map. */
     struct ovs_list saved_flags_list; /* Contains "struct netdev_saved_flags". */
+
+    struct netdev_hw_info hw_info;	/* offload-capable netdev info */
 };
 
 static inline void
diff --git a/lib/netdev.c b/lib/netdev.c
index 690d28409..f3fa08ca3 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -415,6 +415,7 @@  netdev_open(const char *name, const char *type, struct netdev **netdevp)
                 netdev->reconfigure_seq = seq_create();
                 netdev->last_reconfigure_seq =
                     seq_read(netdev->reconfigure_seq);
+                netdev->hw_info.oor = false;
                 netdev->node = shash_add(&netdev_shash, name, netdev);
 
                 /* By default enable one tx and rx queue per netdev. */
@@ -2252,6 +2253,31 @@  netdev_get_block_id(struct netdev *netdev)
             : 0);
 }
 
+/*
+ * Get the value of the hw info parameter specified by type.
+ * Returns the value on success (>= 0). Returns -1 on failure.
+ */
+int
+netdev_get_hw_info(struct netdev *netdev, int type)
+{
+    if (type == HW_INFO_TYPE_OOR) {
+        return netdev->hw_info.oor;
+    }
+
+    return -1;
+}
+
+/*
+ * Set the value of the hw info parameter specified by type.
+ */
+void
+netdev_set_hw_info(struct netdev *netdev, int type, int val)
+{
+    if (type == HW_INFO_TYPE_OOR) {
+        netdev->hw_info.oor = val;
+    }
+}
+
 bool
 netdev_is_flow_api_enabled(void)
 {
@@ -2488,6 +2514,14 @@  netdev_free_custom_stats_counters(struct netdev_custom_stats *custom_stats)
     }
 }
 
+static bool netdev_offload_rebalance_policy = false;
+
+bool
+netdev_is_offload_rebalance_policy_enabled(void)
+{
+    return netdev_offload_rebalance_policy;
+}
+
 #ifdef __linux__
 static void
 netdev_ports_flow_init(void)
diff --git a/lib/netdev.h b/lib/netdev.h
index 556676046..b0e5c5b72 100644
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -227,8 +227,11 @@  int netdev_flow_del(struct netdev *, const ovs_u128 *,
                     struct dpif_flow_stats *);
 int netdev_init_flow_api(struct netdev *);
 uint32_t netdev_get_block_id(struct netdev *);
+int netdev_get_hw_info(struct netdev *, int);
+void netdev_set_hw_info(struct netdev *, int, int);
 bool netdev_is_flow_api_enabled(void);
 void netdev_set_flow_api_enabled(const struct smap *ovs_other_config);
+bool netdev_is_offload_rebalance_policy_enabled(void);
 
 struct dpif_port;
 int netdev_ports_insert(struct netdev *, const struct dpif_class *,