diff mbox

[RFC] net: add a rate_limit attribute to netdev_queue and a rtnetlink

Message ID 20130710140409.6691.77084.stgit@nitbit.x32
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

John Fastabend July 10, 2013, 2:04 p.m. UTC
Add a rate_limit attribute to netdev_queue and a corresponding
ndo ops to pass the parameter to the driver. Also provide an
rtnetlink attribute to dump and set arbitrary queue attributes
the first of which is the rate_limit.

Netlink attribute layout shown here,

/* Queue Attributes management
 *      Nested layout of queue attributes is:
 *      [IFLA_QUEUE_ATTRIBS]
 *              [IFLA_QUEUE_ATTRIB]
 *                      [IFLA_QUEUE_INDEX]      <required>
 *                      [IFLA_QUEUE_RATE]       <optional>
 *              [IFLA_QUEUE_ATTRIB]
 *                      [...]
 */

Adding this as a queue attribute and _not_ a qdisc option allows
using rate limits with qdisc schemes that may not align with tx rings
and also allows using QOS schemes along with rate limits.

A sample implementation is provided for ixgbe. Any improvements or
suggestions welcome I would also be interested to know if this works
with other hardware and if Mbps is a good default unit.

The ndo op is only called when the netlink API is invoked, so drivers
may need to adjust internal registers on link events (speed changes)
as needed. More attributes can be added as needed.

A test patch for iproute2 will be provided.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  |   16 ++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c |   47 +++++++++++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h |    2 +
 include/linux/netdevice.h                      |   14 ++++
 include/uapi/linux/if_link.h                   |   24 ++++++
 net/core/rtnetlink.c                           |   90 ++++++++++++++++++++++++
 6 files changed, 187 insertions(+), 6 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Tom Herbert July 10, 2013, 9:18 p.m. UTC | #1
John, thanks for posting this.

Should we have separate guaranteed and maximum rate?  Will this be
usable in the case that more than one queue shares a rate?  Do you
think other parameters, like priority weights should be done the same
way?

Tom



On Wed, Jul 10, 2013 at 7:04 AM, John Fastabend
<john.fastabend@gmail.com> wrote:
> Add a rate_limit attribute to netdev_queue and a corresponding
> ndo ops to pass the parameter to the driver. Also provide an
> rtnetlink attribute to dump and set arbitrary queue attributes
> the first of which is the rate_limit.
>
> Netlink attribute layout shown here,
>
> /* Queue Attributes management
>  *      Nested layout of queue attributes is:
>  *      [IFLA_QUEUE_ATTRIBS]
>  *              [IFLA_QUEUE_ATTRIB]
>  *                      [IFLA_QUEUE_INDEX]      <required>
>  *                      [IFLA_QUEUE_RATE]       <optional>
>  *              [IFLA_QUEUE_ATTRIB]
>  *                      [...]
>  */
>
> Adding this as a queue attribute and _not_ a qdisc option allows
> using rate limits with qdisc schemes that may not align with tx rings
> and also allows using QOS schemes along with rate limits.
>
> A sample implementation is provided for ixgbe. Any improvements or
> suggestions welcome I would also be interested to know if this works
> with other hardware and if Mbps is a good default unit.
>
> The ndo op is only called when the netlink API is invoked, so drivers
> may need to adjust internal registers on link events (speed changes)
> as needed. More attributes can be added as needed.
>
> A test patch for iproute2 will be provided.
>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  |   16 ++++
>  drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c |   47 +++++++++++--
>  drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h |    2 +
>  include/linux/netdevice.h                      |   14 ++++
>  include/uapi/linux/if_link.h                   |   24 ++++++
>  net/core/rtnetlink.c                           |   90 ++++++++++++++++++++++++
>  6 files changed, 187 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index 047ebaa..6a9b99c 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -5579,6 +5579,19 @@ static void ixgbe_check_hang_subtask(struct ixgbe_adapter *adapter)
>
>  }
>
> +static void ixgbe_reset_rate_limits(struct ixgbe_adapter *adapter)
> +{
> +       struct net_device *dev = adapter->netdev;
> +       int i;
> +
> +       for (i = 0; i < dev->real_num_tx_queues; i++) {
> +               struct netdev_queue *q = netdev_get_tx_queue(dev, i);
> +               u32 rate = q->rate_limit;
> +
> +               ixgbe_set_rate_limit(adapter->netdev, i, &rate);
> +       }
> +}
> +
>  /**
>   * ixgbe_watchdog_update_link - update the link status
>   * @adapter: pointer to the device adapter structure
> @@ -5620,6 +5633,8 @@ static void ixgbe_watchdog_update_link(struct ixgbe_adapter *adapter)
>
>         adapter->link_up = link_up;
>         adapter->link_speed = link_speed;
> +
> +       ixgbe_reset_rate_limits(adapter);
>  }
>
>  static void ixgbe_update_default_up(struct ixgbe_adapter *adapter)
> @@ -7244,6 +7259,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
>         .ndo_fdb_add            = ixgbe_ndo_fdb_add,
>         .ndo_bridge_setlink     = ixgbe_ndo_bridge_setlink,
>         .ndo_bridge_getlink     = ixgbe_ndo_bridge_getlink,
> +       .ndo_set_ratelimit      = ixgbe_set_rate_limit,
>  };
>
>  /**
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
> index 1e7d587..1ea78f1 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
> @@ -1107,17 +1107,14 @@ static int ixgbe_link_mbps(struct ixgbe_adapter *adapter)
>         }
>  }
>
> -static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
> +static u32 ixgbe_bcnrc_from_rate(struct ixgbe_adapter *adapter,
> +                                u16 tx_rate, int link_speed)
>  {
> -       struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ];
> -       struct ixgbe_hw *hw = &adapter->hw;
>         u32 bcnrc_val = 0;
> -       u16 queue, queues_per_pool;
> -       u16 tx_rate = adapter->vfinfo[vf].tx_rate;
>
>         if (tx_rate) {
>                 /* start with base link speed value */
> -               bcnrc_val = adapter->vf_rate_link_speed;
> +               bcnrc_val = link_speed;
>
>                 /* Calculate the rate factor values to set */
>                 bcnrc_val <<= IXGBE_RTTBCNRC_RF_INT_SHIFT;
> @@ -1131,6 +1128,11 @@ static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
>                 bcnrc_val |= IXGBE_RTTBCNRC_RS_ENA;
>         }
>
> +       return bcnrc_val;
> +}
> +
> +static void ixgbe_set_xmit_compensation(struct ixgbe_hw *hw)
> +{
>         /*
>          * Set global transmit compensation time to the MMW_SIZE in RTTBCNRM
>          * register. Typically MMW_SIZE=0x014 if 9728-byte jumbo is supported
> @@ -1146,6 +1148,39 @@ static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
>         default:
>                 break;
>         }
> +}
> +
> +int ixgbe_set_rate_limit(struct net_device *dev, int index, u32 *tx_rate)
> +{
> +       struct ixgbe_adapter *a = netdev_priv(dev);
> +       struct ixgbe_hw *hw = &a->hw;
> +       int linkspeed = ixgbe_link_mbps(a);
> +       u8 reg_idx = a->tx_ring[index]->reg_idx;
> +       u32 bcnrc = ixgbe_bcnrc_from_rate(a, *tx_rate, linkspeed);
> +
> +       /* rate limit cannot be less than 10Mbs */
> +       if (*tx_rate && (*tx_rate <= 10))
> +               return -EINVAL;
> +
> +       ixgbe_set_xmit_compensation(hw);
> +
> +       IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, reg_idx);
> +       IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, bcnrc);
> +
> +       return 0;
> +}
> +
> +static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
> +{
> +       struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ];
> +       struct ixgbe_hw *hw = &adapter->hw;
> +       u32 bcnrc_val = 0;
> +       u16 queue, queues_per_pool;
> +       u16 tx_rate = adapter->vfinfo[vf].tx_rate;
> +
> +       bcnrc_val = ixgbe_bcnrc_from_rate(adapter, tx_rate,
> +                                         adapter->vf_rate_link_speed);
> +       ixgbe_set_xmit_compensation(hw);
>
>         /* determine how many queues per pool based on VMDq mask */
>         queues_per_pool = __ALIGN_MASK(1, ~vmdq->mask);
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
> index 4713f9f..d8b4bbe 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
> @@ -56,5 +56,7 @@ static inline void ixgbe_set_vmvir(struct ixgbe_adapter *adapter,
>         IXGBE_WRITE_REG(hw, IXGBE_VMVIR(vf), vmvir);
>  }
>
> +int ixgbe_set_rate_limit(struct net_device *dev, int index, u32 *tx_rate);
> +
>  #endif /* _IXGBE_SRIOV_H_ */
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 09b4188..0e67b09 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -574,6 +574,7 @@ struct netdev_queue {
>  #ifdef CONFIG_BQL
>         struct dql              dql;
>  #endif
> +       u32                     rate_limit;
>  } ____cacheline_aligned_in_smp;
>
>  static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
> @@ -932,6 +933,16 @@ struct netdev_fcoe_hbainfo {
>   *     that determine carrier state from physical hardware properties (eg
>   *     network cables) or protocol-dependent mechanisms (eg
>   *     USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
> + *
> + * int (*ndo_set_ratelimit)(struct net_device *dev,
> + *                         int queue_index, u32 *maxrate)
> + *     Called to set the rate limit in Mpbs of the specified queue_index
> + *     setting the limit to maxrate. It is expected that hardware may
> + *     quantize the rate limits. In these cases the driver should guarantee the
> + *     specified maxrate is not exceeded and return the set value in maxrate.
> + *     Zero should be returned on success otherwise use appropriate error code.
> + *     Drivers must handle any updates required with link speed changes within
> + *     normal driver code paths.
>   */
>  struct net_device_ops {
>         int                     (*ndo_init)(struct net_device *dev);
> @@ -1060,6 +1071,9 @@ struct net_device_ops {
>                                                       struct nlmsghdr *nlh);
>         int                     (*ndo_change_carrier)(struct net_device *dev,
>                                                       bool new_carrier);
> +       int                     (*ndo_set_ratelimit)(struct net_device *dev,
> +                                                    int queue_index,
> +                                                    u32 *max_rate);
>  };
>
>  /*
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index 03f6170..67a353c 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -143,6 +143,7 @@ enum {
>         IFLA_NUM_TX_QUEUES,
>         IFLA_NUM_RX_QUEUES,
>         IFLA_CARRIER,
> +       IFLA_QUEUE_ATTRIBS,
>         __IFLA_MAX
>  };
>
> @@ -449,6 +450,29 @@ struct ifla_port_vsi {
>         __u8 pad[3];
>  };
>
> +/* Queue Attributes management
> + *     Nested layout of queue attributes is:
> + *     [IFLA_QUEUE_ATTRIBS]
> + *             [IFLA_QUEUE_ATTRIB]
> + *                     [IFLA_QUEUE_INDEX]      <required>
> + *                     [IFLA_QUEUE_RATE]       <optional>
> + *             [IFLA_QUEUE_ATTRIB]
> + *                     [...]
> + */
> +enum {
> +       IFLA_QUEUE_ATTRIB_UNSPEC,
> +       IFLA_QUEUE_ATTRIB,                      /* nest */
> +       __IFLA_QUEUE_ATTRIB_MAX,
> +};
> +#define IFLA_QUEUE_ATTRIB_MAX (__IFLA_QUEUE_ATTRIB_MAX - 1)
> +
> +enum {
> +       IFLA_QUEUE_UNSPEC,
> +       IFLA_QUEUE_INDEX,               /* __u32 */
> +       IFLA_QUEUE_RATE,                /* __u32 */
> +       __IFLA_QUEUE_MAX,
> +};
> +#define IFLA_QUEUE_MAX (__IFLA_QUEUE_MAX - 1)
>
>  /* IPoIB section */
>
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index 9007533..d7ff1c2 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -846,6 +846,45 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
>         return 0;
>  }
>
> +static int rtnl_queue_fill(struct sk_buff *skb, struct net_device *dev)
> +{
> +       int i;
> +       struct nlattr *attribs, *attrib = NULL;
> +
> +       attribs = nla_nest_start(skb, IFLA_QUEUE_ATTRIBS);
> +       if (!attribs)
> +               return -EMSGSIZE;
> +
> +       for (i = 0; i < dev->real_num_tx_queues; i++) {
> +               struct netdev_queue *q = netdev_get_tx_queue(dev, i);
> +
> +               if (!q->rate_limit)
> +                       continue;
> +
> +               attrib = nla_nest_start(skb, IFLA_QUEUE_ATTRIB);
> +               if (!attrib) {
> +                       nla_nest_cancel(skb, attribs);
> +                       return -EMSGSIZE;
> +               }
> +
> +               if (nla_put_u32(skb, IFLA_QUEUE_INDEX, i) ||
> +                   nla_put_u32(skb, IFLA_QUEUE_RATE, q->rate_limit)) {
> +                       nla_nest_cancel(skb, attrib);
> +                       nla_nest_cancel(skb, attribs);
> +                       return -EMSGSIZE;
> +               }
> +
> +               nla_nest_end(skb, attrib);
> +       }
> +
> +       if (attrib)
> +               nla_nest_end(skb, attribs);
> +       else
> +               nla_nest_cancel(skb, attribs);
> +
> +       return 0;
> +}
> +
>  static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
>                             int type, u32 pid, u32 seq, u32 change,
>                             unsigned int flags, u32 ext_filter_mask)
> @@ -997,6 +1036,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
>         if (rtnl_port_fill(skb, dev))
>                 goto nla_put_failure;
>
> +       if (rtnl_queue_fill(skb, dev))
> +               goto nla_put_failure;
> +
>         if (dev->rtnl_link_ops) {
>                 if (rtnl_link_fill(skb, dev) < 0)
>                         goto nla_put_failure;
> @@ -1150,6 +1192,11 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
>         [IFLA_PORT_RESPONSE]    = { .type = NLA_U16, },
>  };
>
> +static const struct nla_policy ifla_queue_policy[IFLA_QUEUE_MAX+1] = {
> +       [IFLA_QUEUE_INDEX]      = { .type = NLA_U32},
> +       [IFLA_QUEUE_RATE]       = { .type = NLA_U32},
> +};
> +
>  struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
>  {
>         struct net *net;
> @@ -1503,6 +1550,49 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
>                         goto errout;
>                 modified = 1;
>         }
> +       if (tb[IFLA_QUEUE_ATTRIBS]) {
> +               struct nlattr *attr;
> +               int rem;
> +
> +               nla_for_each_nested(attr, tb[IFLA_QUEUE_ATTRIBS], rem) {
> +                       struct nlattr *queue[IFLA_QUEUE_MAX+1];
> +                       struct netdev_queue *q = NULL;
> +                       unsigned int qindex, qrate;
> +
> +                       if (nla_type(attr) != IFLA_QUEUE_ATTRIB)
> +                               continue;
> +
> +                       err = nla_parse_nested(queue, IFLA_QUEUE_MAX,
> +                                              attr, ifla_queue_policy);
> +                       if  (err < 0)
> +                               goto errout;
> +
> +                       err = -EINVAL;
> +                       if (queue[IFLA_QUEUE_INDEX]) {
> +                               qindex = nla_get_u32(queue[IFLA_QUEUE_INDEX]);
> +                               if (qindex >= dev->real_num_tx_queues)
> +                                       goto errout;
> +
> +                               q = netdev_get_tx_queue(dev, qindex);
> +                       } else {
> +                               goto errout;
> +                       }
> +
> +                       err = -EOPNOTSUPP;
> +                       if (queue[IFLA_QUEUE_RATE]) {
> +                               if (!ops->ndo_set_ratelimit)
> +                                       goto errout;
> +
> +                               qrate = nla_get_u32(queue[IFLA_QUEUE_RATE]);
> +                               err = ops->ndo_set_ratelimit(dev,
> +                                                            qindex, &qrate);
> +                               if (err < 0)
> +                                       goto errout;
> +                               q->rate_limit = qrate;
> +                               modified = 1;
> +                       }
> +               }
> +       }
>
>         if (tb[IFLA_AF_SPEC]) {
>                 struct nlattr *af;
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend July 11, 2013, 3:10 a.m. UTC | #2
On 7/10/2013 2:18 PM, Tom Herbert wrote:
> John, thanks for posting this.
>
> Should we have separate guaranteed and maximum rate?  Will this be
> usable in the case that more than one queue shares a rate?  Do you
> think other parameters, like priority weights should be done the same
> way?
>
> Tom
>

If hardware can provide a guaranteed min rate as well as a max rate
per queue then I think adding it here would be correct. Notice
guaranteed minimum rates and max rates can already be provided for a
set of queues (a traffic class or tc in the netdev struct) using the
DCB netlink interface.

So we have a hierarchy of attributes namely netdev->tc->queue.
It might be interesting to note a tc can consist of a single queue.
However hardware may not necessarily be able to rate limit a set of
queues or vice versa a single queue.

> Will this be usable in the case that more than one queue shares a rate?

Here I must not understand the question. Each queue has independent
rate limiters. At least in the hardware supported by ixgbe. Maybe
other hardware has a different implementation?

> Do you think other parameters, [...]

With regards to other attributes my intent was to create a structure
that we could easily add additional per queue attributes as needed.
So yes adding priority weights here makes some sense to me. Although
I wonder if priority weights overlaps the 'queue set' and 'tc' notion
already existing in the DCB netlink interface for configuring link
strict classes. With dcbnl and mqprio we can create upto 16 sets
of queues each with a priority mapping and the hardware will process
traffic in priority order. (ixgbe caveat: 82599 supports 8 sets of
queues X540 supports 4).

.John
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller July 11, 2013, 11:35 p.m. UTC | #3
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 10 Jul 2013 07:04:12 -0700

> /* Queue Attributes management
>  *      Nested layout of queue attributes is:
>  *      [IFLA_QUEUE_ATTRIBS]
>  *              [IFLA_QUEUE_ATTRIB]
>  *                      [IFLA_QUEUE_INDEX]      <required>
>  *                      [IFLA_QUEUE_RATE]       <optional>
>  *              [IFLA_QUEUE_ATTRIB]
>  *                      [...]
>  */

Since these are specifically TX queue attributes, and not
RX queue attributes, please add "_TX_" in the names both
of the attributes themselves and the ->ndo_*() method.

> +static const struct nla_policy ifla_queue_policy[IFLA_QUEUE_MAX+1] = {
> +	[IFLA_QUEUE_INDEX]	= { .type = NLA_U32},
> +	[IFLA_QUEUE_RATE]	= { .type = NLA_U32},
> +};

Space needed after NLA_U32 and the closing brace.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim July 14, 2013, 5:44 p.m. UTC | #4
On 13-07-10 11:10 PM, John Fastabend wrote:


>
>> Will this be usable in the case that more than one queue shares a rate?
>
> Here I must not understand the question. Each queue has independent
> rate limiters. At least in the hardware supported by ixgbe. Maybe
> other hardware has a different implementation?
>

I think Tom might be alluding to scenarios where a rate (as a resource)
is shared by multiple ingress as well as egress interfaces (in your case
queues). This is a very popular use case with tc (in particular in
conjunction with IFB).
In such a case, the rate by itself would need to modeled as an indexable 
attribute.

Some hardware (example modern broadcom switching chips, maybe yours)
are not capable; So the index would have local semantics as in your 
case. But I know hardware that does support shared rates.
Maybe you could come up with IFLA_QUEUE_RATE_LOCAL vs
IFLA_QUEUE_RATE_GLOBAL which will encapsulate a global index
for the shared rate.

cheers,
jamal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert July 14, 2013, 9:14 p.m. UTC | #5
> I think Tom might be alluding to scenarios where a rate (as a resource)
> is shared by multiple ingress as well as egress interfaces (in your case
> queues). This is a very popular use case with tc (in particular in
> conjunction with IFB).
> In such a case, the rate by itself would need to modeled as an indexable
> attribute.
>
RIght, conceptually we want rate limit users or applications as
opposed to queues.  For instance, for instance we may want to allocate
four queues to a particular user for scaling, but rate limit user to
1Gbps in aggregate.  Assigning each queue 250Mbps is not the same!
The generalization of this is to define QoS class describing rate
limits and other properties, and allowing queues to be mapped to the
these classes.

Tom

> Some hardware (example modern broadcom switching chips, maybe yours)
> are not capable; So the index would have local semantics as in your case.
> But I know hardware that does support shared rates.
> Maybe you could come up with IFLA_QUEUE_RATE_LOCAL vs
> IFLA_QUEUE_RATE_GLOBAL which will encapsulate a global index
> for the shared rate.
>
> cheers,
> jamal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend July 15, 2013, 3:02 a.m. UTC | #6
On 07/14/2013 02:14 PM, Tom Herbert wrote:
>> I think Tom might be alluding to scenarios where a rate (as a resource)
>> is shared by multiple ingress as well as egress interfaces (in your case
>> queues). This is a very popular use case with tc (in particular in
>> conjunction with IFB).
>> In such a case, the rate by itself would need to modeled as an indexable
>> attribute.
>>

Interesting. Just so I am clear on the example, with tc this would look
like classifying packets on egress and sending them to an IFB device and
similarly matching packets on ingress and sending them to the same IFB
device. Then applying a rate limit on the IFB?

The ingress AND egress part I hadn't considered. If its just ingress
(exclusive) OR egress then its basically a 802.1Q traffic class which
in my case and I assume for most hardware is just a set of queues.


> RIght, conceptually we want rate limit users or applications as
> opposed to queues.  For instance, for instance we may want to allocate
> four queues to a particular user for scaling, but rate limit user to
> 1Gbps in aggregate.  Assigning each queue 250Mbps is not the same!
> The generalization of this is to define QoS class describing rate
> limits and other properties, and allowing queues to be mapped to the
> these classes.
>
> Tom
>

How would this be different from the DCB netlink interface that already
exists? Is the piece I am missing to have a global rate for both egress
and ingress?

Today a traffic class is given a queue set via the mqprio interface
and then max rate for a traffic class can be configured via
DCB_ATTR_IEEE_MAXRATE. Then netprio_cgroup is used to map PIDs to the
traffic classes. So not exactly per user rates but per application.

.John
diff mbox

Patch

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 047ebaa..6a9b99c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5579,6 +5579,19 @@  static void ixgbe_check_hang_subtask(struct ixgbe_adapter *adapter)
 
 }
 
+static void ixgbe_reset_rate_limits(struct ixgbe_adapter *adapter)
+{
+	struct net_device *dev = adapter->netdev;
+	int i;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		struct netdev_queue *q = netdev_get_tx_queue(dev, i);
+		u32 rate = q->rate_limit;
+
+		ixgbe_set_rate_limit(adapter->netdev, i, &rate);
+	}
+}
+
 /**
  * ixgbe_watchdog_update_link - update the link status
  * @adapter: pointer to the device adapter structure
@@ -5620,6 +5633,8 @@  static void ixgbe_watchdog_update_link(struct ixgbe_adapter *adapter)
 
 	adapter->link_up = link_up;
 	adapter->link_speed = link_speed;
+
+	ixgbe_reset_rate_limits(adapter);
 }
 
 static void ixgbe_update_default_up(struct ixgbe_adapter *adapter)
@@ -7244,6 +7259,7 @@  static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_fdb_add		= ixgbe_ndo_fdb_add,
 	.ndo_bridge_setlink	= ixgbe_ndo_bridge_setlink,
 	.ndo_bridge_getlink	= ixgbe_ndo_bridge_getlink,
+	.ndo_set_ratelimit	= ixgbe_set_rate_limit,
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 1e7d587..1ea78f1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -1107,17 +1107,14 @@  static int ixgbe_link_mbps(struct ixgbe_adapter *adapter)
 	}
 }
 
-static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
+static u32 ixgbe_bcnrc_from_rate(struct ixgbe_adapter *adapter,
+				 u16 tx_rate, int link_speed)
 {
-	struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ];
-	struct ixgbe_hw *hw = &adapter->hw;
 	u32 bcnrc_val = 0;
-	u16 queue, queues_per_pool;
-	u16 tx_rate = adapter->vfinfo[vf].tx_rate;
 
 	if (tx_rate) {
 		/* start with base link speed value */
-		bcnrc_val = adapter->vf_rate_link_speed;
+		bcnrc_val = link_speed;
 
 		/* Calculate the rate factor values to set */
 		bcnrc_val <<= IXGBE_RTTBCNRC_RF_INT_SHIFT;
@@ -1131,6 +1128,11 @@  static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
 		bcnrc_val |= IXGBE_RTTBCNRC_RS_ENA;
 	}
 
+	return bcnrc_val;
+}
+
+static void ixgbe_set_xmit_compensation(struct ixgbe_hw *hw)
+{
 	/*
 	 * Set global transmit compensation time to the MMW_SIZE in RTTBCNRM
 	 * register. Typically MMW_SIZE=0x014 if 9728-byte jumbo is supported
@@ -1146,6 +1148,39 @@  static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
 	default:
 		break;
 	}
+}
+
+int ixgbe_set_rate_limit(struct net_device *dev, int index, u32 *tx_rate)
+{
+	struct ixgbe_adapter *a = netdev_priv(dev);
+	struct ixgbe_hw *hw = &a->hw;
+	int linkspeed = ixgbe_link_mbps(a);
+	u8 reg_idx = a->tx_ring[index]->reg_idx;
+	u32 bcnrc = ixgbe_bcnrc_from_rate(a, *tx_rate, linkspeed);
+
+	/* rate limit cannot be less than 10Mbs */
+	if (*tx_rate && (*tx_rate <= 10))
+		return -EINVAL;
+
+	ixgbe_set_xmit_compensation(hw);
+
+	IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, reg_idx);
+	IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, bcnrc);
+
+	return 0;
+}
+
+static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
+{
+	struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ];
+	struct ixgbe_hw *hw = &adapter->hw;
+	u32 bcnrc_val = 0;
+	u16 queue, queues_per_pool;
+	u16 tx_rate = adapter->vfinfo[vf].tx_rate;
+
+	bcnrc_val = ixgbe_bcnrc_from_rate(adapter, tx_rate,
+					  adapter->vf_rate_link_speed);
+	ixgbe_set_xmit_compensation(hw);
 
 	/* determine how many queues per pool based on VMDq mask */
 	queues_per_pool = __ALIGN_MASK(1, ~vmdq->mask);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
index 4713f9f..d8b4bbe 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
@@ -56,5 +56,7 @@  static inline void ixgbe_set_vmvir(struct ixgbe_adapter *adapter,
 	IXGBE_WRITE_REG(hw, IXGBE_VMVIR(vf), vmvir);
 }
 
+int ixgbe_set_rate_limit(struct net_device *dev, int index, u32 *tx_rate);
+
 #endif /* _IXGBE_SRIOV_H_ */
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 09b4188..0e67b09 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -574,6 +574,7 @@  struct netdev_queue {
 #ifdef CONFIG_BQL
 	struct dql		dql;
 #endif
+	u32			rate_limit;
 } ____cacheline_aligned_in_smp;
 
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
@@ -932,6 +933,16 @@  struct netdev_fcoe_hbainfo {
  *	that determine carrier state from physical hardware properties (eg
  *	network cables) or protocol-dependent mechanisms (eg
  *	USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
+ *
+ * int (*ndo_set_ratelimit)(struct net_device *dev,
+ *			    int queue_index, u32 *maxrate)
+ *	Called to set the rate limit in Mpbs of the specified queue_index
+ *	setting the limit to maxrate. It is expected that hardware may
+ *	quantize the rate limits. In these cases the driver should guarantee the
+ *	specified maxrate is not exceeded and return the set value in maxrate.
+ *	Zero should be returned on success otherwise use appropriate error code.
+ *	Drivers must handle any updates required with link speed changes within
+ *	normal driver code paths.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1060,6 +1071,9 @@  struct net_device_ops {
 						      struct nlmsghdr *nlh);
 	int			(*ndo_change_carrier)(struct net_device *dev,
 						      bool new_carrier);
+	int			(*ndo_set_ratelimit)(struct net_device *dev,
+						     int queue_index,
+						     u32 *max_rate);
 };
 
 /*
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 03f6170..67a353c 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -143,6 +143,7 @@  enum {
 	IFLA_NUM_TX_QUEUES,
 	IFLA_NUM_RX_QUEUES,
 	IFLA_CARRIER,
+	IFLA_QUEUE_ATTRIBS,
 	__IFLA_MAX
 };
 
@@ -449,6 +450,29 @@  struct ifla_port_vsi {
 	__u8 pad[3];
 };
 
+/* Queue Attributes management
+ *	Nested layout of queue attributes is:
+ *	[IFLA_QUEUE_ATTRIBS]
+ *		[IFLA_QUEUE_ATTRIB]
+ *			[IFLA_QUEUE_INDEX]	<required>
+ *			[IFLA_QUEUE_RATE]	<optional>
+ *		[IFLA_QUEUE_ATTRIB]
+ *			[...]
+ */
+enum {
+	IFLA_QUEUE_ATTRIB_UNSPEC,
+	IFLA_QUEUE_ATTRIB,			/* nest */
+	__IFLA_QUEUE_ATTRIB_MAX,
+};
+#define IFLA_QUEUE_ATTRIB_MAX (__IFLA_QUEUE_ATTRIB_MAX - 1)
+
+enum {
+	IFLA_QUEUE_UNSPEC,
+	IFLA_QUEUE_INDEX,		/* __u32 */
+	IFLA_QUEUE_RATE,		/* __u32 */
+	__IFLA_QUEUE_MAX,
+};
+#define IFLA_QUEUE_MAX (__IFLA_QUEUE_MAX - 1)
 
 /* IPoIB section */
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9007533..d7ff1c2 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -846,6 +846,45 @@  static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
+static int rtnl_queue_fill(struct sk_buff *skb, struct net_device *dev)
+{
+	int i;
+	struct nlattr *attribs, *attrib = NULL;
+
+	attribs = nla_nest_start(skb, IFLA_QUEUE_ATTRIBS);
+	if (!attribs)
+		return -EMSGSIZE;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		struct netdev_queue *q = netdev_get_tx_queue(dev, i);
+
+		if (!q->rate_limit)
+			continue;
+
+		attrib = nla_nest_start(skb, IFLA_QUEUE_ATTRIB);
+		if (!attrib) {
+			nla_nest_cancel(skb, attribs);
+			return -EMSGSIZE;
+		}
+
+		if (nla_put_u32(skb, IFLA_QUEUE_INDEX, i) ||
+		    nla_put_u32(skb, IFLA_QUEUE_RATE, q->rate_limit)) {
+			nla_nest_cancel(skb, attrib);
+			nla_nest_cancel(skb, attribs);
+			return -EMSGSIZE;
+		}
+
+		nla_nest_end(skb, attrib);
+	}
+
+	if (attrib)
+		nla_nest_end(skb, attribs);
+	else
+		nla_nest_cancel(skb, attribs);
+
+	return 0;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask)
@@ -997,6 +1036,9 @@  static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (rtnl_port_fill(skb, dev))
 		goto nla_put_failure;
 
+	if (rtnl_queue_fill(skb, dev))
+		goto nla_put_failure;
+
 	if (dev->rtnl_link_ops) {
 		if (rtnl_link_fill(skb, dev) < 0)
 			goto nla_put_failure;
@@ -1150,6 +1192,11 @@  static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
 	[IFLA_PORT_RESPONSE]	= { .type = NLA_U16, },
 };
 
+static const struct nla_policy ifla_queue_policy[IFLA_QUEUE_MAX+1] = {
+	[IFLA_QUEUE_INDEX]	= { .type = NLA_U32},
+	[IFLA_QUEUE_RATE]	= { .type = NLA_U32},
+};
+
 struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
 {
 	struct net *net;
@@ -1503,6 +1550,49 @@  static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 			goto errout;
 		modified = 1;
 	}
+	if (tb[IFLA_QUEUE_ATTRIBS]) {
+		struct nlattr *attr;
+		int rem;
+
+		nla_for_each_nested(attr, tb[IFLA_QUEUE_ATTRIBS], rem) {
+			struct nlattr *queue[IFLA_QUEUE_MAX+1];
+			struct netdev_queue *q = NULL;
+			unsigned int qindex, qrate;
+
+			if (nla_type(attr) != IFLA_QUEUE_ATTRIB)
+				continue;
+
+			err = nla_parse_nested(queue, IFLA_QUEUE_MAX,
+					       attr, ifla_queue_policy);
+			if  (err < 0)
+				goto errout;
+
+			err = -EINVAL;
+			if (queue[IFLA_QUEUE_INDEX]) {
+				qindex = nla_get_u32(queue[IFLA_QUEUE_INDEX]);
+				if (qindex >= dev->real_num_tx_queues)
+					goto errout;
+
+				q = netdev_get_tx_queue(dev, qindex);
+			} else {
+				goto errout;
+			}
+
+			err = -EOPNOTSUPP;
+			if (queue[IFLA_QUEUE_RATE]) {
+				if (!ops->ndo_set_ratelimit)
+					goto errout;
+
+				qrate = nla_get_u32(queue[IFLA_QUEUE_RATE]);
+				err = ops->ndo_set_ratelimit(dev,
+							     qindex, &qrate);
+				if (err < 0)
+					goto errout;
+				q->rate_limit = qrate;
+				modified = 1;
+			}
+		}
+	}
 
 	if (tb[IFLA_AF_SPEC]) {
 		struct nlattr *af;