[next-queue,v7,4/6] net/sched: Introduce Credit Based Shaper (CBS) qdisc

Message ID 20171013004005.17416-5-vinicius.gomes@intel.com
State Superseded
Headers show
Series
  • TSN: Add qdisc based config interface for CBS
Related show

Commit Message

Vinicius Costa Gomes Oct. 13, 2017, 12:40 a.m.
This queueing discipline implements the shaper algorithm defined by
the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L.

It's primary usage is to apply some bandwidth reservation to user
defined traffic classes, which are mapped to different queues via the
mqprio qdisc.

Only a simple software implementation is added for now.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 include/uapi/linux/pkt_sched.h |  18 +++
 net/sched/Kconfig              |  11 ++
 net/sched/Makefile             |   1 +
 net/sched/sch_cbs.c            | 314 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 344 insertions(+)
 create mode 100644 net/sched/sch_cbs.c

Comments

Eric Dumazet Oct. 13, 2017, 2:42 a.m. | #1
On Thu, 2017-10-12 at 17:40 -0700, Vinicius Costa Gomes wrote:
> This queueing discipline implements the shaper algorithm defined by
> the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L.
> 
> It's primary usage is to apply some bandwidth reservation to user
> defined traffic classes, which are mapped to different queues via the
> mqprio qdisc.
> 
> Only a simple software implementation is added for now.
> 
> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
> ---

> +/* timediff is in ns, slope is in kbps */
> +static s64 timediff_to_credits(s64 timediff, s32 slope)
> +{
> +	s64 credits = timediff * slope * BYTES_PER_KBIT;
> +
> +	do_div(credits, NSEC_PER_SEC);
> +
> +	return credits;
> +}
> +
> +static s64 delay_from_credits(s64 credits, s32 slope)
> +{
> +	s64 rate = slope * BYTES_PER_KBIT;
> +	s64 delay;
> +
> +	if (unlikely(rate == 0))
> +		return S64_MAX;
> +
> +	delay = -credits * NSEC_PER_SEC;
> +	do_div(delay, rate);
> +
> +	return delay;
> +}
> +
> +static s64 credits_from_len(unsigned int len, s32 slope, s64 port_rate)
> +{
> +	/* As do_div() only works on unsigned quantities, convert
> +	 * slope to a positive number here, and credits to a negative
> +	 * number before returning.
> +	 */
> +	s64 rate = -slope * BYTES_PER_KBIT;
> +	s64 credits;
> +
> +	if (unlikely(port_rate == 0))
> +		return S64_MAX;
> +
> +	credits = len * rate;
> +	do_div(credits, port_rate);
> +
> +	return -credits;
> +}
> +


Your mixing of s64 and u64 is disturbing.

do_div() handles u64, not s64.

div64_s64() might be needed in place of do_div()
Vinicius Costa Gomes Oct. 13, 2017, 5:12 p.m. | #2
Hi,

Eric Dumazet <eric.dumazet@gmail.com> writes:

[...]

>
> Your mixing of s64 and u64 is disturbing.
>
> do_div() handles u64, not s64.
>
> div64_s64() might be needed in place of do_div()

I wasn't very comfortable about the signal juggling either. Didn't know
about div64_s64(), looks much better. Will fix, thanks.


Cheers,
--
Vinicius
Ivan Khoronzhuk Oct. 13, 2017, 7:59 p.m. | #3
On Thu, Oct 12, 2017 at 05:40:03PM -0700, Vinicius Costa Gomes wrote:
> This queueing discipline implements the shaper algorithm defined by
> the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L.
> 
> It's primary usage is to apply some bandwidth reservation to user
> defined traffic classes, which are mapped to different queues via the
> mqprio qdisc.
> 
> Only a simple software implementation is added for now.
> 
> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
> ---
>  include/uapi/linux/pkt_sched.h |  18 +++
>  net/sched/Kconfig              |  11 ++
>  net/sched/Makefile             |   1 +
>  net/sched/sch_cbs.c            | 314 +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 344 insertions(+)
>  create mode 100644 net/sched/sch_cbs.c
> 
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index 099bf5528fed..41e349df4bf4 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -871,4 +871,22 @@ struct tc_pie_xstats {
>  	__u32 maxq;             /* maximum queue size */
>  	__u32 ecn_mark;         /* packets marked with ecn*/
>  };
> +
> +/* CBS */
> +struct tc_cbs_qopt {
> +	__u8 offload;
> +	__s32 hicredit;
> +	__s32 locredit;
> +	__s32 idleslope;
> +	__s32 sendslope;
> +};
> +
> +enum {
> +	TCA_CBS_UNSPEC,
> +	TCA_CBS_PARMS,
> +	__TCA_CBS_MAX,
> +};
> +
> +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
> +
>  #endif
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index e70ed26485a2..c03d86a7775e 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -172,6 +172,17 @@ config NET_SCH_TBF
>  	  To compile this code as a module, choose M here: the
>  	  module will be called sch_tbf.
>  
> +config NET_SCH_CBS
> +	tristate "Credit Based Shaper (CBS)"
> +	---help---
> +	  Say Y here if you want to use the Credit Based Shaper (CBS) packet
> +	  scheduling algorithm.
> +
> +	  See the top of <file:net/sched/sch_cbs.c> for more details.
> +
> +	  To compile this code as a module, choose M here: the
> +	  module will be called sch_cbs.
> +
>  config NET_SCH_GRED
>  	tristate "Generic Random Early Detection (GRED)"
>  	---help---
> diff --git a/net/sched/Makefile b/net/sched/Makefile
> index 7b915d226de7..80c8f92d162d 100644
> --- a/net/sched/Makefile
> +++ b/net/sched/Makefile
> @@ -52,6 +52,7 @@ obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
>  obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
>  obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
>  obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
> +obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
>  
>  obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
>  obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
> diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
> new file mode 100644
> index 000000000000..0643587e6dc8
> --- /dev/null
> +++ b/net/sched/sch_cbs.c
> @@ -0,0 +1,314 @@
> +/*
> + * net/sched/sch_cbs.c	Credit Based Shaper
> + *
> + *		This program is free software; you can redistribute it and/or
> + *		modify it under the terms of the GNU General Public License
> + *		as published by the Free Software Foundation; either version
> + *		2 of the License, or (at your option) any later version.
> + *
> + * Authors:	Vinicius Costa Gomes <vinicius.gomes@intel.com>
> + *
> + */
> +
> +/* Credit Based Shaper (CBS)
> + * =========================
> + *
> + * This is a simple rate-limiting shaper aimed at TSN applications on
> + * systems with known traffic workloads.
> + *
> + * Its algorithm is defined by the IEEE 802.1Q-2014 Specification,
> + * Section 8.6.8.2, and explained in more detail in the Annex L of the
> + * same specification.
> + *
> + * There are four tunables to be considered:
> + *
> + *	'idleslope': Idleslope is the rate of credits that is
> + *	accumulated (in kilobits per second) when there is at least
> + *	one packet waiting for transmission. Packets are transmitted
> + *	when the current value of credits is equal or greater than
> + *	zero. When there is no packet to be transmitted the amount of
> + *	credits is set to zero. This is the main tunable of the CBS
> + *	algorithm.
> + *
> + *	'sendslope':
> + *	Sendslope is the rate of credits that is depleted (it should be a
> + *	negative number of kilobits per second) when a transmission is
> + *	ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
> + *	8.6.8.2 item g):
> + *
> + *	sendslope = idleslope - port_transmit_rate
> + *
> + *	'hicredit': Hicredit defines the maximum amount of credits (in
> + *	bytes) that can be accumulated. Hicredit depends on the
> + *	characteristics of interfering traffic,
> + *	'max_interference_size' is the maximum size of any burst of
> + *	traffic that can delay the transmission of a frame that is
> + *	available for transmission for this traffic class, (IEEE
> + *	802.1Q-2014 Annex L, Equation L-3):
> + *
> + *	hicredit = max_interference_size * (idleslope / port_transmit_rate)
> + *
> + *	'locredit': Locredit is the minimum amount of credits that can
> + *	be reached. It is a function of the traffic flowing through
> + *	this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2):
> + *
> + *	locredit = max_frame_size * (sendslope / port_transmit_rate)
> + */
> +
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/string.h>
> +#include <linux/errno.h>
> +#include <linux/skbuff.h>
> +#include <net/netlink.h>
> +#include <net/sch_generic.h>
> +#include <net/pkt_sched.h>
> +
> +#define BYTES_PER_KBIT (1000 / 8)
> +
> +struct cbs_sched_data {
> +	s64 port_rate; /* in bytes/s */
> +	s64 last; /* timestamp in ns */
> +	s64 credits; /* in bytes */
> +	s32 locredit; /* in bytes */
> +	s32 hicredit; /* in bytes */
> +	s32 sendslope; /* in kbits/s */
> +	s32 idleslope; /* in kbits/s */
> +	struct qdisc_watchdog watchdog;
> +	int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch);
> +	struct sk_buff *(*dequeue)(struct Qdisc *sch);
> +};
> +
> +static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
> +{
> +	struct cbs_sched_data *q = qdisc_priv(sch);
> +
> +	if (sch->q.qlen == 0 && q->credits > 0) {
> +		/* We need to stop accumulating credits when there's
> +		 * no packet enqueued packets and q->credits is
no packet -> no

> +		 * positive.
> +		 */
> +		q->credits = 0;
> +		q->last = ktime_get_ns();
> +	}
> +
> +	return qdisc_enqueue_tail(skb, sch);
> +}
> +
> +static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> +		       struct sk_buff **to_free)
> +{
> +	struct cbs_sched_data *q = qdisc_priv(sch);
> +
> +	return q->enqueue(skb, sch);
> +}
> +
> +/* timediff is in ns, slope is in kbps */
> +static s64 timediff_to_credits(s64 timediff, s32 slope)
> +{
> +	s64 credits = timediff * slope * BYTES_PER_KBIT;
> +
> +	do_div(credits, NSEC_PER_SEC);
> +
> +	return credits;
> +}
> +
> +static s64 delay_from_credits(s64 credits, s32 slope)
> +{
> +	s64 rate = slope * BYTES_PER_KBIT;
> +	s64 delay;
> +
> +	if (unlikely(rate == 0))
> +		return S64_MAX;
> +
> +	delay = -credits * NSEC_PER_SEC;
> +	do_div(delay, rate);
> +
> +	return delay;
> +}
> +
> +static s64 credits_from_len(unsigned int len, s32 slope, s64 port_rate)
> +{
> +	/* As do_div() only works on unsigned quantities, convert
> +	 * slope to a positive number here, and credits to a negative
> +	 * number before returning.
> +	 */
> +	s64 rate = -slope * BYTES_PER_KBIT;
> +	s64 credits;
> +
> +	if (unlikely(port_rate == 0))
> +		return S64_MAX;
> +
> +	credits = len * rate;
> +	do_div(credits, port_rate);
> +
> +	return -credits;
> +}
> +
> +static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
> +{
> +	struct cbs_sched_data *q = qdisc_priv(sch);
> +	s64 now = ktime_get_ns();
> +	struct sk_buff *skb;
> +	s64 credits;
> +	int len;
> +
> +	if (q->credits < 0) {
> +		credits = timediff_to_credits(now - q->last, q->idleslope);
Maybe be better to add small optimization by moving some calculations from data
path, I mean, save idle_slope in bytes instead of kbit and converting it for
every packet. Both delay_from_credits() and timediff_to_credits() is used only
once and with idle_slope only...and both of them converting it.

Same for credits_from_len() and send slope, save it in units of port_rate.

> +
> +		credits = q->credits + credits;
> +		q->credits = min_t(s64, credits, q->hicredit);
> +
> +		if (q->credits < 0) {
> +			s64 delay;
> +
> +			delay = delay_from_credits(q->credits, q->idleslope);
> +			qdisc_watchdog_schedule_ns(&q->watchdog, now + delay);
> +
> +			q->last = now;
> +
> +			return NULL;
> +		}
> +	}
> +
> +	skb = qdisc_dequeue_head(sch);
> +	if (!skb)
> +		return NULL;
> +
> +	len = qdisc_pkt_len(skb);
> +
> +	/* As sendslope is a negative number, this will decrease the
> +	 * amount of q->credits.
> +	 */
> +	credits = credits_from_len(len, q->sendslope, q->port_rate);
> +	credits += q->credits;
> +
> +	q->credits = max_t(s64, credits, q->locredit);
> +	q->last = now;
> +
> +	return skb;
> +}
> +
> +static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
> +{
> +	struct cbs_sched_data *q = qdisc_priv(sch);
> +
> +	return q->dequeue(sch);
> +}
> +
> +static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
> +	[TCA_CBS_PARMS]	= { .len = sizeof(struct tc_cbs_qopt) },
> +};
> +
> +static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
> +{
> +	struct cbs_sched_data *q = qdisc_priv(sch);
> +	struct net_device *dev = qdisc_dev(sch);
> +	struct nlattr *tb[TCA_CBS_MAX + 1];
> +	struct ethtool_link_ksettings ecmd;
> +	struct tc_cbs_qopt *qopt;
> +	s64 link_speed;
> +	int err;
> +
> +	err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL);
> +	if (err < 0)
> +		return err;
> +
> +	if (!tb[TCA_CBS_PARMS])
> +		return -EINVAL;
> +
> +	qopt = nla_data(tb[TCA_CBS_PARMS]);
> +
> +	if (qopt->offload)
> +		return -EOPNOTSUPP;
> +
> +	if (!__ethtool_get_link_ksettings(dev, &ecmd))
> +		link_speed = ecmd.base.speed;
> +	else
> +		link_speed = SPEED_1000;
> +
> +	q->port_rate = link_speed * 1000 * BYTES_PER_KBIT;
> +
> +	q->enqueue = cbs_enqueue_soft;
> +	q->dequeue = cbs_dequeue_soft;
> +
> +	q->hicredit = qopt->hicredit;
> +	q->locredit = qopt->locredit;
> +	q->idleslope = qopt->idleslope;
> +	q->sendslope = qopt->sendslope;
> +
> +	return 0;
> +}
> +
> +static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
> +{
> +	struct cbs_sched_data *q = qdisc_priv(sch);
> +
> +	if (!opt)
> +		return -EINVAL;
> +
> +	qdisc_watchdog_init(&q->watchdog, sch);
> +
> +	return cbs_change(sch, opt);
> +}
> +
> +static void cbs_destroy(struct Qdisc *sch)
> +{
> +	struct cbs_sched_data *q = qdisc_priv(sch);
> +
> +	qdisc_watchdog_cancel(&q->watchdog);
> +}
> +
> +static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
> +{
> +	struct cbs_sched_data *q = qdisc_priv(sch);
> +	struct tc_cbs_qopt opt = { };
> +	struct nlattr *nest;
> +
> +	nest = nla_nest_start(skb, TCA_OPTIONS);
> +	if (!nest)
> +		goto nla_put_failure;
> +
> +	opt.hicredit = q->hicredit;
> +	opt.locredit = q->locredit;
> +	opt.sendslope = q->sendslope;
> +	opt.idleslope = q->idleslope;
> +	opt.offload = 0;
> +
> +	if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt))
> +		goto nla_put_failure;
> +
> +	return nla_nest_end(skb, nest);
> +
> +nla_put_failure:
> +	nla_nest_cancel(skb, nest);
> +	return -1;
> +}
> +
> +static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
> +	.id		=	"cbs",
> +	.priv_size	=	sizeof(struct cbs_sched_data),
> +	.enqueue	=	cbs_enqueue,
> +	.dequeue	=	cbs_dequeue,
> +	.peek		=	qdisc_peek_dequeued,
> +	.init		=	cbs_init,
> +	.reset		=	qdisc_reset_queue,
> +	.destroy	=	cbs_destroy,
> +	.change		=	cbs_change,
> +	.dump		=	cbs_dump,
> +	.owner		=	THIS_MODULE,
> +};
> +
> +static int __init cbs_module_init(void)
> +{
> +	return register_qdisc(&cbs_qdisc_ops);
> +}
> +
> +static void __exit cbs_module_exit(void)
> +{
> +	unregister_qdisc(&cbs_qdisc_ops);
> +}
> +module_init(cbs_module_init)
> +module_exit(cbs_module_exit)
> +MODULE_LICENSE("GPL");
> -- 
> 2.14.2
>
Vinicius Costa Gomes Oct. 13, 2017, 10:54 p.m. | #4
Hi,

Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org> writes:

[...]

>> +
>> +static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
>> +{
>> +	struct cbs_sched_data *q = qdisc_priv(sch);
>> +
>> +	if (sch->q.qlen == 0 && q->credits > 0) {
>> +		/* We need to stop accumulating credits when there's
>> +		 * no packet enqueued packets and q->credits is
> no packet -> no

Ugh. Fixed.

>
>> +		 * positive.
>> +		 */
>> +		q->credits = 0;
>> +		q->last = ktime_get_ns();
>> +	}
>> +
>> +	return qdisc_enqueue_tail(skb, sch);
>> +}
>> +

[...]

>> +static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
>> +{
>> +	struct cbs_sched_data *q = qdisc_priv(sch);
>> +	s64 now = ktime_get_ns();
>> +	struct sk_buff *skb;
>> +	s64 credits;
>> +	int len;
>> +
>> +	if (q->credits < 0) {
>> +		credits = timediff_to_credits(now - q->last, q->idleslope);
> Maybe be better to add small optimization by moving some calculations from data
> path, I mean, save idle_slope in bytes instead of kbit and converting it for
> every packet. Both delay_from_credits() and timediff_to_credits() is used only
> once and with idle_slope only...and both of them converting it.
>
> Same for credits_from_len() and send slope, save it in units of port_rate.
>

Done. Thanks.


Cheers,
--
Vinicius
David Laight Oct. 16, 2017, 9:14 a.m. | #5
From: Ivan Khoronzhuk
> Sent: 13 October 2017 20:59
> On Thu, Oct 12, 2017 at 05:40:03PM -0700, Vinicius Costa Gomes wrote:
> > This queueing discipline implements the shaper algorithm defined by
> > the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L.
> >
> > It's primary usage is to apply some bandwidth reservation to user
> > defined traffic classes, which are mapped to different queues via the
> > mqprio qdisc.
> >
> > Only a simple software implementation is added for now.
> >
> > Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> > Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
> > ---
> >  include/uapi/linux/pkt_sched.h |  18 +++
> >  net/sched/Kconfig              |  11 ++
> >  net/sched/Makefile             |   1 +
> >  net/sched/sch_cbs.c            | 314 +++++++++++++++++++++++++++++++++++++++++
> >  4 files changed, 344 insertions(+)
> >  create mode 100644 net/sched/sch_cbs.c
> >
> > diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> > index 099bf5528fed..41e349df4bf4 100644
> > --- a/include/uapi/linux/pkt_sched.h
> > +++ b/include/uapi/linux/pkt_sched.h
> > @@ -871,4 +871,22 @@ struct tc_pie_xstats {
> >  	__u32 maxq;             /* maximum queue size */
> >  	__u32 ecn_mark;         /* packets marked with ecn*/
> >  };
> > +
> > +/* CBS */
> > +struct tc_cbs_qopt {
> > +	__u8 offload;

You probably don't want unnamed padding in a uapi structure.

> > +	__s32 hicredit;
> > +	__s32 locredit;
> > +	__s32 idleslope;
> > +	__s32 sendslope;
> > +};
> > +
> > +enum {
> > +	TCA_CBS_UNSPEC,
> > +	TCA_CBS_PARMS,
> > +	__TCA_CBS_MAX,
> > +};
> > +
> > +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)

Why not:
	TCA_CBS_PARMS,
	TCA_CBS_NEXT,
	TCA_CBS_MAX = TCA_CBS_NEXT - 1,

...
	David
Vinicius Costa Gomes Oct. 16, 2017, 10:13 p.m. | #6
Hi David,

David Laight <David.Laight@ACULAB.COM> writes:

[...]

>> > index 099bf5528fed..41e349df4bf4 100644
>> > --- a/include/uapi/linux/pkt_sched.h
>> > +++ b/include/uapi/linux/pkt_sched.h
>> > @@ -871,4 +871,22 @@ struct tc_pie_xstats {
>> >  	__u32 maxq;             /* maximum queue size */
>> >  	__u32 ecn_mark;         /* packets marked with ecn*/
>> >  };
>> > +
>> > +/* CBS */
>> > +struct tc_cbs_qopt {
>> > +	__u8 offload;
>
> You probably don't want unnamed padding in a uapi structure.

Yeah, this needs to be fixed.

>
>> > +	__s32 hicredit;
>> > +	__s32 locredit;
>> > +	__s32 idleslope;
>> > +	__s32 sendslope;
>> > +};
>> > +
>> > +enum {
>> > +	TCA_CBS_UNSPEC,
>> > +	TCA_CBS_PARMS,
>> > +	__TCA_CBS_MAX,
>> > +};
>> > +
>> > +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
>
> Why not:
> 	TCA_CBS_PARMS,
> 	TCA_CBS_NEXT,
> 	TCA_CBS_MAX = TCA_CBS_NEXT - 1,

The way it is proposed, at least is consistent with the rest of the
file. So, if you don't have any stronger reasons, I'd like to keep it
this way.

>
> ...
> 	David


Cheers,
--
Vinicius

Patch

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 099bf5528fed..41e349df4bf4 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -871,4 +871,22 @@  struct tc_pie_xstats {
 	__u32 maxq;             /* maximum queue size */
 	__u32 ecn_mark;         /* packets marked with ecn*/
 };
+
+/* CBS */
+struct tc_cbs_qopt {
+	__u8 offload;
+	__s32 hicredit;
+	__s32 locredit;
+	__s32 idleslope;
+	__s32 sendslope;
+};
+
+enum {
+	TCA_CBS_UNSPEC,
+	TCA_CBS_PARMS,
+	__TCA_CBS_MAX,
+};
+
+#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e70ed26485a2..c03d86a7775e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -172,6 +172,17 @@  config NET_SCH_TBF
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_tbf.
 
+config NET_SCH_CBS
+	tristate "Credit Based Shaper (CBS)"
+	---help---
+	  Say Y here if you want to use the Credit Based Shaper (CBS) packet
+	  scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_cbs.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_cbs.
+
 config NET_SCH_GRED
 	tristate "Generic Random Early Detection (GRED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 7b915d226de7..80c8f92d162d 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -52,6 +52,7 @@  obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
 obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
+obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
new file mode 100644
index 000000000000..0643587e6dc8
--- /dev/null
+++ b/net/sched/sch_cbs.c
@@ -0,0 +1,314 @@ 
+/*
+ * net/sched/sch_cbs.c	Credit Based Shaper
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+/* Credit Based Shaper (CBS)
+ * =========================
+ *
+ * This is a simple rate-limiting shaper aimed at TSN applications on
+ * systems with known traffic workloads.
+ *
+ * Its algorithm is defined by the IEEE 802.1Q-2014 Specification,
+ * Section 8.6.8.2, and explained in more detail in the Annex L of the
+ * same specification.
+ *
+ * There are four tunables to be considered:
+ *
+ *	'idleslope': Idleslope is the rate of credits that is
+ *	accumulated (in kilobits per second) when there is at least
+ *	one packet waiting for transmission. Packets are transmitted
+ *	when the current value of credits is equal or greater than
+ *	zero. When there is no packet to be transmitted the amount of
+ *	credits is set to zero. This is the main tunable of the CBS
+ *	algorithm.
+ *
+ *	'sendslope':
+ *	Sendslope is the rate of credits that is depleted (it should be a
+ *	negative number of kilobits per second) when a transmission is
+ *	ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
+ *	8.6.8.2 item g):
+ *
+ *	sendslope = idleslope - port_transmit_rate
+ *
+ *	'hicredit': Hicredit defines the maximum amount of credits (in
+ *	bytes) that can be accumulated. Hicredit depends on the
+ *	characteristics of interfering traffic,
+ *	'max_interference_size' is the maximum size of any burst of
+ *	traffic that can delay the transmission of a frame that is
+ *	available for transmission for this traffic class, (IEEE
+ *	802.1Q-2014 Annex L, Equation L-3):
+ *
+ *	hicredit = max_interference_size * (idleslope / port_transmit_rate)
+ *
+ *	'locredit': Locredit is the minimum amount of credits that can
+ *	be reached. It is a function of the traffic flowing through
+ *	this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2):
+ *
+ *	locredit = max_frame_size * (sendslope / port_transmit_rate)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+
+#define BYTES_PER_KBIT (1000 / 8)
+
+struct cbs_sched_data {
+	s64 port_rate; /* in bytes/s */
+	s64 last; /* timestamp in ns */
+	s64 credits; /* in bytes */
+	s32 locredit; /* in bytes */
+	s32 hicredit; /* in bytes */
+	s32 sendslope; /* in kbits/s */
+	s32 idleslope; /* in kbits/s */
+	struct qdisc_watchdog watchdog;
+	int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch);
+	struct sk_buff *(*dequeue)(struct Qdisc *sch);
+};
+
+static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	if (sch->q.qlen == 0 && q->credits > 0) {
+		/* We need to stop accumulating credits when there's
+		 * no packet enqueued packets and q->credits is
+		 * positive.
+		 */
+		q->credits = 0;
+		q->last = ktime_get_ns();
+	}
+
+	return qdisc_enqueue_tail(skb, sch);
+}
+
+static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	return q->enqueue(skb, sch);
+}
+
+/* timediff is in ns, slope is in kbps */
+static s64 timediff_to_credits(s64 timediff, s32 slope)
+{
+	s64 credits = timediff * slope * BYTES_PER_KBIT;
+
+	do_div(credits, NSEC_PER_SEC);
+
+	return credits;
+}
+
+static s64 delay_from_credits(s64 credits, s32 slope)
+{
+	s64 rate = slope * BYTES_PER_KBIT;
+	s64 delay;
+
+	if (unlikely(rate == 0))
+		return S64_MAX;
+
+	delay = -credits * NSEC_PER_SEC;
+	do_div(delay, rate);
+
+	return delay;
+}
+
+static s64 credits_from_len(unsigned int len, s32 slope, s64 port_rate)
+{
+	/* As do_div() only works on unsigned quantities, convert
+	 * slope to a positive number here, and credits to a negative
+	 * number before returning.
+	 */
+	s64 rate = -slope * BYTES_PER_KBIT;
+	s64 credits;
+
+	if (unlikely(port_rate == 0))
+		return S64_MAX;
+
+	credits = len * rate;
+	do_div(credits, port_rate);
+
+	return -credits;
+}
+
+static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	s64 now = ktime_get_ns();
+	struct sk_buff *skb;
+	s64 credits;
+	int len;
+
+	if (q->credits < 0) {
+		credits = timediff_to_credits(now - q->last, q->idleslope);
+
+		credits = q->credits + credits;
+		q->credits = min_t(s64, credits, q->hicredit);
+
+		if (q->credits < 0) {
+			s64 delay;
+
+			delay = delay_from_credits(q->credits, q->idleslope);
+			qdisc_watchdog_schedule_ns(&q->watchdog, now + delay);
+
+			q->last = now;
+
+			return NULL;
+		}
+	}
+
+	skb = qdisc_dequeue_head(sch);
+	if (!skb)
+		return NULL;
+
+	len = qdisc_pkt_len(skb);
+
+	/* As sendslope is a negative number, this will decrease the
+	 * amount of q->credits.
+	 */
+	credits = credits_from_len(len, q->sendslope, q->port_rate);
+	credits += q->credits;
+
+	q->credits = max_t(s64, credits, q->locredit);
+	q->last = now;
+
+	return skb;
+}
+
+static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	return q->dequeue(sch);
+}
+
+static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
+	[TCA_CBS_PARMS]	= { .len = sizeof(struct tc_cbs_qopt) },
+};
+
+static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct nlattr *tb[TCA_CBS_MAX + 1];
+	struct ethtool_link_ksettings ecmd;
+	struct tc_cbs_qopt *qopt;
+	s64 link_speed;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_CBS_PARMS])
+		return -EINVAL;
+
+	qopt = nla_data(tb[TCA_CBS_PARMS]);
+
+	if (qopt->offload)
+		return -EOPNOTSUPP;
+
+	if (!__ethtool_get_link_ksettings(dev, &ecmd))
+		link_speed = ecmd.base.speed;
+	else
+		link_speed = SPEED_1000;
+
+	q->port_rate = link_speed * 1000 * BYTES_PER_KBIT;
+
+	q->enqueue = cbs_enqueue_soft;
+	q->dequeue = cbs_dequeue_soft;
+
+	q->hicredit = qopt->hicredit;
+	q->locredit = qopt->locredit;
+	q->idleslope = qopt->idleslope;
+	q->sendslope = qopt->sendslope;
+
+	return 0;
+}
+
+static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	if (!opt)
+		return -EINVAL;
+
+	qdisc_watchdog_init(&q->watchdog, sch);
+
+	return cbs_change(sch, opt);
+}
+
+static void cbs_destroy(struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct tc_cbs_qopt opt = { };
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	opt.hicredit = q->hicredit;
+	opt.locredit = q->locredit;
+	opt.sendslope = q->sendslope;
+	opt.idleslope = q->idleslope;
+	opt.offload = 0;
+
+	if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
+	.id		=	"cbs",
+	.priv_size	=	sizeof(struct cbs_sched_data),
+	.enqueue	=	cbs_enqueue,
+	.dequeue	=	cbs_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	cbs_init,
+	.reset		=	qdisc_reset_queue,
+	.destroy	=	cbs_destroy,
+	.change		=	cbs_change,
+	.dump		=	cbs_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init cbs_module_init(void)
+{
+	return register_qdisc(&cbs_qdisc_ops);
+}
+
+static void __exit cbs_module_exit(void)
+{
+	unregister_qdisc(&cbs_qdisc_ops);
+}
+module_init(cbs_module_init)
+module_exit(cbs_module_exit)
+MODULE_LICENSE("GPL");