diff mbox series

[RFC,v3,net-next,13/18] net/sched: Introduce the TBS Qdisc

Message ID 20180307011230.24001-14-jesus.sanchez-palencia@intel.com
State RFC, archived
Delegated to: David Miller
Headers show
Series Time based packet transmission | expand

Commit Message

Jesus Sanchez-Palencia March 7, 2018, 1:12 a.m. UTC
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>

TBS (Time Based Scheduler) uses the information added earlier in this
series (the socket option SO_TXTIME and the new role of
sk_buff->tstamp) to schedule traffic transmission based on absolute
time.

For some workloads, just bandwidth enforcement is not enough, and
precise control of the transmission of packets is necessary.

Example:

$ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \
           map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0

$ tc qdisc add dev enp2s0 parent 100:1 tbs delta 100000 \
           clockid CLOCK_REALTIME sorting

In this example, the Qdisc will provide SW best-effort for the control
of the transmission time to the network adapter, the time stamp in socket
are in reference to the clockid CLOCK_REALTIME and packets leave the
Qdisc "delta" (100000) nanoseconds before its transmission time. It will
also enable sorting of the buffered packets based on their txtime.

The qdisc will drop packets on enqueue() if their skbuff clockid does not
match the clock reference of the Qdisc. Moreover, the tc_drop_if_late
flag from skbuffs will be used on dequeue() to determine if a packet
that has expired while being enqueued should be dropped or not.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 include/linux/netdevice.h      |   1 +
 include/uapi/linux/pkt_sched.h |  17 ++
 net/sched/Kconfig              |  11 +
 net/sched/Makefile             |   1 +
 net/sched/sch_tbs.c            | 474 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 504 insertions(+)
 create mode 100644 net/sched/sch_tbs.c

Comments

Thomas Gleixner March 21, 2018, 1:46 p.m. UTC | #1
On Tue, 6 Mar 2018, Jesus Sanchez-Palencia wrote:
> +struct tbs_sched_data {
> +	bool sorting;
> +	int clockid;
> +	int queue;
> +	s32 delta; /* in ns */
> +	ktime_t last; /* The txtime of the last skb sent to the netdevice. */
> +	struct rb_root head;

Hmm. You are reimplementing timerqueue open coded. Have you checked whether
you could reuse the timerqueue implementation?

That requires to add a timerqueue node to struct skbuff

@@ -671,7 +671,8 @@ struct sk_buff {
 				unsigned long		dev_scratch;
 			};
 		};
-		struct rb_node	rbnode; /* used in netem & tcp stack */
+		struct rb_node		rbnode; /* used in netem & tcp stack */
+		struct timerqueue_node	tqnode;
 	};
 	struct sock		*sk;

Then you can use timerqueue_head in your scheduler data and all the open
coded rbtree handling goes away.

> +static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
> +{
> +	struct tbs_sched_data *q = qdisc_priv(sch);
> +	ktime_t txtime = nskb->tstamp;
> +	struct sock *sk = nskb->sk;
> +	ktime_t now;
> +
> +	if (sk && !sock_flag(sk, SOCK_TXTIME))
> +		return false;
> +
> +	/* We don't perform crosstimestamping.
> +	 * Drop if packet's clockid differs from qdisc's.
> +	 */
> +	if (nskb->txtime_clockid != q->clockid)
> +		return false;
> +
> +	now = get_time_by_clockid(q->clockid);

If you store the time getter function pointer in tbs_sched_data then you
avoid the lookup and just can do

       now = q->get_time();

That applies to lots of other places.

> +	if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
> +		return false;
> +
> +	return true;
> +}
> +
> +static struct sk_buff *tbs_peek(struct Qdisc *sch)
> +{
> +	struct tbs_sched_data *q = qdisc_priv(sch);
> +
> +	return q->peek(sch);
> +}
> +
> +static struct sk_buff *tbs_peek_timesortedlist(struct Qdisc *sch)
> +{
> +	struct tbs_sched_data *q = qdisc_priv(sch);
> +	struct rb_node *p;
> +
> +	p = rb_first(&q->head);

timerqueue gives you direct access to the first expiring entry w/o walking
the rbtree. So that would become:

	p = timerqueue_getnext(&q->tqhead);
	return p ? rb_to_skb(p) : NULL;

> +	if (!p)
> +		return NULL;
> +
> +	return rb_to_skb(p);
> +}

> +static int tbs_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
> +				      struct sk_buff **to_free)
> +{
> +	struct tbs_sched_data *q = qdisc_priv(sch);
> +	struct rb_node **p = &q->head.rb_node, *parent = NULL;
> +	ktime_t txtime = nskb->tstamp;
> +
> +	if (!is_packet_valid(sch, nskb))
> +		return qdisc_drop(nskb, sch, to_free);
> +
> +	while (*p) {
> +		struct sk_buff *skb;
> +
> +		parent = *p;
> +		skb = rb_to_skb(parent);
> +		if (ktime_after(txtime, skb->tstamp))
> +			p = &parent->rb_right;
> +		else
> +			p = &parent->rb_left;
> +	}
> +	rb_link_node(&nskb->rbnode, parent, p);
> +	rb_insert_color(&nskb->rbnode, &q->head);

That'd become:

       nskb->tknode.expires = txtime;
       timerqueue_add(&d->tqhead, &nskb->tknode);

> +	qdisc_qstats_backlog_inc(sch, nskb);
> +	sch->q.qlen++;
> +
> +	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
> +	reset_watchdog(sch);
> +
> +	return NET_XMIT_SUCCESS;
> +}
> +
> +static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
> +				 bool drop)
> +{
> +	struct tbs_sched_data *q = qdisc_priv(sch);
> +
> +	rb_erase(&skb->rbnode, &q->head);
> +
> +	qdisc_qstats_backlog_dec(sch, skb);
> +
> +	if (drop) {
> +		struct sk_buff *to_free = NULL;
> +
> +		qdisc_drop(skb, sch, &to_free);
> +		kfree_skb_list(to_free);
> +		qdisc_qstats_overlimit(sch);
> +	} else {
> +		qdisc_bstats_update(sch, skb);
> +
> +		q->last = skb->tstamp;
> +	}
> +
> +	sch->q.qlen--;
> +
> +	/* The rbnode field in the skb re-uses these fields, now that
> +	 * we are done with the rbnode, reset them.
> +	 */
> +	skb->next = NULL;
> +	skb->prev = NULL;
> +	skb->dev = qdisc_dev(sch);
> +}
> +
> +static struct sk_buff *tbs_dequeue(struct Qdisc *sch)
> +{
> +	struct tbs_sched_data *q = qdisc_priv(sch);
> +
> +	return q->dequeue(sch);
> +}
> +
> +static struct sk_buff *tbs_dequeue_scheduledfifo(struct Qdisc *sch)
> +{
> +	struct tbs_sched_data *q = qdisc_priv(sch);
> +	struct sk_buff *skb = tbs_peek(sch);
> +	ktime_t now, next;
> +
> +	if (!skb)
> +		return NULL;
> +
> +	now = get_time_by_clockid(q->clockid);
> +
> +	/* Drop if packet has expired while in queue and the drop_if_late
> +	 * flag is set.
> +	 */
> +	if (skb->tc_drop_if_late && ktime_before(skb->tstamp, now)) {
> +		struct sk_buff *to_free = NULL;
> +
> +		qdisc_queue_drop_head(sch, &to_free);
> +		kfree_skb_list(to_free);
> +		qdisc_qstats_overlimit(sch);
> +
> +		skb = NULL;
> +		goto out;

Instead of going out immediately you should check the next skb whether its
due for sending already.

> +	}
> +
> +	next = ktime_sub_ns(skb->tstamp, q->delta);
> +
> +	/* Dequeue only if now is within the [txtime - delta, txtime] range. */
> +	if (ktime_after(now, next))
> +		skb = qdisc_dequeue_head(sch);
> +	else
> +		skb = NULL;
> +
> +out:
> +	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
> +	reset_watchdog(sch);
> +
> +	return skb;
> +}
> +
> +static struct sk_buff *tbs_dequeue_timesortedlist(struct Qdisc *sch)
> +{
> +	struct tbs_sched_data *q = qdisc_priv(sch);
> +	struct sk_buff *skb;
> +	ktime_t now, next;
> +
> +	skb = tbs_peek(sch);
> +	if (!skb)
> +		return NULL;
> +
> +	now = get_time_by_clockid(q->clockid);
> +
> +	/* Drop if packet has expired while in queue and the drop_if_late
> +	 * flag is set.
> +	 */
> +	if (skb->tc_drop_if_late && ktime_before(skb->tstamp, now)) {
> +		timesortedlist_erase(sch, skb, true);
> +		skb = NULL;
> +		goto out;

Same as above.

> +	}
> +
> +	next = ktime_sub_ns(skb->tstamp, q->delta);
> +
> +	/* Dequeue only if now is within the [txtime - delta, txtime] range. */
> +	if (ktime_after(now, next))
> +		timesortedlist_erase(sch, skb, false);
> +	else
> +		skb = NULL;
> +
> +out:
> +	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
> +	reset_watchdog(sch);
> +
> +	return skb;
> +}
> +
> +static inline void setup_queueing_mode(struct tbs_sched_data *q)
> +{
> +	if (q->sorting) {
> +		q->enqueue = tbs_enqueue_timesortedlist;
> +		q->dequeue = tbs_dequeue_timesortedlist;
> +		q->peek = tbs_peek_timesortedlist;
> +	} else {
> +		q->enqueue = tbs_enqueue_scheduledfifo;
> +		q->dequeue = tbs_dequeue_scheduledfifo;
> +		q->peek = qdisc_peek_head;

I don't see the point of these two modes and all the duplicated code it
involves.

FIFO mode limits usage to a single thread which has to guarantee that the
packets are queued in time order.

If you look at the use cases of TDM in various fields then FIFO mode is
pretty much useless. In industrial/automotive fieldbus applications the
various time slices are filled by different threads or even processes.

Sure, the rbtree queue/dequeue has overhead compared to a simple linked
list, but you pay for that with more indirections and lots of mostly
duplicated code. And in the worst case one of these code pathes is going to
be rarely used and prone to bitrot.

Thanks,

	tglx
Thomas Gleixner March 21, 2018, 10:29 p.m. UTC | #2
On Wed, 21 Mar 2018, Thomas Gleixner wrote:
> If you look at the use cases of TDM in various fields then FIFO mode is
> pretty much useless. In industrial/automotive fieldbus applications the
> various time slices are filled by different threads or even processes.

That brings me to a related question. The TDM cases I'm familiar with which
aim to use this utilize multiple periodic time slices, aka 802.1Qbv
time-aware scheduling.

Simple example:

[1a][1b][1c][1d]		[1a][1b][1c][1d]		[.....
		[2a][2b]			[2c][2d]
			[3a]				[3b]
			    [4a]			    [4b]
---------------------------------------------------------------------->	t		    

where 1-4 is the slice level and a-d are network nodes.

In most cases the slice levels on a node are handled by different
applications or threads. Some of the protocols utilize dedicated time slice
levels - lets assume '4' in the above example - to run general network
traffic which might even be allowed to have collisions, i.e. [4a-d] would
become [4] and any node can send; the involved componets like switches are
supposed to handle that.

I'm not seing how TBS is going to assist with any of that. It requires
everything to be handled at the application level. Not really useful
especially not for general traffic which does not know about the scheduling
bands at all.

If you look at an industrial control node. It basically does:

	queue_first_packet(tx, slice1);
   	while (!stop) {
		if (wait_for_packet(rx) == ERROR)
			goto errorhandling;
		tx = do_computation(rx);
		queue_next_tx(tx, slice1);
	}

that's a pretty common pattern for these kind of applications. For audio
sources queue_next() might be triggered by the input sampler which needs to
be synchronized to the network slices anyway in order to work properly.

TBS per current implementation is nice as a proof of concept, but it solves
just a small portion of the complete problem space. I have the suspicion
that this was 'designed' to replace the user space hack in the AVNU stack
with something close to it. Not really a good plan to be honest.

I think what we really want is a strict periodic scheduler which supports
multiple slices as shown above because thats what all relevant TDM use
cases need: A/V, industrial fieldbusses .....

  |---------------------------------------------------------|
  |                                                         |
  |                           TAS                           |<- Config
  |    1               2               3               4    |
  |---------------------------------------------------------|
       |               |               |               |
       |               |               |               |
       |               |               |               |
       |               |               |               |
  [DirectSocket]   [Qdisc FIFO]   [Qdisc Prio]     [Qdisc FIFO]
                       |               |               |
		       |               |               |
		    [Socket]   	    [Socket]     [General traffic]


The interesting thing here is that it does not require any time stamp
information brought in from the application. That's especially good for
general network traffic which is routed through a dedicated time slot. If
we don't have that then we need a user space scheduler which does exactly
the same thing and we have to route the general traffic out to user space
and back into the kernel, which is obviously a pointless exercise.

There are all kind of TDM schemes out there which are not directly driven
by applications, but rather route categorized traffic like VLANs through
dedicated time slices. That works pretty well with the above scheme because
in that case the applications might be completely oblivious about the tx
time schedule.

Surely there are protocols which do not utilize every time slice they could
use, so we need a way to tell the number of empty slices between two
consecutive packets. There are also different policies vs. the unused time
slices, like sending dummy frames or just nothing which wants to be
addressed, but I don't think that changes the general approach.

There might be some special cases for setup or node hotplug, but the
protocols I'm familiar with handle these in dedicated time slices or
through general traffic so it should just fit in.

I'm surely missing some details, but from my knowledge about the protocols
which want to utilize this, the general direction should be fine.

Feel free to tell me that I'm missing the point completely though :)

Thoughts?

Thanks,

	tglx
Jesus Sanchez-Palencia March 22, 2018, 8:25 p.m. UTC | #3
Hi Thomas,


On 03/21/2018 03:29 PM, Thomas Gleixner wrote:
> On Wed, 21 Mar 2018, Thomas Gleixner wrote:
>> If you look at the use cases of TDM in various fields then FIFO mode is
>> pretty much useless. In industrial/automotive fieldbus applications the
>> various time slices are filled by different threads or even processes.
> 
> That brings me to a related question. The TDM cases I'm familiar with which
> aim to use this utilize multiple periodic time slices, aka 802.1Qbv
> time-aware scheduling.
> 
> Simple example:
> 
> [1a][1b][1c][1d]		[1a][1b][1c][1d]		[.....
> 		[2a][2b]			[2c][2d]
> 			[3a]				[3b]
> 			    [4a]			    [4b]
> ---------------------------------------------------------------------->	t		    
> 
> where 1-4 is the slice level and a-d are network nodes.
> 
> In most cases the slice levels on a node are handled by different
> applications or threads. Some of the protocols utilize dedicated time slice
> levels - lets assume '4' in the above example - to run general network
> traffic which might even be allowed to have collisions, i.e. [4a-d] would
> become [4] and any node can send; the involved componets like switches are
> supposed to handle that.
> 
> I'm not seing how TBS is going to assist with any of that. It requires
> everything to be handled at the application level. Not really useful
> especially not for general traffic which does not know about the scheduling
> bands at all.
> 
> If you look at an industrial control node. It basically does:
> 
> 	queue_first_packet(tx, slice1);
>    	while (!stop) {
> 		if (wait_for_packet(rx) == ERROR)
> 			goto errorhandling;
> 		tx = do_computation(rx);
> 		queue_next_tx(tx, slice1);
> 	}
> 
> that's a pretty common pattern for these kind of applications. For audio
> sources queue_next() might be triggered by the input sampler which needs to
> be synchronized to the network slices anyway in order to work properly.
> 
> TBS per current implementation is nice as a proof of concept, but it solves
> just a small portion of the complete problem space. I have the suspicion
> that this was 'designed' to replace the user space hack in the AVNU stack
> with something close to it. Not really a good plan to be honest.
> 
> I think what we really want is a strict periodic scheduler which supports
> multiple slices as shown above because thats what all relevant TDM use
> cases need: A/V, industrial fieldbusses .....
> 
>   |---------------------------------------------------------|
>   |                                                         |
>   |                           TAS                           |<- Config
>   |    1               2               3               4    |
>   |---------------------------------------------------------|
>        |               |               |               |
>        |               |               |               |
>        |               |               |               |
>        |               |               |               |
>   [DirectSocket]   [Qdisc FIFO]   [Qdisc Prio]     [Qdisc FIFO]
>                        |               |               |
> 		       |               |               |
> 		    [Socket]   	    [Socket]     [General traffic]
> 
> 
> The interesting thing here is that it does not require any time stamp
> information brought in from the application. That's especially good for
> general network traffic which is routed through a dedicated time slot. If
> we don't have that then we need a user space scheduler which does exactly
> the same thing and we have to route the general traffic out to user space
> and back into the kernel, which is obviously a pointless exercise.
> 
> There are all kind of TDM schemes out there which are not directly driven
> by applications, but rather route categorized traffic like VLANs through
> dedicated time slices. That works pretty well with the above scheme because
> in that case the applications might be completely oblivious about the tx
> time schedule.
> 
> Surely there are protocols which do not utilize every time slice they could
> use, so we need a way to tell the number of empty slices between two
> consecutive packets. There are also different policies vs. the unused time
> slices, like sending dummy frames or just nothing which wants to be
> addressed, but I don't think that changes the general approach.
> 
> There might be some special cases for setup or node hotplug, but the
> protocols I'm familiar with handle these in dedicated time slices or
> through general traffic so it should just fit in.
> 
> I'm surely missing some details, but from my knowledge about the protocols
> which want to utilize this, the general direction should be fine.
> 
> Feel free to tell me that I'm missing the point completely though :)
> 
> Thoughts?


We agree with most of the above. :)
Actually, last year Vinicius shared our ideas for a "time-aware priority" root
qdisc as part of the cbs RFC cover letter, dubbed 'taprio':

https://patchwork.ozlabs.org/cover/808504/

Our plan was to work directly with the Qbv-like scheduling (per-port) just after
the cbs qdisc (Qav), but the feedback here and offline was that there were use
cases for a more simplistic launchtime approach (per-queue) as well. We've
decided to invest on it first (and postpone the 'taprio' qdisc until there was
NIC available with HW support for it, basically).

You are right, and we agree, that using tbs for a per-port schedule of any sort
will require a SW scheduler to be developed on top of it, but we've never said
the contrary either. Our vision has always been that these are separate
mechanisms with different use-cases, so we do see the value for the kernel to
provide both.

In other words, tbs is not the final solution for Qbv, and we agree that a 'TAS'
qdisc is still necessary. And due to the wide range of applications and hw being
used for those out there, we need both specially given that one does not block
the other.


What do you think?

Thanks,
Jesus
Jesus Sanchez-Palencia March 22, 2018, 8:29 p.m. UTC | #4
Hi Thomas,


On 03/21/2018 06:46 AM, Thomas Gleixner wrote:
> On Tue, 6 Mar 2018, Jesus Sanchez-Palencia wrote:
>> +struct tbs_sched_data {
>> +	bool sorting;
>> +	int clockid;
>> +	int queue;
>> +	s32 delta; /* in ns */
>> +	ktime_t last; /* The txtime of the last skb sent to the netdevice. */
>> +	struct rb_root head;
> 
> Hmm. You are reimplementing timerqueue open coded. Have you checked whether
> you could reuse the timerqueue implementation?
> 
> That requires to add a timerqueue node to struct skbuff
> 
> @@ -671,7 +671,8 @@ struct sk_buff {
>  				unsigned long		dev_scratch;
>  			};
>  		};
> -		struct rb_node	rbnode; /* used in netem & tcp stack */
> +		struct rb_node		rbnode; /* used in netem & tcp stack */
> +		struct timerqueue_node	tqnode;
>  	};
>  	struct sock		*sk;
> 
> Then you can use timerqueue_head in your scheduler data and all the open
> coded rbtree handling goes away.


Yes, you are right. We actually looked into that for the first prototype of this
qdisc but we weren't so sure about adding the timerqueue node to the sk_buff's
union and whether it would impact the other usages here, but looking again now
and it looks fine.

We'll fix for the next version, thanks.


> 
>> +static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
>> +{
>> +	struct tbs_sched_data *q = qdisc_priv(sch);
>> +	ktime_t txtime = nskb->tstamp;
>> +	struct sock *sk = nskb->sk;
>> +	ktime_t now;
>> +
>> +	if (sk && !sock_flag(sk, SOCK_TXTIME))
>> +		return false;
>> +
>> +	/* We don't perform crosstimestamping.
>> +	 * Drop if packet's clockid differs from qdisc's.
>> +	 */
>> +	if (nskb->txtime_clockid != q->clockid)
>> +		return false;
>> +
>> +	now = get_time_by_clockid(q->clockid);
> 
> If you store the time getter function pointer in tbs_sched_data then you
> avoid the lookup and just can do
> 
>        now = q->get_time();
> 
> That applies to lots of other places.


Good idea, thanks. Will fix.



>> +
>> +static struct sk_buff *tbs_peek_timesortedlist(struct Qdisc *sch)
>> +{
>> +	struct tbs_sched_data *q = qdisc_priv(sch);
>> +	struct rb_node *p;
>> +
>> +	p = rb_first(&q->head);
> 
> timerqueue gives you direct access to the first expiring entry w/o walking
> the rbtree. So that would become:
> 
> 	p = timerqueue_getnext(&q->tqhead);
> 	return p ? rb_to_skb(p) : NULL;

OK.

(...)

>> +static struct sk_buff *tbs_dequeue_scheduledfifo(struct Qdisc *sch)
>> +{
>> +	struct tbs_sched_data *q = qdisc_priv(sch);
>> +	struct sk_buff *skb = tbs_peek(sch);
>> +	ktime_t now, next;
>> +
>> +	if (!skb)
>> +		return NULL;
>> +
>> +	now = get_time_by_clockid(q->clockid);
>> +
>> +	/* Drop if packet has expired while in queue and the drop_if_late
>> +	 * flag is set.
>> +	 */
>> +	if (skb->tc_drop_if_late && ktime_before(skb->tstamp, now)) {
>> +		struct sk_buff *to_free = NULL;
>> +
>> +		qdisc_queue_drop_head(sch, &to_free);
>> +		kfree_skb_list(to_free);
>> +		qdisc_qstats_overlimit(sch);
>> +
>> +		skb = NULL;
>> +		goto out;
> 
> Instead of going out immediately you should check the next skb whether its
> due for sending already.

We wanted to have a baseline before starting with the optimizations, so we left
this for a later patchset. It was one of the opens we had listed on the v2 cover
letter IIRC, but we'll look into it.


(...)


>> +	}
>> +
>> +	next = ktime_sub_ns(skb->tstamp, q->delta);
>> +
>> +	/* Dequeue only if now is within the [txtime - delta, txtime] range. */
>> +	if (ktime_after(now, next))
>> +		timesortedlist_erase(sch, skb, false);
>> +	else
>> +		skb = NULL;
>> +
>> +out:
>> +	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
>> +	reset_watchdog(sch);
>> +
>> +	return skb;
>> +}
>> +
>> +static inline void setup_queueing_mode(struct tbs_sched_data *q)
>> +{
>> +	if (q->sorting) {
>> +		q->enqueue = tbs_enqueue_timesortedlist;
>> +		q->dequeue = tbs_dequeue_timesortedlist;
>> +		q->peek = tbs_peek_timesortedlist;
>> +	} else {
>> +		q->enqueue = tbs_enqueue_scheduledfifo;
>> +		q->dequeue = tbs_dequeue_scheduledfifo;
>> +		q->peek = qdisc_peek_head;
> 
> I don't see the point of these two modes and all the duplicated code it
> involves.
> 
> FIFO mode limits usage to a single thread which has to guarantee that the
> packets are queued in time order.
> 
> If you look at the use cases of TDM in various fields then FIFO mode is
> pretty much useless. In industrial/automotive fieldbus applications the
> various time slices are filled by different threads or even processes.
> 
> Sure, the rbtree queue/dequeue has overhead compared to a simple linked
> list, but you pay for that with more indirections and lots of mostly
> duplicated code. And in the worst case one of these code pathes is going to
> be rarely used and prone to bitrot.


Our initial version (on RFC v2) was performing the sorting for all modes. After
all the feedback we got we decided to make it optional and provide FIFO modes as
well. For the SW fallback we need the scheduled FIFO, and for "pure" hw offload
we need the "raw" FIFO.

This was a way to accommodate all the use cases without imposing too much of a
burden onto anyone, regardless of their application's segment (i.e. industrial,
pro a/v, automotive, etc).

Having the sorting always enabled requires that a valid static clockid is passed
to the qdisc. For the hw offload mode, that means that the PHC and one of the
system clocks must be synchronized since hrtimers do not support dynamic clocks.
Not all systems do that or want to, and given that we do not want to perform
crosstimestamping between the packets' clock reference and the qdisc's one, the
only solution for these systems would be using the raw hw offload mode.


Thanks,
Jesus
Thomas Gleixner March 22, 2018, 10:11 p.m. UTC | #5
On Thu, 22 Mar 2018, Jesus Sanchez-Palencia wrote:
> On 03/21/2018 06:46 AM, Thomas Gleixner wrote:
> > If you look at the use cases of TDM in various fields then FIFO mode is
> > pretty much useless. In industrial/automotive fieldbus applications the
> > various time slices are filled by different threads or even processes.
> > 
> > Sure, the rbtree queue/dequeue has overhead compared to a simple linked
> > list, but you pay for that with more indirections and lots of mostly
> > duplicated code. And in the worst case one of these code pathes is going to
> > be rarely used and prone to bitrot.
> 
> 
> Our initial version (on RFC v2) was performing the sorting for all modes. After
> all the feedback we got we decided to make it optional and provide FIFO modes as
> well. For the SW fallback we need the scheduled FIFO, and for "pure" hw offload
> we need the "raw" FIFO.

I don't see how FIFO ever works without the issue that a newly qeueud
packet which has an earlier time stamp than the head of the FIFO list will
lose. Why would you even want to have that mode? Just because some weird
existing application misdesign thinks its required? That doesn't make it a
good idea.

With pure hardware offload the packets are immediately handed off to the
network card and that one is responsible for sending it on time. So there
is no FIFO at all. It's actually a bypass mode.

> This was a way to accommodate all the use cases without imposing too much of a
> burden onto anyone, regardless of their application's segment (i.e. industrial,
> pro a/v, automotive, etc).

I'm not buying that argument at all. That's all handwaving.

The whole approach is a burden on every application segment because it
pushes the whole schedule and time slice management out to user space,
which also requires that you route general traffic down to that user space
scheduling entity and then queue it back into the proper time slice. And
FIFO makes that even worse.

> Having the sorting always enabled requires that a valid static clockid is passed
> to the qdisc. For the hw offload mode, that means that the PHC and one of the
> system clocks must be synchronized since hrtimers do not support dynamic clocks.
> Not all systems do that or want to, and given that we do not want to perform
> crosstimestamping between the packets' clock reference and the qdisc's one, the
> only solution for these systems would be using the raw hw offload mode.

There are two variants of hardware offload:

1) Full hardware offload

   That bypasses the queue completely. You just stick the thing into the
   scatter gather buffers. Except when there is no room anymore, then you
   have to queue, but it does not make any difference if you queue in FIFO
   or in time order. The packets go out in time order anyway.

2) Single packet hardware offload

   What you do here is to schedule a hrtimer a bit earlier than the first
   packet tx time and when it fires stick the packet into the hardware and
   rearm the timer for the next one.

   The whole point of TSN with hardware support is that you have:

       - Global network time

       and

       - Frequency adjustment of the system time base

    PTP is TAI based and the kernel exposes clock TAI directly through
    hrtimers. You don't need dynamic clocks for that.

    You can even use clock MONOTONIC as it basically is just

       TAI - offset

If the network card uses anything else than TAI or a time stamp with a
strict correlation to TAI for actual TX scheduling then the whole thing is
broken to begin with.

Thanks,

	tglx
Thomas Gleixner March 22, 2018, 10:52 p.m. UTC | #6
On Thu, 22 Mar 2018, Jesus Sanchez-Palencia wrote:
> Our plan was to work directly with the Qbv-like scheduling (per-port) just after
> the cbs qdisc (Qav), but the feedback here and offline was that there were use
> cases for a more simplistic launchtime approach (per-queue) as well. We've
> decided to invest on it first (and postpone the 'taprio' qdisc until there was
> NIC available with HW support for it, basically).

I missed that discussion due to other urgent stuff on my plate. Just
skimmed through it. More below.

> You are right, and we agree, that using tbs for a per-port schedule of any sort
> will require a SW scheduler to be developed on top of it, but we've never said
> the contrary either. Our vision has always been that these are separate
> mechanisms with different use-cases, so we do see the value for the kernel to
> provide both.
> 
> In other words, tbs is not the final solution for Qbv, and we agree that a 'TAS'
> qdisc is still necessary. And due to the wide range of applications and hw being
> used for those out there, we need both specially given that one does not block
> the other.

So what's the plan for this? Having TAS as a separate entity or TAS feeding
into the proposed 'basic' time transmission thing?

The general objection I have with the current approach is that it creates
the playground for all flavours of misdesigned user space implementations
and just replaces the home brewn and ugly user mode network adapter
drivers.

But that's not helping the cause at all. There is enough crappy stuff out
there already and I rather see a proper designed slice management which can
be utilized and improved by all involved parties.

All variants which utilize the basic time driven packet transmission are
based on periodic explicit plan scheduling with (local) network wide time
slice assignment.

It does not matter whether you feed VLAN traffic into a time slice, where
the VLAN itself does not even have to know about it, or if you have aware
applications feeding packets to a designated timeslot. The basic principle
of this is always the same.

So coming back to last years discussion. It totally went into the wrong
direction because it turned from an approach (the patches) which came from
the big picture to an single use case and application centric view. That's
just wrong and I regret that I didn't have the time to pay attention back
then.

You always need to look at the big picture first and design from there, not
the other way round. There will always be the argument:

    But my application is special and needs X

It's easy to fall for that. From a long experience I know that none of
these claims ever held. These arguments are made because the people making
them have either never looked at the big picture or are simply refusing to
do so because it would cause them work.

If you start from the use case and application centric view and ignore the
big picture then you end up in a gazillion of extra magic features over
time which could have been completely avoided if you had put your foot down
and made everyone to agree on a proper and versatile design in the first
place.

The more low level access you hand out in the beginning the less commonly
used, improved and maintained infrastrucure you will get in the end. That
has happened before in other areas and it will happen here as well. You
create a user space ABI which you cant get rid off and before you come out
with the proper interface after that a large number of involved parties
have gone off and implemented on top of the low level ABI and they will
never look back.

In the (not so) long run this will create a lot more issues than it
solves. A simple example is that you cannot run two applications which
easily could share the network in parallel without major surgery because
both require to be the management authority.

I've not yet seen a convincing argument why this low level stuff with all
of its weird flavours is superiour over something which reflects the basic
operating principle of TSN.

Thanks,

	tglx
Jesus Sanchez-Palencia March 22, 2018, 11:26 p.m. UTC | #7
Hi Thomas,


On 03/22/2018 03:11 PM, Thomas Gleixner wrote:

(...)

>> Having the sorting always enabled requires that a valid static clockid is passed
>> to the qdisc. For the hw offload mode, that means that the PHC and one of the
>> system clocks must be synchronized since hrtimers do not support dynamic clocks.
>> Not all systems do that or want to, and given that we do not want to perform
>> crosstimestamping between the packets' clock reference and the qdisc's one, the
>> only solution for these systems would be using the raw hw offload mode.
> 
> There are two variants of hardware offload:
> 
> 1) Full hardware offload
> 
>    That bypasses the queue completely. You just stick the thing into the
>    scatter gather buffers. Except when there is no room anymore, then you
>    have to queue, but it does not make any difference if you queue in FIFO
>    or in time order. The packets go out in time order anyway.


Illustrating your variants with the current qdisc's setup arguments.

The above is:
- sorting off
- offload on

(I call it a 'raw' fifo as a reference to the usage of qdisc_enqueue_tail() and
qdisc_dequeue_head(), basically.)


> 
> 2) Single packet hardware offload
> 
>    What you do here is to schedule a hrtimer a bit earlier than the first
>    packet tx time and when it fires stick the packet into the hardware and
>    rearm the timer for the next one.


The above is:
- sorting on
- offload on

right?


So, are you just opposing to the case where sorting off + offload off is used?
(i.e. the scheduled FIFO case)



> 
>    The whole point of TSN with hardware support is that you have:
> 
>        - Global network time
> 
>        and
> 
>        - Frequency adjustment of the system time base
> 
>     PTP is TAI based and the kernel exposes clock TAI directly through
>     hrtimers. You don't need dynamic clocks for that.
> 
>     You can even use clock MONOTONIC as it basically is just
> 
>        TAI - offset>
> If the network card uses anything else than TAI or a time stamp with a
> strict correlation to TAI for actual TX scheduling then the whole thing is
> broken to begin with.


Sure, I agree.

Thanks,
Jesus

> 
> Thanks,
> 
> 	tglx
>
Thomas Gleixner March 23, 2018, 8:49 a.m. UTC | #8
On Thu, 22 Mar 2018, Jesus Sanchez-Palencia wrote:
> On 03/22/2018 03:11 PM, Thomas Gleixner wrote:
> So, are you just opposing to the case where sorting off + offload off is used?
> (i.e. the scheduled FIFO case)

FIFO does not make any sense if your packets have a fixed transmission
time. I yet have to see a reasonable explanation why FIFO in the context of
time ordered would be a good thing.

Thanks,

	tglx
Jesus Sanchez-Palencia March 23, 2018, 11:34 p.m. UTC | #9
Hi Thomas,


On 03/23/2018 01:49 AM, Thomas Gleixner wrote:
> On Thu, 22 Mar 2018, Jesus Sanchez-Palencia wrote:
>> On 03/22/2018 03:11 PM, Thomas Gleixner wrote:
>> So, are you just opposing to the case where sorting off + offload off is used?
>> (i.e. the scheduled FIFO case)
> 
> FIFO does not make any sense if your packets have a fixed transmission
> time. I yet have to see a reasonable explanation why FIFO in the context of
> time ordered would be a good thing.


On context of tbs, the scheduled FIFO was developed just so consistency was kept
between all 4 variants, basically (sw best-effort or hw offload vs sorting
enabled or sorting disabled).

I don't have any strong argument in favor of this mode at the moment, so I will
just remove it on a next version - unless someone else brings up a valid use
case for it, of course.

Thanks for the feedback,
Jesus
Jesus Sanchez-Palencia March 24, 2018, 12:34 a.m. UTC | #10
Hi,


On 03/22/2018 03:52 PM, Thomas Gleixner wrote:
> On Thu, 22 Mar 2018, Jesus Sanchez-Palencia wrote:
>> Our plan was to work directly with the Qbv-like scheduling (per-port) just after
>> the cbs qdisc (Qav), but the feedback here and offline was that there were use
>> cases for a more simplistic launchtime approach (per-queue) as well. We've
>> decided to invest on it first (and postpone the 'taprio' qdisc until there was
>> NIC available with HW support for it, basically).
> 
> I missed that discussion due to other urgent stuff on my plate. Just
> skimmed through it. More below.
> 
>> You are right, and we agree, that using tbs for a per-port schedule of any sort
>> will require a SW scheduler to be developed on top of it, but we've never said
>> the contrary either. Our vision has always been that these are separate
>> mechanisms with different use-cases, so we do see the value for the kernel to
>> provide both.
>>
>> In other words, tbs is not the final solution for Qbv, and we agree that a 'TAS'
>> qdisc is still necessary. And due to the wide range of applications and hw being
>> used for those out there, we need both specially given that one does not block
>> the other.
> 
> So what's the plan for this? Having TAS as a separate entity or TAS feeding
> into the proposed 'basic' time transmission thing?


The second one, I guess. Elaborating, the plan is at some point having TAS as a
separate entity, but which can use tbs for one of its classes (and cbs for
another, and strict priority for everything else, etc).

Basically, the design would something along the lines of 'taprio'. A root qdisc
that is both time and priority aware, and capable of running a schedule for the
port. That schedule can run inside the kernel with hrtimers, or just be
offloaded into the controller if Qbv is supported on HW.

Because it would expose the inner traffic classes in a mq / mqprio / prio style,
then it would allow for other per-queue qdiscs to be attached to it. On a system
using the i210, for instance, we could then have tbs installed on traffic class
0 just dialing hw offload. The Qbv schedule would be running in SW on the TAS
entity (i.e. 'taprio') which would be setting the packets' txtime before
dequeueing packets on a fast path -> tbs -> NIC.

Similarly, other qdisc, like cbs, could be installed if all that traffic class
requires is traffic shaping once its 'gate' is allowed to execute the selected
tx algorithm attached to it.



> 
> The general objection I have with the current approach is that it creates
> the playground for all flavours of misdesigned user space implementations
> and just replaces the home brewn and ugly user mode network adapter
> drivers.
> 
> But that's not helping the cause at all. There is enough crappy stuff out
> there already and I rather see a proper designed slice management which can
> be utilized and improved by all involved parties.
> 
> All variants which utilize the basic time driven packet transmission are
> based on periodic explicit plan scheduling with (local) network wide time
> slice assignment.
> 
> It does not matter whether you feed VLAN traffic into a time slice, where
> the VLAN itself does not even have to know about it, or if you have aware
> applications feeding packets to a designated timeslot. The basic principle
> of this is always the same.
> 
> So coming back to last years discussion. It totally went into the wrong
> direction because it turned from an approach (the patches) which came from
> the big picture to an single use case and application centric view. That's
> just wrong and I regret that I didn't have the time to pay attention back
> then.
> 
> You always need to look at the big picture first and design from there, not
> the other way round. There will always be the argument:
> 
>     But my application is special and needs X
> 
> It's easy to fall for that. From a long experience I know that none of
> these claims ever held. These arguments are made because the people making
> them have either never looked at the big picture or are simply refusing to
> do so because it would cause them work.
> 
> If you start from the use case and application centric view and ignore the
> big picture then you end up in a gazillion of extra magic features over
> time which could have been completely avoided if you had put your foot down
> and made everyone to agree on a proper and versatile design in the first
> place.
> 
> The more low level access you hand out in the beginning the less commonly
> used, improved and maintained infrastrucure you will get in the end. That
> has happened before in other areas and it will happen here as well. You
> create a user space ABI which you cant get rid off and before you come out
> with the proper interface after that a large number of involved parties
> have gone off and implemented on top of the low level ABI and they will
> never look back.
> 
> In the (not so) long run this will create a lot more issues than it
> solves. A simple example is that you cannot run two applications which
> easily could share the network in parallel without major surgery because
> both require to be the management authority.
> 
> I've not yet seen a convincing argument why this low level stuff with all
> of its weird flavours is superiour over something which reflects the basic
> operating principle of TSN.


As you know, not all TSN systems are designed the same. Take AVB systems, for
example. These not always are running on networks that are aware of any time
schedule, or at least not quite like what is described by Qbv.

On those systems there is usually a certain number of streams with different
priorities that care mostly about having their bandwidth reserved along the
network. The applications running on such systems are usually based on AVTP,
thus they already have to calculate and set the "avtp presentation time"
per-packet themselves. A Qbv scheduler would probably provide very little
benefits to this domain, IMHO. For "talkers" of these AVB systems, shaping
traffic using txtime (i.e. tbs) can provide a low-jitter alternative to cbs, for
instance.


Thanks,
Jesus

> 
> Thanks,
> 
> 	tglx
> 
> 
> 
> 
> 
> 
> 
>
Thomas Gleixner March 25, 2018, 11:46 a.m. UTC | #11
On Fri, 23 Mar 2018, Jesus Sanchez-Palencia wrote:
> On 03/22/2018 03:52 PM, Thomas Gleixner wrote:
> > So what's the plan for this? Having TAS as a separate entity or TAS feeding
> > into the proposed 'basic' time transmission thing?
> 
> The second one, I guess.

That's just wrong. It won't work. See below.

> Elaborating, the plan is at some point having TAS as a separate entity,
> but which can use tbs for one of its classes (and cbs for another, and
> strict priority for everything else, etc).
>
> Basically, the design would something along the lines of 'taprio'. A root qdisc
> that is both time and priority aware, and capable of running a schedule for the
> port. That schedule can run inside the kernel with hrtimers, or just be
> offloaded into the controller if Qbv is supported on HW.
> 
> Because it would expose the inner traffic classes in a mq / mqprio / prio style,
> then it would allow for other per-queue qdiscs to be attached to it. On a system
> using the i210, for instance, we could then have tbs installed on traffic class
> 0 just dialing hw offload. The Qbv schedule would be running in SW on the TAS
> entity (i.e. 'taprio') which would be setting the packets' txtime before
> dequeueing packets on a fast path -> tbs -> NIC.
> 
> Similarly, other qdisc, like cbs, could be installed if all that traffic class
> requires is traffic shaping once its 'gate' is allowed to execute the selected
> tx algorithm attached to it.
> 
> > I've not yet seen a convincing argument why this low level stuff with all
> > of its weird flavours is superiour over something which reflects the basic
> > operating principle of TSN.
> 
> 
> As you know, not all TSN systems are designed the same. Take AVB systems, for
> example. These not always are running on networks that are aware of any time
> schedule, or at least not quite like what is described by Qbv.
> 
> On those systems there is usually a certain number of streams with different
> priorities that care mostly about having their bandwidth reserved along the
> network. The applications running on such systems are usually based on AVTP,
> thus they already have to calculate and set the "avtp presentation time"
> per-packet themselves. A Qbv scheduler would probably provide very little
> benefits to this domain, IMHO. For "talkers" of these AVB systems, shaping
> traffic using txtime (i.e. tbs) can provide a low-jitter alternative to cbs, for
> instance.

You're looking at it from particular use cases and try to accomodate for
them in the simplest possible way. I don't think that cuts it.

Let's take a step back and look at it from a more general POV without
trying to make it fit to any of the standards first. I'm deliberately NOT
using any of the standard defined terms.

At the (local) network level you have always an explicit plan. This plan
might range from no plan at all to an very elaborate plan which is strict
about when each node is allowed to TX a particular class of packets.

So lets assume we have the following picture:

   	       	  [NIC]
		    |
	 [ Time slice manager ]

Now in the simplest case, the time slice manager has no constraints and
exposes a single input which allows the application to say: "Send my packet
at time X". There is no restriction on 'time X' except if there is a time
collision with an already queued packet or the requested TX time has
already passed. That's close to what you implemented.

  Is the TX timestamp which you defined in the user space ABI a fixed
  scheduling point or is it a deadline?

  That's an important distinction and for this all to work accross various
  use cases you need a way to express that in the ABI. It might be an
  implicit property of the socket/channel to which the application connects
  to but still you want to express it from the application side to do
  proper sanity checking.

  Just think about stuff like audio/video streaming. The point of
  transmission does not have to be fixed if you have some intelligent
  controller at the receiving end which can buffer stuff. The only relevant
  information is the deadline, i.e. the latest point in time where the
  packet needs to go out on the wire in order to keep the stream steady at
  the consumer side. Having the notion of a deadline and that's the only
  thing the provider knows about allows you proper utilization by using an
  approriate scheduling algorithm like EDF.

  Contrary to that you want very explicit TX points for applications like
  automation control. For this kind of use case there is no wiggle room, it
  has to go out at a fixed time because that's the way control systems
  work.

  This is missing right now and you want to get that right from the very
  beginning. Duct taping it on the interface later on is a bad idea.

Now lets go one step further and create two time slices for whatever
purpose still on the single node (not network wide). You want to do that
because you want temporal separation of services. The reason might be
bandwidth guarantee, collission avoidance or whatever.

  How does the application which was written for the simple manager which
  had no restrictions learn about this?

  Does it learn it the hard way because now the packets which fall into the
  reserved timeslice are rejected? The way you created your interface, the
  answer is yes. That's patently bad as it requires to change the
  application once it runs on a partitioned node.

  So you really want a way for the application to query the timing
  constraints and perhaps other properties of the channel it connects
  to. And you want that now before the first application starts to use the
  new ABI. If the application developer does not use it, you still have to
  fix the application, but you have to fix it because the developer was a
  lazy bastard and not because the design was bad. That's a major
  difference.

Now that we have two time slices, I'm coming back to your idea of having
your proposed qdisc as the entity which sits right at the network
interface. Lets assume the following:

   [Slice 1: Timed traffic ] [Slice 2: Other Traffic]

  Lets assume further that 'Other traffic' has no idea about time slices at
  all. It's just stuff like ssh, http, etc. So if you keep that design

       	         [ NIC ]
  	            |
           [ Time slice manager ]
	       |          |
     [ Timed traffic ]  [ Other traffic ]

  feeding into your proposed TBS thingy, then in case of underutilization
  of the 'Timed traffic' slot you prevent utilization of remaining time by
  pulling 'Other traffic' into the empty slots because 'Other traffic' is
  restricted to Slice 2 and 'Timed traffic' does not know about 'Other
  traffic' at all. And no, you cannot make TBS magically pull packets from
  'Other traffic' just because its not designed for it. So your design
  becomes strictly partitioned and forces underutilization.

  That's becoming even worse, when you switch to the proposed full hardware
  offloading scheme. In that case the only way to do admission control is
  the TX time of the farthest out packet which is already queued. That
  might work for a single application which controls all of the network
  traffic, but it wont ever work for something more flexible. The more I
  think about it the less interesting full hardware offload becomes. It's
  nice if you have a fully strict scheduling plan for everything, but then
  your admission control is bogus once you have more than one channel as
  input. So yes, it can be used when the card supports it and you have
  other ways to enforce admission control w/o hurting utilization or if you
  don't care about utilization at all. It's also useful for channels which
  are strictly isolated and have a defined TX time. Such traffic can be
  directly fed into the hardware.

Coming back to the overall scheme. If you start upfront with a time slice
manager which is designed to:

  - Handle multiple channels

  - Expose the time constraints, properties per channel

then you can fit all kind of use cases, whether designed by committee or
not. You can configure that thing per node or network wide. It does not
make a difference. The only difference are the resulting constraints.

We really want to accomodate everything between the 'no restrictions' and
the 'full network wide explicit plan' case. And it's not rocket science
once you realize that the 'no restrictions' case is just a subset of the
'full network wide explicit plan' simply because it exposes a single
channel where:

	slice period = slice length.

It's that easy, but at the same time you teach the application from the
very beginning to ask for the time constraints so if it runs on a more
sophisticated system/network, then it will see a different slice period and
a different slice length and can accomodate or react in a useful way
instead of just dying on the 17th packet it tries to send because it is
rejected.

We really want to design for this as we want to be able to run the video
stream on the same node and network which does robot control without
changing the video application. That's not a theoretical problem. These use
cases exist today, but they are forced to use different networks for the
two. But if you look at the utilization of both then they very well fit
into one and industry certainly wants to go for that.

That implies that you need constraint aware applications from the very
beginning and that requires a proper ABI in the first place. The proposed
ad hoc mode does not qualify. Please be aware, that you are creating a user
space ABI and not a random in kernel interface which can be changed at any
given time.

So lets look once more at the picture in an abstract way:

     	       [ NIC ]
	          |
	 [ Time slice manager ]
	    |           |
         [ Ch 0 ] ... [ Ch N ]

So you have a bunch of properties here:

1) Number of Channels ranging from 1 to N

2) Start point, slice period and slice length per channel

3) Queueing modes assigned per channel. Again that might be anything from
   'feed through' over FIFO, PRIO to more complex things like EDF.

   The queueing mode can also influence properties like the meaning of the
   TX time, i.e. strict or deadline.

Please sit back and map your use cases, standards or whatever you care
about into the above and I would be very surprised if they don't fit.

Thanks,

	tglx
Jesus Sanchez-Palencia March 27, 2018, 11:26 p.m. UTC | #12
Hi Thomas,


On 03/25/2018 04:46 AM, Thomas Gleixner wrote:
> On Fri, 23 Mar 2018, Jesus Sanchez-Palencia wrote:
>> On 03/22/2018 03:52 PM, Thomas Gleixner wrote:
>>> So what's the plan for this? Having TAS as a separate entity or TAS feeding
>>> into the proposed 'basic' time transmission thing?
>>
>> The second one, I guess.
>
> That's just wrong. It won't work. See below.

Yes, our proposal does not handle the scenarios you are bringing into the
discussion.

I think we have more points of convergence than divergence already. I will just
go through some pieces of the discussion first, and then let's see if we can
agree on where we are trying to get.



>
>> Elaborating, the plan is at some point having TAS as a separate entity,
>> but which can use tbs for one of its classes (and cbs for another, and
>> strict priority for everything else, etc).
>>
>> Basically, the design would something along the lines of 'taprio'. A root qdisc
>> that is both time and priority aware, and capable of running a schedule for the
>> port. That schedule can run inside the kernel with hrtimers, or just be
>> offloaded into the controller if Qbv is supported on HW.
>>
>> Because it would expose the inner traffic classes in a mq / mqprio / prio style,
>> then it would allow for other per-queue qdiscs to be attached to it. On a system
>> using the i210, for instance, we could then have tbs installed on traffic class
>> 0 just dialing hw offload. The Qbv schedule would be running in SW on the TAS
>> entity (i.e. 'taprio') which would be setting the packets' txtime before
>> dequeueing packets on a fast path -> tbs -> NIC.
>>
>> Similarly, other qdisc, like cbs, could be installed if all that traffic class
>> requires is traffic shaping once its 'gate' is allowed to execute the selected
>> tx algorithm attached to it.
>>
>>> I've not yet seen a convincing argument why this low level stuff with all
>>> of its weird flavours is superiour over something which reflects the basic
>>> operating principle of TSN.
>>
>>
>> As you know, not all TSN systems are designed the same. Take AVB systems, for
>> example. These not always are running on networks that are aware of any time
>> schedule, or at least not quite like what is described by Qbv.
>>
>> On those systems there is usually a certain number of streams with different
>> priorities that care mostly about having their bandwidth reserved along the
>> network. The applications running on such systems are usually based on AVTP,
>> thus they already have to calculate and set the "avtp presentation time"
>> per-packet themselves. A Qbv scheduler would probably provide very little
>> benefits to this domain, IMHO. For "talkers" of these AVB systems, shaping
>> traffic using txtime (i.e. tbs) can provide a low-jitter alternative to cbs, for
>> instance.
>
> You're looking at it from particular use cases and try to accomodate for
> them in the simplest possible way. I don't think that cuts it.
>
> Let's take a step back and look at it from a more general POV without
> trying to make it fit to any of the standards first. I'm deliberately NOT
> using any of the standard defined terms.
>
> At the (local) network level you have always an explicit plan. This plan
> might range from no plan at all to an very elaborate plan which is strict
> about when each node is allowed to TX a particular class of packets.


Ok, we are aligned here.


>
> So lets assume we have the following picture:
>
>    	       	  [NIC]
> 		    |
> 	 [ Time slice manager ]
>
> Now in the simplest case, the time slice manager has no constraints and
> exposes a single input which allows the application to say: "Send my packet
> at time X". There is no restriction on 'time X' except if there is a time
> collision with an already queued packet or the requested TX time has
> already passed. That's close to what you implemented.
>
>   Is the TX timestamp which you defined in the user space ABI a fixed
>   scheduling point or is it a deadline?
>
>   That's an important distinction and for this all to work accross various
>   use cases you need a way to express that in the ABI. It might be an
>   implicit property of the socket/channel to which the application connects
>   to but still you want to express it from the application side to do
>   proper sanity checking.
>
>   Just think about stuff like audio/video streaming. The point of
>   transmission does not have to be fixed if you have some intelligent
>   controller at the receiving end which can buffer stuff. The only relevant
>   information is the deadline, i.e. the latest point in time where the
>   packet needs to go out on the wire in order to keep the stream steady at
>   the consumer side. Having the notion of a deadline and that's the only
>   thing the provider knows about allows you proper utilization by using an
>   approriate scheduling algorithm like EDF.
>
>   Contrary to that you want very explicit TX points for applications like
>   automation control. For this kind of use case there is no wiggle room, it
>   has to go out at a fixed time because that's the way control systems
>   work.
>
>   This is missing right now and you want to get that right from the very
>   beginning. Duct taping it on the interface later on is a bad idea.


Agreed that this is needed. On the SO_TXTIME + tbs proposal, I believe it's been
covered by the (per-packet) SCM_DROP_IF_LATE. Do you think we need a different
mechanism for expressing that?


>
> Now lets go one step further and create two time slices for whatever
> purpose still on the single node (not network wide). You want to do that
> because you want temporal separation of services. The reason might be
> bandwidth guarantee, collission avoidance or whatever.
>
>   How does the application which was written for the simple manager which
>   had no restrictions learn about this?
>
>   Does it learn it the hard way because now the packets which fall into the
>   reserved timeslice are rejected? The way you created your interface, the
>   answer is yes. That's patently bad as it requires to change the
>   application once it runs on a partitioned node.
>
>   So you really want a way for the application to query the timing
>   constraints and perhaps other properties of the channel it connects
>   to. And you want that now before the first application starts to use the
>   new ABI. If the application developer does not use it, you still have to
>   fix the application, but you have to fix it because the developer was a
>   lazy bastard and not because the design was bad. That's a major
>   difference.


Ok, this is something that we have considered in the past, but then the feedback
here drove us onto a different direction. The overall input we got here was that
applications would have to be adjusted or that userspace would have to handle
the coordination between applications somehow (e.g.: a daemon could be developed
separately to accommodate the fully dynamic use-cases, etc).


>
> Now that we have two time slices, I'm coming back to your idea of having
> your proposed qdisc as the entity which sits right at the network
> interface. Lets assume the following:
>
>    [Slice 1: Timed traffic ] [Slice 2: Other Traffic]
>
>   Lets assume further that 'Other traffic' has no idea about time slices at
>   all. It's just stuff like ssh, http, etc. So if you keep that design
>
>        	         [ NIC ]
>   	            |
>            [ Time slice manager ]
> 	       |          |
>      [ Timed traffic ]  [ Other traffic ]
>
>   feeding into your proposed TBS thingy, then in case of underutilization
>   of the 'Timed traffic' slot you prevent utilization of remaining time by
>   pulling 'Other traffic' into the empty slots because 'Other traffic' is
>   restricted to Slice 2 and 'Timed traffic' does not know about 'Other
>   traffic' at all. And no, you cannot make TBS magically pull packets from
>   'Other traffic' just because its not designed for it. So your design
>   becomes strictly partitioned and forces underutilization.
>
>   That's becoming even worse, when you switch to the proposed full hardware
>   offloading scheme. In that case the only way to do admission control is
>   the TX time of the farthest out packet which is already queued. That
>   might work for a single application which controls all of the network
>   traffic, but it wont ever work for something more flexible. The more I
>   think about it the less interesting full hardware offload becomes. It's
>   nice if you have a fully strict scheduling plan for everything, but then
>   your admission control is bogus once you have more than one channel as
>   input. So yes, it can be used when the card supports it and you have
>   other ways to enforce admission control w/o hurting utilization or if you
>   don't care about utilization at all. It's also useful for channels which
>   are strictly isolated and have a defined TX time. Such traffic can be
>   directly fed into the hardware.


This is a new requirement for the entire discussion.

If I'm not missing anything, however, underutilization of the time slots is only
a problem:

1) for the fully dynamic use-cases and;
2) because now you are designing applications in terms of time slices, right?

We have not thought of making any of the proposed qdiscs capable of (optionally)
adjusting the "time slices", but mainly because this is not a problem we had
here before. Our assumption was that per-port Tx schedules would only be used
for static systems. In other words, no, we didn't think that re-balancing the
slots was a requirement, not even for 'taprio'.


>
> Coming back to the overall scheme. If you start upfront with a time slice
> manager which is designed to:
>
>   - Handle multiple channels
>
>   - Expose the time constraints, properties per channel
>
> then you can fit all kind of use cases, whether designed by committee or
> not. You can configure that thing per node or network wide. It does not
> make a difference. The only difference are the resulting constraints.


Ok, and I believe the above was covered by what we had proposed before, unless
what you meant by time constraints is beyond the configured port schedule.

Are you suggesting that we'll need to have a kernel entity that is not only
aware of the current traffic classes 'schedule', but also of the resources that
are still available for new streams to be accommodated into the classes? Putting
it differently, is the TAS you envision just an entity that runs a schedule, or
is it a time-aware 'orchestrator'?


>
> We really want to accomodate everything between the 'no restrictions' and
> the 'full network wide explicit plan' case. And it's not rocket science
> once you realize that the 'no restrictions' case is just a subset of the
> 'full network wide explicit plan' simply because it exposes a single
> channel where:
>
> 	slice period = slice length.
>
> It's that easy, but at the same time you teach the application from the
> very beginning to ask for the time constraints so if it runs on a more
> sophisticated system/network, then it will see a different slice period and
> a different slice length and can accomodate or react in a useful way
> instead of just dying on the 17th packet it tries to send because it is
> rejected.


Ok.


>
> We really want to design for this as we want to be able to run the video
> stream on the same node and network which does robot control without
> changing the video application. That's not a theoretical problem. These use
> cases exist today, but they are forced to use different networks for the
> two. But if you look at the utilization of both then they very well fit
> into one and industry certainly wants to go for that.
>
> That implies that you need constraint aware applications from the very
> beginning and that requires a proper ABI in the first place. The proposed
> ad hoc mode does not qualify. Please be aware, that you are creating a user
> space ABI and not a random in kernel interface which can be changed at any
> given time.
>
> So lets look once more at the picture in an abstract way:
>
>      	       [ NIC ]
> 	          |
> 	 [ Time slice manager ]
> 	    |           |
>          [ Ch 0 ] ... [ Ch N ]
>
> So you have a bunch of properties here:
>
> 1) Number of Channels ranging from 1 to N
>
> 2) Start point, slice period and slice length per channel

Ok, so we agree that a TAS entity is needed. Assuming that channels are traffic
classes, do you have something else in mind other than a new root qdisc?


>
> 3) Queueing modes assigned per channel. Again that might be anything from
>    'feed through' over FIFO, PRIO to more complex things like EDF.
>
>    The queueing mode can also influence properties like the meaning of the
>    TX time, i.e. strict or deadline.


Ok, but how are the queueing modes assigned / configured per channel?

Just to make sure we re-visit some ideas from the past:

* TAS:

   The idea we are currently exploring is to add a "time-aware", priority based
   qdisc, that also exposes the Tx queues available and provides a mechanism for
   mapping priority <-> traffic class <-> Tx queues in a similar fashion as
   mqprio. We are calling this qdisc 'taprio', and its 'tc' cmd line would be:

   $ $ tc qdisc add dev ens4 parent root handle 100 taprio num_tc 4    \
     	   map 2 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3                         \
	   queues 0 1 2 3                                              \
     	   sched-file gates.sched [base-time <interval>]               \
           [cycle-time <interval>] [extension-time <interval>]

   <file> is multi-line, with each line being of the following format:
   <cmd> <gate mask> <interval in nanoseconds>

   Qbv only defines one <cmd>: "S" for 'SetGates'

   For example:

   S 0x01 300
   S 0x03 500

   This means that there are two intervals, the first will have the gate
   for traffic class 0 open for 300 nanoseconds, the second will have
   both traffic classes open for 500 nanoseconds.


It would handle multiple channels and expose their constraints / properties.
Each channel also becomes a traffic class, so other qdiscs can be attached to
them separately.


So, in summary, because our entire design is based on qdisc interfaces, what we
had proposed was a root qdisc (the time slice manager, as you put) that allows
for other qdiscs to be attached to each channel. The inner qdiscs define the
queueing modes for each channel, and tbs is just one of those modes. I
understand now that you want to allow for fully dynamic use-cases to be
supported as well, which we hadn't covered with our TAS proposal before because
we hadn't envisioned it being used for these systems' design.

Have I missed anything?

Thanks,
Jesus



>
> Please sit back and map your use cases, standards or whatever you care
> about into the above and I would be very surprised if they don't fit.
>
> Thanks,
>
> 	tglx
>
>
>
>
Thomas Gleixner March 28, 2018, 7:48 a.m. UTC | #13
Jesus,

On Tue, 27 Mar 2018, Jesus Sanchez-Palencia wrote:
> On 03/25/2018 04:46 AM, Thomas Gleixner wrote:
> >   This is missing right now and you want to get that right from the very
> >   beginning. Duct taping it on the interface later on is a bad idea.
> 
> Agreed that this is needed. On the SO_TXTIME + tbs proposal, I believe it's been
> covered by the (per-packet) SCM_DROP_IF_LATE. Do you think we need a different
> mechanism for expressing that?

Uuurgh. No. DROP_IF_LATE is just crap to be honest.

There are two modes:

      1) Send at the given TX time (Explicit mode)

      2) Send before given TX time (Deadline mode)

There is no need to specify 'drop if late' simply because if the message is
handed in past the given TX time, it's too late by definition. What you are
trying to implement is a hybrid of TSN and general purpose (not time aware)
networking in one go. And you do that because your overall design is not
looking at the big picture. You designed from a given use case assumption
and tried to fit other things into it with duct tape.

> >   So you really want a way for the application to query the timing
> >   constraints and perhaps other properties of the channel it connects
> >   to. And you want that now before the first application starts to use the
> >   new ABI. If the application developer does not use it, you still have to
> >   fix the application, but you have to fix it because the developer was a
> >   lazy bastard and not because the design was bad. That's a major
> >   difference.
> 
> Ok, this is something that we have considered in the past, but then the feedback
> here drove us onto a different direction. The overall input we got here was that
> applications would have to be adjusted or that userspace would have to handle
> the coordination between applications somehow (e.g.: a daemon could be developed
> separately to accommodate the fully dynamic use-cases, etc).

The only thing which will happen is that you get applications which require
to control the full interface themself because they are so important and
the only ones which get it right. Good luck with fixing them up.

That extra daemon if it ever surfaces will be just a PITA. Think about
20khz control loops. Do you really want queueing, locking, several context
switches and priority configuration nightmares in such a scenario?
Definitely not! You want a fast channel directly to the root qdisc which
takes care of getting it out at the right point, which might be immediate
handover if the adapter supports hw scheduling.

> This is a new requirement for the entire discussion.
> 
> If I'm not missing anything, however, underutilization of the time slots is only
> a problem:
> 
> 1) for the fully dynamic use-cases and;
> 2) because now you are designing applications in terms of time slices, right?

No. It's a general problem. I'm not designing applications in terms of time
slices. Time slices are a fundamental property of TSN. Whether you use them
for explicit scheduling or bandwidth reservation or make them flat does not
matter.

The application does not necessarily need to know about the time
constraints at all. But if it wants to use timed scheduling then it better
does know about them.

> We have not thought of making any of the proposed qdiscs capable of (optionally)
> adjusting the "time slices", but mainly because this is not a problem we had
> here before. Our assumption was that per-port Tx schedules would only be used
> for static systems. In other words, no, we didn't think that re-balancing the
> slots was a requirement, not even for 'taprio'.

Sigh. Utilization is not something entirely new in the network space. I'm
not saying that this needs to be implemented right away, but designing it
in a way which forces underutilization is just wrong.

> > Coming back to the overall scheme. If you start upfront with a time slice
> > manager which is designed to:
> >
> >   - Handle multiple channels
> >
> >   - Expose the time constraints, properties per channel
> >
> > then you can fit all kind of use cases, whether designed by committee or
> > not. You can configure that thing per node or network wide. It does not
> > make a difference. The only difference are the resulting constraints.
> 
>
> Ok, and I believe the above was covered by what we had proposed before, unless
> what you meant by time constraints is beyond the configured port schedule.
>
> Are you suggesting that we'll need to have a kernel entity that is not only
> aware of the current traffic classes 'schedule', but also of the resources that
> are still available for new streams to be accommodated into the classes? Putting
> it differently, is the TAS you envision just an entity that runs a schedule, or
> is it a time-aware 'orchestrator'?

In the first place its something which runs a defined schedule.

The accomodation for new streams is required, but not necessarily at the
root qdisc level. That might be a qdisc feeding into it.

Assume you have a bandwidth reservation, aka time slot, for audio. If your
audio related qdisc does deadline scheduling then you can add new streams
to it up to the point where it's not longer able to fit.

The only thing which might be needed at the root qdisc is the ability to
utilize unused time slots for other purposes, but that's not required to be
there in the first place as long as its designed in a way that it can be
added later on.

> > So lets look once more at the picture in an abstract way:
> >
> >      	       [ NIC ]
> > 	          |
> > 	 [ Time slice manager ]
> > 	    |           |
> >          [ Ch 0 ] ... [ Ch N ]
> >
> > So you have a bunch of properties here:
> >
> > 1) Number of Channels ranging from 1 to N
> >
> > 2) Start point, slice period and slice length per channel
> 
> Ok, so we agree that a TAS entity is needed. Assuming that channels are traffic
> classes, do you have something else in mind other than a new root qdisc?

Whatever you call it, the important point is that it is the gate keeper to
the network adapter and there is no way around it. It fully controls the
timed schedule how simple or how complex it may be.

> > 3) Queueing modes assigned per channel. Again that might be anything from
> >    'feed through' over FIFO, PRIO to more complex things like EDF.
> >
> >    The queueing mode can also influence properties like the meaning of the
> >    TX time, i.e. strict or deadline.
> 
> 
> Ok, but how are the queueing modes assigned / configured per channel?
> 
> Just to make sure we re-visit some ideas from the past:
> 
> * TAS:
> 
>    The idea we are currently exploring is to add a "time-aware", priority based
>    qdisc, that also exposes the Tx queues available and provides a mechanism for
>    mapping priority <-> traffic class <-> Tx queues in a similar fashion as
>    mqprio. We are calling this qdisc 'taprio', and its 'tc' cmd line would be:
> 
>    $ $ tc qdisc add dev ens4 parent root handle 100 taprio num_tc 4    \
>      	   map 2 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3                         \
> 	   queues 0 1 2 3                                              \
>      	   sched-file gates.sched [base-time <interval>]               \
>            [cycle-time <interval>] [extension-time <interval>]
> 
>    <file> is multi-line, with each line being of the following format:
>    <cmd> <gate mask> <interval in nanoseconds>
> 
>    Qbv only defines one <cmd>: "S" for 'SetGates'
> 
>    For example:
> 
>    S 0x01 300
>    S 0x03 500
> 
>    This means that there are two intervals, the first will have the gate
>    for traffic class 0 open for 300 nanoseconds, the second will have
>    both traffic classes open for 500 nanoseconds.

To accomodate stuff like control systems you also need a base line, which
is not expressed as interval. Otherwise you can't schedule network wide
explicit plans. That's either an absolute network-time (TAI) time stamp or
an offset to a well defined network-time (TAI) time stamp, e.g. start of
epoch or something else which is agreed on. The actual schedule then fast
forwards past now (TAI) and sets up the slots from there. That makes node
hotplug possible as well.

Btw, it's not only control systems. Think about complex multi source A/V
streams. They are reality in recording and life mixing and looking at the
timing constraints of such scenarios, collision avoidance is key there. So
you want to be able to do network wide traffic orchestration.

> It would handle multiple channels and expose their constraints / properties.
> Each channel also becomes a traffic class, so other qdiscs can be attached to
> them separately.

Right.

> So, in summary, because our entire design is based on qdisc interfaces, what we
> had proposed was a root qdisc (the time slice manager, as you put) that allows
> for other qdiscs to be attached to each channel. The inner qdiscs define the
> queueing modes for each channel, and tbs is just one of those modes. I
> understand now that you want to allow for fully dynamic use-cases to be
> supported as well, which we hadn't covered with our TAS proposal before because
> we hadn't envisioned it being used for these systems' design.

Yes, you have the root qdisc, which is in charge of the overall scheduling
plan, how complex or not it is defined does not matter. It exposes traffic
classes which have properties defined by the configuration.

The qdiscs which are attached to those traffic classes can be anything
including:

 - Simple feed through (Applications are time contraints aware and set the
   exact schedule). qdisc has admission control.

 - Deadline aware qdisc to handle e.g. A/V streams. Applications are aware
   of time constraints and provide the packet deadline. qdisc has admission
   control. This can be a simple first comes, first served scheduler or
   something like EDF which allows optimized utilization. The qdisc sets
   the TX time depending on the deadline and feeds into the root.

 - FIFO/PRIO/XXX for general traffic. Applications do not know anything
   about timing constraints. These qdiscs obviously have neither admission
   control nor do they set a TX time.  The root qdisc just pulls from there
   when the assigned time slot is due or if it (optionally) decides to use
   underutilized time slots from other classes.

 - .... Add your favourite scheduling mode(s).

Thanks,

	tglx
Henrik Austad March 28, 2018, 1:07 p.m. UTC | #14
On Wed, Mar 28, 2018 at 09:48:05AM +0200, Thomas Gleixner wrote:
> Jesus,

Thomas, Jesus,

> On Tue, 27 Mar 2018, Jesus Sanchez-Palencia wrote:
> > On 03/25/2018 04:46 AM, Thomas Gleixner wrote:
> > >   This is missing right now and you want to get that right from the very
> > >   beginning. Duct taping it on the interface later on is a bad idea.
> > 
> > Agreed that this is needed. On the SO_TXTIME + tbs proposal, I believe it's been
> > covered by the (per-packet) SCM_DROP_IF_LATE. Do you think we need a different
> > mechanism for expressing that?
> 
> Uuurgh. No. DROP_IF_LATE is just crap to be honest.
> 
> There are two modes:
> 
>       1) Send at the given TX time (Explicit mode)
> 
>       2) Send before given TX time (Deadline mode)
> 
> There is no need to specify 'drop if late' simply because if the message is
> handed in past the given TX time, it's too late by definition. What you are
> trying to implement is a hybrid of TSN and general purpose (not time aware)
> networking in one go. And you do that because your overall design is not
> looking at the big picture. You designed from a given use case assumption
> and tried to fit other things into it with duct tape.

Yes, +1 to this. The whole point of bandwidth reservation is to not drop 
frames, you should never, ever miss a deadline, if you do, then your 
admission tests are inadequate.

> > >   So you really want a way for the application to query the timing
> > >   constraints and perhaps other properties of the channel it connects
> > >   to. And you want that now before the first application starts to use the
> > >   new ABI. If the application developer does not use it, you still have to
> > >   fix the application, but you have to fix it because the developer was a
> > >   lazy bastard and not because the design was bad. That's a major
> > >   difference.
> > 
> > Ok, this is something that we have considered in the past, but then the feedback
> > here drove us onto a different direction. The overall input we got here was that
> > applications would have to be adjusted or that userspace would have to handle
> > the coordination between applications somehow (e.g.: a daemon could be developed
> > separately to accommodate the fully dynamic use-cases, etc).
> 
> The only thing which will happen is that you get applications which require
> to control the full interface themself because they are so important and
> the only ones which get it right. Good luck with fixing them up.
> 
> That extra daemon if it ever surfaces will be just a PITA. Think about
> 20khz control loops. Do you really want queueing, locking, several context
> switches and priority configuration nightmares in such a scenario?
> Definitely not! You want a fast channel directly to the root qdisc which
> takes care of getting it out at the right point, which might be immediate
> handover if the adapter supports hw scheduling.
> 
> > This is a new requirement for the entire discussion.
> > If I'm not missing anything, however, underutilization of the time slots is only
> > a problem:
> > 
> > 1) for the fully dynamic use-cases and;
> > 2) because now you are designing applications in terms of time slices, right?
> 
> No. It's a general problem. I'm not designing applications in terms of time
> slices. Time slices are a fundamental property of TSN. Whether you use them
> for explicit scheduling or bandwidth reservation or make them flat does not
> matter.
> 
> The application does not necessarily need to know about the time
> constraints at all. But if it wants to use timed scheduling then it better
> does know about them.

yep, +1 in a lot of A/V cases here, the application will have to know about 
presentation_time, and the delay through the network stack should be "low 
and deterministic", but apart from that, the application shouldn't have to 
care about SO_TXTIME and what other applications may or may not do.

> > We have not thought of making any of the proposed qdiscs capable of (optionally)
> > adjusting the "time slices", but mainly because this is not a problem we had
> > here before. Our assumption was that per-port Tx schedules would only be used
> > for static systems. In other words, no, we didn't think that re-balancing the
> > slots was a requirement, not even for 'taprio'.
> 
> Sigh. Utilization is not something entirely new in the network space. I'm
> not saying that this needs to be implemented right away, but designing it
> in a way which forces underutilization is just wrong.
> 
> > > Coming back to the overall scheme. If you start upfront with a time slice
> > > manager which is designed to:
> > >
> > >   - Handle multiple channels
> > >
> > >   - Expose the time constraints, properties per channel
> > >
> > > then you can fit all kind of use cases, whether designed by committee or
> > > not. You can configure that thing per node or network wide. It does not
> > > make a difference. The only difference are the resulting constraints.
> > 
> >
> > Ok, and I believe the above was covered by what we had proposed before, unless
> > what you meant by time constraints is beyond the configured port schedule.
> >
> > Are you suggesting that we'll need to have a kernel entity that is not only
> > aware of the current traffic classes 'schedule', but also of the resources that
> > are still available for new streams to be accommodated into the classes? Putting
> > it differently, is the TAS you envision just an entity that runs a schedule, or
> > is it a time-aware 'orchestrator'?
> 
> In the first place its something which runs a defined schedule.
> 
> The accomodation for new streams is required, but not necessarily at the
> root qdisc level. That might be a qdisc feeding into it.
> 
> Assume you have a bandwidth reservation, aka time slot, for audio. If your
> audio related qdisc does deadline scheduling then you can add new streams
> to it up to the point where it's not longer able to fit.
> 
> The only thing which might be needed at the root qdisc is the ability to
> utilize unused time slots for other purposes, but that's not required to be
> there in the first place as long as its designed in a way that it can be
> added later on.
> 
> > > So lets look once more at the picture in an abstract way:
> > >
> > >      	       [ NIC ]
> > > 	          |
> > > 	 [ Time slice manager ]
> > > 	    |           |
> > >          [ Ch 0 ] ... [ Ch N ]
> > >
> > > So you have a bunch of properties here:
> > >
> > > 1) Number of Channels ranging from 1 to N
> > >
> > > 2) Start point, slice period and slice length per channel
> > 
> > Ok, so we agree that a TAS entity is needed. Assuming that channels are traffic
> > classes, do you have something else in mind other than a new root qdisc?
> 
> Whatever you call it, the important point is that it is the gate keeper to
> the network adapter and there is no way around it. It fully controls the
> timed schedule how simple or how complex it may be.
> 
> > > 3) Queueing modes assigned per channel. Again that might be anything from
> > >    'feed through' over FIFO, PRIO to more complex things like EDF.
> > >
> > >    The queueing mode can also influence properties like the meaning of the
> > >    TX time, i.e. strict or deadline.
> > 
> > 
> > Ok, but how are the queueing modes assigned / configured per channel?
> > 
> > Just to make sure we re-visit some ideas from the past:
> > 
> > * TAS:
> > 
> >    The idea we are currently exploring is to add a "time-aware", priority based
> >    qdisc, that also exposes the Tx queues available and provides a mechanism for
> >    mapping priority <-> traffic class <-> Tx queues in a similar fashion as
> >    mqprio. We are calling this qdisc 'taprio', and its 'tc' cmd line would be:
> > 
> >    $ $ tc qdisc add dev ens4 parent root handle 100 taprio num_tc 4    \
> >      	   map 2 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3                         \
> > 	   queues 0 1 2 3                                              \
> >      	   sched-file gates.sched [base-time <interval>]               \
> >            [cycle-time <interval>] [extension-time <interval>]
> > 
> >    <file> is multi-line, with each line being of the following format:
> >    <cmd> <gate mask> <interval in nanoseconds>
> > 
> >    Qbv only defines one <cmd>: "S" for 'SetGates'
> > 
> >    For example:
> > 
> >    S 0x01 300
> >    S 0x03 500
> > 
> >    This means that there are two intervals, the first will have the gate
> >    for traffic class 0 open for 300 nanoseconds, the second will have
> >    both traffic classes open for 500 nanoseconds.
> 
> To accomodate stuff like control systems you also need a base line, which
> is not expressed as interval. Otherwise you can't schedule network wide
> explicit plans. That's either an absolute network-time (TAI) time stamp or
> an offset to a well defined network-time (TAI) time stamp, e.g. start of
> epoch or something else which is agreed on. The actual schedule then fast
> forwards past now (TAI) and sets up the slots from there. That makes node
> hotplug possible as well.

Ok, so this is perhaps a bit of a sidetrack, but based on other discussions 
in this patch-series, does it really make sense to discuss anything *but* 
TAI?

If you have a TSN-stream (or any other time-sensitive way of prioritizing 
frames based on time), then the network is going to be PTP synched anyway, 
and all the rest of the network is going to operate on PTP-time. Why even 
bother adding CLOCK_REALTIME and CLOCK_MONOTONIC to the discussion? Sure, 
use CLOCK_REALTIME locally and sync that to TAI, but the kernel should 
worry about ptp-time _for_that_adapter_, and we should make it pretty 
obvious to userspace that if you want to specify tx-time, then there's this 
thing called 'PTP' and it rules this domain. My $0.02 etc

> Btw, it's not only control systems. Think about complex multi source A/V
> streams. They are reality in recording and life mixing and looking at the
> timing constraints of such scenarios, collision avoidance is key there. So
> you want to be able to do network wide traffic orchestration.

Yep, and if are too bursty, the network is free to drop your frames, which 
is not desired.

> > It would handle multiple channels and expose their constraints / properties.
> > Each channel also becomes a traffic class, so other qdiscs can be attached to
> > them separately.
> 
> Right.

I don't think you need a separate qdisc for each channel, if you describe a 
channel with

- period (what AVB calls observation interval)
- max data
- deadline

you should be able to keep a sorted rb-tree and handle that pretty 
efficiently. Or perhaps I'm completely missing the mark here. If so, my 
apologies

> > So, in summary, because our entire design is based on qdisc interfaces, what we
> > had proposed was a root qdisc (the time slice manager, as you put) that allows
> > for other qdiscs to be attached to each channel. The inner qdiscs define the
> > queueing modes for each channel, and tbs is just one of those modes. I
> > understand now that you want to allow for fully dynamic use-cases to be
> > supported as well, which we hadn't covered with our TAS proposal before because
> > we hadn't envisioned it being used for these systems' design.
> 
> Yes, you have the root qdisc, which is in charge of the overall scheduling
> plan, how complex or not it is defined does not matter. It exposes traffic
> classes which have properties defined by the configuration.
> 
> The qdiscs which are attached to those traffic classes can be anything
> including:
> 
>  - Simple feed through (Applications are time contraints aware and set the
>    exact schedule). qdisc has admission control.
> 
>  - Deadline aware qdisc to handle e.g. A/V streams. Applications are aware
>    of time constraints and provide the packet deadline. qdisc has admission
>    control. This can be a simple first comes, first served scheduler or
>    something like EDF which allows optimized utilization. The qdisc sets
>    the TX time depending on the deadline and feeds into the root.

As a small nitpick, it would make more sense to do a laxity-approach here, 
both for explicit mode and deadline-mode. We know the size of the frame to 
send, we know the outgoing rate, so keep a ready-queue sorted based on 
laxity

     laxity = absolute_deadline - (size / outgoing_rate)

Also, given that we use a *single* tx-queue for time-triggered 
transmission, this boils down to a uniprocessor equivalent and we have a 
lot of func real-time scheduling academia to draw from.

This could then probably handle both of the above (Direct + deadline), but 
that's implementatino specific I guess.

>  - FIFO/PRIO/XXX for general traffic. Applications do not know anything
>    about timing constraints. These qdiscs obviously have neither admission
>    control nor do they set a TX time.  The root qdisc just pulls from there
>    when the assigned time slot is due or if it (optionally) decides to use
>    underutilized time slots from other classes.
> 
>  - .... Add your favourite scheduling mode(s).

Just give it sub-qdiscs and offload enqueue/dequeue to those I suppose.
Jesus Sanchez-Palencia April 9, 2018, 4:36 p.m. UTC | #15
Hi Thomas,


On 03/28/2018 12:48 AM, Thomas Gleixner wrote:

(...)

>
> There are two modes:
>
>       1) Send at the given TX time (Explicit mode)
>
>       2) Send before given TX time (Deadline mode)
>
> There is no need to specify 'drop if late' simply because if the message is
> handed in past the given TX time, it's too late by definition. What you are
> trying to implement is a hybrid of TSN and general purpose (not time aware)
> networking in one go. And you do that because your overall design is not
> looking at the big picture. You designed from a given use case assumption
> and tried to fit other things into it with duct tape.


Ok, I see the difference now, thanks. I have just two more questions about the
deadline mode, please see below.

(...)


>
>>> Coming back to the overall scheme. If you start upfront with a time slice
>>> manager which is designed to:
>>>
>>>   - Handle multiple channels
>>>
>>>   - Expose the time constraints, properties per channel
>>>
>>> then you can fit all kind of use cases, whether designed by committee or
>>> not. You can configure that thing per node or network wide. It does not
>>> make a difference. The only difference are the resulting constraints.
>>
>>
>> Ok, and I believe the above was covered by what we had proposed before, unless
>> what you meant by time constraints is beyond the configured port schedule.
>>
>> Are you suggesting that we'll need to have a kernel entity that is not only
>> aware of the current traffic classes 'schedule', but also of the resources that
>> are still available for new streams to be accommodated into the classes? Putting
>> it differently, is the TAS you envision just an entity that runs a schedule, or
>> is it a time-aware 'orchestrator'?
>
> In the first place its something which runs a defined schedule.
>
> The accomodation for new streams is required, but not necessarily at the
> root qdisc level. That might be a qdisc feeding into it.
>
> Assume you have a bandwidth reservation, aka time slot, for audio. If your
> audio related qdisc does deadline scheduling then you can add new streams
> to it up to the point where it's not longer able to fit.
>
> The only thing which might be needed at the root qdisc is the ability to
> utilize unused time slots for other purposes, but that's not required to be
> there in the first place as long as its designed in a way that it can be
> added later on.


Ok, agreed.


>
>>> So lets look once more at the picture in an abstract way:
>>>
>>>      	       [ NIC ]
>>> 	          |
>>> 	 [ Time slice manager ]
>>> 	    |           |
>>>          [ Ch 0 ] ... [ Ch N ]
>>>
>>> So you have a bunch of properties here:
>>>
>>> 1) Number of Channels ranging from 1 to N
>>>
>>> 2) Start point, slice period and slice length per channel
>>
>> Ok, so we agree that a TAS entity is needed. Assuming that channels are traffic
>> classes, do you have something else in mind other than a new root qdisc?
>
> Whatever you call it, the important point is that it is the gate keeper to
> the network adapter and there is no way around it. It fully controls the
> timed schedule how simple or how complex it may be.


Ok, and I've finally understood the nuance between the above and what we had
planned initially.


(...)


>>
>> * TAS:
>>
>>    The idea we are currently exploring is to add a "time-aware", priority based
>>    qdisc, that also exposes the Tx queues available and provides a mechanism for
>>    mapping priority <-> traffic class <-> Tx queues in a similar fashion as
>>    mqprio. We are calling this qdisc 'taprio', and its 'tc' cmd line would be:
>>
>>    $ $ tc qdisc add dev ens4 parent root handle 100 taprio num_tc 4    \
>>      	   map 2 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3                         \
>> 	   queues 0 1 2 3                                              \
>>      	   sched-file gates.sched [base-time <interval>]               \
>>            [cycle-time <interval>] [extension-time <interval>]
>>
>>    <file> is multi-line, with each line being of the following format:
>>    <cmd> <gate mask> <interval in nanoseconds>
>>
>>    Qbv only defines one <cmd>: "S" for 'SetGates'
>>
>>    For example:
>>
>>    S 0x01 300
>>    S 0x03 500
>>
>>    This means that there are two intervals, the first will have the gate
>>    for traffic class 0 open for 300 nanoseconds, the second will have
>>    both traffic classes open for 500 nanoseconds.
>
> To accomodate stuff like control systems you also need a base line, which
> is not expressed as interval. Otherwise you can't schedule network wide
> explicit plans. That's either an absolute network-time (TAI) time stamp or
> an offset to a well defined network-time (TAI) time stamp, e.g. start of
> epoch or something else which is agreed on. The actual schedule then fast
> forwards past now (TAI) and sets up the slots from there. That makes node
> hotplug possible as well.


Sure, and the [base-time <interval>] on the command line above was actually
wrong. It should have been expressed as [base-time <timestamp>].



>> It would handle multiple channels and expose their constraints / properties.
>> Each channel also becomes a traffic class, so other qdiscs can be attached to
>> them separately.
>
> Right.
>
>> So, in summary, because our entire design is based on qdisc interfaces, what we
>> had proposed was a root qdisc (the time slice manager, as you put) that allows
>> for other qdiscs to be attached to each channel. The inner qdiscs define the
>> queueing modes for each channel, and tbs is just one of those modes. I
>> understand now that you want to allow for fully dynamic use-cases to be
>> supported as well, which we hadn't covered with our TAS proposal before because
>> we hadn't envisioned it being used for these systems' design.
>
> Yes, you have the root qdisc, which is in charge of the overall scheduling
> plan, how complex or not it is defined does not matter. It exposes traffic
> classes which have properties defined by the configuration.


Perfect. Let's see if we can agree on an overall plan, then. Hopefully I'm not
missing anything.

For the above we'll develop a new qdisc, designed along the 'taprio' ideas, thus
a Qbv style scheduler, to be used as root qdisc. It can run the schedule inside
the kernel or just offload it to the NIC if supported. Similarly to the other
multiqueue qdiscs, it will expose the HW Tx queues.

What is new here from the ideas we shared last year is that this new root qdisc
will be responsible for calling the attached qdiscs' dequeue functions during
their timeslices, making it the only entity capable of enqueueing packets into
the NIC.

This is the "global scheduler", but we still need the txtime aware qdisc. For
that, we'll modify tbs to accommodate the feedback from this thread. More below.


>
> The qdiscs which are attached to those traffic classes can be anything
> including:
>
>  - Simple feed through (Applications are time contraints aware and set the
>    exact schedule). qdisc has admission control.


This will be provided by the tbs qdisc. It will still provide a txtime sorted
list and hw offload, but now there will be a per-socket option that tells the
qdisc if the per-packet timestamp is the txtime (i.e. explicit mode, as you've
called it) or a deadline. The drop_if_late flag will be removed.

When in explicit mode, packets from that socket are dequeued from the qdisc
during its time slice if their [(txtime - delta) < now].


>
>  - Deadline aware qdisc to handle e.g. A/V streams. Applications are aware
>    of time constraints and provide the packet deadline. qdisc has admission
>    control. This can be a simple first comes, first served scheduler or
>    something like EDF which allows optimized utilization. The qdisc sets
>    the TX time depending on the deadline and feeds into the root.


This will be provided by tbs if the socket which is transmitting packets is
configured for deadline mode.

For the deadline -> txtime conversion, what I have in mind is: when dequeue is
called tbs will just change the skbuff's timestamp from the deadline to 'now'
(i.e. as soon as possible) and dequeue the packet. Would that be enough or
should we use the delta parameter of the qdisc on this case add make [txtime =
now + delta]? The only benefit of doing so would be to provide a configurable
'fudge' factor.

Another question for this mode (but perhaps that applies to both modes) is, what
if the qdisc misses the deadline for *any* reason? I'm assuming it should drop
the packet during dequeue.


Putting it all together, we end up with:

1) a new txtime aware qdisc, tbs, to be used per queue. Its cli will look like:
$ tc qdisc add (...) tbs clockid CLOCK_REALTIME delta 150000 offload sorting

2) a new cmsg-interface for setting a per-packet timestamp that will be used
either as a txtime or as deadline by tbs (and further the NIC driver for the
offlaod case): SCM_TXTIME.

3) a new socket option: SO_TXTIME. It will be used to enable the feature for a
socket, and will have as parameters a clockid and a txtime mode (deadline or
explicit), that defines the semantics of the timestamp set on packets using
SCM_TXTIME.

4) a new #define DYNAMIC_CLOCKID 15 added to include/uapi/linux/time.h .

5) a new schedule-aware qdisc, 'tas' or 'taprio', to be used per port. Its cli
will look like what was proposed for taprio (base time being an absolute timestamp).



If we all agree with the above, we will start by closing on 1-4 asap and will
focus on 5 next.

How does that sound?

Thanks,
Jesus



>
>  - FIFO/PRIO/XXX for general traffic. Applications do not know anything
>    about timing constraints. These qdiscs obviously have neither admission
>    control nor do they set a TX time.  The root qdisc just pulls from there
>    when the assigned time slot is due or if it (optionally) decides to use
>    underutilized time slots from other classes.
>
>  - .... Add your favourite scheduling mode(s).
>
> Thanks,
>
> 	tglx
>
Thomas Gleixner April 10, 2018, 12:37 p.m. UTC | #16
Jesus,

On Mon, 9 Apr 2018, Jesus Sanchez-Palencia wrote:
> On 03/28/2018 12:48 AM, Thomas Gleixner wrote:
> > Yes, you have the root qdisc, which is in charge of the overall scheduling
> > plan, how complex or not it is defined does not matter. It exposes traffic
> > classes which have properties defined by the configuration.
> 
> Perfect. Let's see if we can agree on an overall plan, then. Hopefully I'm not
> missing anything.
> 
> For the above we'll develop a new qdisc, designed along the 'taprio' ideas, thus
> a Qbv style scheduler, to be used as root qdisc. It can run the schedule inside
> the kernel or just offload it to the NIC if supported. Similarly to the other
> multiqueue qdiscs, it will expose the HW Tx queues.
> 
> What is new here from the ideas we shared last year is that this new root qdisc
> will be responsible for calling the attached qdiscs' dequeue functions during
> their timeslices, making it the only entity capable of enqueueing packets into
> the NIC.

Correct. Aside of that it's the entity which is in charge of the overall
scheduling.

> This is the "global scheduler", but we still need the txtime aware
> qdisc. For that, we'll modify tbs to accommodate the feedback from this
> thread. More below.

> > The qdiscs which are attached to those traffic classes can be anything
> > including:
> >
> >  - Simple feed through (Applications are time contraints aware and set the
> >    exact schedule). qdisc has admission control.
> 
> This will be provided by the tbs qdisc. It will still provide a txtime sorted
> list and hw offload, but now there will be a per-socket option that tells the
> qdisc if the per-packet timestamp is the txtime (i.e. explicit mode, as you've
> called it) or a deadline. The drop_if_late flag will be removed.
> 
> When in explicit mode, packets from that socket are dequeued from the qdisc
> during its time slice if their [(txtime - delta) < now].
> 
> >
> >  - Deadline aware qdisc to handle e.g. A/V streams. Applications are aware
> >    of time constraints and provide the packet deadline. qdisc has admission
> >    control. This can be a simple first comes, first served scheduler or
> >    something like EDF which allows optimized utilization. The qdisc sets
> >    the TX time depending on the deadline and feeds into the root.
> 
> This will be provided by tbs if the socket which is transmitting packets is
> configured for deadline mode.

You don't want the socket to decide that. The qdisc into which a socket
feeds defines the mode and the qdisc rejects requests with the wrong mode.

Making a qdisc doing both and let the user decide what he wants it to be is
not really going to fly. Especially if you have different users which want
a different mode. It's clearly distinct functionality.

Please stop trying to develop swiss army knifes with integrated coffee
machines.

> For the deadline -> txtime conversion, what I have in mind is: when dequeue is
> called tbs will just change the skbuff's timestamp from the deadline to 'now'
> (i.e. as soon as possible) and dequeue the packet. Would that be enough or
> should we use the delta parameter of the qdisc on this case add make [txtime =
> now + delta]? The only benefit of doing so would be to provide a configurable
> 'fudge' factor.

Well, that really depends on how your deadline scheduler works.

> Another question for this mode (but perhaps that applies to both modes) is, what
> if the qdisc misses the deadline for *any* reason? I'm assuming it should drop
> the packet during dequeue.

There the question is how user space is notified about that issue. The
application which queued the packet on time does rightfully assume that
it's going to be on the wire on time.

This is a violation of the overall scheduling plan, so you need to have
a sane design to handle that.

> Putting it all together, we end up with:
> 
> 1) a new txtime aware qdisc, tbs, to be used per queue. Its cli will look like:
> $ tc qdisc add (...) tbs clockid CLOCK_REALTIME delta 150000 offload sorting

Why CLOCK_REALTIME? The only interesting time in a TSN network is
CLOCK_TAI, really.

> 2) a new cmsg-interface for setting a per-packet timestamp that will be used
> either as a txtime or as deadline by tbs (and further the NIC driver for the
> offlaod case): SCM_TXTIME.
> 
> 3) a new socket option: SO_TXTIME. It will be used to enable the feature for a
> socket, and will have as parameters a clockid and a txtime mode (deadline or
> explicit), that defines the semantics of the timestamp set on packets using
> SCM_TXTIME.
> 
> 4) a new #define DYNAMIC_CLOCKID 15 added to include/uapi/linux/time.h .

Can you remind me why we would need that?

> 5) a new schedule-aware qdisc, 'tas' or 'taprio', to be used per port. Its cli
> will look like what was proposed for taprio (base time being an absolute timestamp).
> 
> If we all agree with the above, we will start by closing on 1-4 asap and will
> focus on 5 next.
> 
> How does that sound?

Backwards to be honest.

You should start with the NIC facing qdisc because that's the key part of
all this and the design might have implications on how the qdiscs which
feed into it need to be designed.

Thanks,

	tglx
Jesus Sanchez-Palencia April 10, 2018, 9:24 p.m. UTC | #17
Hi Thomas,


On 04/10/2018 05:37 AM, Thomas Gleixner wrote:

(...)


>>>
>>>  - Simple feed through (Applications are time contraints aware and set the
>>>    exact schedule). qdisc has admission control.
>>
>> This will be provided by the tbs qdisc. It will still provide a txtime sorted
>> list and hw offload, but now there will be a per-socket option that tells the
>> qdisc if the per-packet timestamp is the txtime (i.e. explicit mode, as you've
>> called it) or a deadline. The drop_if_late flag will be removed.
>>
>> When in explicit mode, packets from that socket are dequeued from the qdisc
>> during its time slice if their [(txtime - delta) < now].
>>
>>>
>>>  - Deadline aware qdisc to handle e.g. A/V streams. Applications are aware
>>>    of time constraints and provide the packet deadline. qdisc has admission
>>>    control. This can be a simple first comes, first served scheduler or
>>>    something like EDF which allows optimized utilization. The qdisc sets
>>>    the TX time depending on the deadline and feeds into the root.
>>
>> This will be provided by tbs if the socket which is transmitting packets is
>> configured for deadline mode.
> 
> You don't want the socket to decide that. The qdisc into which a socket
> feeds defines the mode and the qdisc rejects requests with the wrong mode.
> 
> Making a qdisc doing both and let the user decide what he wants it to be is
> not really going to fly. Especially if you have different users which want
> a different mode. It's clearly distinct functionality.


Ok, so just to make sure I got this right, are you suggesting that both the
'tbs' qdisc *and* the socket (i.e. through SO_TXTIME) should have a config
parameter for specifying the txtime mode? This way if there is a mismatch,
packets from that socket are rejected by the qdisc.



(...)


> 
>> Another question for this mode (but perhaps that applies to both modes) is, what
>> if the qdisc misses the deadline for *any* reason? I'm assuming it should drop
>> the packet during dequeue.
> 
> There the question is how user space is notified about that issue. The
> application which queued the packet on time does rightfully assume that
> it's going to be on the wire on time.
> 
> This is a violation of the overall scheduling plan, so you need to have
> a sane design to handle that.


In addition to the qdisc stats, we could look into using the socket's error
queue to notify the application about that.


> 
>> Putting it all together, we end up with:
>>
>> 1) a new txtime aware qdisc, tbs, to be used per queue. Its cli will look like:
>> $ tc qdisc add (...) tbs clockid CLOCK_REALTIME delta 150000 offload sorting
> 
> Why CLOCK_REALTIME? The only interesting time in a TSN network is
> CLOCK_TAI, really.


REALTIME was just an example here to show that the qdisc has to be configured
with a clockid parameter. Are you suggesting that instead both of the new qdiscs
(i.e. tbs and taprio) should always be using CLOCK_TAI implicitly?


> 
>> 2) a new cmsg-interface for setting a per-packet timestamp that will be used
>> either as a txtime or as deadline by tbs (and further the NIC driver for the
>> offlaod case): SCM_TXTIME.
>>
>> 3) a new socket option: SO_TXTIME. It will be used to enable the feature for a
>> socket, and will have as parameters a clockid and a txtime mode (deadline or
>> explicit), that defines the semantics of the timestamp set on packets using
>> SCM_TXTIME.
>>
>> 4) a new #define DYNAMIC_CLOCKID 15 added to include/uapi/linux/time.h .
> 
> Can you remind me why we would need that?


So there is a "clockid" that can be used for the full hw offload modes. On this
case, the txtimes are in reference to the NIC's PTP clock, and, as discussed, we
can't just use a clockid that was computed from the fd pointing to /dev/ptpX .


> 
>> 5) a new schedule-aware qdisc, 'tas' or 'taprio', to be used per port. Its cli
>> will look like what was proposed for taprio (base time being an absolute timestamp).
>>
>> If we all agree with the above, we will start by closing on 1-4 asap and will
>> focus on 5 next.
>>
>> How does that sound?
> 
> Backwards to be honest.
> 
> You should start with the NIC facing qdisc because that's the key part of
> all this and the design might have implications on how the qdiscs which
> feed into it need to be designed.


Ok, let's just try to close on the above first.


Thanks,
Jesus


> 
> Thanks,
> 
> 	tglx
>
Thomas Gleixner April 11, 2018, 8:16 p.m. UTC | #18
On Tue, 10 Apr 2018, Jesus Sanchez-Palencia wrote:
> >> This will be provided by tbs if the socket which is transmitting packets is
> >> configured for deadline mode.
> > 
> > You don't want the socket to decide that. The qdisc into which a socket
> > feeds defines the mode and the qdisc rejects requests with the wrong mode.
> > 
> > Making a qdisc doing both and let the user decide what he wants it to be is
> > not really going to fly. Especially if you have different users which want
> > a different mode. It's clearly distinct functionality.
> 
> 
> Ok, so just to make sure I got this right, are you suggesting that both the
> 'tbs' qdisc *and* the socket (i.e. through SO_TXTIME) should have a config
> parameter for specifying the txtime mode? This way if there is a mismatch,
> packets from that socket are rejected by the qdisc.

Correct. The same is true if you try to set SO_TXTIME for something which
is just routing regular traffic.

> (...)
> > 
> >> Another question for this mode (but perhaps that applies to both modes) is, what
> >> if the qdisc misses the deadline for *any* reason? I'm assuming it should drop
> >> the packet during dequeue.
> > 
> > There the question is how user space is notified about that issue. The
> > application which queued the packet on time does rightfully assume that
> > it's going to be on the wire on time.
> > 
> > This is a violation of the overall scheduling plan, so you need to have
> > a sane design to handle that.
> 
> In addition to the qdisc stats, we could look into using the socket's error
> queue to notify the application about that.

Makes sense.
 
> >> Putting it all together, we end up with:
> >>
> >> 1) a new txtime aware qdisc, tbs, to be used per queue. Its cli will look like:
> >> $ tc qdisc add (...) tbs clockid CLOCK_REALTIME delta 150000 offload sorting
> > 
> > Why CLOCK_REALTIME? The only interesting time in a TSN network is
> > CLOCK_TAI, really.
> 
> REALTIME was just an example here to show that the qdisc has to be configured
> with a clockid parameter. Are you suggesting that instead both of the new qdiscs
> (i.e. tbs and taprio) should always be using CLOCK_TAI implicitly?

I think so. It's _the_ network time on which everything is based on.

> >> 2) a new cmsg-interface for setting a per-packet timestamp that will be used
> >> either as a txtime or as deadline by tbs (and further the NIC driver for the
> >> offlaod case): SCM_TXTIME.
> >>
> >> 3) a new socket option: SO_TXTIME. It will be used to enable the feature for a
> >> socket, and will have as parameters a clockid and a txtime mode (deadline or
> >> explicit), that defines the semantics of the timestamp set on packets using
> >> SCM_TXTIME.
> >>
> >> 4) a new #define DYNAMIC_CLOCKID 15 added to include/uapi/linux/time.h .
> > 
> > Can you remind me why we would need that?
> 
> So there is a "clockid" that can be used for the full hw offload modes. On this
> case, the txtimes are in reference to the NIC's PTP clock, and, as discussed, we
> can't just use a clockid that was computed from the fd pointing to /dev/ptpX .

And the NICs PTP clock is CLOCK_TAI, so there should be no reason to have
yet another clock, right?

Thanks,

	tglx
Ivan Briano April 11, 2018, 8:31 p.m. UTC | #19
On 04/11/2018 01:16 PM, Thomas Gleixner wrote:
> On Tue, 10 Apr 2018, Jesus Sanchez-Palencia wrote:
>>>> This will be provided by tbs if the socket which is transmitting packets is
>>>> configured for deadline mode.
>>>
>>> You don't want the socket to decide that. The qdisc into which a socket
>>> feeds defines the mode and the qdisc rejects requests with the wrong mode.
>>>
>>> Making a qdisc doing both and let the user decide what he wants it to be is
>>> not really going to fly. Especially if you have different users which want
>>> a different mode. It's clearly distinct functionality.
>>
>>
>> Ok, so just to make sure I got this right, are you suggesting that both the
>> 'tbs' qdisc *and* the socket (i.e. through SO_TXTIME) should have a config
>> parameter for specifying the txtime mode? This way if there is a mismatch,
>> packets from that socket are rejected by the qdisc.
> 
> Correct. The same is true if you try to set SO_TXTIME for something which
> is just routing regular traffic.
> 
>> (...)
>>>
>>>> Another question for this mode (but perhaps that applies to both modes) is, what
>>>> if the qdisc misses the deadline for *any* reason? I'm assuming it should drop
>>>> the packet during dequeue.
>>>
>>> There the question is how user space is notified about that issue. The
>>> application which queued the packet on time does rightfully assume that
>>> it's going to be on the wire on time.
>>>
>>> This is a violation of the overall scheduling plan, so you need to have
>>> a sane design to handle that.
>>
>> In addition to the qdisc stats, we could look into using the socket's error
>> queue to notify the application about that.
> 
> Makes sense.
>  
>>>> Putting it all together, we end up with:
>>>>
>>>> 1) a new txtime aware qdisc, tbs, to be used per queue. Its cli will look like:
>>>> $ tc qdisc add (...) tbs clockid CLOCK_REALTIME delta 150000 offload sorting
>>>
>>> Why CLOCK_REALTIME? The only interesting time in a TSN network is
>>> CLOCK_TAI, really.
>>
>> REALTIME was just an example here to show that the qdisc has to be configured
>> with a clockid parameter. Are you suggesting that instead both of the new qdiscs
>> (i.e. tbs and taprio) should always be using CLOCK_TAI implicitly?
> 
> I think so. It's _the_ network time on which everything is based on.
> 
>>>> 2) a new cmsg-interface for setting a per-packet timestamp that will be used
>>>> either as a txtime or as deadline by tbs (and further the NIC driver for the
>>>> offlaod case): SCM_TXTIME.
>>>>
>>>> 3) a new socket option: SO_TXTIME. It will be used to enable the feature for a
>>>> socket, and will have as parameters a clockid and a txtime mode (deadline or
>>>> explicit), that defines the semantics of the timestamp set on packets using
>>>> SCM_TXTIME.
>>>>
>>>> 4) a new #define DYNAMIC_CLOCKID 15 added to include/uapi/linux/time.h .
>>>
>>> Can you remind me why we would need that?
>>
>> So there is a "clockid" that can be used for the full hw offload modes. On this
>> case, the txtimes are in reference to the NIC's PTP clock, and, as discussed, we
>> can't just use a clockid that was computed from the fd pointing to /dev/ptpX .
> 
> And the NICs PTP clock is CLOCK_TAI, so there should be no reason to have
> yet another clock, right?
> 

Most likely, though you can technically have a different time domain
that is not based on TAI.

> Thanks,
> 
> 	tglx
>
Jesus Sanchez-Palencia April 11, 2018, 11:38 p.m. UTC | #20
Hi,

On 04/11/2018 01:16 PM, Thomas Gleixner wrote:
>>>> Putting it all together, we end up with:
>>>>
>>>> 1) a new txtime aware qdisc, tbs, to be used per queue. Its cli will look like:
>>>> $ tc qdisc add (...) tbs clockid CLOCK_REALTIME delta 150000 offload sorting
>>>
>>> Why CLOCK_REALTIME? The only interesting time in a TSN network is
>>> CLOCK_TAI, really.
>>
>> REALTIME was just an example here to show that the qdisc has to be configured
>> with a clockid parameter. Are you suggesting that instead both of the new qdiscs
>> (i.e. tbs and taprio) should always be using CLOCK_TAI implicitly?
> 
> I think so. It's _the_ network time on which everything is based on.

Yes, but more on this below.


> 
>>>> 2) a new cmsg-interface for setting a per-packet timestamp that will be used
>>>> either as a txtime or as deadline by tbs (and further the NIC driver for the
>>>> offlaod case): SCM_TXTIME.
>>>>
>>>> 3) a new socket option: SO_TXTIME. It will be used to enable the feature for a
>>>> socket, and will have as parameters a clockid and a txtime mode (deadline or
>>>> explicit), that defines the semantics of the timestamp set on packets using
>>>> SCM_TXTIME.
>>>>
>>>> 4) a new #define DYNAMIC_CLOCKID 15 added to include/uapi/linux/time.h .
>>>
>>> Can you remind me why we would need that?
>>
>> So there is a "clockid" that can be used for the full hw offload modes. On this
>> case, the txtimes are in reference to the NIC's PTP clock, and, as discussed, we
>> can't just use a clockid that was computed from the fd pointing to /dev/ptpX .
> 
> And the NICs PTP clock is CLOCK_TAI, so there should be no reason to have
> yet another clock, right?

Just breaking this down a bit, yes, TAI is the network time base, and the NICs
PTP clock use that because PTP is (commonly) based on TAI. After the PHCs have
been synchronized over the network (e.g. with ptp4l), my understanding is that
if applications want to use the clockid_t CLOCK_TAI as a network clock reference
it's required that something (i.e. phc2sys) is synchronizing the PHCs and the
system clock, and also that something calls adjtime to apply the TAI vs UTC
offset to CLOCK_TAI.

If we are fine with those 'dependencies', then I agree there is no need for
another clock.

I was thinking about the full offload use-cases, thus when no scheduling is
happening inside the qdiscs. Applications could just read the time from the PHC
clocks directly without having to rely on any of the above. On this case,
userspace would use DYNAMIC_CLOCK just to flag that this is the case, but I must
admit it's not clear to me how common of a use-case that is, or even if it makes
sense.


Thanks,
Jesus


> 
> Thanks,
> 
> 	tglx
>
Richard Cochran April 12, 2018, 3:03 p.m. UTC | #21
On Wed, Apr 11, 2018 at 04:38:44PM -0700, Jesus Sanchez-Palencia wrote:
> Just breaking this down a bit, yes, TAI is the network time base, and the NICs
> PTP clock use that because PTP is (commonly) based on TAI. After the PHCs have
> been synchronized over the network (e.g. with ptp4l), my understanding is that
> if applications want to use the clockid_t CLOCK_TAI as a network clock reference
> it's required that something (i.e. phc2sys) is synchronizing the PHCs and the
> system clock, and also that something calls adjtime to apply the TAI vs UTC
> offset to CLOCK_TAI.

Yes.  I haven't seen any distro that sets the TAI-UTC offset after
boot, nor are there any user space tools for this.  The kernel is
ready, though.

> I was thinking about the full offload use-cases, thus when no scheduling is
> happening inside the qdiscs. Applications could just read the time from the PHC
> clocks directly without having to rely on any of the above. On this case,
> userspace would use DYNAMIC_CLOCK just to flag that this is the case, but I must
> admit it's not clear to me how common of a use-case that is, or even if it makes
> sense.

1588 allows only two timescales, TAI and ARB-itrary.  Although it
doesn't make too much sense to use ARB, still people will do strange
things.  Probably some people use UTC.  I am not advocating supporting
alternate timescales, just pointing out the possibility.

Thanks,
Richard
Miroslav Lichvar April 12, 2018, 3:19 p.m. UTC | #22
On Thu, Apr 12, 2018 at 08:03:49AM -0700, Richard Cochran wrote:
> On Wed, Apr 11, 2018 at 04:38:44PM -0700, Jesus Sanchez-Palencia wrote:
> > Just breaking this down a bit, yes, TAI is the network time base, and the NICs
> > PTP clock use that because PTP is (commonly) based on TAI. After the PHCs have
> > been synchronized over the network (e.g. with ptp4l), my understanding is that
> > if applications want to use the clockid_t CLOCK_TAI as a network clock reference
> > it's required that something (i.e. phc2sys) is synchronizing the PHCs and the
> > system clock, and also that something calls adjtime to apply the TAI vs UTC
> > offset to CLOCK_TAI.
> 
> Yes.  I haven't seen any distro that sets the TAI-UTC offset after
> boot, nor are there any user space tools for this.  The kernel is
> ready, though.

FWIW, the default NTP configuration in Fedora sets the kernel TAI-UTC
offset.

> > I was thinking about the full offload use-cases, thus when no scheduling is
> > happening inside the qdiscs. Applications could just read the time from the PHC
> > clocks directly without having to rely on any of the above. On this case,
> > userspace would use DYNAMIC_CLOCK just to flag that this is the case, but I must
> > admit it's not clear to me how common of a use-case that is, or even if it makes
> > sense.
> 
> 1588 allows only two timescales, TAI and ARB-itrary.  Although it
> doesn't make too much sense to use ARB, still people will do strange
> things.  Probably some people use UTC.  I am not advocating supporting
> alternate timescales, just pointing out the possibility.

There is also the possibility that the NIC clock is not synchronized
to anything. For synchronization of the system clock it's easier to
leave it free running and only track its phase/frequency offset to
allow conversion between the PHC and system time.
Thomas Gleixner April 19, 2018, 10:03 a.m. UTC | #23
On Wed, 11 Apr 2018, Jesus Sanchez-Palencia wrote:
> On 04/11/2018 01:16 PM, Thomas Gleixner wrote:
> >> So there is a "clockid" that can be used for the full hw offload modes. On this
> >> case, the txtimes are in reference to the NIC's PTP clock, and, as discussed, we
> >> can't just use a clockid that was computed from the fd pointing to /dev/ptpX .
> > 
> > And the NICs PTP clock is CLOCK_TAI, so there should be no reason to have
> > yet another clock, right?
> 
> Just breaking this down a bit, yes, TAI is the network time base, and the NICs
> PTP clock use that because PTP is (commonly) based on TAI. After the PHCs have
> been synchronized over the network (e.g. with ptp4l), my understanding is that
> if applications want to use the clockid_t CLOCK_TAI as a network clock reference
> it's required that something (i.e. phc2sys) is synchronizing the PHCs and the
> system clock, and also that something calls adjtime to apply the TAI vs UTC
> offset to CLOCK_TAI.
> 
> If we are fine with those 'dependencies', then I agree there is no need for
> another clock.
> 
> I was thinking about the full offload use-cases, thus when no scheduling is
> happening inside the qdiscs. Applications could just read the time from the PHC
> clocks directly without having to rely on any of the above. On this case,
> userspace would use DYNAMIC_CLOCK just to flag that this is the case, but I must
> admit it's not clear to me how common of a use-case that is, or even if it makes
> sense.

I don't think it makes a lot of sense because the only use case for that is
a full user space scheduler which routes _ALL_ traffic. I don't think
that's something which we want to proliferate.

So I'd rather start off with the CLOCK_TAI assumption and if the need
really arises we can discuss that separately. So you can take a clockid
into account when designing the ABI, but have it CLOCK_TAI only for the
start.

Thanks,

	tglx
Jesus Sanchez-Palencia April 23, 2018, 6:21 p.m. UTC | #24
Hi Thomas,


On 03/21/2018 06:46 AM, Thomas Gleixner wrote:
> On Tue, 6 Mar 2018, Jesus Sanchez-Palencia wrote:
>> +struct tbs_sched_data {
>> +	bool sorting;
>> +	int clockid;
>> +	int queue;
>> +	s32 delta; /* in ns */
>> +	ktime_t last; /* The txtime of the last skb sent to the netdevice. */
>> +	struct rb_root head;
> 
> Hmm. You are reimplementing timerqueue open coded. Have you checked whether
> you could reuse the timerqueue implementation?
> 
> That requires to add a timerqueue node to struct skbuff
> 
> @@ -671,7 +671,8 @@ struct sk_buff {
>  				unsigned long		dev_scratch;
>  			};
>  		};
> -		struct rb_node	rbnode; /* used in netem & tcp stack */
> +		struct rb_node		rbnode; /* used in netem & tcp stack */
> +		struct timerqueue_node	tqnode;
>  	};
>  	struct sock		*sk;
> 
> Then you can use timerqueue_head in your scheduler data and all the open
> coded rbtree handling goes away.


I just noticed that doing the above increases the size of struct sk_buff by 8
bytes - struct timerqueue_node is 32bytes long while struct rb_node is only
24bytes long.

Given the feedback we got here before against touching struct sk_buff at all for
non-generic use cases, I will keep the implementation of sch_tbs.c as is, thus
keeping the open-coded version for now, ok?

Thanks,
Jesus


(...)
Thomas Gleixner April 24, 2018, 8:50 a.m. UTC | #25
On Mon, 23 Apr 2018, Jesus Sanchez-Palencia wrote:
> On 03/21/2018 06:46 AM, Thomas Gleixner wrote:
> > On Tue, 6 Mar 2018, Jesus Sanchez-Palencia wrote:
> >> +struct tbs_sched_data {
> >> +	bool sorting;
> >> +	int clockid;
> >> +	int queue;
> >> +	s32 delta; /* in ns */
> >> +	ktime_t last; /* The txtime of the last skb sent to the netdevice. */
> >> +	struct rb_root head;
> > 
> > Hmm. You are reimplementing timerqueue open coded. Have you checked whether
> > you could reuse the timerqueue implementation?
> > 
> > That requires to add a timerqueue node to struct skbuff
> > 
> > @@ -671,7 +671,8 @@ struct sk_buff {
> >  				unsigned long		dev_scratch;
> >  			};
> >  		};
> > -		struct rb_node	rbnode; /* used in netem & tcp stack */
> > +		struct rb_node		rbnode; /* used in netem & tcp stack */
> > +		struct timerqueue_node	tqnode;
> >  	};
> >  	struct sock		*sk;
> > 
> > Then you can use timerqueue_head in your scheduler data and all the open
> > coded rbtree handling goes away.
> 
> 
> I just noticed that doing the above increases the size of struct sk_buff by 8
> bytes - struct timerqueue_node is 32bytes long while struct rb_node is only
> 24bytes long.
> 
> Given the feedback we got here before against touching struct sk_buff at all for
> non-generic use cases, I will keep the implementation of sch_tbs.c as is, thus
> keeping the open-coded version for now, ok?

The size of sk_buff is 216 and the size of sk_buff_fclones is 440
bytes. The sk_buff and sk_buff_fclones kmem_caches use objects sized 256
and 512 bytes because the kmem_caches are created with SLAB_HWCACHE_ALIGN.

So adding 8 bytes to spare duplicated code will not change the kmem_cache
object size and I really doubt that anyone will notice.

Thanks,

	tglx
David Miller April 24, 2018, 1:50 p.m. UTC | #26
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 24 Apr 2018 10:50:04 +0200 (CEST)

> So adding 8 bytes to spare duplicated code will not change the kmem_cache
> object size and I really doubt that anyone will notice.

It's about where the cache lines end up when each and every byte is added
to the structure, not just the slab object size.
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7104de2bc957..09b5b2e08f04 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -781,6 +781,7 @@  enum tc_setup_type {
 	TC_SETUP_QDISC_CBS,
 	TC_SETUP_QDISC_RED,
 	TC_SETUP_QDISC_PRIO,
+	TC_SETUP_QDISC_TBS,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096ae97b..a33b5b9da81a 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -934,4 +934,21 @@  enum {
 
 #define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
 
+
+/* TBS */
+struct tc_tbs_qopt {
+	__s32 delta;
+	__s32 clockid;
+	__u32 flags;
+#define TC_TBS_SORTING_ON BIT(0)
+};
+
+enum {
+	TCA_TBS_UNSPEC,
+	TCA_TBS_PARMS,
+	__TCA_TBS_MAX,
+};
+
+#define TCA_TBS_MAX (__TCA_TBS_MAX - 1)
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169fb5325..9e68fef78d50 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -183,6 +183,17 @@  config NET_SCH_CBS
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_cbs.
 
+config NET_SCH_TBS
+	tristate "Time Based Scheduler (TBS)"
+	---help---
+	  Say Y here if you want to use the Time Based Scheduler (TBS) packet
+	  scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_tbs.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_tbs.
+
 config NET_SCH_GRED
 	tristate "Generic Random Early Detection (GRED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d3804878..f02378a0a8f2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -54,6 +54,7 @@  obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
 obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
+obj-$(CONFIG_NET_SCH_TBS)	+= sch_tbs.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_tbs.c b/net/sched/sch_tbs.c
new file mode 100644
index 000000000000..c19eedda9bc5
--- /dev/null
+++ b/net/sched/sch_tbs.c
@@ -0,0 +1,474 @@ 
+/*
+ * net/sched/sch_tbs.c	Time Based Shaper
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
+ *		Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/rbtree.h>
+#include <linux/skbuff.h>
+#include <linux/posix-timers.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+#define SORTING_IS_ON(x) (x->flags & TC_TBS_SORTING_ON)
+
+struct tbs_sched_data {
+	bool sorting;
+	int clockid;
+	int queue;
+	s32 delta; /* in ns */
+	ktime_t last; /* The txtime of the last skb sent to the netdevice. */
+	struct rb_root head;
+	struct qdisc_watchdog watchdog;
+	struct Qdisc *qdisc;
+	int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free);
+	struct sk_buff *(*dequeue)(struct Qdisc *sch);
+	struct sk_buff *(*peek)(struct Qdisc *sch);
+};
+
+static const struct nla_policy tbs_policy[TCA_TBS_MAX + 1] = {
+	[TCA_TBS_PARMS]	= { .len = sizeof(struct tc_tbs_qopt) },
+};
+
+typedef ktime_t (*get_time_func_t)(void);
+
+static const get_time_func_t clockid_to_get_time[MAX_CLOCKS] = {
+	[CLOCK_MONOTONIC] = ktime_get,
+	[CLOCK_REALTIME] = ktime_get_real,
+	[CLOCK_BOOTTIME] = ktime_get_boottime,
+	[CLOCK_TAI] = ktime_get_clocktai,
+};
+
+static ktime_t get_time_by_clockid(clockid_t clockid)
+{
+	get_time_func_t func = clockid_to_get_time[clockid];
+
+	if (!func)
+		return 0;
+
+	return func();
+}
+
+static inline int validate_input_params(struct tc_tbs_qopt *qopt,
+					struct netlink_ext_ack *extack)
+{
+	/* Check if params comply to the following rules:
+	 *	* If SW best-effort, then clockid and delta must be valid
+	 *	  regardless of sorting enabled or not.
+	 *
+	 *	* Dynamic clockids are not supported.
+	 *	* Delta must be a positive integer.
+	 */
+	if ((qopt->clockid & CLOCKID_INVALID) == CLOCKID_INVALID ||
+	    qopt->clockid >= MAX_CLOCKS) {
+		NL_SET_ERR_MSG(extack, "Invalid clockid");
+		return -EINVAL;
+	} else if (qopt->clockid < 0 ||
+		   !clockid_to_get_time[qopt->clockid]) {
+		NL_SET_ERR_MSG(extack, "Clockid is not supported");
+		return -ENOTSUPP;
+	}
+
+	if (qopt->delta < 0) {
+		NL_SET_ERR_MSG(extack, "Delta must be positive");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+	ktime_t txtime = nskb->tstamp;
+	struct sock *sk = nskb->sk;
+	ktime_t now;
+
+	if (sk && !sock_flag(sk, SOCK_TXTIME))
+		return false;
+
+	/* We don't perform crosstimestamping.
+	 * Drop if packet's clockid differs from qdisc's.
+	 */
+	if (nskb->txtime_clockid != q->clockid)
+		return false;
+
+	now = get_time_by_clockid(q->clockid);
+	if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
+		return false;
+
+	return true;
+}
+
+static struct sk_buff *tbs_peek(struct Qdisc *sch)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+
+	return q->peek(sch);
+}
+
+static struct sk_buff *tbs_peek_timesortedlist(struct Qdisc *sch)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+	struct rb_node *p;
+
+	p = rb_first(&q->head);
+	if (!p)
+		return NULL;
+
+	return rb_to_skb(p);
+}
+
+static void reset_watchdog(struct Qdisc *sch)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = tbs_peek(sch);
+	ktime_t next;
+
+	if (!skb)
+		return;
+
+	next = ktime_sub_ns(skb->tstamp, q->delta);
+	qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
+}
+
+static int tbs_enqueue(struct sk_buff *nskb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+
+	return q->enqueue(nskb, sch, to_free);
+}
+
+static int tbs_enqueue_scheduledfifo(struct sk_buff *nskb, struct Qdisc *sch,
+				     struct sk_buff **to_free)
+{
+	int err;
+
+	if (!is_packet_valid(sch, nskb))
+		return qdisc_drop(nskb, sch, to_free);
+
+	err = qdisc_enqueue_tail(nskb, sch);
+
+	/* If there is only 1 packet, then we must reset the watchdog. */
+	if (err >= 0 && sch->q.qlen == 1)
+		reset_watchdog(sch);
+
+	return err;
+}
+
+static int tbs_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
+				      struct sk_buff **to_free)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+	struct rb_node **p = &q->head.rb_node, *parent = NULL;
+	ktime_t txtime = nskb->tstamp;
+
+	if (!is_packet_valid(sch, nskb))
+		return qdisc_drop(nskb, sch, to_free);
+
+	while (*p) {
+		struct sk_buff *skb;
+
+		parent = *p;
+		skb = rb_to_skb(parent);
+		if (ktime_after(txtime, skb->tstamp))
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&nskb->rbnode, parent, p);
+	rb_insert_color(&nskb->rbnode, &q->head);
+
+	qdisc_qstats_backlog_inc(sch, nskb);
+	sch->q.qlen++;
+
+	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
+	reset_watchdog(sch);
+
+	return NET_XMIT_SUCCESS;
+}
+
+static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
+				 bool drop)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+
+	rb_erase(&skb->rbnode, &q->head);
+
+	qdisc_qstats_backlog_dec(sch, skb);
+
+	if (drop) {
+		struct sk_buff *to_free = NULL;
+
+		qdisc_drop(skb, sch, &to_free);
+		kfree_skb_list(to_free);
+		qdisc_qstats_overlimit(sch);
+	} else {
+		qdisc_bstats_update(sch, skb);
+
+		q->last = skb->tstamp;
+	}
+
+	sch->q.qlen--;
+
+	/* The rbnode field in the skb re-uses these fields, now that
+	 * we are done with the rbnode, reset them.
+	 */
+	skb->next = NULL;
+	skb->prev = NULL;
+	skb->dev = qdisc_dev(sch);
+}
+
+static struct sk_buff *tbs_dequeue(struct Qdisc *sch)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+
+	return q->dequeue(sch);
+}
+
+static struct sk_buff *tbs_dequeue_scheduledfifo(struct Qdisc *sch)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = tbs_peek(sch);
+	ktime_t now, next;
+
+	if (!skb)
+		return NULL;
+
+	now = get_time_by_clockid(q->clockid);
+
+	/* Drop if packet has expired while in queue and the drop_if_late
+	 * flag is set.
+	 */
+	if (skb->tc_drop_if_late && ktime_before(skb->tstamp, now)) {
+		struct sk_buff *to_free = NULL;
+
+		qdisc_queue_drop_head(sch, &to_free);
+		kfree_skb_list(to_free);
+		qdisc_qstats_overlimit(sch);
+
+		skb = NULL;
+		goto out;
+	}
+
+	next = ktime_sub_ns(skb->tstamp, q->delta);
+
+	/* Dequeue only if now is within the [txtime - delta, txtime] range. */
+	if (ktime_after(now, next))
+		skb = qdisc_dequeue_head(sch);
+	else
+		skb = NULL;
+
+out:
+	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
+	reset_watchdog(sch);
+
+	return skb;
+}
+
+static struct sk_buff *tbs_dequeue_timesortedlist(struct Qdisc *sch)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	ktime_t now, next;
+
+	skb = tbs_peek(sch);
+	if (!skb)
+		return NULL;
+
+	now = get_time_by_clockid(q->clockid);
+
+	/* Drop if packet has expired while in queue and the drop_if_late
+	 * flag is set.
+	 */
+	if (skb->tc_drop_if_late && ktime_before(skb->tstamp, now)) {
+		timesortedlist_erase(sch, skb, true);
+		skb = NULL;
+		goto out;
+	}
+
+	next = ktime_sub_ns(skb->tstamp, q->delta);
+
+	/* Dequeue only if now is within the [txtime - delta, txtime] range. */
+	if (ktime_after(now, next))
+		timesortedlist_erase(sch, skb, false);
+	else
+		skb = NULL;
+
+out:
+	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
+	reset_watchdog(sch);
+
+	return skb;
+}
+
+static inline void setup_queueing_mode(struct tbs_sched_data *q)
+{
+	if (q->sorting) {
+		q->enqueue = tbs_enqueue_timesortedlist;
+		q->dequeue = tbs_dequeue_timesortedlist;
+		q->peek = tbs_peek_timesortedlist;
+	} else {
+		q->enqueue = tbs_enqueue_scheduledfifo;
+		q->dequeue = tbs_dequeue_scheduledfifo;
+		q->peek = qdisc_peek_head;
+	}
+}
+
+static int tbs_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct nlattr *tb[TCA_TBS_MAX + 1];
+	struct tc_tbs_qopt *qopt;
+	int err;
+
+	if (!opt) {
+		NL_SET_ERR_MSG(extack, "Missing TBS qdisc options which are mandatory");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_TBS_MAX, opt, tbs_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_TBS_PARMS]) {
+		NL_SET_ERR_MSG(extack, "Missing mandatory TBS parameters");
+		return -EINVAL;
+	}
+
+	qopt = nla_data(tb[TCA_TBS_PARMS]);
+
+	pr_debug("delta %d clockid %d sorting %s\n",
+		 qopt->delta, qopt->clockid,
+		 SORTING_IS_ON(qopt) ? "on" : "off");
+
+	err = validate_input_params(qopt, extack);
+	if (err < 0)
+		return err;
+
+	q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+	/* Everything went OK, save the parameters used. */
+	q->delta = qopt->delta;
+	q->clockid = qopt->clockid;
+	q->sorting = SORTING_IS_ON(qopt);
+
+	/* Select queueing mode based on parameters. */
+	setup_queueing_mode(q);
+
+	qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid);
+
+	return 0;
+}
+
+static void timesortedlist_clear(struct Qdisc *sch)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+	struct rb_node *p = rb_first(&q->head);
+
+	while (p) {
+		struct sk_buff *skb = rb_to_skb(p);
+
+		p = rb_next(p);
+
+		rb_erase(&skb->rbnode, &q->head);
+		rtnl_kfree_skbs(skb, skb);
+		sch->q.qlen--;
+	}
+}
+
+static void tbs_reset(struct Qdisc *sch)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+
+	/* Only cancel watchdog if it's been initialized. */
+	if (q->watchdog.qdisc == sch)
+		qdisc_watchdog_cancel(&q->watchdog);
+
+	/* No matter which mode we are on, it's safe to clear both lists. */
+	timesortedlist_clear(sch);
+	__qdisc_reset_queue(&sch->q);
+
+	sch->qstats.backlog = 0;
+	sch->q.qlen = 0;
+
+	q->last = 0;
+}
+
+static void tbs_destroy(struct Qdisc *sch)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+
+	/* Only cancel watchdog if it's been initialized. */
+	if (q->watchdog.qdisc == sch)
+		qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int tbs_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct tbs_sched_data *q = qdisc_priv(sch);
+	struct tc_tbs_qopt opt = { };
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	opt.delta = q->delta;
+	opt.clockid = q->clockid;
+	if (q->sorting)
+		opt.flags |= TC_TBS_SORTING_ON;
+
+	if (nla_put(skb, TCA_TBS_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct Qdisc_ops tbs_qdisc_ops __read_mostly = {
+	.id		=	"tbs",
+	.priv_size	=	sizeof(struct tbs_sched_data),
+	.enqueue	=	tbs_enqueue,
+	.dequeue	=	tbs_dequeue,
+	.peek		=	tbs_peek,
+	.init		=	tbs_init,
+	.reset		=	tbs_reset,
+	.destroy	=	tbs_destroy,
+	.dump		=	tbs_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init tbs_module_init(void)
+{
+	return register_qdisc(&tbs_qdisc_ops);
+}
+
+static void __exit tbs_module_exit(void)
+{
+	unregister_qdisc(&tbs_qdisc_ops);
+}
+module_init(tbs_module_init)
+module_exit(tbs_module_exit)
+MODULE_LICENSE("GPL");