diff mbox

[net-next-2.6,v2,3/3] net_sched: implement a root container qdisc sch_mclass

Message ID 20101221192930.9703.63791.stgit@jf-dev1-dcblab
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

John Fastabend Dec. 21, 2010, 7:29 p.m. UTC
This implements a mclass 'multi-class' queueing discipline that by
default creates multiple mq qdisc's one for each traffic class. Each
mq qdisc then owns a range of queues per the netdev_tc_txq mappings.

Using the mclass qdisc the number of tcs currently in use along
with the range of queues alloted to each class can be configured. By
default skbs are mapped to traffic classes using the skb priority.
This mapping is configurable.

Configurable parameters,

struct tc_mclass_qopt {
        __u8    num_tc;
        __u8    prio_tc_map[16];
        __u8    hw;
        __u16   count[16];
        __u16   offset[16];
};

Here the count/offset pairing give the queue alignment and the
prio_tc_map gives the mapping from skb->priority to tc. The
hw bit determines if the hardware should configure the count
and offset values. If the hardware bit is set then the operation
will fail if the hardware does not implement the ndo_setup_tc
operation. This is to avoid undetermined states where the hardware
may or may not control the queue mapping. Also minimal bounds
checking is done on the count/offset to verify a queue does not
exceed num_tx_queues and that queue ranges do not overlap. Otherwise
it is left to user policy or hardware configuration to create
useful mappings.

It is expected that hardware QOS schemes can be implemented by
creating appropriate mappings of queues in ndo_tc_setup(). This
scheme can be expanded as needed with additional qdisc being graft'd
onto the root qdisc to provide per tc queuing disciplines. Allowing
Software and hardware queuing disciplines can be used together

One expected use case is drivers will use the ndo_setup_tc to map
queue ranges onto 802.1Q traffic classes. This provides a generic
mechanism to map network traffic onto these traffic classes and
removes the need for lower layer drivers to no specifics about
traffic types.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 include/linux/netdevice.h |    3 
 include/linux/pkt_sched.h |    9 +
 include/net/sch_generic.h |    2 
 net/sched/Makefile        |    2 
 net/sched/sch_api.c       |    1 
 net/sched/sch_generic.c   |   10 +
 net/sched/sch_mclass.c    |  376 +++++++++++++++++++++++++++++++++++++++++++++
 net/sched/sch_mq.c        |    3 
 8 files changed, 403 insertions(+), 3 deletions(-)
 create mode 100644 net/sched/sch_mclass.c


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Jarek Poplawski Dec. 30, 2010, 11:37 p.m. UTC | #1
John Fastabend wrote:
> This implements a mclass 'multi-class' queueing discipline that by
> default creates multiple mq qdisc's one for each traffic class. Each
> mq qdisc then owns a range of queues per the netdev_tc_txq mappings.

Is it really necessary to add one more abstraction layer for this,
probably not most often used (or even asked by users), functionality?
Why mclass can't simply do these few things more instead of attaching
(and changing) mq?

...
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 0af57eb..723ee52 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -50,6 +50,7 @@ struct Qdisc {
>  #define TCQ_F_INGRESS		4
>  #define TCQ_F_CAN_BYPASS	8
>  #define TCQ_F_MQROOT		16
> +#define TCQ_F_MQSAFE		32

If every other qdisc added a flag for qdiscs it likes...

> @@ -709,7 +709,13 @@ static void attach_default_qdiscs(struct net_device *dev)
>  		dev->qdisc = txq->qdisc_sleeping;
>  		atomic_inc(&dev->qdisc->refcnt);
>  	} else {
> -		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
> +		if (dev->num_tc)

Actually, where this num_tc is expected to be set? I can see it inside
mclass only, with unsetting on destruction, but probably I miss something.

> +			qdisc = qdisc_create_dflt(txq, &mclass_qdisc_ops,
> +						  TC_H_ROOT);
> +		else
> +			qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops,
> +						  TC_H_ROOT);
> +
> +static int mclass_init(struct Qdisc *sch, struct nlattr *opt)
> +{
> +	struct net_device *dev = qdisc_dev(sch);
> +	struct mclass_sched *priv = qdisc_priv(sch);
> +	struct netdev_queue *dev_queue;
> +	struct Qdisc *qdisc;
> +	int i, err = -EOPNOTSUPP;
> +	struct tc_mclass_qopt *qopt = NULL;
> +
> +	/* Unwind attributes on failure */
> +	u8 unwnd_tc = dev->num_tc;
> +	u8 unwnd_map[16];

[TC_MAX_QUEUE] ?

> +	struct netdev_tc_txq unwnd_txq[16];
> +
> +	if (sch->parent != TC_H_ROOT)
> +		return -EOPNOTSUPP;
> +
> +	if (!netif_is_multiqueue(dev))
> +		return -EOPNOTSUPP;
> +
> +	if (nla_len(opt) < sizeof(*qopt))
> +		return -EINVAL;
> +	qopt = nla_data(opt);
> +
> +	memcpy(unwnd_map, dev->prio_tc_map, sizeof(unwnd_map));
> +	memcpy(unwnd_txq, dev->tc_to_txq, sizeof(unwnd_txq));
> +
> +	/* If the mclass options indicate that hardware should own
> +	 * the queue mapping then run ndo_setup_tc if this can not
> +	 * be done fail immediately.
> +	 */
> +	if (qopt->hw && dev->netdev_ops->ndo_setup_tc) {
> +		priv->hw_owned = 1;
> +		if (dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc))
> +			return -EINVAL;
> +	} else if (!qopt->hw) {
> +		if (mclass_parse_opt(dev, qopt))
> +			return -EINVAL;
> +
> +		if (netdev_set_num_tc(dev, qopt->num_tc))
> +			return -ENOMEM;
> +
> +		for (i = 0; i < qopt->num_tc; i++)
> +			netdev_set_tc_queue(dev, i,
> +					    qopt->count[i], qopt->offset[i]);
> +	} else {
> +		return -EINVAL;
> +	}
> +
> +	/* Always use supplied priority mappings */
> +	for (i = 0; i < 16; i++) {

i < qopt->num_tc ?

> +		if (netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i])) {
> +			err = -EINVAL;
> +			goto tc_err;
> +		}
> +	}
> +
> +	/* pre-allocate qdisc, attachment can't fail */
> +	priv->qdiscs = kcalloc(qopt->num_tc,
> +			       sizeof(priv->qdiscs[0]), GFP_KERNEL);
> +	if (priv->qdiscs == NULL) {
> +		err = -ENOMEM;
> +		goto tc_err;
> +	}
> +
> +	for (i = 0; i < dev->num_tc; i++) {
> +		dev_queue = netdev_get_tx_queue(dev, dev->tc_to_txq[i].offset);

Are these offsets etc. validated?

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Dec. 30, 2010, 11:56 p.m. UTC | #2
Jarek Poplawski wrote:
> John Fastabend wrote:
...
>> +	for (i = 0; i < dev->num_tc; i++) {
>> +		dev_queue = netdev_get_tx_queue(dev, dev->tc_to_txq[i].offset);
> 
> Are these offsets etc. validated?

They are... Forget this last question.

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Dec. 31, 2010, 9:25 a.m. UTC | #3
On 2010-12-21 20:29, John Fastabend wrote:
> This implements a mclass 'multi-class' queueing discipline that by
> default creates multiple mq qdisc's one for each traffic class. Each
> mq qdisc then owns a range of queues per the netdev_tc_txq mappings.

Btw, you could also consider better name (mqprio?) because there're
many 'multi-class' queueing disciplines around.

> +static int mclass_parse_opt(struct net_device *dev, struct tc_mclass_qopt *qopt)
> +{
> +	int i, j;
> +
> +	/* Verify TC offset and count are sane */

if (qopt->num_tc > TC_MAX_QUEUE) ?
	return -EINVAL;

> +	for (i = 0; i < qopt->num_tc; i++) {
> +		int last = qopt->offset[i] + qopt->count[i];
> +		if (last > dev->num_tx_queues)

if (last >= dev->num_tx_queues) ?

> +			return -EINVAL;
> +		for (j = i + 1; j < qopt->num_tc; j++) {
> +			if (last > qopt->offset[j])

if (last >= qopt->offset[j]) ?

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Jan. 3, 2011, 5:43 a.m. UTC | #4
On 12/30/2010 3:37 PM, Jarek Poplawski wrote:
> John Fastabend wrote:
>> This implements a mclass 'multi-class' queueing discipline that by
>> default creates multiple mq qdisc's one for each traffic class. Each
>> mq qdisc then owns a range of queues per the netdev_tc_txq mappings.
> 
> Is it really necessary to add one more abstraction layer for this,
> probably not most often used (or even asked by users), functionality?
> Why mclass can't simply do these few things more instead of attaching
> (and changing) mq?
> 

The statistics work nicely when the mq qdisc is used. 

qdisc mclass 8002: root  tc 4 map 0 1 2 3 0 1 2 3 1 1 1 1 1 1 1 1
             queues:(0:1) (2:3) (4:5) (6:15)
 Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
qdisc mq 8003: parent 8002:1
 Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
qdisc mq 8004: parent 8002:2
 Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
qdisc mq 8005: parent 8002:3
 Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
qdisc mq 8006: parent 8002:4
 Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
qdisc sfq 8007: parent 8005:1 limit 127p quantum 1514b
 Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
qdisc sfq 8008: parent 8005:2 limit 127p quantum 1514b
 Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0

The mclass gives the statistics for the interface and then statistics on the mq qdisc gives statistics for each traffic class. Also, when using the 'mq qdisc' with this abstraction other qdisc can be grafted onto the queue. For example the sch_sfq is used in the above example.

Although I am not too hung up on this use case it does seem to be a good abstraction to me. Is it strictly necessary though no and looking at the class statistics of mclass could be used to get stats per traffic class.

> ...
>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>> index 0af57eb..723ee52 100644
>> --- a/include/net/sch_generic.h
>> +++ b/include/net/sch_generic.h
>> @@ -50,6 +50,7 @@ struct Qdisc {
>>  #define TCQ_F_INGRESS		4
>>  #define TCQ_F_CAN_BYPASS	8
>>  #define TCQ_F_MQROOT		16
>> +#define TCQ_F_MQSAFE		32
> 
> If every other qdisc added a flag for qdiscs it likes...
> 

then we run out of bits and get unneeded complexity. I think I will drop the MQSAFE bit completely and let user space catch this. The worst that should happen is the noop qdisc is used.

>> @@ -709,7 +709,13 @@ static void attach_default_qdiscs(struct net_device *dev)
>>  		dev->qdisc = txq->qdisc_sleeping;
>>  		atomic_inc(&dev->qdisc->refcnt);
>>  	} else {
>> -		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
>> +		if (dev->num_tc)
> 
> Actually, where this num_tc is expected to be set? I can see it inside
> mclass only, with unsetting on destruction, but probably I miss something.

Either through mclass as you noted or a driver could set the num_tc. One of the RFC's I sent out has ixgbe setting the num_tc when DCB was enabled.

>> +			qdisc = qdisc_create_dflt(txq, &mclass_qdisc_ops,
>> +						  TC_H_ROOT);
>> +		else
>> +			qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops,
>> +						  TC_H_ROOT);
>> +
>> +static int mclass_init(struct Qdisc *sch, struct nlattr *opt)
>> +{
>> +	struct net_device *dev = qdisc_dev(sch);
>> +	struct mclass_sched *priv = qdisc_priv(sch);
>> +	struct netdev_queue *dev_queue;
>> +	struct Qdisc *qdisc;
>> +	int i, err = -EOPNOTSUPP;
>> +	struct tc_mclass_qopt *qopt = NULL;
>> +
>> +	/* Unwind attributes on failure */
>> +	u8 unwnd_tc = dev->num_tc;
>> +	u8 unwnd_map[16];
> 
> [TC_MAX_QUEUE] ?

Actually TC_BITMASK+1 is probably more accurate. This array maps the skb priority to a traffic class after the priority is masked with TC_BITMASK.

> 
>> +	struct netdev_tc_txq unwnd_txq[16];
>> +

Although unwnd_txq should be TC_MAX_QUEUE.

>> +	if (sch->parent != TC_H_ROOT)
>> +		return -EOPNOTSUPP;
>> +
>> +	if (!netif_is_multiqueue(dev))
>> +		return -EOPNOTSUPP;
>> +
>> +	if (nla_len(opt) < sizeof(*qopt))
>> +		return -EINVAL;
>> +	qopt = nla_data(opt);
>> +
>> +	memcpy(unwnd_map, dev->prio_tc_map, sizeof(unwnd_map));
>> +	memcpy(unwnd_txq, dev->tc_to_txq, sizeof(unwnd_txq));
>> +
>> +	/* If the mclass options indicate that hardware should own
>> +	 * the queue mapping then run ndo_setup_tc if this can not
>> +	 * be done fail immediately.
>> +	 */
>> +	if (qopt->hw && dev->netdev_ops->ndo_setup_tc) {
>> +		priv->hw_owned = 1;
>> +		if (dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc))
>> +			return -EINVAL;
>> +	} else if (!qopt->hw) {
>> +		if (mclass_parse_opt(dev, qopt))
>> +			return -EINVAL;
>> +
>> +		if (netdev_set_num_tc(dev, qopt->num_tc))
>> +			return -ENOMEM;
>> +
>> +		for (i = 0; i < qopt->num_tc; i++)
>> +			netdev_set_tc_queue(dev, i,
>> +					    qopt->count[i], qopt->offset[i]);
>> +	} else {
>> +		return -EINVAL;
>> +	}
>> +
>> +	/* Always use supplied priority mappings */
>> +	for (i = 0; i < 16; i++) {
> 
> i < qopt->num_tc ?

Nope, TC_BITMASK+1 here. If we only have 4 tcs for example we still need to map all 16 priority values to a tc.

> 
>> +		if (netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i])) {
>> +			err = -EINVAL;
>> +			goto tc_err;
>> +		}
>> +	}
>> +
>> +	/* pre-allocate qdisc, attachment can't fail */
>> +	priv->qdiscs = kcalloc(qopt->num_tc,
>> +			       sizeof(priv->qdiscs[0]), GFP_KERNEL);
>> +	if (priv->qdiscs == NULL) {
>> +		err = -ENOMEM;
>> +		goto tc_err;
>> +	}
>> +
>> +	for (i = 0; i < dev->num_tc; i++) {
>> +		dev_queue = netdev_get_tx_queue(dev, dev->tc_to_txq[i].offset);
> 
> Are these offsets etc. validated?

Yes, as your next email noted.

Thanks,
John
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Jan. 3, 2011, 5:46 a.m. UTC | #5
On 12/31/2010 1:25 AM, Jarek Poplawski wrote:
> On 2010-12-21 20:29, John Fastabend wrote:
>> This implements a mclass 'multi-class' queueing discipline that by
>> default creates multiple mq qdisc's one for each traffic class. Each
>> mq qdisc then owns a range of queues per the netdev_tc_txq mappings.
> 
> Btw, you could also consider better name (mqprio?) because there're
> many 'multi-class' queueing disciplines around.
> 

OK.

>> +static int mclass_parse_opt(struct net_device *dev, struct tc_mclass_qopt *qopt)
>> +{
>> +	int i, j;
>> +
>> +	/* Verify TC offset and count are sane */
> 
> if (qopt->num_tc > TC_MAX_QUEUE) ?
> 	return -EINVAL;

This would be caught later when netdev_set_num_tc() fails although probably best to catch all failures in this function as early as possible.

> 
>> +	for (i = 0; i < qopt->num_tc; i++) {
>> +		int last = qopt->offset[i] + qopt->count[i];
>> +		if (last > dev->num_tx_queues)
> 
> if (last >= dev->num_tx_queues) ?
> 
>> +			return -EINVAL;
>> +		for (j = i + 1; j < qopt->num_tc; j++) {
>> +			if (last > qopt->offset[j])
> 
> if (last >= qopt->offset[j]) ?
	
I believe the below works as expected. The offset needs to be verified (this I missed) but offset+count can be equal to num_tx_queue indicating the last queue is in use. With 8 tx queues and num_tc=2 a valid configuration is, tc1 offset of 0 and a count of 7 with tc2 offset of 7 and count of 1.


        /* Verify num_tc is in max range */
        if (qopt->num_tc > TC_MAX_QUEUE)
                return -EINVAL;

        for (i = 0; i < qopt->num_tc; i++) {
                /* Verify the queue offset is in the num tx range */
                if (qopt->offset[i] >= dev->num_tx_queues)
                        return -EINVAL;
                /* Verify the queue count is in tx range being equal to the
                 * num_tx_queues indicates the last queue is in use.
                 */
                else if (qopt->offset[i] + qopt->count[i] > dev->num_tx_queues)
                        return -EINVAL;

                /* Verify that the offset and counts do not overlap */
                for (j = i + 1; j < qopt->num_tc; j++) {
                        if (last > qopt->offset[j])
                                return -EINVAL;
                }
        }


Thanks for the review!

John.

> 
> Jarek P.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Jan. 3, 2011, 5:02 p.m. UTC | #6
On Sun, Jan 02, 2011 at 09:43:27PM -0800, John Fastabend wrote:
> On 12/30/2010 3:37 PM, Jarek Poplawski wrote:
> > John Fastabend wrote:
> >> This implements a mclass 'multi-class' queueing discipline that by
> >> default creates multiple mq qdisc's one for each traffic class. Each
> >> mq qdisc then owns a range of queues per the netdev_tc_txq mappings.
> > 
> > Is it really necessary to add one more abstraction layer for this,
> > probably not most often used (or even asked by users), functionality?
> > Why mclass can't simply do these few things more instead of attaching
> > (and changing) mq?
> > 
> 
> The statistics work nicely when the mq qdisc is used. 

Well, I sometimes add leaf qdiscs only to get class stats with less
typing, too ;-)

> 
> qdisc mclass 8002: root  tc 4 map 0 1 2 3 0 1 2 3 1 1 1 1 1 1 1 1
>              queues:(0:1) (2:3) (4:5) (6:15)
>  Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
>  backlog 0b 0p requeues 0
> qdisc mq 8003: parent 8002:1
>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>  backlog 0b 0p requeues 0
> qdisc mq 8004: parent 8002:2
>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>  backlog 0b 0p requeues 0
> qdisc mq 8005: parent 8002:3
>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>  backlog 0b 0p requeues 0
> qdisc mq 8006: parent 8002:4
>  Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
>  backlog 0b 0p requeues 0
> qdisc sfq 8007: parent 8005:1 limit 127p quantum 1514b
>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>  backlog 0b 0p requeues 0
> qdisc sfq 8008: parent 8005:2 limit 127p quantum 1514b
>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>  backlog 0b 0p requeues 0
> 
> The mclass gives the statistics for the interface and then statistics on the mq qdisc gives statistics for each traffic class. Also, when using the 'mq qdisc' with this abstraction other qdisc can be grafted onto the queue. For example the sch_sfq is used in the above example.

IMHO, these tc offsets and counts make simply two level hierarchy
(classes with leaf subclasses) similarly (or simpler) to other
classful qdisc which manage it all inside one module. Of course,
we could think of another way of code organization, but it should
be rather done at the beginning of schedulers design. The mq qdisc
broke the design a bit adding a fake root, but I doubt we should go
deeper unless it's necessary. Doing mclass (or something) as a more
complex alternative to mq should be enough. Why couldn't mclass graft
sch_sfq the same way as mq?

> 
> Although I am not too hung up on this use case it does seem to be a good abstraction to me. Is it strictly necessary though no and looking at the class statistics of mclass could be used to get stats per traffic class.

I am not too hung up on this either, especially if it's OK to others,
especially to DaveM ;-)

> 
> > ...
> >> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> >> index 0af57eb..723ee52 100644
> >> --- a/include/net/sch_generic.h
> >> +++ b/include/net/sch_generic.h
> >> @@ -50,6 +50,7 @@ struct Qdisc {
> >>  #define TCQ_F_INGRESS		4
> >>  #define TCQ_F_CAN_BYPASS	8
> >>  #define TCQ_F_MQROOT		16
> >> +#define TCQ_F_MQSAFE		32
> > 
> > If every other qdisc added a flag for qdiscs it likes...
> > 
> 
> then we run out of bits and get unneeded complexity. I think I will drop the MQSAFE bit completely and let user space catch this. The worst that should happen is the noop qdisc is used.

Maybe you're right. On the other hand, usually flags are added for
more general purpose and the optimal/wrong configs are the matter of
documentation.

> 
> >> @@ -709,7 +709,13 @@ static void attach_default_qdiscs(struct net_device *dev)
> >>  		dev->qdisc = txq->qdisc_sleeping;
> >>  		atomic_inc(&dev->qdisc->refcnt);
> >>  	} else {
> >> -		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
> >> +		if (dev->num_tc)
> > 
> > Actually, where this num_tc is expected to be set? I can see it inside
> > mclass only, with unsetting on destruction, but probably I miss something.
> 
> Either through mclass as you noted or a driver could set the num_tc. One of the RFC's I sent out has ixgbe setting the num_tc when DCB was enabled.

OK, I probably missed this second possibility in the last version.

...
> >> +	/* Unwind attributes on failure */
> >> +	u8 unwnd_tc = dev->num_tc;
> >> +	u8 unwnd_map[16];
> > 
> > [TC_MAX_QUEUE] ?
> 
> Actually TC_BITMASK+1 is probably more accurate. This array maps the skb priority to a traffic class after the priority is masked with TC_BITMASK.
> 
> > 
> >> +	struct netdev_tc_txq unwnd_txq[16];
> >> +
> 
> Although unwnd_txq should be TC_MAX_QUEUE.
...
> >> +	/* Always use supplied priority mappings */
> >> +	for (i = 0; i < 16; i++) {
> > 
> > i < qopt->num_tc ?
> 
> Nope, TC_BITMASK+1 here. If we only have 4 tcs for example we still need to map all 16 priority values to a tc.

OK, anyway, all these '16' should be 'upgraded'.
 
Thanks,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Jan. 3, 2011, 5:04 p.m. UTC | #7
On Sun, Jan 02, 2011 at 09:46:16PM -0800, John Fastabend wrote:
> On 12/31/2010 1:25 AM, Jarek Poplawski wrote:
> > On 2010-12-21 20:29, John Fastabend wrote:
> >> +static int mclass_parse_opt(struct net_device *dev, struct tc_mclass_qopt *qopt)
> >> +{
> >> +	int i, j;
> >> +
> >> +	/* Verify TC offset and count are sane */
> > 
> > if (qopt->num_tc > TC_MAX_QUEUE) ?
> > 	return -EINVAL;
> 
> This would be caught later when netdev_set_num_tc() fails although probably best to catch all failures in this function as early as possible.

Plus reading beyond the table range wouldn't look nice.

> 
> > 
> >> +	for (i = 0; i < qopt->num_tc; i++) {
> >> +		int last = qopt->offset[i] + qopt->count[i];
> >> +		if (last > dev->num_tx_queues)
> > 
> > if (last >= dev->num_tx_queues) ?
> > 
> >> +			return -EINVAL;
> >> +		for (j = i + 1; j < qopt->num_tc; j++) {
> >> +			if (last > qopt->offset[j])
> > 
> > if (last >= qopt->offset[j]) ?
> 	
> I believe the below works as expected. The offset needs to be verified (this I missed) but offset+count can be equal to num_tx_queue indicating the last queue is in use. With 8 tx queues and num_tc=2 a valid configuration is, tc1 offset of 0 and a count of 7 with tc2 offset of 7 and count of 1.
> 
> 
>         /* Verify num_tc is in max range */
>         if (qopt->num_tc > TC_MAX_QUEUE)
>                 return -EINVAL;
> 
>         for (i = 0; i < qopt->num_tc; i++) {
>                 /* Verify the queue offset is in the num tx range */
>                 if (qopt->offset[i] >= dev->num_tx_queues)
>                         return -EINVAL;
>                 /* Verify the queue count is in tx range being equal to the
>                  * num_tx_queues indicates the last queue is in use.
>                  */
>                 else if (qopt->offset[i] + qopt->count[i] > dev->num_tx_queues)
>                         return -EINVAL;
> 
>                 /* Verify that the offset and counts do not overlap */
>                 for (j = i + 1; j < qopt->num_tc; j++) {
>                         if (last > qopt->offset[j])
>                                 return -EINVAL;
>                 }
>         }

Yes, after assigning the 'last' it should work OK ;-)

Thanks,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Jan. 3, 2011, 8:37 p.m. UTC | #8
On 1/3/2011 9:02 AM, Jarek Poplawski wrote:
> On Sun, Jan 02, 2011 at 09:43:27PM -0800, John Fastabend wrote:
>> On 12/30/2010 3:37 PM, Jarek Poplawski wrote:
>>> John Fastabend wrote:
>>>> This implements a mclass 'multi-class' queueing discipline that by
>>>> default creates multiple mq qdisc's one for each traffic class. Each
>>>> mq qdisc then owns a range of queues per the netdev_tc_txq mappings.
>>>
>>> Is it really necessary to add one more abstraction layer for this,
>>> probably not most often used (or even asked by users), functionality?
>>> Why mclass can't simply do these few things more instead of attaching
>>> (and changing) mq?
>>>
>>
>> The statistics work nicely when the mq qdisc is used. 
> 
> Well, I sometimes add leaf qdiscs only to get class stats with less
> typing, too ;-)
> 
>>
>> qdisc mclass 8002: root  tc 4 map 0 1 2 3 0 1 2 3 1 1 1 1 1 1 1 1
>>              queues:(0:1) (2:3) (4:5) (6:15)
>>  Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
>>  backlog 0b 0p requeues 0
>> qdisc mq 8003: parent 8002:1
>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>  backlog 0b 0p requeues 0
>> qdisc mq 8004: parent 8002:2
>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>  backlog 0b 0p requeues 0
>> qdisc mq 8005: parent 8002:3
>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>  backlog 0b 0p requeues 0
>> qdisc mq 8006: parent 8002:4
>>  Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
>>  backlog 0b 0p requeues 0
>> qdisc sfq 8007: parent 8005:1 limit 127p quantum 1514b
>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>  backlog 0b 0p requeues 0
>> qdisc sfq 8008: parent 8005:2 limit 127p quantum 1514b
>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>  backlog 0b 0p requeues 0
>>
>> The mclass gives the statistics for the interface and then statistics on the mq qdisc gives statistics for each traffic class. Also, when using the 'mq qdisc' with this abstraction other qdisc can be grafted onto the queue. For example the sch_sfq is used in the above example.
> 
> IMHO, these tc offsets and counts make simply two level hierarchy
> (classes with leaf subclasses) similarly (or simpler) to other
> classful qdisc which manage it all inside one module. Of course,
> we could think of another way of code organization, but it should
> be rather done at the beginning of schedulers design. The mq qdisc
> broke the design a bit adding a fake root, but I doubt we should go
> deeper unless it's necessary. Doing mclass (or something) as a more
> complex alternative to mq should be enough. Why couldn't mclass graft
> sch_sfq the same way as mq?
> 

If you also want to graft a scheduler onto a traffic class now your stuck. For now this qdisc doesn't exist, but I would like to have a software implementation of the currently offloaded DCB ETS scheduler. The 802.1Qaz spec allows different scheduling algorithms to be used on each traffic class. In the current implementation mclass could graft these scheduling schemes onto each traffic class independently.

                              mclass
                                |
    -------------------------------------------------------
    |         |        |        |     |     |     |       |
   mq_tbf   mq_tbf   mq_ets   mq_ets  mq    mq   mq_wrr greedy
   |                            |
 ---------                  ---------
 |   |   |                  |   |   |
red red red                red red red

Perhaps, being concerned with hypothetical qdiscs is a bit of a stretch but I would like to at least not preclude this work from being done in the future.

>>
>> Although I am not too hung up on this use case it does seem to be a good abstraction to me. Is it strictly necessary though no and looking at the class statistics of mclass could be used to get stats per traffic class.
> 
> I am not too hung up on this either, especially if it's OK to others,
> especially to DaveM ;-)
> 
>>
>>> ...
>>>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>>>> index 0af57eb..723ee52 100644
>>>> --- a/include/net/sch_generic.h
>>>> +++ b/include/net/sch_generic.h
>>>> @@ -50,6 +50,7 @@ struct Qdisc {
>>>>  #define TCQ_F_INGRESS		4
>>>>  #define TCQ_F_CAN_BYPASS	8
>>>>  #define TCQ_F_MQROOT		16
>>>> +#define TCQ_F_MQSAFE		32
>>>
>>> If every other qdisc added a flag for qdiscs it likes...
>>>
>>
>> then we run out of bits and get unneeded complexity. I think I will drop the MQSAFE bit completely and let user space catch this. The worst that should happen is the noop qdisc is used.
> 
> Maybe you're right. On the other hand, usually flags are added for
> more general purpose and the optimal/wrong configs are the matter of
> documentation.
> 

So we handle this with documentation.

>>
>>>> @@ -709,7 +709,13 @@ static void attach_default_qdiscs(struct net_device *dev)
>>>>  		dev->qdisc = txq->qdisc_sleeping;
>>>>  		atomic_inc(&dev->qdisc->refcnt);
>>>>  	} else {
>>>> -		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
>>>> +		if (dev->num_tc)
>>>
>>> Actually, where this num_tc is expected to be set? I can see it inside
>>> mclass only, with unsetting on destruction, but probably I miss something.
>>
>> Either through mclass as you noted or a driver could set the num_tc. One of the RFC's I sent out has ixgbe setting the num_tc when DCB was enabled.
> 
> OK, I probably missed this second possibility in the last version.
> 
> ...
>>>> +	/* Unwind attributes on failure */
>>>> +	u8 unwnd_tc = dev->num_tc;
>>>> +	u8 unwnd_map[16];
>>>
>>> [TC_MAX_QUEUE] ?
>>
>> Actually TC_BITMASK+1 is probably more accurate. This array maps the skb priority to a traffic class after the priority is masked with TC_BITMASK.
>>
>>>
>>>> +	struct netdev_tc_txq unwnd_txq[16];
>>>> +
>>
>> Although unwnd_txq should be TC_MAX_QUEUE.
> ...
>>>> +	/* Always use supplied priority mappings */
>>>> +	for (i = 0; i < 16; i++) {
>>>
>>> i < qopt->num_tc ?
>>
>> Nope, TC_BITMASK+1 here. If we only have 4 tcs for example we still need to map all 16 priority values to a tc.
> 
> OK, anyway, all these '16' should be 'upgraded'.

Yes. I'll do this in the next version.

Thanks,
John.

>  
> Thanks,
> Jarek P.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Jan. 3, 2011, 10:59 p.m. UTC | #9
On Mon, Jan 03, 2011 at 12:37:56PM -0800, John Fastabend wrote:
> On 1/3/2011 9:02 AM, Jarek Poplawski wrote:
> > On Sun, Jan 02, 2011 at 09:43:27PM -0800, John Fastabend wrote:
> >> On 12/30/2010 3:37 PM, Jarek Poplawski wrote:
> >>> John Fastabend wrote:
> >>>> This implements a mclass 'multi-class' queueing discipline that by
> >>>> default creates multiple mq qdisc's one for each traffic class. Each
> >>>> mq qdisc then owns a range of queues per the netdev_tc_txq mappings.
> >>>
> >>> Is it really necessary to add one more abstraction layer for this,
> >>> probably not most often used (or even asked by users), functionality?
> >>> Why mclass can't simply do these few things more instead of attaching
> >>> (and changing) mq?
> >>>
> >>
> >> The statistics work nicely when the mq qdisc is used. 
> > 
> > Well, I sometimes add leaf qdiscs only to get class stats with less
> > typing, too ;-)
> > 
> >>
> >> qdisc mclass 8002: root  tc 4 map 0 1 2 3 0 1 2 3 1 1 1 1 1 1 1 1
> >>              queues:(0:1) (2:3) (4:5) (6:15)
> >>  Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
> >>  backlog 0b 0p requeues 0
> >> qdisc mq 8003: parent 8002:1
> >>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
> >>  backlog 0b 0p requeues 0
> >> qdisc mq 8004: parent 8002:2
> >>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
> >>  backlog 0b 0p requeues 0
> >> qdisc mq 8005: parent 8002:3
> >>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
> >>  backlog 0b 0p requeues 0
> >> qdisc mq 8006: parent 8002:4
> >>  Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
> >>  backlog 0b 0p requeues 0
> >> qdisc sfq 8007: parent 8005:1 limit 127p quantum 1514b
> >>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
> >>  backlog 0b 0p requeues 0
> >> qdisc sfq 8008: parent 8005:2 limit 127p quantum 1514b
> >>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
> >>  backlog 0b 0p requeues 0
> >>
> >> The mclass gives the statistics for the interface and then statistics on the mq qdisc gives statistics for each traffic class. Also, when using the 'mq qdisc' with this abstraction other qdisc can be grafted onto the queue. For example the sch_sfq is used in the above example.
> > 
> > IMHO, these tc offsets and counts make simply two level hierarchy
> > (classes with leaf subclasses) similarly (or simpler) to other
> > classful qdisc which manage it all inside one module. Of course,
> > we could think of another way of code organization, but it should
> > be rather done at the beginning of schedulers design. The mq qdisc
> > broke the design a bit adding a fake root, but I doubt we should go
> > deeper unless it's necessary. Doing mclass (or something) as a more
> > complex alternative to mq should be enough. Why couldn't mclass graft
> > sch_sfq the same way as mq?
> > 
> 
> If you also want to graft a scheduler onto a traffic class now your stuck. For now this qdisc doesn't exist, but I would like to have a software implementation of the currently offloaded DCB ETS scheduler. The 802.1Qaz spec allows different scheduling algorithms to be used on each traffic class. In the current implementation mclass could graft these scheduling schemes onto each traffic class independently.
> 
>                               mclass
>                                 |
>     -------------------------------------------------------
>     |         |        |        |     |     |     |       |
>    mq_tbf   mq_tbf   mq_ets   mq_ets  mq    mq   mq_wrr greedy
>    |                            |
>  ---------                  ---------
>  |   |   |                  |   |   |
> red red red                red red red
> 
> Perhaps, being concerned with hypothetical qdiscs is a bit of a stretch but I would like to at least not preclude this work from being done in the future.

Probably, despite this very nice figure and description, I still miss
something and can't see the problem. If you graft a qdisc/scheduler
to a traffic class you can change the way/range of grafting depending
on additional parameters or even by checking some properties of the
grafted qdisc. My main concern is adding complexity to the qdisc tree
structure (instead of hiding it at the class level) for something,
IMHO, hardly ever popular (like both mq and DCB).

Thanks,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Jan. 4, 2011, 12:18 a.m. UTC | #10
On 1/3/2011 2:59 PM, Jarek Poplawski wrote:
> On Mon, Jan 03, 2011 at 12:37:56PM -0800, John Fastabend wrote:
>> On 1/3/2011 9:02 AM, Jarek Poplawski wrote:
>>> On Sun, Jan 02, 2011 at 09:43:27PM -0800, John Fastabend wrote:
>>>> On 12/30/2010 3:37 PM, Jarek Poplawski wrote:
>>>>> John Fastabend wrote:
>>>>>> This implements a mclass 'multi-class' queueing discipline that by
>>>>>> default creates multiple mq qdisc's one for each traffic class. Each
>>>>>> mq qdisc then owns a range of queues per the netdev_tc_txq mappings.
>>>>>
>>>>> Is it really necessary to add one more abstraction layer for this,
>>>>> probably not most often used (or even asked by users), functionality?
>>>>> Why mclass can't simply do these few things more instead of attaching
>>>>> (and changing) mq?
>>>>>
>>>>
>>>> The statistics work nicely when the mq qdisc is used. 
>>>
>>> Well, I sometimes add leaf qdiscs only to get class stats with less
>>> typing, too ;-)
>>>
>>>>
>>>> qdisc mclass 8002: root  tc 4 map 0 1 2 3 0 1 2 3 1 1 1 1 1 1 1 1
>>>>              queues:(0:1) (2:3) (4:5) (6:15)
>>>>  Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
>>>>  backlog 0b 0p requeues 0
>>>> qdisc mq 8003: parent 8002:1
>>>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>>>  backlog 0b 0p requeues 0
>>>> qdisc mq 8004: parent 8002:2
>>>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>>>  backlog 0b 0p requeues 0
>>>> qdisc mq 8005: parent 8002:3
>>>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>>>  backlog 0b 0p requeues 0
>>>> qdisc mq 8006: parent 8002:4
>>>>  Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
>>>>  backlog 0b 0p requeues 0
>>>> qdisc sfq 8007: parent 8005:1 limit 127p quantum 1514b
>>>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>>>  backlog 0b 0p requeues 0
>>>> qdisc sfq 8008: parent 8005:2 limit 127p quantum 1514b
>>>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>>>  backlog 0b 0p requeues 0
>>>>
>>>> The mclass gives the statistics for the interface and then statistics on the mq qdisc gives statistics for each traffic class. Also, when using the 'mq qdisc' with this abstraction other qdisc can be grafted onto the queue. For example the sch_sfq is used in the above example.
>>>
>>> IMHO, these tc offsets and counts make simply two level hierarchy
>>> (classes with leaf subclasses) similarly (or simpler) to other
>>> classful qdisc which manage it all inside one module. Of course,
>>> we could think of another way of code organization, but it should
>>> be rather done at the beginning of schedulers design. The mq qdisc
>>> broke the design a bit adding a fake root, but I doubt we should go
>>> deeper unless it's necessary. Doing mclass (or something) as a more
>>> complex alternative to mq should be enough. Why couldn't mclass graft
>>> sch_sfq the same way as mq?
>>>
>>
>> If you also want to graft a scheduler onto a traffic class now your stuck. For now this qdisc doesn't exist, but I would like to have a software implementation of the currently offloaded DCB ETS scheduler. The 802.1Qaz spec allows different scheduling algorithms to be used on each traffic class. In the current implementation mclass could graft these scheduling schemes onto each traffic class independently.
>>
>>                               mclass
>>                                 |
>>     -------------------------------------------------------
>>     |         |        |        |     |     |     |       |
>>    mq_tbf   mq_tbf   mq_ets   mq_ets  mq    mq   mq_wrr greedy
>>    |                            |
>>  ---------                  ---------
>>  |   |   |                  |   |   |
>> red red red                red red red
>>
>> Perhaps, being concerned with hypothetical qdiscs is a bit of a stretch but I would like to at least not preclude this work from being done in the future.
> 
> Probably, despite this very nice figure and description, I still miss
> something and can't see the problem. If you graft a qdisc/scheduler
> to a traffic class you can change the way/range of grafting depending
> on additional parameters or even by checking some properties of the
> grafted qdisc. My main concern is adding complexity to the qdisc tree
> structure (instead of hiding it at the class level) for something,
> IMHO, hardly ever popular (like both mq and DCB).
> 

OK I'm convinced I'll keep everything contained in mclass. Building this mechanism into the qdisc seems to be adding extra complexity that is most likely not needed as you noted.

Although I suspect the "additional parameter" would be something along the lines of a queue index and offset? right? Otherwise how would a mq like qdisc know which queues it owns.

Thanks,
John.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Jan. 4, 2011, 2:59 a.m. UTC | #11
On 1/3/2011 4:18 PM, John Fastabend wrote:
> On 1/3/2011 2:59 PM, Jarek Poplawski wrote:
>> On Mon, Jan 03, 2011 at 12:37:56PM -0800, John Fastabend wrote:
>>> On 1/3/2011 9:02 AM, Jarek Poplawski wrote:
>>>> On Sun, Jan 02, 2011 at 09:43:27PM -0800, John Fastabend wrote:
>>>>> On 12/30/2010 3:37 PM, Jarek Poplawski wrote:
>>>>>> John Fastabend wrote:
>>>>>>> This implements a mclass 'multi-class' queueing discipline that by
>>>>>>> default creates multiple mq qdisc's one for each traffic class. Each
>>>>>>> mq qdisc then owns a range of queues per the netdev_tc_txq mappings.
>>>>>>
>>>>>> Is it really necessary to add one more abstraction layer for this,
>>>>>> probably not most often used (or even asked by users), functionality?
>>>>>> Why mclass can't simply do these few things more instead of attaching
>>>>>> (and changing) mq?
>>>>>>
>>>>>
>>>>> The statistics work nicely when the mq qdisc is used. 
>>>>
>>>> Well, I sometimes add leaf qdiscs only to get class stats with less
>>>> typing, too ;-)
>>>>
>>>>>
>>>>> qdisc mclass 8002: root  tc 4 map 0 1 2 3 0 1 2 3 1 1 1 1 1 1 1 1
>>>>>              queues:(0:1) (2:3) (4:5) (6:15)
>>>>>  Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
>>>>>  backlog 0b 0p requeues 0
>>>>> qdisc mq 8003: parent 8002:1
>>>>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>>>>  backlog 0b 0p requeues 0
>>>>> qdisc mq 8004: parent 8002:2
>>>>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>>>>  backlog 0b 0p requeues 0
>>>>> qdisc mq 8005: parent 8002:3
>>>>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>>>>  backlog 0b 0p requeues 0
>>>>> qdisc mq 8006: parent 8002:4
>>>>>  Sent 140 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
>>>>>  backlog 0b 0p requeues 0
>>>>> qdisc sfq 8007: parent 8005:1 limit 127p quantum 1514b
>>>>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>>>>  backlog 0b 0p requeues 0
>>>>> qdisc sfq 8008: parent 8005:2 limit 127p quantum 1514b
>>>>>  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>>>>>  backlog 0b 0p requeues 0
>>>>>
>>>>> The mclass gives the statistics for the interface and then statistics on the mq qdisc gives statistics for each traffic class. Also, when using the 'mq qdisc' with this abstraction other qdisc can be grafted onto the queue. For example the sch_sfq is used in the above example.
>>>>
>>>> IMHO, these tc offsets and counts make simply two level hierarchy
>>>> (classes with leaf subclasses) similarly (or simpler) to other
>>>> classful qdisc which manage it all inside one module. Of course,
>>>> we could think of another way of code organization, but it should
>>>> be rather done at the beginning of schedulers design. The mq qdisc
>>>> broke the design a bit adding a fake root, but I doubt we should go
>>>> deeper unless it's necessary. Doing mclass (or something) as a more
>>>> complex alternative to mq should be enough. Why couldn't mclass graft
>>>> sch_sfq the same way as mq?
>>>>
>>>
>>> If you also want to graft a scheduler onto a traffic class now your stuck. For now this qdisc doesn't exist, but I would like to have a software implementation of the currently offloaded DCB ETS scheduler. The 802.1Qaz spec allows different scheduling algorithms to be used on each traffic class. In the current implementation mclass could graft these scheduling schemes onto each traffic class independently.
>>>
>>>                               mclass
>>>                                 |
>>>     -------------------------------------------------------
>>>     |         |        |        |     |     |     |       |
>>>    mq_tbf   mq_tbf   mq_ets   mq_ets  mq    mq   mq_wrr greedy
>>>    |                            |
>>>  ---------                  ---------
>>>  |   |   |                  |   |   |
>>> red red red                red red red
>>>
>>> Perhaps, being concerned with hypothetical qdiscs is a bit of a stretch but I would like to at least not preclude this work from being done in the future.
>>
>> Probably, despite this very nice figure and description, I still miss
>> something and can't see the problem. If you graft a qdisc/scheduler
>> to a traffic class you can change the way/range of grafting depending
>> on additional parameters or even by checking some properties of the
>> grafted qdisc. My main concern is adding complexity to the qdisc tree
>> structure (instead of hiding it at the class level) for something,
>> IMHO, hardly ever popular (like both mq and DCB).
>>
> 
> OK I'm convinced I'll keep everything contained in mclass. Building this mechanism into the qdisc seems to be adding extra complexity that is most likely not needed as you noted.
> 
> Although I suspect the "additional parameter" would be something along the lines of a queue index and offset? right? Otherwise how would a mq like qdisc know which queues it owns.


Perhaps something with qdisc_class_ops select_queue() could be done to make it more flexible. When I get around to implementing these hypothetical qdiscs I will have to figure it out. For now though hypothetical qdiscs are not a very compelling use case.

Thanks,
John.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 453b2d7..911185b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -764,6 +764,8 @@  struct netdev_tc_txq {
  * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
  *			  struct nlattr *port[]);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
+ *
+ * int (*ndo_setup_tc)(struct net_device *dev, int tc);
  */
 #define HAVE_NET_DEVICE_OPS
 struct net_device_ops {
@@ -822,6 +824,7 @@  struct net_device_ops {
 						   struct nlattr *port[]);
 	int			(*ndo_get_vf_port)(struct net_device *dev,
 						   int vf, struct sk_buff *skb);
+	int			(*ndo_setup_tc)(struct net_device *dev, u8 tc);
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	int			(*ndo_fcoe_enable)(struct net_device *dev);
 	int			(*ndo_fcoe_disable)(struct net_device *dev);
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 2cfa4bc..0134ed4 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -481,4 +481,13 @@  struct tc_drr_stats {
 	__u32	deficit;
 };
 
+/* MCLASS */
+struct tc_mclass_qopt {
+	__u8	num_tc;
+	__u8	prio_tc_map[16];
+	__u8	hw;
+	__u16	count[16];
+	__u16	offset[16];
+};
+
 #endif
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 0af57eb..723ee52 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -50,6 +50,7 @@  struct Qdisc {
 #define TCQ_F_INGRESS		4
 #define TCQ_F_CAN_BYPASS	8
 #define TCQ_F_MQROOT		16
+#define TCQ_F_MQSAFE		32
 #define TCQ_F_WARN_NONWC	(1 << 16)
 	int			padded;
 	struct Qdisc_ops	*ops;
@@ -276,6 +277,7 @@  extern struct Qdisc noop_qdisc;
 extern struct Qdisc_ops noop_qdisc_ops;
 extern struct Qdisc_ops pfifo_fast_ops;
 extern struct Qdisc_ops mq_qdisc_ops;
+extern struct Qdisc_ops mclass_qdisc_ops;
 
 struct Qdisc_class_common {
 	u32			classid;
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 960f5db..76dcf5b 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -2,7 +2,7 @@ 
 # Makefile for the Linux Traffic Control Unit.
 #
 
-obj-y	:= sch_generic.o sch_mq.o
+obj-y	:= sch_generic.o sch_mq.o sch_mclass.o
 
 obj-$(CONFIG_NET_SCHED)		+= sch_api.o sch_blackhole.o
 obj-$(CONFIG_NET_CLS)		+= cls_api.o
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b22ca2d..24f40e0 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1770,6 +1770,7 @@  static int __init pktsched_init(void)
 	register_qdisc(&bfifo_qdisc_ops);
 	register_qdisc(&pfifo_head_drop_qdisc_ops);
 	register_qdisc(&mq_qdisc_ops);
+	register_qdisc(&mclass_qdisc_ops);
 
 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 34dc598..1c86ea1 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -376,7 +376,7 @@  static struct netdev_queue noop_netdev_queue = {
 struct Qdisc noop_qdisc = {
 	.enqueue	=	noop_enqueue,
 	.dequeue	=	noop_dequeue,
-	.flags		=	TCQ_F_BUILTIN,
+	.flags		=	TCQ_F_BUILTIN | TCQ_F_MQSAFE,
 	.ops		=	&noop_qdisc_ops,
 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
@@ -709,7 +709,13 @@  static void attach_default_qdiscs(struct net_device *dev)
 		dev->qdisc = txq->qdisc_sleeping;
 		atomic_inc(&dev->qdisc->refcnt);
 	} else {
-		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
+		if (dev->num_tc)
+			qdisc = qdisc_create_dflt(txq, &mclass_qdisc_ops,
+						  TC_H_ROOT);
+		else
+			qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops,
+						  TC_H_ROOT);
+
 		if (qdisc) {
 			qdisc->ops->attach(qdisc);
 			dev->qdisc = qdisc;
diff --git a/net/sched/sch_mclass.c b/net/sched/sch_mclass.c
new file mode 100644
index 0000000..444492a
--- /dev/null
+++ b/net/sched/sch_mclass.c
@@ -0,0 +1,376 @@ 
+/*
+ * net/sched/sch_mclass.c
+ *
+ * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct mclass_sched {
+	struct Qdisc		**qdiscs;
+	int hw_owned;
+};
+
+static void mclass_destroy(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mclass_sched *priv = qdisc_priv(sch);
+	unsigned int ntc;
+
+	if (!priv->qdiscs)
+		return;
+
+	for (ntc = 0; ntc < dev->num_tc && priv->qdiscs[ntc]; ntc++)
+		qdisc_destroy(priv->qdiscs[ntc]);
+
+	if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
+		dev->netdev_ops->ndo_setup_tc(dev, 0);
+	else
+		netdev_set_num_tc(dev, 0);
+
+	kfree(priv->qdiscs);
+}
+
+static int mclass_parse_opt(struct net_device *dev, struct tc_mclass_qopt *qopt)
+{
+	int i, j;
+
+	/* Verify TC offset and count are sane */
+	for (i = 0; i < qopt->num_tc; i++) {
+		int last = qopt->offset[i] + qopt->count[i];
+		if (last > dev->num_tx_queues)
+			return -EINVAL;
+		for (j = i + 1; j < qopt->num_tc; j++) {
+			if (last > qopt->offset[j])
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int mclass_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mclass_sched *priv = qdisc_priv(sch);
+	struct netdev_queue *dev_queue;
+	struct Qdisc *qdisc;
+	int i, err = -EOPNOTSUPP;
+	struct tc_mclass_qopt *qopt = NULL;
+
+	/* Unwind attributes on failure */
+	u8 unwnd_tc = dev->num_tc;
+	u8 unwnd_map[16];
+	struct netdev_tc_txq unwnd_txq[16];
+
+	if (sch->parent != TC_H_ROOT)
+		return -EOPNOTSUPP;
+
+	if (!netif_is_multiqueue(dev))
+		return -EOPNOTSUPP;
+
+	if (nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+	qopt = nla_data(opt);
+
+	memcpy(unwnd_map, dev->prio_tc_map, sizeof(unwnd_map));
+	memcpy(unwnd_txq, dev->tc_to_txq, sizeof(unwnd_txq));
+
+	/* If the mclass options indicate that hardware should own
+	 * the queue mapping then run ndo_setup_tc if this can not
+	 * be done fail immediately.
+	 */
+	if (qopt->hw && dev->netdev_ops->ndo_setup_tc) {
+		priv->hw_owned = 1;
+		if (dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc))
+			return -EINVAL;
+	} else if (!qopt->hw) {
+		if (mclass_parse_opt(dev, qopt))
+			return -EINVAL;
+
+		if (netdev_set_num_tc(dev, qopt->num_tc))
+			return -ENOMEM;
+
+		for (i = 0; i < qopt->num_tc; i++)
+			netdev_set_tc_queue(dev, i,
+					    qopt->count[i], qopt->offset[i]);
+	} else {
+		return -EINVAL;
+	}
+
+	/* Always use supplied priority mappings */
+	for (i = 0; i < 16; i++) {
+		if (netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i])) {
+			err = -EINVAL;
+			goto tc_err;
+		}
+	}
+
+	/* pre-allocate qdisc, attachment can't fail */
+	priv->qdiscs = kcalloc(qopt->num_tc,
+			       sizeof(priv->qdiscs[0]), GFP_KERNEL);
+	if (priv->qdiscs == NULL) {
+		err = -ENOMEM;
+		goto tc_err;
+	}
+
+	for (i = 0; i < dev->num_tc; i++) {
+		dev_queue = netdev_get_tx_queue(dev, dev->tc_to_txq[i].offset);
+		qdisc = qdisc_create_dflt(dev_queue, &mq_qdisc_ops,
+					  TC_H_MAKE(TC_H_MAJ(sch->handle),
+						    TC_H_MIN(i + 1)));
+		if (qdisc == NULL) {
+			err = -ENOMEM;
+			goto err;
+		}
+		qdisc->flags |= TCQ_F_CAN_BYPASS;
+		priv->qdiscs[i] = qdisc;
+	}
+
+	sch->flags |= TCQ_F_MQROOT;
+	return 0;
+
+err:
+	mclass_destroy(sch);
+tc_err:
+	if (priv->hw_owned)
+		dev->netdev_ops->ndo_setup_tc(dev, unwnd_tc);
+	else
+		netdev_set_num_tc(dev, unwnd_tc);
+
+	memcpy(dev->prio_tc_map, unwnd_map, sizeof(unwnd_map));
+	memcpy(dev->tc_to_txq, unwnd_txq, sizeof(unwnd_txq));
+
+	return err;
+}
+
+static void mclass_attach(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mclass_sched *priv = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	unsigned int ntc;
+
+	/* Attach underlying qdisc */
+	for (ntc = 0; ntc < dev->num_tc; ntc++) {
+		qdisc = priv->qdiscs[ntc];
+		if (qdisc->ops && qdisc->ops->attach)
+			qdisc->ops->attach(qdisc);
+	}
+}
+
+static int mclass_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+		    struct Qdisc **old)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mclass_sched *priv = qdisc_priv(sch);
+	unsigned long ntc = cl - 1;
+
+	if (ntc >= dev->num_tc || (new && !(new->flags & TCQ_F_MQSAFE)))
+		return -EINVAL;
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	*old = priv->qdiscs[ntc];
+	priv->qdiscs[ntc] = new;
+	qdisc_reset(*old);
+
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+
+	return 0;
+}
+
+static int mclass_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mclass_sched *priv = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_mclass_qopt opt;
+	struct Qdisc *qdisc;
+	unsigned int i;
+
+	sch->q.qlen = 0;
+	memset(&sch->bstats, 0, sizeof(sch->bstats));
+	memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+		spin_lock_bh(qdisc_lock(qdisc));
+		sch->q.qlen		+= qdisc->q.qlen;
+		sch->bstats.bytes	+= qdisc->bstats.bytes;
+		sch->bstats.packets	+= qdisc->bstats.packets;
+		sch->qstats.qlen	+= qdisc->qstats.qlen;
+		sch->qstats.backlog	+= qdisc->qstats.backlog;
+		sch->qstats.drops	+= qdisc->qstats.drops;
+		sch->qstats.requeues	+= qdisc->qstats.requeues;
+		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
+
+	opt.num_tc = dev->num_tc;
+	memcpy(opt.prio_tc_map, dev->prio_tc_map, 16);
+	opt.hw = priv->hw_owned;
+
+	for (i = 0; i < dev->num_tc; i++) {
+		opt.count[i] = dev->tc_to_txq[i].count;
+		opt.offset[i] = dev->tc_to_txq[i].offset;
+	}
+
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	return skb->len;
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct Qdisc *mclass_leaf(struct Qdisc *sch, unsigned long cl)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mclass_sched *priv = qdisc_priv(sch);
+	unsigned long ntc = cl - 1;
+
+	if (ntc >= dev->num_tc)
+		return NULL;
+	return priv->qdiscs[ntc];
+}
+
+static unsigned long mclass_get(struct Qdisc *sch, u32 classid)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned int ntc = TC_H_MIN(classid);
+
+	if (ntc >= dev->num_tc)
+		return 0;
+	return ntc;
+}
+
+static void mclass_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mclass_dump_class(struct Qdisc *sch, unsigned long cl,
+			 struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct Qdisc *class;
+	struct net_device *dev = qdisc_dev(sch);
+	struct mclass_sched *priv = qdisc_priv(sch);
+	unsigned long ntc = cl - 1;
+
+	if (ntc >= dev->num_tc)
+		return -EINVAL;
+
+	class = priv->qdiscs[ntc];
+
+	tcm->tcm_parent = TC_H_ROOT;
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	tcm->tcm_info = class->handle;
+	return 0;
+}
+
+static int mclass_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+			       struct gnet_dump *d)
+{
+	struct Qdisc *class, *qdisc;
+	struct net_device *dev = qdisc_dev(sch);
+	struct mclass_sched *priv = qdisc_priv(sch);
+	unsigned long ntc = cl - 1;
+	unsigned int i;
+	u16 count, offset;
+
+	if (ntc >= dev->num_tc)
+		return -EINVAL;
+
+	class = priv->qdiscs[ntc];
+	count = dev->tc_to_txq[ntc].count;
+	offset = dev->tc_to_txq[ntc].offset;
+
+	memset(&class->bstats, 0, sizeof(class->bstats));
+	memset(&class->qstats, 0, sizeof(class->qstats));
+
+	/* Drop lock here it will be reclaimed before touching statistics
+	 * this is required because the qdisc_root_sleeping_lock we hold
+	 * here is the look on dev_queue->qdisc_sleeping also acquired
+	 * below.
+	 */
+	spin_unlock_bh(d->lock);
+
+	for (i = offset; i < offset + count; i++) {
+		qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+		spin_lock_bh(qdisc_lock(qdisc));
+		class->q.qlen		 += qdisc->q.qlen;
+		class->bstats.bytes	 += qdisc->bstats.bytes;
+		class->bstats.packets	 += qdisc->bstats.packets;
+		class->qstats.qlen	 += qdisc->qstats.qlen;
+		class->qstats.backlog	 += qdisc->qstats.backlog;
+		class->qstats.drops	 += qdisc->qstats.drops;
+		class->qstats.requeues	 += qdisc->qstats.requeues;
+		class->qstats.overlimits += qdisc->qstats.overlimits;
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
+
+	/* Reclaim root sleeping lock before completing stats */
+	spin_lock_bh(d->lock);
+
+	class->qstats.qlen = class->q.qlen;
+	if (gnet_stats_copy_basic(d, &class->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &class->qstats) < 0)
+		return -1;
+	return 0;
+}
+
+static void mclass_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntc;
+
+	if (arg->stop)
+		return;
+
+	arg->count = arg->skip;
+	for (ntc = arg->skip; ntc < dev->num_tc; ntc++) {
+		if (arg->fn(sch, ntc + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops mclass_class_ops = {
+	.graft		= mclass_graft,
+	.leaf		= mclass_leaf,
+	.get		= mclass_get,
+	.put		= mclass_put,
+	.walk		= mclass_walk,
+	.dump		= mclass_dump_class,
+	.dump_stats	= mclass_dump_class_stats,
+};
+
+struct Qdisc_ops mclass_qdisc_ops __read_mostly = {
+	.cl_ops		= &mclass_class_ops,
+	.id		= "mclass",
+	.priv_size	= sizeof(struct mclass_sched),
+	.init		= mclass_init,
+	.destroy	= mclass_destroy,
+	.attach		= mclass_attach,
+	.dump		= mclass_dump,
+	.owner		= THIS_MODULE,
+};
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 86da74c..886cfac 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -86,6 +86,9 @@  static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 
 	if (!priv->num_tc)
 		sch->flags |= TCQ_F_MQROOT;
+	else
+		sch->flags |= TCQ_F_MQSAFE;
+
 	return 0;
 
 err: