diff mbox

[RFC] Don't run __qdisc_run() on a stopped TX queue

Message ID 20090720074607.32463.23013.sendpatchset@localhost.localdomain
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Krishna Kumar July 20, 2009, 7:46 a.m. UTC
From: Krishna Kumar <krkumar2@in.ibm.com>

Driver sends an skb and stops the queue if no more space is
available on the device, and returns OK. When dev_queue_xmit
runs for the next skb, it enqueue's the skb & calls qdisc_run.
qdisc_run dequeue's the skb and finds the queue is stopped
after getting the HARD_LOCK_TX, and puts the skb back on
gso_skb (the next iteration fails faster at dequeue_skb).

This patch avoids calling __qdisc_run if the queue is stopped.
This also lets us remove the queue_stopped check in dequeue_skb
and in qdisc_restart. When the queue is re-enabled, it runs with
one less queue_stopped() check for every skb on the queue (we
still need to own the __QDISC_STATE_RUNNING bit to catch the
stopped condition correctly since stopped cannot be set without
holding this bit).

Results for 3 iteration testing for 1, 4, 8, 12 netperf sessions
running for 1 minute each:

-----------------------------------------------------------------
	       System-X			         P6
-----------------------------------------------------------------
Size	ORG BW		NEW BW		ORG BW		NEW BW
-----------------------------------------------------------------
16K	72183.05	74417.79	60775.76	63413.17
128K	69097.87	72447.12	61692.16	62251.07
256K	60456.21	61415.81	59694.03	61641.81
-----------------------------------------------------------------

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---

 include/net/pkt_sched.h |    9 +++++++--
 net/sched/sch_generic.c |    9 ++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Jarek Poplawski July 24, 2009, 8:30 a.m. UTC | #1
On 20-07-2009 09:46, Krishna Kumar wrote:
> From: Krishna Kumar <krkumar2@in.ibm.com>
> 
> Driver sends an skb and stops the queue if no more space is
> available on the device, and returns OK. When dev_queue_xmit
> runs for the next skb, it enqueue's the skb & calls qdisc_run.
> qdisc_run dequeue's the skb and finds the queue is stopped
> after getting the HARD_LOCK_TX, and puts the skb back on
> gso_skb (the next iteration fails faster at dequeue_skb).
> 
> This patch avoids calling __qdisc_run if the queue is stopped.
> This also lets us remove the queue_stopped check in dequeue_skb
> and in qdisc_restart.

It was designed wrt. multiqueue handling mainly, were such a check
isn't enough at this place.

Jarek P.

> When the queue is re-enabled, it runs with
> one less queue_stopped() check for every skb on the queue (we
> still need to own the __QDISC_STATE_RUNNING bit to catch the
> stopped condition correctly since stopped cannot be set without
> holding this bit).
> 
> Results for 3 iteration testing for 1, 4, 8, 12 netperf sessions
> running for 1 minute each:
> 
> -----------------------------------------------------------------
> 	       System-X			         P6
> -----------------------------------------------------------------
> Size	ORG BW		NEW BW		ORG BW		NEW BW
> -----------------------------------------------------------------
> 16K	72183.05	74417.79	60775.76	63413.17
> 128K	69097.87	72447.12	61692.16	62251.07
> 256K	60456.21	61415.81	59694.03	61641.81
> -----------------------------------------------------------------
> 
> Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> ---
> 
>  include/net/pkt_sched.h |    9 +++++++--
>  net/sched/sch_generic.c |    9 ++-------
>  2 files changed, 9 insertions(+), 9 deletions(-)
> 
> diff -ruNp org/include/net/pkt_sched.h new/include/net/pkt_sched.h
> --- org/include/net/pkt_sched.h	2009-06-22 11:40:31.000000000 +0530
> +++ new/include/net/pkt_sched.h	2009-07-20 13:09:18.000000000 +0530
> @@ -92,8 +92,13 @@ extern void __qdisc_run(struct Qdisc *q)
>  
>  static inline void qdisc_run(struct Qdisc *q)
>  {
> -	if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
> -		__qdisc_run(q);
> +	if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
> +		if (!netif_tx_queue_stopped(q->dev_queue))
> +			__qdisc_run(q);
> +		else
> +			/* driver will wake us up anyway, just clear */
> +			clear_bit(__QDISC_STATE_RUNNING, &q->state);
> +	}
>  }
>  
>  extern int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
> diff -ruNp org/net/sched/sch_generic.c new/net/sched/sch_generic.c
> --- org/net/sched/sch_generic.c	2009-05-25 07:48:07.000000000 +0530
> +++ new/net/sched/sch_generic.c	2009-07-20 13:09:18.000000000 +0530
> @@ -56,12 +56,8 @@ static inline struct sk_buff *dequeue_sk
>  	struct sk_buff *skb = q->gso_skb;
>  
>  	if (unlikely(skb)) {
> -		struct net_device *dev = qdisc_dev(q);
> -		struct netdev_queue *txq;
> -
>  		/* check the reason of requeuing without tx lock first */
> -		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
> -		if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
> +		if (!netif_tx_queue_frozen(q->dev_queue))
>  			q->gso_skb = NULL;
>  		else
>  			skb = NULL;
> @@ -142,8 +138,7 @@ static inline int qdisc_restart(struct Q
>  	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
>  
>  	HARD_TX_LOCK(dev, txq, smp_processor_id());
> -	if (!netif_tx_queue_stopped(txq) &&
> -	    !netif_tx_queue_frozen(txq))
> +	if (!netif_tx_queue_frozen(txq))
>  		ret = dev_hard_start_xmit(skb, dev, txq);
>  	HARD_TX_UNLOCK(dev, txq);
>  
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 24, 2009, 8:37 a.m. UTC | #2
On Fri, Jul 24, 2009 at 08:30:06AM +0000, Jarek Poplawski wrote:
> On 20-07-2009 09:46, Krishna Kumar wrote:
> > From: Krishna Kumar <krkumar2@in.ibm.com>
> > 
> > Driver sends an skb and stops the queue if no more space is
> > available on the device, and returns OK. When dev_queue_xmit
> > runs for the next skb, it enqueue's the skb & calls qdisc_run.
> > qdisc_run dequeue's the skb and finds the queue is stopped
> > after getting the HARD_LOCK_TX, and puts the skb back on
> > gso_skb (the next iteration fails faster at dequeue_skb).
> > 
> > This patch avoids calling __qdisc_run if the queue is stopped.
> > This also lets us remove the queue_stopped check in dequeue_skb
> > and in qdisc_restart.
> 
> It was designed wrt. multiqueue handling mainly, were such a check
> isn't enough at this place.

Well multiqueue TX was designed so that each CPU could have its
own queue.  While this isn't always the case due to resource
constraints and legacy applications, we should optimise for the
case where each CPU is using its own TX queue.

Of course we should still maintain correctness should the unexpected
occur.

So while I agree with Krishna's patch in principle, someone needs to
make sure that it still does the right thing (i.e., it doesn't
introduce potential starvation) if one CPU transmits to someone
else's queue.

Cheers,
Krishna Kumar July 24, 2009, 10:31 a.m. UTC | #3
> Herbert Xu <herbert@gondor.apana.org.au> wrote on 07/24/2009 02:07:49 PM:
> On Fri, Jul 24, 2009 at 08:30:06AM +0000, Jarek Poplawski wrote:
> > On 20-07-2009 09:46, Krishna Kumar wrote:
> > > From: Krishna Kumar <krkumar2@in.ibm.com>
> > >
> > > This patch avoids calling __qdisc_run if the queue is stopped.
> > > This also lets us remove the queue_stopped check in dequeue_skb
> > > and in qdisc_restart.
> >
> > It was designed wrt. multiqueue handling mainly, were such a check
> > isn't enough at this place.
>
> Well multiqueue TX was designed so that each CPU could have its
> own queue.  While this isn't always the case due to resource
> constraints and legacy applications, we should optimise for the
> case where each CPU is using its own TX queue.
>
> Of course we should still maintain correctness should the unexpected
> occur.
>
> So while I agree with Krishna's patch in principle, someone needs to
> make sure that it still does the right thing (i.e., it doesn't
> introduce potential starvation) if one CPU transmits to someone
> else's queue.

Hi Herbert,

Assuming many CPU's share a queue, only one can xmit due to the
RUNNING bit. And after RUNNING bit is taken, no other cpu can
stop the queue. So the only change in the behavior with this
patch is that the xmit is terminated a little earlier compared
to the current code. In case of a stopped queue, the patch helps
a little bit more by removing one stopped check for each queue'd
skb, including those skbs that are added later while the current
xmit session (qdisc_run) is ongoing.

I hope I have addressed your concern?

Thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet July 24, 2009, 12:46 p.m. UTC | #4
Krishna Kumar a écrit :
> From: Krishna Kumar <krkumar2@in.ibm.com>
> diff -ruNp org/net/sched/sch_generic.c new/net/sched/sch_generic.c
> --- org/net/sched/sch_generic.c	2009-05-25 07:48:07.000000000 +0530
> +++ new/net/sched/sch_generic.c	2009-07-20 13:09:18.000000000 +0530
> @@ -56,12 +56,8 @@ static inline struct sk_buff *dequeue_sk
>  	struct sk_buff *skb = q->gso_skb;
>  
>  	if (unlikely(skb)) {
> -		struct net_device *dev = qdisc_dev(q);
> -		struct netdev_queue *txq;
> -
>  		/* check the reason of requeuing without tx lock first */
> -		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
> -		if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
> +		if (!netif_tx_queue_frozen(q->dev_queue))
>  			q->gso_skb = NULL;

This part might be a separate patch...

Are we sure these are equivalent ?

OLD)
	struct net_device *dev = qdisc_dev(q);
	struct netdev_queue *txq;
	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));

NEW)
	struct netdev_queue *txq = q->dev_queue;

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 24, 2009, 2:54 p.m. UTC | #5
On Fri, Jul 24, 2009 at 02:46:06PM +0200, Eric Dumazet wrote:
> Krishna Kumar a écrit :
> > From: Krishna Kumar <krkumar2@in.ibm.com>
> > diff -ruNp org/net/sched/sch_generic.c new/net/sched/sch_generic.c
> > --- org/net/sched/sch_generic.c	2009-05-25 07:48:07.000000000 +0530
> > +++ new/net/sched/sch_generic.c	2009-07-20 13:09:18.000000000 +0530
> > @@ -56,12 +56,8 @@ static inline struct sk_buff *dequeue_sk
> >  	struct sk_buff *skb = q->gso_skb;
> >  
> >  	if (unlikely(skb)) {
> > -		struct net_device *dev = qdisc_dev(q);
> > -		struct netdev_queue *txq;
> > -
> >  		/* check the reason of requeuing without tx lock first */
> > -		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
> > -		if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
> > +		if (!netif_tx_queue_frozen(q->dev_queue))
> >  			q->gso_skb = NULL;
> 
> This part might be a separate patch...
> 
> Are we sure these are equivalent ?

I think the only thing that can change this is act_skbedit.  But
really any queue that changes the skb queue mapping should physically
move the packet to that qdisc.

Cheers,
Herbert Xu July 25, 2009, 3:24 a.m. UTC | #6
On Fri, Jul 24, 2009 at 04:01:15PM +0530, Krishna Kumar2 wrote:
>
> Assuming many CPU's share a queue, only one can xmit due to the
> RUNNING bit. And after RUNNING bit is taken, no other cpu can
> stop the queue. So the only change in the behavior with this
> patch is that the xmit is terminated a little earlier compared
> to the current code. In case of a stopped queue, the patch helps
> a little bit more by removing one stopped check for each queue'd
> skb, including those skbs that are added later while the current
> xmit session (qdisc_run) is ongoing.
> 
> I hope I have addressed your concern?

That got me to actually look at your patch :)

You're essentiall reverting f4ab543201992fe499bef5c406e09f23aa97b4d5
which cannot be right since the same problem still exists.

However, I am definitely with you in that we should perform this
optimisation since it makes sense for the majority of people who
use multiqueue TX.

So the fact that our current architecture penalises the people
who actually need multiqueue TX in order to ensure correctness
for the people who cannot use multiqueue TX effectively (i.e.,
those who use non-default qdiscs) makes me uneasy.

Dave, remember our discussion about the benefits of using multiqueue
TX just for the sake of enarlging the TX queues? How about just
going back to using a single queue for non-default qdiscs (at
least until such a time when non-default qdiscs start doing
multiple queues internally)?

Yes it would mean potentially smaller queues for those non-default
qdisc users, but they're usually the same people who want the
hardware to queue as little as possible in order to enforce whatever
it is that their qdisc is designed to enforce.

Cheers,
Jarek Poplawski July 25, 2009, 11:04 a.m. UTC | #7
On Sat, Jul 25, 2009 at 11:24:36AM +0800, Herbert Xu wrote:
...
> However, I am definitely with you in that we should perform this
> optimisation since it makes sense for the majority of people who
> use multiqueue TX.
> 
> So the fact that our current architecture penalises the people
> who actually need multiqueue TX in order to ensure correctness
> for the people who cannot use multiqueue TX effectively (i.e.,
> those who use non-default qdiscs) makes me uneasy.

It would be nice to establish what difference is made by each part of
this patch. The check for stopped queue before each skb xmit was done
earlier too, so it's not exactly multiqueue penalization.

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 25, 2009, 11:12 a.m. UTC | #8
On Sat, Jul 25, 2009 at 01:04:09PM +0200, Jarek Poplawski wrote:
> 
> It would be nice to establish what difference is made by each part of
> this patch. The check for stopped queue before each skb xmit was done
> earlier too, so it's not exactly multiqueue penalization.

No, but getting into qdisc_restart when we know that the queue is
stopped is just silly.  The only reason that is done right now is
because of the pathological case where a packet on qdisc X maps to
hardware queue Y.  This is something which almost nobody uses and
yet it penalises everybody.

Cheers,
Jarek Poplawski July 25, 2009, 11:53 a.m. UTC | #9
On Sat, Jul 25, 2009 at 07:12:07PM +0800, Herbert Xu wrote:
> On Sat, Jul 25, 2009 at 01:04:09PM +0200, Jarek Poplawski wrote:
> > 
> > It would be nice to establish what difference is made by each part of
> > this patch. The check for stopped queue before each skb xmit was done
> > earlier too, so it's not exactly multiqueue penalization.
> 
> No, but getting into qdisc_restart when we know that the queue is
> stopped is just silly.  The only reason that is done right now is
> because of the pathological case where a packet on qdisc X maps to
> hardware queue Y.  This is something which almost nobody uses and
> yet it penalises everybody.

Hmm... I agree with you 100%! (I'd only like to see more digits... ;-)

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller July 28, 2009, 2:28 a.m. UTC | #10
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 25 Jul 2009 11:24:36 +0800

> Dave, remember our discussion about the benefits of using multiqueue
> TX just for the sake of enarlging the TX queues? How about just
> going back to using a single queue for non-default qdiscs (at
> least until such a time when non-default qdiscs start doing
> multiple queues internally)?
> 
> Yes it would mean potentially smaller queues for those non-default
> qdisc users, but they're usually the same people who want the
> hardware to queue as little as possible in order to enforce whatever
> it is that their qdisc is designed to enforce.

There is a locking benefit even for non-default qdiscs.

Instead of two choke points (qdisc lock and queue lock) there
is now only one (qdisc lock) and consdiering the cost of
things like setting up IOMMU mappings and hitting chip
registers the qdisc lock is the shortest held of the two.

So going to one queue would be a serious regression.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 28, 2009, 2:48 a.m. UTC | #11
On Mon, Jul 27, 2009 at 07:28:44PM -0700, David Miller wrote:
>
> There is a locking benefit even for non-default qdiscs.
> 
> Instead of two choke points (qdisc lock and queue lock) there
> is now only one (qdisc lock) and consdiering the cost of
> things like setting up IOMMU mappings and hitting chip
> registers the qdisc lock is the shortest held of the two.

But only one CPU can process a given qdisc at one time so I don't
see why there is a second choke point if you use a single queue
with a non-default qdisc.

Cheers,
David Miller July 28, 2009, 4:21 a.m. UTC | #12
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 28 Jul 2009 10:48:13 +0800

> On Mon, Jul 27, 2009 at 07:28:44PM -0700, David Miller wrote:
>>
>> There is a locking benefit even for non-default qdiscs.
>> 
>> Instead of two choke points (qdisc lock and queue lock) there
>> is now only one (qdisc lock) and consdiering the cost of
>> things like setting up IOMMU mappings and hitting chip
>> registers the qdisc lock is the shortest held of the two.
> 
> But only one CPU can process a given qdisc at one time so I don't
> see why there is a second choke point if you use a single queue
> with a non-default qdisc.

Good point, but this only suggests that we might want to undo that
queue runner exclusivity state bit for this case especially when we
know that we are feeding a multiqueue device.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski July 28, 2009, 6:43 a.m. UTC | #13
On Mon, Jul 27, 2009 at 09:21:07PM -0700, David Miller wrote:
> From: Herbert Xu <herbert@gondor.apana.org.au>
> Date: Tue, 28 Jul 2009 10:48:13 +0800
> 
> > On Mon, Jul 27, 2009 at 07:28:44PM -0700, David Miller wrote:
> >>
> >> There is a locking benefit even for non-default qdiscs.
> >> 
> >> Instead of two choke points (qdisc lock and queue lock) there
> >> is now only one (qdisc lock) and consdiering the cost of
> >> things like setting up IOMMU mappings and hitting chip
> >> registers the qdisc lock is the shortest held of the two.
> > 
> > But only one CPU can process a given qdisc at one time so I don't
> > see why there is a second choke point if you use a single queue
> > with a non-default qdisc.
> 
> Good point, but this only suggests that we might want to undo that
> queue runner exclusivity state bit for this case especially when we
> know that we are feeding a multiqueue device.

I guess the main sacrifice of Herbert's idea is sch_multiq, and we
could probably consider some compromise like doing qdisc_run (with
qdisc_restart) a callback or only adding a "if default qdisc" check,
depending on costs/gains.

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 28, 2009, 7:12 a.m. UTC | #14
On Mon, Jul 27, 2009 at 09:21:07PM -0700, David Miller wrote:
>
> Good point, but this only suggests that we might want to undo that
> queue runner exclusivity state bit for this case especially when we
> know that we are feeding a multiqueue device.

I'm not convinced that would generate a postivie outcome.  If
you remove the exclusivity and go back to taking two locks in
turns (the qdisc lock or the xmit lock) then the only case where
you might win is if multiple CPUs perform qdisc_run at the same
time.

However, in that case you'll now have two locks bouncing around
instead of one and my guess would be that the cache overhead
would offset any gain that is made from the parallel processing.

But I think the fundamental question is should we treat the scenario
where each CPU is transmitting to its own TX queue as the ideal
and optimise for that over the case where the CPUs are transmitting
more or less randomly into the queues.

Cheers,
Herbert Xu July 28, 2009, 8:03 a.m. UTC | #15
On Tue, Jul 28, 2009 at 06:43:19AM +0000, Jarek Poplawski wrote:
>
> I guess the main sacrifice of Herbert's idea is sch_multiq, and we
> could probably consider some compromise like doing qdisc_run (with
> qdisc_restart) a callback or only adding a "if default qdisc" check,
> depending on costs/gains.

Indeed, getting rid of multiple queues for all non-default qdiscs
is overly drastic.  Here's a different plan.  We add the proposed
check only for the default qdisc (we still need to fix act_skbedit)
though) while all non-default qdiscs continue to behave as they
currently do.

This can be done by setting dev_queue for non-default qdiscs to
NULL and checking that before we do qdisc_run:

	if (!dev_queue || !queue_stopped(dev_queue))
		qdisc_run

This makes sense as non-default qdiscs do not map bijectively onto
dev queues so they shouldn't really have a pointer to a dev queue.

Of course we'll need to fix those places that use dev_queue for
other purposes such as getting back to the device.  If that is
too much work we could just add a new field that is identical
to dev_queue for the default qdisc but is NULL for non-default
qdiscs.

Cheers,
Jarek Poplawski July 28, 2009, 8:37 a.m. UTC | #16
On Tue, Jul 28, 2009 at 04:03:54PM +0800, Herbert Xu wrote:
> On Tue, Jul 28, 2009 at 06:43:19AM +0000, Jarek Poplawski wrote:
> >
> > I guess the main sacrifice of Herbert's idea is sch_multiq, and we
> > could probably consider some compromise like doing qdisc_run (with
> > qdisc_restart) a callback or only adding a "if default qdisc" check,
> > depending on costs/gains.
> 
> Indeed, getting rid of multiple queues for all non-default qdiscs
> is overly drastic.  Here's a different plan.  We add the proposed
> check only for the default qdisc (we still need to fix act_skbedit)
> though) while all non-default qdiscs continue to behave as they
> currently do.
> 
> This can be done by setting dev_queue for non-default qdiscs to
> NULL and checking that before we do qdisc_run:
> 
> 	if (!dev_queue || !queue_stopped(dev_queue))
> 		qdisc_run
> 
> This makes sense as non-default qdiscs do not map bijectively onto
> dev queues so they shouldn't really have a pointer to a dev queue.
> 
> Of course we'll need to fix those places that use dev_queue for
> other purposes such as getting back to the device.  If that is
> too much work we could just add a new field that is identical
> to dev_queue for the default qdisc but is NULL for non-default
> qdiscs.

Hmm.. would something IMHO_more_readable/less_fixing like:
(qdisc->flags & TCQ_F_DEFAULT) be too much? (Btw, IIRC there was an
idea long time ago to treat all queues equally.)

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 28, 2009, 8:44 a.m. UTC | #17
On Tue, Jul 28, 2009 at 08:37:13AM +0000, Jarek Poplawski wrote:
>
> Hmm.. would something IMHO_more_readable/less_fixing like:
> (qdisc->flags & TCQ_F_DEFAULT) be too much? (Btw, IIRC there was an
> idea long time ago to treat all queues equally.)

Well the fact that as it stands the only true multiqueue qdisc
is the default is a mere coincidence.  There is no fundamental
reason why non-default qdiscs cannot be made multiqueue aware,
well, for some of them anyway.

For example, sfq can naturally be enhanced as a multiqueue qdisc
just like the default qdisc.

So I don't think we want to prejudice what a future non-default
qdisc may support.

Cheers,
Jarek Poplawski July 28, 2009, 9:02 a.m. UTC | #18
On Tue, Jul 28, 2009 at 04:44:51PM +0800, Herbert Xu wrote:
> On Tue, Jul 28, 2009 at 08:37:13AM +0000, Jarek Poplawski wrote:
> >
> > Hmm.. would something IMHO_more_readable/less_fixing like:
> > (qdisc->flags & TCQ_F_DEFAULT) be too much? (Btw, IIRC there was an
> > idea long time ago to treat all queues equally.)
> 
> Well the fact that as it stands the only true multiqueue qdisc
> is the default is a mere coincidence.  There is no fundamental
> reason why non-default qdiscs cannot be made multiqueue aware,
> well, for some of them anyway.
> 
> For example, sfq can naturally be enhanced as a multiqueue qdisc
> just like the default qdisc.
> 
> So I don't think we want to prejudice what a future non-default
> qdisc may support.

If it's only about the name I'm OK with: (qdisc->flags &
TCQ_F_DEFAULT_WITHOUT_PREJUDICE_MAYBE_NON_DEFAULT_IN_THE_FUTURE)
or any other, too ;-)

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 28, 2009, 10:53 a.m. UTC | #19
On Tue, Jul 28, 2009 at 09:02:29AM +0000, Jarek Poplawski wrote:
>
> If it's only about the name I'm OK with: (qdisc->flags &
> TCQ_F_DEFAULT_WITHOUT_PREJUDICE_MAYBE_NON_DEFAULT_IN_THE_FUTURE)
> or any other, too ;-)

I see.  Yes a flag should work.

Thanks,
Jarek Poplawski July 28, 2009, 11:24 a.m. UTC | #20
On Tue, Jul 28, 2009 at 06:53:29PM +0800, Herbert Xu wrote:
> On Tue, Jul 28, 2009 at 09:02:29AM +0000, Jarek Poplawski wrote:
> >
> > If it's only about the name I'm OK with: (qdisc->flags &
> > TCQ_F_DEFAULT_WITHOUT_PREJUDICE_MAYBE_NON_DEFAULT_IN_THE_FUTURE)
> > or any other, too ;-)
> 
> I see.  Yes a flag should work.

So, I hope Krishna will try if these requested comments save any gains
of the original patch...

Thanks,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krishna Kumar July 28, 2009, 11:43 a.m. UTC | #21
Sure, I will test and give my findings here. I guess setting the
flag in qdisc_create_dflt is the right place - that will cover pfifo,
bfifo and pfifo_fast (instead of attach_one_default_qdisc)?

Thanks for all the suggestions.

- KK

Jarek Poplawski <jarkao2@gmail.com> wrote on 07/28/2009 04:54:45 PM:

> On Tue, Jul 28, 2009 at 06:53:29PM +0800, Herbert Xu wrote:
> > On Tue, Jul 28, 2009 at 09:02:29AM +0000, Jarek Poplawski wrote:
> > >
> > > If it's only about the name I'm OK with: (qdisc->flags &
> > > TCQ_F_DEFAULT_WITHOUT_PREJUDICE_MAYBE_NON_DEFAULT_IN_THE_FUTURE)
> > > or any other, too ;-)
> >
> > I see.  Yes a flag should work.
>
> So, I hope Krishna will try if these requested comments save any gains
> of the original patch...
>
> Thanks,
> Jarek P.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 28, 2009, 11:48 a.m. UTC | #22
On Tue, Jul 28, 2009 at 05:13:45PM +0530, Krishna Kumar2 wrote:
> Sure, I will test and give my findings here. I guess setting the
> flag in qdisc_create_dflt is the right place - that will cover pfifo,
> bfifo and pfifo_fast (instead of attach_one_default_qdisc)?

No I think you want to put it in attach_one_default_qdisc.

Despite its name, qdisc_create_dflt is used by non-default qdiscs
too.

Cheers,
Krishna Kumar July 28, 2009, 12:24 p.m. UTC | #23
Hi Herbert,

Herbert Xu <herbert@gondor.apana.org.au> wrote on 07/28/2009 05:18:03 PM:
>
> Re: [RFC] [PATCH] Don't run __qdisc_run() on a stopped TX queue
>
> On Tue, Jul 28, 2009 at 05:13:45PM +0530, Krishna Kumar2 wrote:
> > Sure, I will test and give my findings here. I guess setting the
> > flag in qdisc_create_dflt is the right place - that will cover pfifo,
> > bfifo and pfifo_fast (instead of attach_one_default_qdisc)?
>
> No I think you want to put it in attach_one_default_qdisc.
>
> Despite its name, qdisc_create_dflt is used by non-default qdiscs
> too.

Right, thanks for pointing this out.

BTW, I am testing another patch that does "Do not enqueue skb for
default qdisc if there are no skbs already on the queue" (to avoid
enqueue followed by immediate dequeue of the same skb for most xmit
operations on default qdiscs). I am planning to use the same flag
for both. I hope there is nothing wrong with that idea (excuse me
if this has already been discussed on the list but I haven't been
following for almost a year).

Thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller July 28, 2009, 7:59 p.m. UTC | #24
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 28 Jul 2009 15:12:47 +0800

> However, in that case you'll now have two locks bouncing around
> instead of one and my guess would be that the cache overhead
> would offset any gain that is made from the parallel processing.

The premise is that there'd be only one.  The qdisc lock.

If the traffic is distributed, flow wise, the driver XMIT
lock would spread due to multiqueue.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller July 28, 2009, 8:02 p.m. UTC | #25
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 28 Jul 2009 16:44:51 +0800

> Well the fact that as it stands the only true multiqueue qdisc
> is the default is a mere coincidence.  There is no fundamental
> reason why non-default qdiscs cannot be made multiqueue aware,
> well, for some of them anyway.
> 
> For example, sfq can naturally be enhanced as a multiqueue qdisc
> just like the default qdisc.
> 
> So I don't think we want to prejudice what a future non-default
> qdisc may support.

The idea is that it cannot be done anywhere we currently
limits, decisions, etc. are all done with "device" scope.

And I think SFQ even falls into that category.

You would need a special SFQ that makes sure to break down traffic
identically to how the TX queue selection divides traffic, and even
then I'm not even so sure how implementable that is.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 29, 2009, 12:44 a.m. UTC | #26
On Tue, Jul 28, 2009 at 12:59:19PM -0700, David Miller wrote:
>
> The premise is that there'd be only one.  The qdisc lock.
> 
> If the traffic is distributed, flow wise, the driver XMIT
> lock would spread due to multiqueue.

Suppose that we have a single large flow going through that has
filled up the hardware queue and is now backlogged in the qdisc
with qdisc_run on CPU A.   Now some other flow comes along and
sends a packet on CPU B.

So now CPU A and B will both be processing packets for the first
flow causing loads of lock contention.

But worse yet, we have introduced packet reordering.  So are you
convinced now :)

Cheers,
David Miller July 29, 2009, 1:06 a.m. UTC | #27
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 29 Jul 2009 08:44:28 +0800

> Suppose that we have a single large flow going through that has
> filled up the hardware queue and is now backlogged in the qdisc
> with qdisc_run on CPU A.   Now some other flow comes along and
> sends a packet on CPU B.
> 
> So now CPU A and B will both be processing packets for the first
> flow causing loads of lock contention.
> 
> But worse yet, we have introduced packet reordering.  So are you
> convinced now :)

More interesting to me is the case where the queue is not
filled up, or is very nearly so. :-)

But yes I do see your point.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 29, 2009, 1:25 a.m. UTC | #28
On Tue, Jul 28, 2009 at 06:06:47PM -0700, David Miller wrote:
>
> More interesting to me is the case where the queue is not
> filled up, or is very nearly so. :-)

Do you mean the hardware queue? In that case perhaps Krishna's
proposal of bypassing the qdisc would be the best.

Cheers,
David Miller July 29, 2009, 1:34 a.m. UTC | #29
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 29 Jul 2009 09:25:23 +0800

> On Tue, Jul 28, 2009 at 06:06:47PM -0700, David Miller wrote:
>>
>> More interesting to me is the case where the queue is not
>> filled up, or is very nearly so. :-)
> 
> Do you mean the hardware queue? In that case perhaps Krishna's
> proposal of bypassing the qdisc would be the best.

Wouldn't that bypass any rate limiting enforcement done by
the qdisc too?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 29, 2009, 2:12 a.m. UTC | #30
On Tue, Jul 28, 2009 at 06:34:51PM -0700, David Miller wrote:
>
> Wouldn't that bypass any rate limiting enforcement done by
> the qdisc too?

You'd only bypass it for the default qdisc (or any other qdisc
that didn't care).  It can either be achived through a flag as
Krishna proposed or perhaps we can make the qdisc's enqueue
function return a special value that indicates the packet can
be dequeued immediately and given to the hardware.

I don't have a good answer for how we can achieve CPU scalability
for rate limiting qdiscs yet.  However, that shouldn't stop us
from getting better CPU scalability for the default qdisc which
is what most people will use for the time being.

One possibility is to partition the bandwidth equally between
the queues and apply rate limiting locally within each queue.

Cheers,
David Miller July 29, 2009, 2:22 a.m. UTC | #31
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 29 Jul 2009 10:12:12 +0800

> One possibility is to partition the bandwidth equally between
> the queues and apply rate limiting locally within each queue.

Of course, you can solve the issue by redefining problem. :-)

However, such a semantic is not what users of this feature
are after.  I doubt there would be much uptake of it even
if we did implement it.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 29, 2009, 2:38 a.m. UTC | #32
On Tue, Jul 28, 2009 at 07:22:47PM -0700, David Miller wrote:
> 
> Of course, you can solve the issue by redefining problem. :-)
> 
> However, such a semantic is not what users of this feature
> are after.  I doubt there would be much uptake of it even
> if we did implement it.

I did say it wasn't a good answer :)

My main point is that whatever we do for the limiting qdiscs
should not penalise the non-limiting qdiscs because for the
moment people are trying to get more out of their NICs rather
than less :)

Cheers,
Herbert Xu July 29, 2009, 3:15 a.m. UTC | #33
On Wed, Jul 29, 2009 at 10:38:09AM +0800, Herbert Xu wrote:
>
> I did say it wasn't a good answer :)

Here's another crazy idea:

Forget about using multiple hardware TX queues with rate limiting
qdiscs.  Why? Because these qdiscs are fundamentally serialising
so once the traffic has gone through it there is nothing to be
gained from multiplexing them into the hardware only to be merged
again onto the wire right away.

So does this mean that we give up on CPU scalability on limiting
qdiscs? No, the bottleneck is in fact in the qdisc, not in the
NIC.  So we implement multiqueue in the qdisc instead.

This can be done with a lockless ring buffer.  Each CPU would
deposit its packets in a qdisc ringer buffer assigned to it.
The qdisc would then run on a single CPU harvesting all the
ring buffers in some fair manner.

Essentially we can treat rate limiting qdiscs as NICs.  Another
way of thinking about this is that we're essentially decoupling
a single system into two.  One that is using a default qdisc to
achieve maximum throughput.  This is then fed into a second system
that is dedicated to rate limiting.

Cheers,
Jarek Poplawski July 29, 2009, 11:04 a.m. UTC | #34
On Wed, Jul 29, 2009 at 08:44:28AM +0800, Herbert Xu wrote:
> On Tue, Jul 28, 2009 at 12:59:19PM -0700, David Miller wrote:
> >
> > The premise is that there'd be only one.  The qdisc lock.
> > 
> > If the traffic is distributed, flow wise, the driver XMIT
> > lock would spread due to multiqueue.
> 
> Suppose that we have a single large flow going through that has
> filled up the hardware queue and is now backlogged in the qdisc
> with qdisc_run on CPU A.   Now some other flow comes along and
> sends a packet on CPU B.
> 
> So now CPU A and B will both be processing packets for the first
> flow causing loads of lock contention.
> 
> But worse yet, we have introduced packet reordering.  So are you
> convinced now :)

How about this: instead of the _RUNNING flag we take tx lock while
holding qdisc lock and release qdisc lock just after (before xmit).
This should prevent reordering, and probably could improve cache use:
CPU B which takes qdisc lock only for enqueuing now, would use it for
dequeuing too, plus if accidentally the next xmit goes to a different
tx queue, it could start before CPU A finishes. Otherwise it would
simply wait for CPU A (without tx lock contention). Of course it
needs testing... 

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 29, 2009, 11:11 a.m. UTC | #35
On Wed, Jul 29, 2009 at 11:04:36AM +0000, Jarek Poplawski wrote:
>
> How about this: instead of the _RUNNING flag we take tx lock while
> holding qdisc lock and release qdisc lock just after (before xmit).
> This should prevent reordering, and probably could improve cache use:
> CPU B which takes qdisc lock only for enqueuing now, would use it for
> dequeuing too, plus if accidentally the next xmit goes to a different
> tx queue, it could start before CPU A finishes. Otherwise it would
> simply wait for CPU A (without tx lock contention). Of course it
> needs testing... 

Well reordering isn't the only problem, the lock contention brought
upon by two CPUs both trying to transmit the same flow from the
qdisc is just as bad.

Cheers,
Jarek Poplawski July 29, 2009, 11:26 a.m. UTC | #36
On Wed, Jul 29, 2009 at 07:11:34PM +0800, Herbert Xu wrote:
> On Wed, Jul 29, 2009 at 11:04:36AM +0000, Jarek Poplawski wrote:
> >
> > How about this: instead of the _RUNNING flag we take tx lock while
> > holding qdisc lock and release qdisc lock just after (before xmit).
> > This should prevent reordering, and probably could improve cache use:
> > CPU B which takes qdisc lock only for enqueuing now, would use it for
> > dequeuing too, plus if accidentally the next xmit goes to a different
> > tx queue, it could start before CPU A finishes. Otherwise it would
> > simply wait for CPU A (without tx lock contention). Of course it
> > needs testing... 
> 
> Well reordering isn't the only problem, the lock contention brought
> upon by two CPUs both trying to transmit the same flow from the
> qdisc is just as bad.

If you mean the tx lock there should be no "real" contention: only
one waiter max. qdisc lock's contention might be higher, but it's
use (during contention) better: enqueue + dequeue together instead
of doing it separately.

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu July 29, 2009, 12:30 p.m. UTC | #37
On Wed, Jul 29, 2009 at 11:26:14AM +0000, Jarek Poplawski wrote:
>
> If you mean the tx lock there should be no "real" contention: only
> one waiter max. qdisc lock's contention might be higher, but it's
> use (during contention) better: enqueue + dequeue together instead
> of doing it separately.

Hmm, you will have contention if they're both transmitting a
single flow which must always go into a single physical queue.

So you'll have two CPUs doing the work of a single CPU, with one
of them always spinning on the TX lock.

Cheers,
Jarek Poplawski July 29, 2009, 12:47 p.m. UTC | #38
On Wed, Jul 29, 2009 at 08:30:41PM +0800, Herbert Xu wrote:
> On Wed, Jul 29, 2009 at 11:26:14AM +0000, Jarek Poplawski wrote:
> >
> > If you mean the tx lock there should be no "real" contention: only
> > one waiter max. qdisc lock's contention might be higher, but it's
> > use (during contention) better: enqueue + dequeue together instead
> > of doing it separately.
> 
> Hmm, you will have contention if they're both transmitting a
> single flow which must always go into a single physical queue.
> 
> So you'll have two CPUs doing the work of a single CPU, with one
> of them always spinning on the TX lock.

Hmm.. I'd call it a little waiting, but OK let's call it contention;-)
When tx is faster than queue operations there could be no contention
at all. I'm not saying I must be right: IMHO it's only worth trying.

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krishna Kumar July 29, 2009, 1:08 p.m. UTC | #39
Hi Jarek,

Jarek Poplawski <jarkao2@gmail.com> wrote on 07/29/2009 06:17:34 PM:

> Re: [RFC] [PATCH] Don't run __qdisc_run() on a stopped TX queue
>
> On Wed, Jul 29, 2009 at 08:30:41PM +0800, Herbert Xu wrote:
> > On Wed, Jul 29, 2009 at 11:26:14AM +0000, Jarek Poplawski wrote:
> > >
> > > If you mean the tx lock there should be no "real" contention: only
> > > one waiter max. qdisc lock's contention might be higher, but it's
> > > use (during contention) better: enqueue + dequeue together instead
> > > of doing it separately.
> >
> > Hmm, you will have contention if they're both transmitting a
> > single flow which must always go into a single physical queue.
> >
> > So you'll have two CPUs doing the work of a single CPU, with one
> > of them always spinning on the TX lock.
>
> Hmm.. I'd call it a little waiting, but OK let's call it contention;-)
> When tx is faster than queue operations there could be no contention
> at all. I'm not saying I must be right: IMHO it's only worth trying.

My expectation is that tx would be much longer than a few lines of
queue operation....

thanks,

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krishna Kumar July 29, 2009, 1:28 p.m. UTC | #40
Hi Herbert,

Herbert Xu <herbert@gondor.apana.org.au> wrote on 07/29/2009 06:14:28 AM:
>
> Re: [RFC] [PATCH] Don't run __qdisc_run() on a stopped TX queue
>
> On Tue, Jul 28, 2009 at 12:59:19PM -0700, David Miller wrote:
> >
> > The premise is that there'd be only one.  The qdisc lock.
> >
> > If the traffic is distributed, flow wise, the driver XMIT
> > lock would spread due to multiqueue.
>
> Suppose that we have a single large flow going through that has
> filled up the hardware queue and is now backlogged in the qdisc
> with qdisc_run on CPU A.   Now some other flow comes along and
> sends a packet on CPU B.
>
> So now CPU A and B will both be processing packets for the first
> flow causing loads of lock contention.
>
> But worse yet, we have introduced packet reordering.  So are you
> convinced now :)

I am probably misunderstanding you, but ... For different flows, is
this an issue? The same TCP connection will always use a single TXQ,
even if it runs on different CPUs since dev_pick_tx will select the
same txq, so how will reordering happen?

thanks,

- KK

(I am off tomorrow, but will be around for the next 3 hours)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski July 29, 2009, 7:26 p.m. UTC | #41
On Wed, Jul 29, 2009 at 06:38:36PM +0530, Krishna Kumar2 wrote:
> Hi Jarek,
> 
> Jarek Poplawski <jarkao2@gmail.com> wrote on 07/29/2009 06:17:34 PM:
> 
> > Re: [RFC] [PATCH] Don't run __qdisc_run() on a stopped TX queue
> >
> > On Wed, Jul 29, 2009 at 08:30:41PM +0800, Herbert Xu wrote:
> > > On Wed, Jul 29, 2009 at 11:26:14AM +0000, Jarek Poplawski wrote:
> > > >
> > > > If you mean the tx lock there should be no "real" contention: only
> > > > one waiter max. qdisc lock's contention might be higher, but it's
> > > > use (during contention) better: enqueue + dequeue together instead
> > > > of doing it separately.
> > >
> > > Hmm, you will have contention if they're both transmitting a
> > > single flow which must always go into a single physical queue.
> > >
> > > So you'll have two CPUs doing the work of a single CPU, with one
> > > of them always spinning on the TX lock.
> >
> > Hmm.. I'd call it a little waiting, but OK let's call it contention;-)
> > When tx is faster than queue operations there could be no contention
> > at all. I'm not saying I must be right: IMHO it's only worth trying.
> 
> My expectation is that tx would be much longer than a few lines of
> queue operation....

I meant here the case of non-default qdisc, like HTB, HFSC or CBQ,
often with many classes and filters, so a few more lines than usual...

Thanks,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Aug. 4, 2009, 3:15 a.m. UTC | #42
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 29 Jul 2009 11:15:48 +0800

> This can be done with a lockless ring buffer.  Each CPU would
> deposit its packets in a qdisc ringer buffer assigned to it.
> The qdisc would then run on a single CPU harvesting all the
> ring buffers in some fair manner.

There was a recent posting (I think today) on lkml of a memory barrier
based lockless queue, I think it was named kfifo, that could
facilitate this.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff -ruNp org/include/net/pkt_sched.h new/include/net/pkt_sched.h
--- org/include/net/pkt_sched.h	2009-06-22 11:40:31.000000000 +0530
+++ new/include/net/pkt_sched.h	2009-07-20 13:09:18.000000000 +0530
@@ -92,8 +92,13 @@  extern void __qdisc_run(struct Qdisc *q)
 
 static inline void qdisc_run(struct Qdisc *q)
 {
-	if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
-		__qdisc_run(q);
+	if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
+		if (!netif_tx_queue_stopped(q->dev_queue))
+			__qdisc_run(q);
+		else
+			/* driver will wake us up anyway, just clear */
+			clear_bit(__QDISC_STATE_RUNNING, &q->state);
+	}
 }
 
 extern int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
diff -ruNp org/net/sched/sch_generic.c new/net/sched/sch_generic.c
--- org/net/sched/sch_generic.c	2009-05-25 07:48:07.000000000 +0530
+++ new/net/sched/sch_generic.c	2009-07-20 13:09:18.000000000 +0530
@@ -56,12 +56,8 @@  static inline struct sk_buff *dequeue_sk
 	struct sk_buff *skb = q->gso_skb;
 
 	if (unlikely(skb)) {
-		struct net_device *dev = qdisc_dev(q);
-		struct netdev_queue *txq;
-
 		/* check the reason of requeuing without tx lock first */
-		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
-		if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
+		if (!netif_tx_queue_frozen(q->dev_queue))
 			q->gso_skb = NULL;
 		else
 			skb = NULL;
@@ -142,8 +138,7 @@  static inline int qdisc_restart(struct Q
 	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
-	if (!netif_tx_queue_stopped(txq) &&
-	    !netif_tx_queue_frozen(txq))
+	if (!netif_tx_queue_frozen(txq))
 		ret = dev_hard_start_xmit(skb, dev, txq);
 	HARD_TX_UNLOCK(dev, txq);