diff mbox

[net-next,1/2] qdisc: Allow qdiscs to provide backpressure up the stack.

Message ID 1282762851-3612-1-git-send-email-greearb@candelatech.com
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Ben Greear Aug. 25, 2010, 7 p.m. UTC
Some qdiscs, in some instances, can reliably detect when they
are about to drop a packet in the dev_queue_xmit path.  In
this case, it would be nice to provide backpressure up the
stack, and NOT free the skb in the qdisc logic.

Signed-off-by: Ben Greear <greearb@candelatech.com>
---
:100644 100644 59962db... 20be932... M	include/linux/netdevice.h
:100644 100644 3c8728a... 146a97a... M	include/net/sch_generic.h
:100644 100644 859e30f... f360a9b... M	net/core/dev.c
:100644 100644 2aeb3a4... 0692717... M	net/sched/sch_generic.c
 include/linux/netdevice.h |    7 +++++++
 include/net/sch_generic.h |   19 +++++++++++++++++++
 net/core/dev.c            |   19 ++++++++++++++-----
 net/sched/sch_generic.c   |   20 ++++++++++++++++++++
 4 files changed, 60 insertions(+), 5 deletions(-)

Comments

stephen hemminger Aug. 25, 2010, 8:44 p.m. UTC | #1
On Wed, 25 Aug 2010 12:00:50 -0700
Ben Greear <greearb@candelatech.com> wrote:

> Some qdiscs, in some instances, can reliably detect when they
> are about to drop a packet in the dev_queue_xmit path.  In
> this case, it would be nice to provide backpressure up the
> stack, and NOT free the skb in the qdisc logic.
> 
> Signed-off-by: Ben Greear <greearb@candelatech.com>
> ---
> :100644 100644 59962db... 20be932... M	include/linux/netdevice.h
> :100644 100644 3c8728a... 146a97a... M	include/net/sch_generic.h
> :100644 100644 859e30f... f360a9b... M	net/core/dev.c
> :100644 100644 2aeb3a4... 0692717... M	net/sched/sch_generic.c
>  include/linux/netdevice.h |    7 +++++++
>  include/net/sch_generic.h |   19 +++++++++++++++++++
>  net/core/dev.c            |   19 ++++++++++++++-----
>  net/sched/sch_generic.c   |   20 ++++++++++++++++++++
>  4 files changed, 60 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 59962db..20be932 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -97,6 +97,7 @@ struct wireless_dev;
>  #define NET_XMIT_DROP		0x01	/* skb dropped			*/
>  #define NET_XMIT_CN		0x02	/* congestion notification	*/
>  #define NET_XMIT_POLICED	0x03	/* skb is shot by police	*/
> +#define NET_XMIT_BUSY	  	0x04	/* congestion, but skb was NOT freed */
>  #define NET_XMIT_MASK		0x0f	/* qdisc flags in net/sch_generic.h */
>  
>  /* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It
> @@ -1296,6 +1297,12 @@ extern int		dev_open(struct net_device *dev);
>  extern int		dev_close(struct net_device *dev);
>  extern void		dev_disable_lro(struct net_device *dev);
>  extern int		dev_queue_xmit(struct sk_buff *skb);
> +
> +/* Similar to dev_queue_xmit, but if try_no_consume != 0,
> + * it may return NET_XMIT_BUSY and NOT free the skb if it detects congestion
> + */
> +extern int		try_dev_queue_xmit(struct sk_buff *skb, int try_no_consume);
> +
>  extern int		register_netdevice(struct net_device *dev);
>  extern void		unregister_netdevice_queue(struct net_device *dev,
>  						   struct list_head *head);
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 3c8728a..146a97a 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -43,6 +43,7 @@ struct qdisc_size_table {
>  
>  struct Qdisc {
>  	int 			(*enqueue)(struct sk_buff *skb, struct Qdisc *dev);
> +	int 			(*try_enqueue)(struct sk_buff *, struct Qdisc *dev); /* May return NET_XMIT_BUSY and NOT free skb. */
>


There aren't that many qdisc modules; just fix them all and change
semantics of enqueue.  How do you expect to handle the retry? Spinning
at the higher level is a bad idea and there is no non-racy way to get
a callback to make forward progress.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Greear Aug. 25, 2010, 8:56 p.m. UTC | #2
On 08/25/2010 01:44 PM, Stephen Hemminger wrote:
> On Wed, 25 Aug 2010 12:00:50 -0700

>>   struct Qdisc {
>>   	int 			(*enqueue)(struct sk_buff *skb, struct Qdisc *dev);
>> +	int 			(*try_enqueue)(struct sk_buff *, struct Qdisc *dev); /* May return NET_XMIT_BUSY and NOT free skb. */
>>
>
>
> There aren't that many qdisc modules; just fix them all and change
> semantics of enqueue.  How do you expect to handle the retry? Spinning
> at the higher level is a bad idea and there is no non-racy way to get
> a callback to make forward progress.

It may only make sense for things like macvlan at this point, since
it is already valid to return NET_XMIT_BUSY and not free the SKB
when you call dev->hard_start_xmit().

Using the try_ logic for sockets and such would probably require a good
deal more work, and probably is something I'd not attempt any time
soon (there are others who know that code better, so maybe they'd
be able & willing to do that work).

If/when all the calling code knows how to do the retry logic properly,
then we could have a single enqueue method, but I think to allow
incremental changes, it's best to have the two methods for now.

Thanks,
Ben
David Miller Aug. 26, 2010, 10:59 p.m. UTC | #3
From: Ben Greear <greearb@candelatech.com>
Date: Wed, 25 Aug 2010 12:00:50 -0700

> Some qdiscs, in some instances, can reliably detect when they
> are about to drop a packet in the dev_queue_xmit path.  In
> this case, it would be nice to provide backpressure up the
> stack, and NOT free the skb in the qdisc logic.
> 
> Signed-off-by: Ben Greear <greearb@candelatech.com>

I agree with Stephen that, if you're going to do this, it's better to
just update all of the qdiscs to be able to handle the new semantic
than to make this special case code path.

Right now the only configuration where you scheme is going to work is
one with the default qdisc.  That is way too narrow in scope in my
opinion.

There is also the issue of what this actually accomplishes.  All I can
happening is that instead of being limited to the child device's
backlog limits, you're limited by the combined backlog of two devices.

In reality I don't see this helping much at all.  The most downstream
device provides the TX queue limiting and that ought to be sufficient.
In fact, allowing such increase in backlogging is something which might
be completely unexpected.

If the limit is insufficient in the most downstream device, increase
it.

If the only remaining problem is pktgen, we can look into changing it
to operate with more information than just plain NETDEV_TX_BUSY.

Also, there is the issue of waking up the queue.  If you pass back
NETDEV_TX_BUSY the generic scheduler layer assumes that this means
that there will absolutely be something which triggers to get the
queue transmitting again.  Usually that is the hardware queue waking up
in the device.

But as implemented, you've added no such notifications and therefore I
think it's possible for the pipline of packets going through these
layered devices to get wedged and stop forever.

This is why you can't mix and match hardware queue status notifications
with qdisc software ones.  If you pass NETDEV_TX_BUSY back you better
have fake queue wakeup notifications to get the send engine going again
as well.  But I believe we really don't want that.

The only indication we really want to propagate back down to the callers
is NET_XMIT_CN which, as far as I can tell, does happen correctly right
now.  If not that's a bug.

You're essentially trying to pass down a hardware queue condition for
something that is caused by a software queue limit.  And therefore I
strongly disagree with what these changes are doing.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Greear Aug. 27, 2010, 4:14 a.m. UTC | #4
On 08/26/2010 03:59 PM, David Miller wrote:
> From: Ben Greear<greearb@candelatech.com>
> Date: Wed, 25 Aug 2010 12:00:50 -0700
>
>> Some qdiscs, in some instances, can reliably detect when they
>> are about to drop a packet in the dev_queue_xmit path.  In
>> this case, it would be nice to provide backpressure up the
>> stack, and NOT free the skb in the qdisc logic.
>>
>> Signed-off-by: Ben Greear<greearb@candelatech.com>
>
> I agree with Stephen that, if you're going to do this, it's better to
> just update all of the qdiscs to be able to handle the new semantic
> than to make this special case code path.

Do you mean I should add a try_enqueue method to each one, or make
the existing enqueue method able to return the new error code and fix
up all callers?

> Right now the only configuration where you scheme is going to work is
> one with the default qdisc.  That is way too narrow in scope in my
> opinion.

No argument there, but no use in doing that work until we agree on
an API.

> There is also the issue of what this actually accomplishes.  All I can
> happening is that instead of being limited to the child device's
> backlog limits, you're limited by the combined backlog of two devices.
>
> In reality I don't see this helping much at all.  The most downstream
> device provides the TX queue limiting and that ought to be sufficient.
> In fact, allowing such increase in backlogging is something which might
> be completely unexpected.
>
> If the limit is insufficient in the most downstream device, increase
> it.
>
> If the only remaining problem is pktgen, we can look into changing it
> to operate with more information than just plain NETDEV_TX_BUSY.
>
> Also, there is the issue of waking up the queue.  If you pass back
> NETDEV_TX_BUSY the generic scheduler layer assumes that this means
> that there will absolutely be something which triggers to get the
> queue transmitting again.  Usually that is the hardware queue waking up
> in the device.
>
> But as implemented, you've added no such notifications and therefore I
> think it's possible for the pipline of packets going through these
> layered devices to get wedged and stop forever.
>
> This is why you can't mix and match hardware queue status notifications
> with qdisc software ones.  If you pass NETDEV_TX_BUSY back you better
> have fake queue wakeup notifications to get the send engine going again
> as well.  But I believe we really don't want that.
>
> The only indication we really want to propagate back down to the callers
> is NET_XMIT_CN which, as far as I can tell, does happen correctly right
> now.  If not that's a bug.
>
> You're essentially trying to pass down a hardware queue condition for
> something that is caused by a software queue limit.  And therefore I
> strongly disagree with what these changes are doing.

I'll look into the NET_XMIT_CN, but if that propagates backpressure up
through mac-vlan, then something must know how to re-start the tx logic.
That same logic could be used for my purposes I believe.

One thing that really bothers me about the current qdisc stuff is that
it just frees the packet when it cannot accept it.  I would like to
be able to not delete the packet, similar to how the NET_XMIT_BUSY is
supposed to work.  This *should* allow us to better throttle UDP
sockets without dropping as many packets on the sending side,
and would probably help TCP as well.  And, I know qdiscs cannot
always know at acceptance time if they are going to drop the skb,
but at least some of them *do* know, so I'd like to use that
knowledge when it exists.

As for waking up queues, perhaps we could have some generic framework
for things to register for waking with a netdevice?  Pktgen could
benefit from this by not having to busy-spin on a full network device,
normal queues must already have something similar, and things like
mac-vlans and 802.1q vlans could register themselves as well.

It would be assumed that the bottom level netdevice would have interrupts
or similar to wake it, and then it could propagate recursively to all
things registered to be notified.

I have a patch that even I am embarrassed to post that accomplishes this
wait-queue logic for pktgen, but with a bit of work, it could be
made more generic and publish-able :)

Thanks,
Ben
David Miller Aug. 27, 2010, 4:34 a.m. UTC | #5
From: Ben Greear <greearb@candelatech.com>
Date: Thu, 26 Aug 2010 21:14:39 -0700

> I'll look into the NET_XMIT_CN, but if that propagates backpressure up
> through mac-vlan, then something must know how to re-start the tx
> logic.

It doesn't need to, as it drops and frees up the packet.

> One thing that really bothers me about the current qdisc stuff is that
> it just frees the packet when it cannot accept it.

Without the drop, things like RED simply would not work.

> This *should* allow us to better throttle UDP sockets without
> dropping as many packets on the sending side, and would probably
> help TCP as well.

UDP is well throttled by the socket send buffer.

The only problematic case is pktgen and it's an abberation and
an obscure case.  It's in fact trying to bypass all of the
queueing and accounting the stack normally makes use of.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Greear Aug. 27, 2010, 5:22 a.m. UTC | #6
On 08/26/2010 09:34 PM, David Miller wrote:
> From: Ben Greear<greearb@candelatech.com>
> Date: Thu, 26 Aug 2010 21:14:39 -0700
>
>> I'll look into the NET_XMIT_CN, but if that propagates backpressure up
>> through mac-vlan, then something must know how to re-start the tx
>> logic.
>
> It doesn't need to, as it drops and frees up the packet.

If there were 5 pkts in the socket buffer, and the attempt to
send the first one caused NET_XMIT_CN, then
based on your comment below about UDP being throttled, I assume
the other 4 are kept until later?

What logic wakes up the socket transmit logic for those other 4
packets?

For sch_generic, it never returns NET_XMIT_CN at all..it just returns
NET_XMIT_DROP and deletes the passed-in skb.  Others, such as sch_fifo
drop the oldest pkt, keep the one passed in, and returns NET_XMIT_CN,
so it could not benefit from my patch.

>> One thing that really bothers me about the current qdisc stuff is that
>> it just frees the packet when it cannot accept it.
>
> Without the drop, things like RED simply would not work.

Even if RED could never work, the sch_generic can.

>> This *should* allow us to better throttle UDP sockets without
>> dropping as many packets on the sending side, and would probably
>> help TCP as well.
>
> UDP is well throttled by the socket send buffer.
>
> The only problematic case is pktgen and it's an abberation and
> an obscure case.  It's in fact trying to bypass all of the
> queueing and accounting the stack normally makes use of.

 From looking at upstream code, it seems that pktgen does the
right thing when NET_XMIT_CN or NET_XMIT_DROP is returned,
as long as you don't mind busy-spinning and perhaps some
accounting errors (NET_XMIT_CN doesn't mean *your* pkt was
deleted, just that somebody's packet was).

For accounting purposes, it would still be nice to get
NET_TX_BUSY because then you can just retry the packet
and you know that your packet was not transmitted or dropped.

Thanks,
Ben
David Miller Aug. 27, 2010, 5:36 a.m. UTC | #7
From: Ben Greear <greearb@candelatech.com>
Date: Thu, 26 Aug 2010 22:22:23 -0700

> On 08/26/2010 09:34 PM, David Miller wrote:
>> From: Ben Greear<greearb@candelatech.com>
>> Date: Thu, 26 Aug 2010 21:14:39 -0700
>>
>>> I'll look into the NET_XMIT_CN, but if that propagates backpressure up
>>> through mac-vlan, then something must know how to re-start the tx
>>> logic.
>>
>> It doesn't need to, as it drops and frees up the packet.
> 
> If there were 5 pkts in the socket buffer, and the attempt to
> send the first one caused NET_XMIT_CN, then
> based on your comment below about UDP being throttled, I assume
> the other 4 are kept until later?

It only has an effect on TCP, it causes it to decrease the
congestion window limits.

I did not state that it had any effect on UDP.  Just because I
talk about UDP later in my email doesn't mean the two things
are related.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Greear Aug. 27, 2010, 5:58 a.m. UTC | #8
On 08/26/2010 10:36 PM, David Miller wrote:
> From: Ben Greear<greearb@candelatech.com>
> Date: Thu, 26 Aug 2010 22:22:23 -0700
>
>> On 08/26/2010 09:34 PM, David Miller wrote:
>>> From: Ben Greear<greearb@candelatech.com>
>>> Date: Thu, 26 Aug 2010 21:14:39 -0700
>>>
>>>> I'll look into the NET_XMIT_CN, but if that propagates backpressure up
>>>> through mac-vlan, then something must know how to re-start the tx
>>>> logic.
>>>
>>> It doesn't need to, as it drops and frees up the packet.
>>
>> If there were 5 pkts in the socket buffer, and the attempt to
>> send the first one caused NET_XMIT_CN, then
>> based on your comment below about UDP being throttled, I assume
>> the other 4 are kept until later?
>
> It only has an effect on TCP, it causes it to decrease the
> congestion window limits.
>
> I did not state that it had any effect on UDP.  Just because I
> talk about UDP later in my email doesn't mean the two things
> are related.

Ok, but you did state that UDP is well throttled.

To me that implies some backpressure will fill up socket transmit buffers
and slow down user-space transmit if the local network device cannot handle
the load.  And if it does that, then something must know how to restart the
transmit logic.

Maybe it's well throttled on real devices by the sched logic and
the netdev-wake-queue logic but just not on vlans?

Either way, it would be nice if we could throttle w/out having to drop
packets, so a way for certain sched types to return a BUSY code and not
delete the skb still seems useful, and not fundamentally different from
handling NET_XMIT_CN.

Thanks,
Ben
David Miller Aug. 27, 2010, 6:11 a.m. UTC | #9
From: Ben Greear <greearb@candelatech.com>
Date: Thu, 26 Aug 2010 22:58:41 -0700

> And if it does that, then something must know how to restart the
> transmit logic.

When SKBs get freed up, space opens up in the socket send buffer,
waking up the process or signalling it from poll() so it can write
more.

pktgen eliminates this whole layer of queueing and signalling, which
is why it continually succeptible this behavior you dislike.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Greear Aug. 27, 2010, 3:26 p.m. UTC | #10
On 08/26/2010 11:11 PM, David Miller wrote:
> From: Ben Greear<greearb@candelatech.com>
> Date: Thu, 26 Aug 2010 22:58:41 -0700
>
>> And if it does that, then something must know how to restart the
>> transmit logic.
>
> When SKBs get freed up, space opens up in the socket send buffer,
> waking up the process or signalling it from poll() so it can write
> more.

Seems there is still room for problems:

*  There may be zero frames already in the qdisc for this UDP connection,
    so aside from the one that failed to enqueue, none other will be freed.

*  If deleting that single SKB that was to be enqueued opens space, then it's
    not really throttling, just busy-spinning on deleting skbs, but I think
    that the wakeup heuristics won't wake the queue until it's about 1/2 full
    anyway, so that single skb isn't going to wake any queues in most cases.

I'll look through the code later today and see how much churn it would take
to support the BUSY return code from dev_queue_xmit, without adding any new
qdisc API methods.

For the trivial case, I can just kfree_skb when BUSY is returned, for the
same overall behaviour as today.  For something like UDP, I might be able
to poke the SKB back into the queue instead of freeing it.

> pktgen eliminates this whole layer of queueing and signalling, which
> is why it continually succeptible this behavior you dislike.

Proper backoff on error codes from hard_start_xmit and some wake logic
when the unerlying netdevice wakes up again fixes this.  You can make pktgen
run w/out busy spinning.  It's a big messy patch (as I coded it up)
though.

I think we could concentrate on allowing qdisc to return some sort of
congestion notification w/out always freeing the skb first, and once
that seems workable, go back to worrying about how to properly propagate
that through macvlans.

Thanks,
Ben
Eric Dumazet Aug. 27, 2010, 3:59 p.m. UTC | #11
Le vendredi 27 août 2010 à 08:26 -0700, Ben Greear a écrit :

> For the trivial case, I can just kfree_skb when BUSY is returned, for the
> same overall behaviour as today.  For something like UDP, I might be able
> to poke the SKB back into the queue instead of freeing it.

Its not trivial at all. Actually I bet this is not possible.

There is no queue for UDP (apart CORKing of course). There is at most
one frame per socket. Once this frame is ready (uncork), we send it
immediately via ip_push_pending_frames(), a destructive process, no way
it can be rollbacked in case of failure at the end of the chain.

throttling is only done because some frames are queued in qdisc, keeping
memory accounting refs on socket.

Between UDP and qdisc, they are many layers (netfilter, ip
fragmentation, ...), so requeuing skb is not possible at all.

Netfilter wont rollback all its work.


backpressure should be evaluated by another way.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Greear Aug. 27, 2010, 5 p.m. UTC | #12
On 08/27/2010 08:59 AM, Eric Dumazet wrote:
> Le vendredi 27 août 2010 à 08:26 -0700, Ben Greear a écrit :
>
>> For the trivial case, I can just kfree_skb when BUSY is returned, for the
>> same overall behaviour as today.  For something like UDP, I might be able
>> to poke the SKB back into the queue instead of freeing it.
>
> Its not trivial at all. Actually I bet this is not possible.

Ughh, I'll quit bothering with higher level protocols then.

Dave:  Is there any reason to pursue the qdisc BUSY return code
so that it can be used for macvlans, vlans, bonding, etc, or
is that idea just DOA.

There are calls to dev_queue_xmit all over, and many don't
check return codes at all.  I could still fix those up to
free the skb when BUSY is returned.

Or, I could stay with adding a new try_dev_queue_xmit method
so that the few callers that care can deal with error codes
properly.

Based on my slightly improved understanding of how UDP and others
do queue backoff, I don't see how the changes to macvlan that
I posted could cause any additional opportunities for queue
deadlock.

If someone manages to add a qdisc to a macvlan, there could
be issues since there is no way to wake up a macvlan queue,
but that issue exists with or without my patch.

Thanks,
Ben
diff mbox

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 59962db..20be932 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -97,6 +97,7 @@  struct wireless_dev;
 #define NET_XMIT_DROP		0x01	/* skb dropped			*/
 #define NET_XMIT_CN		0x02	/* congestion notification	*/
 #define NET_XMIT_POLICED	0x03	/* skb is shot by police	*/
+#define NET_XMIT_BUSY	  	0x04	/* congestion, but skb was NOT freed */
 #define NET_XMIT_MASK		0x0f	/* qdisc flags in net/sch_generic.h */
 
 /* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It
@@ -1296,6 +1297,12 @@  extern int		dev_open(struct net_device *dev);
 extern int		dev_close(struct net_device *dev);
 extern void		dev_disable_lro(struct net_device *dev);
 extern int		dev_queue_xmit(struct sk_buff *skb);
+
+/* Similar to dev_queue_xmit, but if try_no_consume != 0,
+ * it may return NET_XMIT_BUSY and NOT free the skb if it detects congestion
+ */
+extern int		try_dev_queue_xmit(struct sk_buff *skb, int try_no_consume);
+
 extern int		register_netdevice(struct net_device *dev);
 extern void		unregister_netdevice_queue(struct net_device *dev,
 						   struct list_head *head);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 3c8728a..146a97a 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -43,6 +43,7 @@  struct qdisc_size_table {
 
 struct Qdisc {
 	int 			(*enqueue)(struct sk_buff *skb, struct Qdisc *dev);
+	int 			(*try_enqueue)(struct sk_buff *, struct Qdisc *dev); /* May return NET_XMIT_BUSY and NOT free skb. */
 	struct sk_buff *	(*dequeue)(struct Qdisc *dev);
 	unsigned		flags;
 #define TCQ_F_BUILTIN		1
@@ -135,6 +136,7 @@  struct Qdisc_ops {
 	int			priv_size;
 
 	int 			(*enqueue)(struct sk_buff *, struct Qdisc *);
+	int 			(*try_enqueue)(struct sk_buff *, struct Qdisc *); /* May return NET_XMIT_BUSY and NOT free skb. */
 	struct sk_buff *	(*dequeue)(struct Qdisc *);
 	struct sk_buff *	(*peek)(struct Qdisc *);
 	unsigned int		(*drop)(struct Qdisc *);
@@ -426,6 +428,23 @@  static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch)
 	return qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
 }
 
+static inline int try_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+#ifdef CONFIG_NET_SCHED
+	if (sch->stab)
+		qdisc_calculate_pkt_len(skb, sch->stab);
+#endif
+	if (sch->try_enqueue)
+		return sch->try_enqueue(skb, sch);
+	return sch->enqueue(skb, sch);
+}
+
+static inline int try_qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch)
+{
+	qdisc_skb_cb(skb)->pkt_len = skb->len;
+	return try_qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
+}
+
 static inline void __qdisc_update_bstats(struct Qdisc *sch, unsigned int len)
 {
 	sch->bstats.bytes += len;
diff --git a/net/core/dev.c b/net/core/dev.c
index 859e30f..f360a9b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2087,7 +2087,8 @@  static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 
 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct net_device *dev,
-				 struct netdev_queue *txq)
+				 struct netdev_queue *txq,
+				 bool try_no_consume)
 {
 	spinlock_t *root_lock = qdisc_lock(q);
 	bool contended = qdisc_is_running(q);
@@ -2128,7 +2129,10 @@  static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		skb_dst_force(skb);
-		rc = qdisc_enqueue_root(skb, q);
+		if (try_no_consume) 
+			rc = try_qdisc_enqueue_root(skb, q);
+		else
+			rc = qdisc_enqueue_root(skb, q);
 		if (qdisc_run_begin(q)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);
@@ -2168,7 +2172,12 @@  static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
  *      the BH enable code must have IRQs enabled so that it will not deadlock.
  *          --BLG
  */
-int dev_queue_xmit(struct sk_buff *skb)
+int dev_queue_xmit(struct sk_buff *skb) {
+	return try_dev_queue_xmit(skb, 0);
+}
+EXPORT_SYMBOL(dev_queue_xmit);
+
+int try_dev_queue_xmit(struct sk_buff *skb, int try_no_consume)
 {
 	struct net_device *dev = skb->dev;
 	struct netdev_queue *txq;
@@ -2187,7 +2196,7 @@  int dev_queue_xmit(struct sk_buff *skb)
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
 #endif
 	if (q->enqueue) {
-		rc = __dev_xmit_skb(skb, q, dev, txq);
+		rc = __dev_xmit_skb(skb, q, dev, txq, try_no_consume);
 		goto out;
 	}
 
@@ -2239,7 +2248,7 @@  out:
 	rcu_read_unlock_bh();
 	return rc;
 }
-EXPORT_SYMBOL(dev_queue_xmit);
+EXPORT_SYMBOL(try_dev_queue_xmit);
 
 
 /*=======================================================================
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2aeb3a4..0692717 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -460,6 +460,24 @@  static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
 	return qdisc_drop(skb, qdisc);
 }
 
+static int pfifo_fast_try_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+{
+	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
+		int band = prio2band[skb->priority & TC_PRIO_MAX];
+		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+		struct sk_buff_head *list = band2list(priv, band);
+
+		priv->bitmap |= (1 << band);
+		qdisc->q.qlen++;
+		return __qdisc_enqueue_tail(skb, qdisc, list);
+	}
+
+	/* no room to enqueue, tell calling code to back off.  Do NOT free skb, that is
+	 * calling code's to deal with.
+	 */
+	return NET_XMIT_BUSY;
+}
+
 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
 {
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
@@ -533,6 +551,7 @@  struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.id		=	"pfifo_fast",
 	.priv_size	=	sizeof(struct pfifo_fast_priv),
 	.enqueue	=	pfifo_fast_enqueue,
+	.try_enqueue	=	pfifo_fast_try_enqueue,
 	.dequeue	=	pfifo_fast_dequeue,
 	.peek		=	pfifo_fast_peek,
 	.init		=	pfifo_fast_init,
@@ -564,6 +583,7 @@  struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	spin_lock_init(&sch->busylock);
 	sch->ops = ops;
 	sch->enqueue = ops->enqueue;
+	sch->try_enqueue = ops->try_enqueue;
 	sch->dequeue = ops->dequeue;
 	sch->dev_queue = dev_queue;
 	dev_hold(qdisc_dev(sch));