diff mbox

[RFC,1/5] net: implement support for low latency socket polling

Message ID 20130227175555.10611.42794.stgit@gitlad.jf.intel.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Eliezer Tamir Feb. 27, 2013, 5:55 p.m. UTC
Adds a new ndo_ll_poll method and the code that supports and uses it.
This method can be used by low latency applications to busy poll ethernet
device queues directly from the socket code. The ip_low_latency_poll sysctl
entry controls how many cycles to poll. Set to zero to disable.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
---

 include/linux/netdevice.h  |    3 ++
 include/linux/skbuff.h     |    4 ++
 include/net/ll_poll.h      |   71 ++++++++++++++++++++++++++++++++++++++++++++
 include/net/sock.h         |    3 ++
 net/core/datagram.c        |    6 ++++
 net/core/skbuff.c          |    4 ++
 net/core/sock.c            |    6 ++++
 net/ipv4/Kconfig           |   12 +++++++
 net/ipv4/sysctl_net_ipv4.c |   10 ++++++
 net/socket.c               |   25 +++++++++++++++
 10 files changed, 143 insertions(+), 1 deletions(-)
 create mode 100644 include/net/ll_poll.h


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric Dumazet March 3, 2013, 6:35 p.m. UTC | #1
On Wed, 2013-02-27 at 09:55 -0800, Eliezer Tamir wrote:

> index 821c7f4..d1d1016 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -408,6 +408,10 @@ struct sk_buff {
>  	struct sock		*sk;
>  	struct net_device	*dev;
>  
> +#ifdef CONFIG_INET_LL_RX_POLL
> +	struct napi_struct	*dev_ref; /* where this skb came from */
> +#endif
> +
>  	/*
>  	 * This is the control buffer. It is free to use for every
>  	 * layer. Please put your private variables there. If you

Yes, thats the killer, because :

1) It adds 8 bytes per skb, and we are going to reach the 256 bytes per
sk_buff boundary. cloned skbs will use an extra cache line.

It might make sense to union this on dma_cookie, as dma_cookie is only
used on TX path.

2) We need to reference count napi structs.

For 2) , we would need to add a percpu ref counter (a bit like struct
netdevice -> pcpu_refcnt)

Alternative to 2) would be to use a generation id, incremented every
time a napi used in spin polling enabled driver is dismantled (and freed
after RCU grace period)

And store in sockets not only the pointer to napi_struct, but the
current generation id : If the generation id doesnt match, disable
the spinpoll until next packet rebuilds the cache again.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen March 3, 2013, 7:21 p.m. UTC | #2
> Alternative to 2) would be to use a generation id, incremented every
> time a napi used in spin polling enabled driver is dismantled (and freed
> after RCU grace period)
> 
> And store in sockets not only the pointer to napi_struct, but the
> current generation id : If the generation id doesnt match, disable
> the spinpoll until next packet rebuilds the cache again.

This would require rcu_read_lock, aka preempt off, during polling, right?

-Andi
Eric Dumazet March 3, 2013, 9:20 p.m. UTC | #3
On Sun, 2013-03-03 at 20:21 +0100, Andi Kleen wrote:
> > Alternative to 2) would be to use a generation id, incremented every
> > time a napi used in spin polling enabled driver is dismantled (and freed
> > after RCU grace period)
> > 
> > And store in sockets not only the pointer to napi_struct, but the
> > current generation id : If the generation id doesnt match, disable
> > the spinpoll until next packet rebuilds the cache again.
> 
> This would require rcu_read_lock, aka preempt off, during polling, right?
> 

Of course, polling probably needs BH disabling as well to get the per
napi lock


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen March 4, 2013, 3:55 a.m. UTC | #4
On Sun, Mar 03, 2013 at 01:20:01PM -0800, Eric Dumazet wrote:
> On Sun, 2013-03-03 at 20:21 +0100, Andi Kleen wrote:
> > > Alternative to 2) would be to use a generation id, incremented every
> > > time a napi used in spin polling enabled driver is dismantled (and freed
> > > after RCU grace period)
> > > 
> > > And store in sockets not only the pointer to napi_struct, but the
> > > current generation id : If the generation id doesnt match, disable
> > > the spinpoll until next packet rebuilds the cache again.
> > 
> > This would require rcu_read_lock, aka preempt off, during polling, right?
> > 
> 
> Of course, polling probably needs BH disabling as well to get the per
> napi lock

Ok maybe the cond_resched() is good enough.

-Andi
Eliezer Tamir March 4, 2013, 8:43 a.m. UTC | #5
On 03/03/2013 20:35, Eric Dumazet wrote:
> On Wed, 2013-02-27 at 09:55 -0800, Eliezer Tamir wrote:
>
>> index 821c7f4..d1d1016 100644
>> --- a/include/linux/skbuff.h
>> +++ b/include/linux/skbuff.h
>> @@ -408,6 +408,10 @@ struct sk_buff {
>>   	struct sock		*sk;
>>   	struct net_device	*dev;
>>
>> +#ifdef CONFIG_INET_LL_RX_POLL
>> +	struct napi_struct	*dev_ref; /* where this skb came from */
>> +#endif
>> +
>>   	/*
>>   	 * This is the control buffer. It is free to use for every
>>   	 * layer. Please put your private variables there. If you
>
> Yes, thats the killer, because :
>
> 1) It adds 8 bytes per skb, and we are going to reach the 256 bytes per
> sk_buff boundary. cloned skbs will use an extra cache line.
>
> It might make sense to union this on dma_cookie, as dma_cookie is only
> used on TX path.

I will try this out.

> 2) We need to reference count napi structs.
>
> For 2) , we would need to add a percpu ref counter (a bit like struct
> netdevice -> pcpu_refcnt)
>
> Alternative to 2) would be to use a generation id, incremented every
> time a napi used in spin polling enabled driver is dismantled (and freed
> after RCU grace period)

I like this option, because one would assume that the life expectancy of 
a napi is rather long. We can just increment the generation id any time 
any napi is disabled, which simplifies things.

There could be other configuration changes that would make our notion on 
where to poll outdated, for example, someone may have reprogrammed an RX 
filter. This is not as catastrophic as a napi going away but still.

Would it make sense to make this a generic mechanism?
One could for example increment the generation id every time the RTNL is 
taken. or is this too much?

Thanks,
Eliezer
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 4, 2013, 2:52 p.m. UTC | #6
On Mon, 2013-03-04 at 10:43 +0200, Eliezer Tamir wrote:

> One could for example increment the generation id every time the RTNL is 
> taken. or is this too much?

RTNL is taken for a lot of operations, it would be better to have a
finer grained increment.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eliezer Tamir March 4, 2013, 3:28 p.m. UTC | #7
On 04/03/2013 16:52, Eric Dumazet wrote:
> On Mon, 2013-03-04 at 10:43 +0200, Eliezer Tamir wrote:
>
>> One could for example increment the generation id every time the RTNL is
>> taken. or is this too much?
>
> RTNL is taken for a lot of operations, it would be better to have a
> finer grained increment.

If is taken rarely enough it will still be worth it.

Otherwise it may be hard to know what operations need to invalidate the 
napi reference. It can very well be HW dependent, and then you end up 
adding a function for drivers to call to do the invalidation.

Or we can decide that we only care about catastrophic events and only 
worry about a napi completely going away and not worry about 
configuration changes.(Polling the wrong queue will not kill you, it's 
just a waste of perfectly good CPU cycles.)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 4, 2013, 4:15 p.m. UTC | #8
On Mon, 2013-03-04 at 17:28 +0200, Eliezer Tamir wrote:
> On 04/03/2013 16:52, Eric Dumazet wrote:
> > On Mon, 2013-03-04 at 10:43 +0200, Eliezer Tamir wrote:
> >
> >> One could for example increment the generation id every time the RTNL is
> >> taken. or is this too much?
> >
> > RTNL is taken for a lot of operations, it would be better to have a
> > finer grained increment.
> 
> If is taken rarely enough it will still be worth it.
> 

Yes, but eventually it makes attempts to get rid of RTNL a nightmare.

When adding new network features, just use the right semantic from the
beginning.

> Otherwise it may be hard to know what operations need to invalidate the 
> napi reference. It can very well be HW dependent, and then you end up 
> adding a function for drivers to call to do the invalidation.
> 
> Or we can decide that we only care about catastrophic events and only 
> worry about a napi completely going away and not worry about 
> configuration changes.(Polling the wrong queue will not kill you, it's 
> just a waste of perfectly good CPU cycles.)

As long as the incoming packets are able to update the information, who
cares if one packet missed the poll ?
 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Hutchings March 5, 2013, 4:43 p.m. UTC | #9
On Wed, 2013-02-27 at 09:55 -0800, Eliezer Tamir wrote:
> Adds a new ndo_ll_poll method and the code that supports and uses it.
> This method can be used by low latency applications to busy poll ethernet
> device queues directly from the socket code. The ip_low_latency_poll sysctl
> entry controls how many cycles to poll. Set to zero to disable.
[...]
> --- /dev/null
> +++ b/include/net/ll_poll.h
> @@ -0,0 +1,71 @@
> +/*
> + * low latency device queue flush
> + */
> +
> +#ifndef _LINUX_NET_LL_POLL_H
> +#define _LINUX_NET_LL_POLL_H
> +#ifdef CONFIG_INET_LL_RX_POLL
> +#include <linux/netdevice.h>
> +struct napi_struct;
> +extern int sysctl_net_ll_poll __read_mostly;
> +
> +/* return values from ndo_ll_poll */
> +#define LL_FLUSH_DONE		0
> +#define LL_FLUSH_FAILED		1
> +#define LL_FLUSH_BUSY		2
> +
> +static inline int sk_valid_ll(struct sock *sk)

bool

> +{
> +	return sysctl_net_ll_poll && sk->dev_ref &&
> +		!need_resched() && !signal_pending(current);
> +}
> +
> +/*
> + * TODO: how do we know that we have a working get_cycles?
> + * do we limit this by a configure dependacy?

In general it appears to require a run-time check.  You might need to
augment <asm/timex.h>.

> + * TODO: this is not safe when the device can be removed,
> + * but simple refcounting may prevent removal indefinatly
> + */
> +static inline int sk_poll_ll(struct sock *sk)
> +{
> +	struct napi_struct *napi = sk->dev_ref;
> +	const struct net_device_ops *ops;
> +	unsigned long end_time = sysctl_net_ll_poll + get_cycles();

ACCESS_ONCE(sysctl_net_ll_poll)

> +	if (!napi->dev || !napi->dev->netdev_ops ||
> +	    !napi->dev->netdev_ops->ndo_ll_poll)
> +		return false;
> +
> +	local_bh_disable();
> +
> +	ops = napi->dev->netdev_ops;
> +	while (skb_queue_empty(&sk->sk_receive_queue) &&
> +			!time_after((unsigned long)get_cycles(), end_time))

cycles_t may be narrower than unsigned long, in which case time_after()
will not compare correctly.  I think you need to open-code the
equivalent of time_after() but using cycles_t.

> +		if (ops->ndo_ll_poll(napi) == LL_FLUSH_FAILED)
> +				break; /* premanent failure */
> +
> +	local_bh_enable();
> +
> +	return !skb_queue_empty(&sk->sk_receive_queue);
> +}
> +
> +static inline void skb_mark_ll(struct napi_struct *napi, struct sk_buff *skb)
> +{
> +	skb->dev_ref = napi;
> +}

Slightly odd - I would expect skb to be the first parameter.

[...]
> --- a/net/ipv4/Kconfig
> +++ b/net/ipv4/Kconfig
> @@ -402,6 +402,18 @@ config INET_LRO
>  
>  	  If unsure, say Y.
>  
> +config INET_LL_RX_POLL
> +	bool "Low Latency Receive Poll"
> +	default n
> +	---help---
> +	  Support Low Latency Receive Queue Poll.
> +	  (For network card drivers which support this option.)
> +	  When waiting for data in read or poll call directly into the the device driver
> +	  to flush packets which may be pending on the device queues into the stack.
> +
> +
> +	  If unsure, say N.

Of course, all distributions will be expected to enable this.  So I'm
not sure what the point of the compile-time option is.  You might as
well enable it at compile-time but leave the default set to 0.

>  config INET_DIAG
>  	tristate "INET: socket monitoring interface"
>  	default y
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 960fd29..0c060c6 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -25,6 +25,7 @@
>  #include <net/inet_frag.h>
>  #include <net/ping.h>
>  #include <net/tcp_memcontrol.h>
> +#include <net/ll_poll.h>
>  
>  static int zero;
>  static int one = 1;
> @@ -326,6 +327,15 @@ static struct ctl_table ipv4_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= proc_dointvec
>  	},
> +#ifdef CONFIG_INET_LL_RX_POLL
> +	{
> +		.procname	= "ip_low_latency_poll",
> +		.data		= &sysctl_net_ll_poll,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec
> +	},
> +#endif

This would need to be added to Documentation/networking/ip-sysctl.txt.

Should the units really be cycles or, say, microseconds?  I assume that
a sysctl setter can do a conversion to cycles so that there's no need to
multiply every time the value is used.  (If the CPU doesn't have
constant_tsc or equivalent then this conversion doesn't quite work, but
then low-latency tunng usually includes disabling frequency scaling.)

Also, this should be a per-device (or even per-NAPI-context?) setting.

>  	{
>  		.procname	= "tcp_syn_retries",
>  		.data		= &sysctl_tcp_syn_retries,
> diff --git a/net/socket.c b/net/socket.c
> index ee0d029..86da082 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -105,6 +105,12 @@
>  #include <linux/sockios.h>
>  #include <linux/atalk.h>
>  
> +#ifdef CONFIG_INET_LL_RX_POLL
> +#include <net/ll_poll.h>
> +int sysctl_net_ll_poll __read_mostly = 150000;

Nicely tuned for your specific test system, no doubt. :-)

> +EXPORT_SYMBOL_GPL(sysctl_net_ll_poll);
> +#endif
[...]

Ben.
Eliezer Tamir March 5, 2013, 5:15 p.m. UTC | #10
On 05/03/2013 18:43, Ben Hutchings wrote:
> On Wed, 2013-02-27 at 09:55 -0800, Eliezer Tamir wrote:
>
> Should the units really be cycles or, say, microseconds?  I assume that
> a sysctl setter can do a conversion to cycles so that there's no need to
> multiply every time the value is used.  (If the CPU doesn't have
> constant_tsc or equivalent then this conversion doesn't quite work, but
> then low-latency tunng usually includes disabling frequency scaling.)

We are not very sensitive to this setting, anything on the order of your 
half round time trip plus a few standard deviations works well.
We are busy waiting, so setting a higher value does not change the 
results much.

It does make sense to have this in ms, and it might not matter if the 
dynamic cycles mess with the value too much.

BTW on my machines enabling frequency scaling improves performance in 
many cases.

> Also, this should be a per-device (or even per-NAPI-context?) setting.

Again, I would expect this to depend more on your workload than on the 
NIC, so I would keep this global.
User knobs should be as simple as possible.

 >>+int sysctl_net_ll_poll __read_mostly = 150000;
> Nicely tuned for your specific test system, no doubt. :-)

why don't you try this on your NIC and see ;-)

Thanks for the input,
Eliezer
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller March 5, 2013, 7:55 p.m. UTC | #11
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 5 Mar 2013 16:43:01 +0000

> In general it appears to require a run-time check.  You might need to
> augment <asm/timex.h>.

On the other hand, unlike get_cycles, sched_clock() is always available.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller March 5, 2013, 7:57 p.m. UTC | #12
From: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Date: Tue, 05 Mar 2013 19:15:26 +0200

> We are not very sensitive to this setting, anything on the order of
> your half round time trip plus a few standard deviations works well.
> We are busy waiting, so setting a higher value does not change the
> results much.

This makes the argument for using sched_clock() even stronger.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
H. Peter Anvin March 5, 2013, 8:03 p.m. UTC | #13
On 03/05/2013 11:55 AM, David Miller wrote:
> From: Ben Hutchings <bhutchings@solarflare.com>
> Date: Tue, 5 Mar 2013 16:43:01 +0000
> 
>> In general it appears to require a run-time check.  You might need to
>> augment <asm/timex.h>.
> 
> On the other hand, unlike get_cycles, sched_clock() is always available.
> 

On the gripping hand, we need to know when it uses something like
jiffies, in which case we probably need to disable the whole interface.

	-hpa

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b3d00fa..c6f2a9a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -945,6 +945,9 @@  struct net_device_ops {
 						     gfp_t gfp);
 	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
 #endif
+#ifdef CONFIG_INET_LL_RX_POLL
+	int			(*ndo_ll_poll)(struct napi_struct *dev);
+#endif
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
 	int			(*ndo_set_vf_vlan)(struct net_device *dev,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 821c7f4..d1d1016 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -408,6 +408,10 @@  struct sk_buff {
 	struct sock		*sk;
 	struct net_device	*dev;
 
+#ifdef CONFIG_INET_LL_RX_POLL
+	struct napi_struct	*dev_ref; /* where this skb came from */
+#endif
+
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h
new file mode 100644
index 0000000..3c7bcec
--- /dev/null
+++ b/include/net/ll_poll.h
@@ -0,0 +1,71 @@ 
+/*
+ * low latency device queue flush
+ */
+
+#ifndef _LINUX_NET_LL_POLL_H
+#define _LINUX_NET_LL_POLL_H
+#ifdef CONFIG_INET_LL_RX_POLL
+#include <linux/netdevice.h>
+struct napi_struct;
+extern int sysctl_net_ll_poll __read_mostly;
+
+/* return values from ndo_ll_poll */
+#define LL_FLUSH_DONE		0
+#define LL_FLUSH_FAILED		1
+#define LL_FLUSH_BUSY		2
+
+static inline int sk_valid_ll(struct sock *sk)
+{
+	return sysctl_net_ll_poll && sk->dev_ref &&
+		!need_resched() && !signal_pending(current);
+}
+
+/*
+ * TODO: how do we know that we have a working get_cycles?
+ * do we limit this by a configure dependacy?
+ * TODO: this is not safe when the device can be removed,
+ * but simple refcounting may prevent removal indefinatly
+ */
+static inline int sk_poll_ll(struct sock *sk)
+{
+	struct napi_struct *napi = sk->dev_ref;
+	const struct net_device_ops *ops;
+	unsigned long end_time = sysctl_net_ll_poll + get_cycles();
+
+	if (!napi->dev || !napi->dev->netdev_ops ||
+	    !napi->dev->netdev_ops->ndo_ll_poll)
+		return false;
+
+	local_bh_disable();
+
+	ops = napi->dev->netdev_ops;
+	while (skb_queue_empty(&sk->sk_receive_queue) &&
+			!time_after((unsigned long)get_cycles(), end_time))
+		if (ops->ndo_ll_poll(napi) == LL_FLUSH_FAILED)
+				break; /* premanent failure */
+
+	local_bh_enable();
+
+	return !skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static inline void skb_mark_ll(struct napi_struct *napi, struct sk_buff *skb)
+{
+	skb->dev_ref = napi;
+}
+
+static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
+{
+	if (skb->dev_ref)
+		sk->dev_ref = skb->dev_ref;
+
+}
+#else /* CONFIG_INET_LL_RX_FLUSH */
+
+#define sk_valid_ll(sk) 0
+#define sk_poll_ll(sk) do {} while (0)
+#define skb_mark_ll(napi, skb) do {} while (0)
+#define sk_mark_ll(sk, skb) do {} while (0)
+
+#endif /* CONFIG_INET_LL_RX_FLUSH */
+#endif /* _LINUX_NET_LL_POLL_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index a66caa2..13dd743 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -399,6 +399,9 @@  struct sock {
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
 	void                    (*sk_destruct)(struct sock *sk);
+#ifdef CONFIG_INET_LL_RX_POLL
+	struct napi_struct	*dev_ref;
+#endif
 };
 
 /*
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 368f9c3..14ad733 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -56,6 +56,7 @@ 
 #include <net/sock.h>
 #include <net/tcp_states.h>
 #include <trace/events/skb.h>
+#include <net/ll_poll.h>
 
 /*
  *	Is a socket 'connection oriented' ?
@@ -196,11 +197,16 @@  struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 			} else
 				__skb_unlink(skb, queue);
 
+			sk_mark_ll(sk, skb);
 			spin_unlock_irqrestore(&queue->lock, cpu_flags);
 			return skb;
 		}
 		spin_unlock_irqrestore(&queue->lock, cpu_flags);
 
+#ifdef CONFIG_INET_LL_RX_POLL
+		if (sk_valid_ll(sk) && sk_poll_ll(sk))
+			continue;
+#endif
 		/* User doesn't want to wait */
 		error = -EAGAIN;
 		if (!timeo)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 33245ef..3fa650e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -709,6 +709,10 @@  static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->vlan_tci		= old->vlan_tci;
 
 	skb_copy_secmark(new, old);
+
+#ifdef CONFIG_INET_LL_RX_POLL
+	new->dev_ref		= old->dev_ref;
+#endif
 }
 
 /*
diff --git a/net/core/sock.c b/net/core/sock.c
index b261a79..e752670 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -139,6 +139,8 @@ 
 #include <net/tcp.h>
 #endif
 
+#include <net/ll_poll.h>
+
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
@@ -2290,6 +2292,10 @@  void sock_init_data(struct socket *sock, struct sock *sk)
 
 	sk->sk_stamp = ktime_set(-1L, 0);
 
+#ifdef CONFIG_INET_LL_RX_POLL
+	sk->dev_ref	=	NULL;
+#endif
+
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7944df7..e52f011 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -402,6 +402,18 @@  config INET_LRO
 
 	  If unsure, say Y.
 
+config INET_LL_RX_POLL
+	bool "Low Latency Receive Poll"
+	default n
+	---help---
+	  Support Low Latency Receive Queue Poll.
+	  (For network card drivers which support this option.)
+	  When waiting for data in read or poll call directly into the the device driver
+	  to flush packets which may be pending on the device queues into the stack.
+
+
+	  If unsure, say N.
+
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 960fd29..0c060c6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -25,6 +25,7 @@ 
 #include <net/inet_frag.h>
 #include <net/ping.h>
 #include <net/tcp_memcontrol.h>
+#include <net/ll_poll.h>
 
 static int zero;
 static int one = 1;
@@ -326,6 +327,15 @@  static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+#ifdef CONFIG_INET_LL_RX_POLL
+	{
+		.procname	= "ip_low_latency_poll",
+		.data		= &sysctl_net_ll_poll,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
 	{
 		.procname	= "tcp_syn_retries",
 		.data		= &sysctl_tcp_syn_retries,
diff --git a/net/socket.c b/net/socket.c
index ee0d029..86da082 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -105,6 +105,12 @@ 
 #include <linux/sockios.h>
 #include <linux/atalk.h>
 
+#ifdef CONFIG_INET_LL_RX_POLL
+#include <net/ll_poll.h>
+int sysctl_net_ll_poll __read_mostly = 150000;
+EXPORT_SYMBOL_GPL(sysctl_net_ll_poll);
+#endif
+
 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos);
@@ -1157,12 +1163,29 @@  EXPORT_SYMBOL(sock_create_lite);
 static unsigned int sock_poll(struct file *file, poll_table *wait)
 {
 	struct socket *sock;
+	unsigned int poll_result;
 
 	/*
 	 *      We can't return errors to poll, so it's either yes or no.
 	 */
 	sock = file->private_data;
-	return sock->ops->poll(file, sock, wait);
+
+	poll_result = sock->ops->poll(file, sock, wait);
+
+#ifdef CONFIG_INET_LL_RX_POLL
+	if (wait &&
+	    !(poll_result & (POLLRDNORM | POLLERR | POLLRDHUP | POLLHUP))) {
+
+		struct sock *sk = sock->sk;
+
+		/* only try once per poll */
+		if (sk_valid_ll(sk) && sk_poll_ll(sk))
+			poll_result = sock->ops->poll(file, sock, wait);
+
+	}
+#endif /* CONFIG_INET_LL_RX_POLL */
+
+	return poll_result;
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)