diff mbox

[v4,net-next] net: poll/select low latency socket support

Message ID 20130624072803.26134.41593.stgit@ladj378.jer.intel.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Eliezer Tamir June 24, 2013, 7:28 a.m. UTC
select/poll busy-poll support.

Split sysctl value into two separate ones, one for read and one for poll.
updated Documentation/sysctl/net.txt

Add a new poll flag POLL_LL. When this flag is set, sock_poll will call
sk_poll_ll if possible. sock_poll sets this flag in its return value
to indicate to select/poll when a socket that can busy poll is found.

When poll/select have nothing to report, call the low-level
sock_poll again until we are out of time or we find something.

Once the system call finds something, it stops setting POLL_LL, so it can
return the result to the user ASAP.

Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
---

 Documentation/sysctl/net.txt    |   18 ++++++++++++++++--
 fs/select.c                     |   34 +++++++++++++++++++++++++++++-----
 include/net/ll_poll.h           |   35 ++++++++++++++++++++++-------------
 include/uapi/asm-generic/poll.h |    2 ++
 net/core/sock.c                 |    2 +-
 net/core/sysctl_net_core.c      |    8 ++++++++
 net/socket.c                    |   14 +++++++++++++-
 7 files changed, 91 insertions(+), 22 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

David Miller June 25, 2013, 11:36 p.m. UTC | #1
From: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Date: Mon, 24 Jun 2013 10:28:03 +0300

> select/poll busy-poll support.
> 
> Split sysctl value into two separate ones, one for read and one for poll.
> updated Documentation/sysctl/net.txt
> 
> Add a new poll flag POLL_LL. When this flag is set, sock_poll will call
> sk_poll_ll if possible. sock_poll sets this flag in its return value
> to indicate to select/poll when a socket that can busy poll is found.
> 
> When poll/select have nothing to report, call the low-level
> sock_poll again until we are out of time or we find something.
> 
> Once the system call finds something, it stops setting POLL_LL, so it can
> return the result to the user ASAP.
> 
> Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>

Looks good, applied, thanks!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cody P Schafer June 28, 2013, 12:25 a.m. UTC | #2
On 06/24/2013 12:28 AM, Eliezer Tamir wrote:
> select/poll busy-poll support.
...
> diff --git a/fs/select.c b/fs/select.c
> index 8c1c96c..79b876e 100644
> --- a/fs/select.c
> +++ b/fs/select.c
> @@ -400,6 +402,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
>   	poll_table *wait;
>   	int retval, i, timed_out = 0;
>   	unsigned long slack = 0;
> +	unsigned int ll_flag = POLL_LL;
> +	u64 ll_time = ll_end_time();
>
>   	rcu_read_lock();
>   	retval = max_select_fd(n, fds);
> @@ -750,6 +768,8 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
>   	ktime_t expire, *to = NULL;
>   	int timed_out = 0, count = 0;
>   	unsigned long slack = 0;
> +	unsigned int ll_flag = POLL_LL;
> +	u64 ll_time = ll_end_time();
>
>   	/* Optimise the no-wait case */
>   	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
> diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h
> index fcc7c36..5bf2b3a 100644
> --- a/include/net/ll_poll.h
> +++ b/include/net/ll_poll.h
> @@ -38,17 +39,18 @@ extern unsigned int sysctl_net_ll_poll __read_mostly;
>
>   /* we can use sched_clock() because we don't care much about precision
>    * we only care that the average is bounded
> + * we don't mind a ~2.5% imprecision so <<10 instead of *1000
> + * sk->sk_ll_usec is a u_int so this can't overflow
>    */
> -static inline u64 ll_end_time(struct sock *sk)
> +static inline u64 ll_sk_end_time(struct sock *sk)
>   {
> -	u64 end_time = ACCESS_ONCE(sk->sk_ll_usec);
> -
> -	/* we don't mind a ~2.5% imprecision
> -	 * sk->sk_ll_usec is a u_int so this can't overflow
> -	 */
> -	end_time = (end_time << 10) + sched_clock();
> +	return ((u64)ACCESS_ONCE(sk->sk_ll_usec) << 10) + sched_clock();
> +}
>
> -	return end_time;
> +/* in poll/select we use the global sysctl_net_ll_poll value */
> +static inline u64 ll_end_time(void)
> +{
> +	return ((u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10) + sched_clock();
>   }
>
>   static inline bool sk_valid_ll(struct sock *sk)
> @@ -62,10 +64,13 @@ static inline bool can_poll_ll(u64 end_time)
>   	return !time_after64(sched_clock(), end_time);
>   }
>
> +/* when used in sock_poll() nonblock is known at compile time to be true
> + * so the loop and end_time will be optimized out
> + */
>   static inline bool sk_poll_ll(struct sock *sk, int nonblock)
>   {
> +	u64 end_time = nonblock ? 0 : ll_sk_end_time(sk);
>   	const struct net_device_ops *ops;
> -	u64 end_time = ll_end_time(sk);
>   	struct napi_struct *napi;
>   	int rc = false;
>

I'm seeing warnings about using smp_processor_id() while preemptable 
(log included below) due to this patch. I expect the use of 
ll_end_time() -> sched_clock() here is triggering this.

Apologies if this has already been noted.
--

# [    3.114452] BUG: using smp_processor_id() in preemptible [00000000] 
code: sh/62
[    3.117970] caller is native_sched_clock+0x20/0x80
[    3.120303] CPU: 0 PID: 62 Comm: sh Not tainted 
3.10.0-rc6-dnuma-01032-g2d48d67 #21
[    3.123710] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[    3.128616]  0000000000000000 ffff880002b6baf0 ffffffff813c07d0 
ffff880002b6bb08
[    3.135055]  ffffffff811ff835 00000004d076eeed ffff880002b6bb20 
ffffffff81009ac0
[    3.137359]  0000000000000000 ffff880002b6bb30 ffffffff81009b29 
ffff880002b6bf40
[    3.138954] Call Trace:
[    3.139466]  [<ffffffff813c07d0>] dump_stack+0x19/0x1b
[    3.140559]  [<ffffffff811ff835>] debug_smp_processor_id+0xd5/0xf0
[    3.141831]  [<ffffffff81009ac0>] native_sched_clock+0x20/0x80
[    3.143031]  [<ffffffff81009b29>] sched_clock+0x9/0x10
[    3.144127]  [<ffffffff811033a6>] do_sys_poll+0x1f6/0x500
[    3.145239]  [<ffffffff81009b29>] ? sched_clock+0x9/0x10
[    3.146335]  [<ffffffff81009ac0>] ? native_sched_clock+0x20/0x80
[    3.147557]  [<ffffffff8106cf5d>] ? sched_clock_local+0x1d/0x90
[    3.148816]  [<ffffffff81009ac0>] ? native_sched_clock+0x20/0x80
[    3.150007]  [<ffffffff81009b29>] ? sched_clock+0x9/0x10
[    3.151090]  [<ffffffff8106cf5d>] ? sched_clock_local+0x1d/0x90
[    3.152419]  [<ffffffff81009ac0>] ? native_sched_clock+0x20/0x80
[    3.153638]  [<ffffffff81009ac0>] ? native_sched_clock+0x20/0x80
[    3.154865]  [<ffffffff81009b29>] ? sched_clock+0x9/0x10
[    3.155961]  [<ffffffff8106cf5d>] ? sched_clock_local+0x1d/0x90
[    3.157230]  [<ffffffff8106d128>] ? sched_clock_cpu+0xa8/0x100
[    3.158433]  [<ffffffff81101af0>] ? SyS_getdents64+0x110/0x110
[    3.159628]  [<ffffffff81009ac0>] ? native_sched_clock+0x20/0x80
[    3.160916]  [<ffffffff81009b29>] ? sched_clock+0x9/0x10
[    3.162003]  [<ffffffff8106cf5d>] ? sched_clock_local+0x1d/0x90
[    3.163207]  [<ffffffff8106d128>] ? sched_clock_cpu+0xa8/0x100
[    3.164427]  [<ffffffff81084b39>] ? get_lock_stats+0x19/0x60
[    3.165580]  [<ffffffff81084fbe>] ? put_lock_stats.isra.28+0xe/0x40
[    3.166856]  [<ffffffff813c2415>] ? __mutex_unlock_slowpath+0x105/0x1a0
[    3.168412]  [<ffffffff81087c55>] ? trace_hardirqs_on_caller+0x105/0x1d0
[    3.169944]  [<ffffffff81087d2d>] ? trace_hardirqs_on+0xd/0x10
[    3.171155]  [<ffffffff813c24b9>] ? mutex_unlock+0x9/0x10
[    3.172355]  [<ffffffff81251fd3>] ? tty_ioctl+0xa53/0xd40
[    3.173483]  [<ffffffff8108ae28>] ? lock_release_non_nested+0x308/0x350
[    3.174848]  [<ffffffff81089bd6>] ? __lock_acquire+0x3d6/0xb70
[    3.176087]  [<ffffffff81087c55>] ? trace_hardirqs_on_caller+0x105/0x1d0
[    3.177466]  [<ffffffff81101205>] ? do_vfs_ioctl+0x305/0x510
[    3.178629]  [<ffffffff813c6959>] ? sysret_check+0x22/0x5d
[    3.179764]  [<ffffffff81087c55>] ? trace_hardirqs_on_caller+0x105/0x1d0
[    3.181196]  [<ffffffff81103770>] SyS_poll+0x60/0xf0
[    3.182225]  [<ffffffff813c692d>] system_call_fastpath+0x1a/0x1f



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cody P Schafer June 28, 2013, 12:29 a.m. UTC | #3
On 06/27/2013 05:25 PM, Cody P Schafer wrote:
> On 06/24/2013 12:28 AM, Eliezer Tamir wrote:
>> select/poll busy-poll support.
>> ...
>
> I'm seeing warnings about using smp_processor_id() while preemptable
> (log included below) due to this patch. I expect the use of
> ll_end_time() -> sched_clock() here is triggering this.
>
> Apologies if this has already been noted.

To be clear, given how sched_time() is used here the BUG appears 
invalid, but we need a way to mark this as OK for the smp_processor_id() 
checks so we don't get BUG spam.

> --
>
> # [    3.114452] BUG: using smp_processor_id() in preemptible [00000000]
> code: sh/62
...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen June 28, 2013, 4:43 a.m. UTC | #4
> @@ -400,6 +402,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
>  	poll_table *wait;
>  	int retval, i, timed_out = 0;
>  	unsigned long slack = 0;
> +	unsigned int ll_flag = POLL_LL;
> +	u64 ll_time = ll_end_time();

So you're adding a sched_clock to every select call, even if it has
nothing to do with ll? 

That seems rather drastic. select can be performance critical.

-andi
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eliezer Tamir June 28, 2013, 5:32 a.m. UTC | #5
On 28/06/2013 07:43, Andi Kleen wrote:
>> @@ -400,6 +402,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
>>   	poll_table *wait;
>>   	int retval, i, timed_out = 0;
>>   	unsigned long slack = 0;
>> +	unsigned int ll_flag = POLL_LL;
>> +	u64 ll_time = ll_end_time();
>
> So you're adding a sched_clock to every select call, even if it has
> nothing to do with ll?
>
> That seems rather drastic. select can be performance critical.

would the following be acceptable?

	unsigned int ll_flag = ll_poll_enabled(); // returns POLL_LL if on
	u64 ll_time = ll_flag ? ll_end_time() : 0;

and at the other side
	
	if (ll_flag && can_poll_ll(ll_end_time))
		continue;


-Eliezer
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eliezer Tamir June 28, 2013, 6 a.m. UTC | #6
On 28/06/2013 03:29, Cody P Schafer wrote:
> On 06/27/2013 05:25 PM, Cody P Schafer wrote:
>> On 06/24/2013 12:28 AM, Eliezer Tamir wrote:
>>> select/poll busy-poll support.
>>> ...
>>
>> I'm seeing warnings about using smp_processor_id() while preemptable
>> (log included below) due to this patch. I expect the use of
>> ll_end_time() -> sched_clock() here is triggering this.
>>
>> Apologies if this has already been noted.
>
> To be clear, given how sched_time() is used here the BUG appears
> invalid, but we need a way to mark this as OK for the smp_processor_id()
> checks so we don't get BUG spam.
>
Thanks for reporting this.

-Eliezer
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 5369879..e658bbf 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -50,13 +50,27 @@  The maximum number of packets that kernel can handle on a NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
-low_latency_poll
+low_latency_read
 ----------------
-Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL)
+Low latency busy poll timeout for socket reads. (needs CONFIG_NET_LL_RX_POLL)
 Approximate time in us to spin waiting for packets on the device queue.
+This sets the default value of the SO_LL socket option.
+Can be set or overridden per socket by setting socket option SO_LL.
 Recommended value is 50. May increase power usage.
 Default: 0 (off)
 
+low_latency_poll
+----------------
+Low latency busy poll timeout for poll and select. (needs CONFIG_NET_LL_RX_POLL)
+Approximate time in us to spin waiting for packets on the device queue.
+Recommended value depends on the number of sockets you poll on.
+For several sockets 50, for several hundreds 100.
+For more than that you probably want to use epoll.
+Note that only sockets with SO_LL set will be busy polled, so you want to either
+selectively set SO_LL on those sockets or set sysctl.net.low_latency_read globally.
+May increase power usage.
+Default: 0 (off)
+
 rmem_default
 ------------
 
diff --git a/fs/select.c b/fs/select.c
index 8c1c96c..79b876e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -27,6 +27,7 @@ 
 #include <linux/rcupdate.h>
 #include <linux/hrtimer.h>
 #include <linux/sched/rt.h>
+#include <net/ll_poll.h>
 
 #include <asm/uaccess.h>
 
@@ -384,9 +385,10 @@  get_max:
 #define POLLEX_SET (POLLPRI)
 
 static inline void wait_key_set(poll_table *wait, unsigned long in,
-				unsigned long out, unsigned long bit)
+				unsigned long out, unsigned long bit,
+				unsigned int ll_flag)
 {
-	wait->_key = POLLEX_SET;
+	wait->_key = POLLEX_SET | ll_flag;
 	if (in & bit)
 		wait->_key |= POLLIN_SET;
 	if (out & bit)
@@ -400,6 +402,8 @@  int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 	poll_table *wait;
 	int retval, i, timed_out = 0;
 	unsigned long slack = 0;
+	unsigned int ll_flag = POLL_LL;
+	u64 ll_time = ll_end_time();
 
 	rcu_read_lock();
 	retval = max_select_fd(n, fds);
@@ -422,6 +426,7 @@  int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 	retval = 0;
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
+		bool can_ll = false;
 
 		inp = fds->in; outp = fds->out; exp = fds->ex;
 		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -449,7 +454,8 @@  int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 					f_op = f.file->f_op;
 					mask = DEFAULT_POLLMASK;
 					if (f_op && f_op->poll) {
-						wait_key_set(wait, in, out, bit);
+						wait_key_set(wait, in, out,
+							     bit, ll_flag);
 						mask = (*f_op->poll)(f.file, wait);
 					}
 					fdput(f);
@@ -468,6 +474,11 @@  int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 						retval++;
 						wait->_qproc = NULL;
 					}
+					if (mask & POLL_LL)
+						can_ll = true;
+					/* got something, stop busy polling */
+					if (retval)
+						ll_flag = 0;
 				}
 			}
 			if (res_in)
@@ -486,6 +497,9 @@  int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 			break;
 		}
 
+		if (can_ll && can_poll_ll(ll_time))
+			continue;
+
 		/*
 		 * If this is the first loop and we have a timeout
 		 * given, then we convert to ktime_t and set the to
@@ -717,7 +731,8 @@  struct poll_list {
  * pwait poll_table will be used by the fd-provided poll handler for waiting,
  * if pwait->_qproc is non-NULL.
  */
-static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
+static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
+				     bool *can_ll, unsigned int ll_flag)
 {
 	unsigned int mask;
 	int fd;
@@ -731,7 +746,10 @@  static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 			mask = DEFAULT_POLLMASK;
 			if (f.file->f_op && f.file->f_op->poll) {
 				pwait->_key = pollfd->events|POLLERR|POLLHUP;
+				pwait->_key |= ll_flag;
 				mask = f.file->f_op->poll(f.file, pwait);
+				if (mask & POLL_LL)
+					*can_ll = true;
 			}
 			/* Mask out unneeded events. */
 			mask &= pollfd->events | POLLERR | POLLHUP;
@@ -750,6 +768,8 @@  static int do_poll(unsigned int nfds,  struct poll_list *list,
 	ktime_t expire, *to = NULL;
 	int timed_out = 0, count = 0;
 	unsigned long slack = 0;
+	unsigned int ll_flag = POLL_LL;
+	u64 ll_time = ll_end_time();
 
 	/* Optimise the no-wait case */
 	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -762,6 +782,7 @@  static int do_poll(unsigned int nfds,  struct poll_list *list,
 
 	for (;;) {
 		struct poll_list *walk;
+		bool can_ll = false;
 
 		for (walk = list; walk != NULL; walk = walk->next) {
 			struct pollfd * pfd, * pfd_end;
@@ -776,9 +797,10 @@  static int do_poll(unsigned int nfds,  struct poll_list *list,
 				 * this. They'll get immediately deregistered
 				 * when we break out and return.
 				 */
-				if (do_pollfd(pfd, pt)) {
+				if (do_pollfd(pfd, pt, &can_ll, ll_flag)) {
 					count++;
 					pt->_qproc = NULL;
+					ll_flag = 0;
 				}
 			}
 		}
@@ -795,6 +817,8 @@  static int do_poll(unsigned int nfds,  struct poll_list *list,
 		if (count || timed_out)
 			break;
 
+		if (can_ll && can_poll_ll(ll_time))
+			continue;
 		/*
 		 * If this is the first loop and we have a timeout
 		 * given, then we convert to ktime_t and set the to
diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h
index fcc7c36..5bf2b3a 100644
--- a/include/net/ll_poll.h
+++ b/include/net/ll_poll.h
@@ -30,6 +30,7 @@ 
 #ifdef CONFIG_NET_LL_RX_POLL
 
 struct napi_struct;
+extern unsigned int sysctl_net_ll_read __read_mostly;
 extern unsigned int sysctl_net_ll_poll __read_mostly;
 
 /* return values from ndo_ll_poll */
@@ -38,17 +39,18 @@  extern unsigned int sysctl_net_ll_poll __read_mostly;
 
 /* we can use sched_clock() because we don't care much about precision
  * we only care that the average is bounded
+ * we don't mind a ~2.5% imprecision so <<10 instead of *1000
+ * sk->sk_ll_usec is a u_int so this can't overflow
  */
-static inline u64 ll_end_time(struct sock *sk)
+static inline u64 ll_sk_end_time(struct sock *sk)
 {
-	u64 end_time = ACCESS_ONCE(sk->sk_ll_usec);
-
-	/* we don't mind a ~2.5% imprecision
-	 * sk->sk_ll_usec is a u_int so this can't overflow
-	 */
-	end_time = (end_time << 10) + sched_clock();
+	return ((u64)ACCESS_ONCE(sk->sk_ll_usec) << 10) + sched_clock();
+}
 
-	return end_time;
+/* in poll/select we use the global sysctl_net_ll_poll value */
+static inline u64 ll_end_time(void)
+{
+	return ((u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10) + sched_clock();
 }
 
 static inline bool sk_valid_ll(struct sock *sk)
@@ -62,10 +64,13 @@  static inline bool can_poll_ll(u64 end_time)
 	return !time_after64(sched_clock(), end_time);
 }
 
+/* when used in sock_poll() nonblock is known at compile time to be true
+ * so the loop and end_time will be optimized out
+ */
 static inline bool sk_poll_ll(struct sock *sk, int nonblock)
 {
+	u64 end_time = nonblock ? 0 : ll_sk_end_time(sk);
 	const struct net_device_ops *ops;
-	u64 end_time = ll_end_time(sk);
 	struct napi_struct *napi;
 	int rc = false;
 
@@ -84,7 +89,6 @@  static inline bool sk_poll_ll(struct sock *sk, int nonblock)
 		goto out;
 
 	do {
-
 		rc = ops->ndo_ll_poll(napi);
 
 		if (rc == LL_FLUSH_FAILED)
@@ -95,8 +99,8 @@  static inline bool sk_poll_ll(struct sock *sk, int nonblock)
 			NET_ADD_STATS_BH(sock_net(sk),
 					 LINUX_MIB_LOWLATENCYRXPACKETS, rc);
 
-	} while (skb_queue_empty(&sk->sk_receive_queue)
-			&& can_poll_ll(end_time) && !nonblock);
+	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
+		 can_poll_ll(end_time));
 
 	rc = !skb_queue_empty(&sk->sk_receive_queue);
 out:
@@ -118,7 +122,12 @@  static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
 
 #else /* CONFIG_NET_LL_RX_POLL */
 
-static inline u64 ll_end_time(struct sock *sk)
+static inline u64 sk_ll_end_time(struct sock *sk)
+{
+	return 0;
+}
+
+static inline u64 ll_end_time(void)
 {
 	return 0;
 }
diff --git a/include/uapi/asm-generic/poll.h b/include/uapi/asm-generic/poll.h
index 9ce7f44..4aee586 100644
--- a/include/uapi/asm-generic/poll.h
+++ b/include/uapi/asm-generic/poll.h
@@ -30,6 +30,8 @@ 
 
 #define POLLFREE	0x4000	/* currently only for epoll */
 
+#define POLL_LL		0x8000
+
 struct pollfd {
 	int fd;
 	short events;
diff --git a/net/core/sock.c b/net/core/sock.c
index 1e744b1..b6c619f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2307,7 +2307,7 @@  void sock_init_data(struct socket *sock, struct sock *sk)
 
 #ifdef CONFIG_NET_LL_RX_POLL
 	sk->sk_napi_id		=	0;
-	sk->sk_ll_usec		=	sysctl_net_ll_poll;
+	sk->sk_ll_usec		=	sysctl_net_ll_read;
 #endif
 
 	/*
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 62702c2..afc677e 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -306,6 +306,14 @@  static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "low_latency_read",
+		.data		= &sysctl_net_ll_read,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#
 #endif
 #endif /* CONFIG_NET */
 	{
diff --git a/net/socket.c b/net/socket.c
index 3eec3f7..4da14cb 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -107,6 +107,7 @@ 
 #include <net/ll_poll.h>
 
 #ifdef CONFIG_NET_LL_RX_POLL
+unsigned int sysctl_net_ll_read __read_mostly;
 unsigned int sysctl_net_ll_poll __read_mostly;
 #endif
 
@@ -1147,13 +1148,24 @@  EXPORT_SYMBOL(sock_create_lite);
 /* No kernel lock held - perfect */
 static unsigned int sock_poll(struct file *file, poll_table *wait)
 {
+	unsigned int ll_flag = 0;
 	struct socket *sock;
 
 	/*
 	 *      We can't return errors to poll, so it's either yes or no.
 	 */
 	sock = file->private_data;
-	return sock->ops->poll(file, sock, wait);
+
+	if (sk_valid_ll(sock->sk)) {
+		/* this socket can poll_ll so tell the system call */
+		ll_flag = POLL_LL;
+
+		/* once, only if requested by syscall */
+		if (wait && (wait->_key & POLL_LL))
+			sk_poll_ll(sock->sk, 1);
+	}
+
+	return ll_flag | sock->ops->poll(file, sock, wait);
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)