diff mbox

[v2,net-next] pkt_sched: fq: Fair Queue packet scheduler

Message ID 1378294029.7360.92.camel@edumazet-glaptop
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Sept. 4, 2013, 11:27 a.m. UTC
On Wed, 2013-09-04 at 03:30 -0700, Eric Dumazet wrote:
> On Wed, 2013-09-04 at 14:30 +0800, Jason Wang wrote:
> 
> > > And tcpdump would certainly help ;)
> > 
> > See attachment.
> > 
> 
> Nothing obvious on tcpdump (only that lot of frames are missing)
> 
> 1) Are you capturing part of the payload only (like tcpdump -s 128)
> 
> 2) What is the setup.
> 
> 3) tc -s -d qdisc

If you use FQ in the guest, then it could be that high resolution timers
have high latency ?

So FQ arms short timers, but effective duration could be much longer.

Here I get a smooth latency of up to ~3 us

lpq83:~# ./netperf -H lpq84 ; ./tc -s -d qd ; dmesg | tail -n1
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to lpq84.prod.google.com () port 0 AF_INET
Recv   Send    Send                          
Socket Socket  Message  Elapsed              
Size   Size    Size     Time     Throughput  
bytes  bytes   bytes    secs.    10^6bits/sec  

 87380  16384  16384    10.00    9410.82   
qdisc fq 8005: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 1024 quantum 3028 initial_quantum 15140 
 Sent 50545633991 bytes 33385894 pkt (dropped 0, overlimits 0 requeues 19) 
 rate 9258Mbit 764335pps backlog 0b 0p requeues 19 
  117 flow, 115 inactive, 0 throttled
  0 gc, 0 highprio, 0 retrans, 96861 throttled, 0 flows_plimit
[  572.551664] latency = 3035 ns


What do you get with this debugging patch ?



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Daniel Borkmann Sept. 4, 2013, 11:59 a.m. UTC | #1
On 09/04/2013 01:27 PM, Eric Dumazet wrote:
> On Wed, 2013-09-04 at 03:30 -0700, Eric Dumazet wrote:
>> On Wed, 2013-09-04 at 14:30 +0800, Jason Wang wrote:
>>
>>>> And tcpdump would certainly help ;)
>>>
>>> See attachment.
>>>
>>
>> Nothing obvious on tcpdump (only that lot of frames are missing)
>>
>> 1) Are you capturing part of the payload only (like tcpdump -s 128)
>>
>> 2) What is the setup.
>>
>> 3) tc -s -d qdisc
>
> If you use FQ in the guest, then it could be that high resolution timers
> have high latency ?

Probably they internally switch to a lower resolution clock event source if
there's no hardware support available:

   The [source event] management layer provides interfaces for hrtimers to
   implement high resolution timers [...] [and it] supports these more advanced
   functions only when appropriate clock event sources have been registered,
   otherwise the traditional periodic tick based behaviour is retained. [1]

[1] https://www.kernel.org/doc/ols/2006/ols2006v1-pages-333-346.pdf

> So FQ arms short timers, but effective duration could be much longer.
>
> Here I get a smooth latency of up to ~3 us
>
> lpq83:~# ./netperf -H lpq84 ; ./tc -s -d qd ; dmesg | tail -n1
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to lpq84.prod.google.com () port 0 AF_INET
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
>
>   87380  16384  16384    10.00    9410.82
> qdisc fq 8005: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 1024 quantum 3028 initial_quantum 15140
>   Sent 50545633991 bytes 33385894 pkt (dropped 0, overlimits 0 requeues 19)
>   rate 9258Mbit 764335pps backlog 0b 0p requeues 19
>    117 flow, 115 inactive, 0 throttled
>    0 gc, 0 highprio, 0 retrans, 96861 throttled, 0 flows_plimit
> [  572.551664] latency = 3035 ns
>
>
> What do you get with this debugging patch ?
>
> diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
> index 32ad015..c1312a0 100644
> --- a/net/sched/sch_fq.c
> +++ b/net/sched/sch_fq.c
> @@ -103,6 +103,7 @@ struct fq_sched_data {
>   	u64		stat_internal_packets;
>   	u64		stat_tcp_retrans;
>   	u64		stat_throttled;
> +	s64		slatency;
>   	u64		stat_flows_plimit;
>   	u64		stat_pkts_too_long;
>   	u64		stat_allocation_errors;
> @@ -393,6 +394,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
>   static void fq_check_throttled(struct fq_sched_data *q, u64 now)
>   {
>   	struct rb_node *p;
> +	bool first = true;
>
>   	if (q->time_next_delayed_flow > now)
>   		return;
> @@ -405,6 +407,13 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
>   			q->time_next_delayed_flow = f->time_next_packet;
>   			break;
>   		}
> +		if (first) {
> +			s64 delay = now - f->time_next_packet;
> +
> +			first = false;
> +			delay -= q->slatency >> 3;
> +			q->slatency += delay;
> +		}
>   		rb_erase(p, &q->delayed);
>   		q->throttled_flows--;
>   		fq_flow_add_tail(&q->old_flows, f);
> @@ -711,6 +720,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
>   	if (opts == NULL)
>   		goto nla_put_failure;
>
> +	pr_err("latency = %lld ns\n", q->slatency >> 3);
>   	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
>   	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
>   	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Sept. 5, 2013, 12:50 a.m. UTC | #2
On Wed, 2013-09-04 at 04:27 -0700, Eric Dumazet wrote:
> On Wed, 2013-09-04 at 03:30 -0700, Eric Dumazet wrote:
> > On Wed, 2013-09-04 at 14:30 +0800, Jason Wang wrote:
> > 
> > > > And tcpdump would certainly help ;)
> > > 
> > > See attachment.
> > > 
> > 
> > Nothing obvious on tcpdump (only that lot of frames are missing)
> > 
> > 1) Are you capturing part of the payload only (like tcpdump -s 128)
> > 
> > 2) What is the setup.
> > 
> > 3) tc -s -d qdisc
> 
> If you use FQ in the guest, then it could be that high resolution timers
> have high latency ?
> 
> So FQ arms short timers, but effective duration could be much longer.
> 
> Here I get a smooth latency of up to ~3 us
> 
> lpq83:~# ./netperf -H lpq84 ; ./tc -s -d qd ; dmesg | tail -n1
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to lpq84.prod.google.com () port 0 AF_INET
> Recv   Send    Send                          
> Socket Socket  Message  Elapsed              
> Size   Size    Size     Time     Throughput  
> bytes  bytes   bytes    secs.    10^6bits/sec  
> 
>  87380  16384  16384    10.00    9410.82   
> qdisc fq 8005: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 1024 quantum 3028 initial_quantum 15140 
>  Sent 50545633991 bytes 33385894 pkt (dropped 0, overlimits 0 requeues 19) 
>  rate 9258Mbit 764335pps backlog 0b 0p requeues 19 
>   117 flow, 115 inactive, 0 throttled
>   0 gc, 0 highprio, 0 retrans, 96861 throttled, 0 flows_plimit
> [  572.551664] latency = 3035 ns
> 
> 
> What do you get with this debugging patch ?
> 
> diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
> index 32ad015..c1312a0 100644
> --- a/net/sched/sch_fq.c
> +++ b/net/sched/sch_fq.c
> @@ -103,6 +103,7 @@ struct fq_sched_data {
>  	u64		stat_internal_packets;
>  	u64		stat_tcp_retrans;
>  	u64		stat_throttled;
> +	s64		slatency;
>  	u64		stat_flows_plimit;
>  	u64		stat_pkts_too_long;
>  	u64		stat_allocation_errors;
> @@ -393,6 +394,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
>  static void fq_check_throttled(struct fq_sched_data *q, u64 now)
>  {
>  	struct rb_node *p;
> +	bool first = true;
>  
>  	if (q->time_next_delayed_flow > now)
>  		return;
> @@ -405,6 +407,13 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
>  			q->time_next_delayed_flow = f->time_next_packet;
>  			break;
>  		}
> +		if (first) {
> +			s64 delay = now - f->time_next_packet;
> +
> +			first = false;
> +			delay -= q->slatency >> 3;
> +			q->slatency += delay;
> +		}
>  		rb_erase(p, &q->delayed);
>  		q->throttled_flows--;
>  		fq_flow_add_tail(&q->old_flows, f);
> @@ -711,6 +720,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
>  	if (opts == NULL)
>  		goto nla_put_failure;
>  
> +	pr_err("latency = %lld ns\n", q->slatency >> 3);
>  	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
>  	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
>  	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
> 


BTW what is your HZ value ?

We have a problem in TCP stack, because srtt is in HZ units.

Before we change to us units, I guess tcp_update_pacing_rate() should be
changed a bit if HZ=250




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Wang Sept. 5, 2013, 3:34 a.m. UTC | #3
On 09/04/2013 07:27 PM, Eric Dumazet wrote:
> On Wed, 2013-09-04 at 03:30 -0700, Eric Dumazet wrote:
>> > On Wed, 2013-09-04 at 14:30 +0800, Jason Wang wrote:
>> > 
>>>> > > > And tcpdump would certainly help ;)
>>> > > 
>>> > > See attachment.
>>> > > 
>> > 
>> > Nothing obvious on tcpdump (only that lot of frames are missing)
>> > 
>> > 1) Are you capturing part of the payload only (like tcpdump -s 128)
>> > 
>> > 2) What is the setup.
>> > 
>> > 3) tc -s -d qdisc
> If you use FQ in the guest, then it could be that high resolution timers
> have high latency ?

Not sure, but it should not affect so much. And I'm using kvm-clock in
guest whose overhead should be very small.
>
> So FQ arms short timers, but effective duration could be much longer.
>
> Here I get a smooth latency of up to ~3 us
>
> lpq83:~# ./netperf -H lpq84 ; ./tc -s -d qd ; dmesg | tail -n1
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to lpq84.prod.google.com () port 0 AF_INET
> Recv   Send    Send                          
> Socket Socket  Message  Elapsed              
> Size   Size    Size     Time     Throughput  
> bytes  bytes   bytes    secs.    10^6bits/sec  
>
>  87380  16384  16384    10.00    9410.82   
> qdisc fq 8005: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 1024 quantum 3028 initial_quantum 15140 
>  Sent 50545633991 bytes 33385894 pkt (dropped 0, overlimits 0 requeues 19) 
>  rate 9258Mbit 764335pps backlog 0b 0p requeues 19 
>   117 flow, 115 inactive, 0 throttled
>   0 gc, 0 highprio, 0 retrans, 96861 throttled, 0 flows_plimit
> [  572.551664] latency = 3035 ns
>
>
> What do you get with this debugging patch ?

I'm getting about 13us-19us, one run like:

 netperf -H 192.168.100.5; tc -s -d qd; dmesg | tail -n1
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
192.168.100.5 () port 0 AF_INET : demo
Recv   Send    Send                         
Socket Socket  Message  Elapsed             
Size   Size    Size     Time     Throughput 
bytes  bytes   bytes    secs.    10^6bits/sec 

 87380  16384  16384    10.00    4542.09  
qdisc fq 8001: dev eth0 root refcnt 2 [Unknown qdisc, optlen=64]
 Sent 53652327205 bytes 35580150 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
[  201.320565] latency = 14905 ns

One interesting thing is if I switch from kvm-clock to acpi_pm which has
much more overhead, the latency increase to about 50ns, and the
throughput drops very quickly.
netperf -H 192.168.100.5; tc -s -d qd; dmesg | tail -n1
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
192.168.100.5 () port 0 AF_INET : demo
Recv   Send    Send                         
Socket Socket  Message  Elapsed             
Size   Size    Size     Time     Throughput 
bytes  bytes   bytes    secs.    10^6bits/sec 

 87380  16384  16384    10.00    2262.46  
qdisc fq 8001: dev eth0 root refcnt 2 [Unknown qdisc, optlen=64]
 Sent 56611533075 bytes 37550429 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
[  474.121689] latency = 51841 ns
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Wang Sept. 5, 2013, 3:39 a.m. UTC | #4
On 09/04/2013 07:59 PM, Daniel Borkmann wrote:
> On 09/04/2013 01:27 PM, Eric Dumazet wrote:
>> On Wed, 2013-09-04 at 03:30 -0700, Eric Dumazet wrote:
>>> On Wed, 2013-09-04 at 14:30 +0800, Jason Wang wrote:
>>>
>>>>> And tcpdump would certainly help ;)
>>>>
>>>> See attachment.
>>>>
>>>
>>> Nothing obvious on tcpdump (only that lot of frames are missing)
>>>
>>> 1) Are you capturing part of the payload only (like tcpdump -s 128)
>>>
>>> 2) What is the setup.
>>>
>>> 3) tc -s -d qdisc
>>
>> If you use FQ in the guest, then it could be that high resolution timers
>> have high latency ?
>
> Probably they internally switch to a lower resolution clock event
> source if
> there's no hardware support available:
>
>   The [source event] management layer provides interfaces for hrtimers to
>   implement high resolution timers [...] [and it] supports these more
> advanced
>   functions only when appropriate clock event sources have been
> registered,
>   otherwise the traditional periodic tick based behaviour is retained.
> [1]
>
> [1] https://www.kernel.org/doc/ols/2006/ols2006v1-pages-333-346.pdf 

Maybe, AFAIK, kvm-clock does not provide a clock event, only a pv
clocksource were provided.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Wang Sept. 5, 2013, 3:43 a.m. UTC | #5
On 09/05/2013 08:50 AM, Eric Dumazet wrote:
> On Wed, 2013-09-04 at 04:27 -0700, Eric Dumazet wrote:
>> On Wed, 2013-09-04 at 03:30 -0700, Eric Dumazet wrote:
>>> On Wed, 2013-09-04 at 14:30 +0800, Jason Wang wrote:
>>>
>>>>> And tcpdump would certainly help ;)
>>>> See attachment.
>>>>
>>> Nothing obvious on tcpdump (only that lot of frames are missing)
>>>
>>> 1) Are you capturing part of the payload only (like tcpdump -s 128)
>>>
>>> 2) What is the setup.
>>>
>>> 3) tc -s -d qdisc
>> If you use FQ in the guest, then it could be that high resolution timers
>> have high latency ?
>>
>> So FQ arms short timers, but effective duration could be much longer.
>>
>> Here I get a smooth latency of up to ~3 us
>>
>> lpq83:~# ./netperf -H lpq84 ; ./tc -s -d qd ; dmesg | tail -n1
>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to lpq84.prod.google.com () port 0 AF_INET
>> Recv   Send    Send                          
>> Socket Socket  Message  Elapsed              
>> Size   Size    Size     Time     Throughput  
>> bytes  bytes   bytes    secs.    10^6bits/sec  
>>
>>  87380  16384  16384    10.00    9410.82   
>> qdisc fq 8005: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 1024 quantum 3028 initial_quantum 15140 
>>  Sent 50545633991 bytes 33385894 pkt (dropped 0, overlimits 0 requeues 19) 
>>  rate 9258Mbit 764335pps backlog 0b 0p requeues 19 
>>   117 flow, 115 inactive, 0 throttled
>>   0 gc, 0 highprio, 0 retrans, 96861 throttled, 0 flows_plimit
>> [  572.551664] latency = 3035 ns
>>
>>
>> What do you get with this debugging patch ?
>>
>> diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
>> index 32ad015..c1312a0 100644
>> --- a/net/sched/sch_fq.c
>> +++ b/net/sched/sch_fq.c
>> @@ -103,6 +103,7 @@ struct fq_sched_data {
>>  	u64		stat_internal_packets;
>>  	u64		stat_tcp_retrans;
>>  	u64		stat_throttled;
>> +	s64		slatency;
>>  	u64		stat_flows_plimit;
>>  	u64		stat_pkts_too_long;
>>  	u64		stat_allocation_errors;
>> @@ -393,6 +394,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
>>  static void fq_check_throttled(struct fq_sched_data *q, u64 now)
>>  {
>>  	struct rb_node *p;
>> +	bool first = true;
>>  
>>  	if (q->time_next_delayed_flow > now)
>>  		return;
>> @@ -405,6 +407,13 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
>>  			q->time_next_delayed_flow = f->time_next_packet;
>>  			break;
>>  		}
>> +		if (first) {
>> +			s64 delay = now - f->time_next_packet;
>> +
>> +			first = false;
>> +			delay -= q->slatency >> 3;
>> +			q->slatency += delay;
>> +		}
>>  		rb_erase(p, &q->delayed);
>>  		q->throttled_flows--;
>>  		fq_flow_add_tail(&q->old_flows, f);
>> @@ -711,6 +720,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
>>  	if (opts == NULL)
>>  		goto nla_put_failure;
>>  
>> +	pr_err("latency = %lld ns\n", q->slatency >> 3);
>>  	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
>>  	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
>>  	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
>>
>
> BTW what is your HZ value ?

Guest HZ is 1000.
>
> We have a problem in TCP stack, because srtt is in HZ units.
>
> Before we change to us units, I guess tcp_update_pacing_rate() should be
> changed a bit if HZ=250
>
>


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 32ad015..c1312a0 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -103,6 +103,7 @@  struct fq_sched_data {
 	u64		stat_internal_packets;
 	u64		stat_tcp_retrans;
 	u64		stat_throttled;
+	s64		slatency;
 	u64		stat_flows_plimit;
 	u64		stat_pkts_too_long;
 	u64		stat_allocation_errors;
@@ -393,6 +394,7 @@  static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 {
 	struct rb_node *p;
+	bool first = true;
 
 	if (q->time_next_delayed_flow > now)
 		return;
@@ -405,6 +407,13 @@  static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 			q->time_next_delayed_flow = f->time_next_packet;
 			break;
 		}
+		if (first) {
+			s64 delay = now - f->time_next_packet;
+
+			first = false;
+			delay -= q->slatency >> 3;
+			q->slatency += delay;
+		}
 		rb_erase(p, &q->delayed);
 		q->throttled_flows--;
 		fq_flow_add_tail(&q->old_flows, f);
@@ -711,6 +720,7 @@  static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (opts == NULL)
 		goto nla_put_failure;
 
+	pr_err("latency = %lld ns\n", q->slatency >> 3);
 	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
 	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
 	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||