diff mbox

iproute2 / tbf with large burst seems broken again

Message ID 20090825094120.GA11478@ff.dom.local
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Jarek Poplawski Aug. 25, 2009, 9:41 a.m. UTC
On Tue, Aug 25, 2009 at 09:00:35AM +0000, Jarek Poplawski wrote:
> On Tue, Aug 25, 2009 at 08:43:06AM +0000, Jarek Poplawski wrote:
> ...
> > since these 64 bits will be needed soon for higher rates anyway, I
> > guess we could try some change like the patch below, if you find it
> > works for you (I didn't test it yet.)

I hope this time it works...

Jarek P.

--- (take 2)

 net/sched/sch_tbf.c |   14 +++++++-------
 1 files changed, 7 insertions(+), 7 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Denys Fedoryshchenko Aug. 25, 2009, 10:29 a.m. UTC | #1
Thanks a lot for your help! 
I will try now.

On Tuesday 25 August 2009 12:41:20 Jarek Poplawski wrote:
> On Tue, Aug 25, 2009 at 09:00:35AM +0000, Jarek Poplawski wrote:
> > On Tue, Aug 25, 2009 at 08:43:06AM +0000, Jarek Poplawski wrote:
> > ...
> >
> > > since these 64 bits will be needed soon for higher rates anyway, I
> > > guess we could try some change like the patch below, if you find it
> > > works for you (I didn't test it yet.)
>
> I hope this time it works...
>
> Jarek P.
>
> --- (take 2)
>
>  net/sched/sch_tbf.c |   14 +++++++-------
>  1 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
> index e22dfe8..7d0fe69 100644
> --- a/net/sched/sch_tbf.c
> +++ b/net/sched/sch_tbf.c
> @@ -108,8 +108,8 @@ struct tbf_sched_data
>  	struct qdisc_rate_table	*P_tab;
>
>  /* Variables */
> -	long	tokens;			/* Current number of B tokens */
> -	long	ptokens;		/* Current number of P tokens */
> +	u32	tokens;			/* Current number of B tokens */
> +	u32	ptokens;		/* Current number of P tokens */
>  	psched_time_t	t_c;		/* Time check-point */
>  	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
>  	struct qdisc_watchdog watchdog;	/* Watchdog timer */
> @@ -160,21 +160,21 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
>
>  	if (skb) {
>  		psched_time_t now;
> -		long toks;
> -		long ptoks = 0;
> +		long long toks;
> +		long long ptoks = 0;
>  		unsigned int len = qdisc_pkt_len(skb);
>
>  		now = psched_get_time();
> -		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
> +		toks = min_t(u32, now - q->t_c, q->buffer);
>
>  		if (q->P_tab) {
>  			ptoks = toks + q->ptokens;
> -			if (ptoks > (long)q->mtu)
> +			if (ptoks > q->mtu)
>  				ptoks = q->mtu;
>  			ptoks -= L2T_P(q, len);
>  		}
>  		toks += q->tokens;
> -		if (toks > (long)q->buffer)
> +		if (toks > q->buffer)
>  			toks = q->buffer;
>  		toks -= L2T(q, len);


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Denys Fedoryshchenko Aug. 25, 2009, 11:16 a.m. UTC | #2
Done, tested, it doesn't stale and fix the issue but:

PPPoE_146 ~ # ./tc qdisc del dev ppp13 root;./tc qdisc add dev ppp13 root tbf 
rate 96000 burst 2048000 latency 500ms                                               

This one is ok
PPPoE_146 ~ # ./tc -s -d qdisc show dev ppp13
qdisc tbf 8005: root rate 96000bit burst 2000Kb/8 mpu 0b lat 500.0ms
 Sent 55260 bytes 65 pkt (dropped 0, overlimits 0 requeues 0)
 rate 0bit 0pps backlog 0b 0p requeues 0
qdisc ingress ffff: parent ffff:fff1 ----------------
 Sent 641055 bytes 2485 pkt (dropped 0, overlimits 0 requeues 0)
 rate 0bit 0pps backlog 0b 0p requeues 0

PPPoE_146 ~ # ./tc qdisc del dev ppp13 root;./tc qdisc add dev ppp13 root tbf 
rate 96000 burst 4096000 latency 500ms                                               

But this one maybe will overflow because of limitations in iproute2.

PPoE_146 ~ # ./tc -s -d qdisc show dev ppp13
qdisc tbf 8004: root rate 96000bit burst 797465b/8 mpu 0b lat 275.4s
 Sent 82867 bytes 123 pkt (dropped 0, overlimits 0 requeues 0)
 rate 0bit 0pps backlog 0b 0p requeues 0
qdisc ingress ffff: parent ffff:fff1 ----------------
 Sent 506821 bytes 1916 pkt (dropped 0, overlimits 0 requeues 0)
 rate 0bit 0pps backlog 0b 0p requeues 0

So maybe all of that just wrong way of using TBF.
At same time this means, if HTB and policers in filters done same way, that 
QoS in Linux cannot do similar to squid delay pools feature:

First 10Mb give with 1Mbit/s, then slow 64Kbit/s. If user use less than 64K - 
recharge with that unused bandwidth a "10 Mb / 1Mbit bucket".

On Tuesday 25 August 2009 12:41:20 Jarek Poplawski wrote:
> On Tue, Aug 25, 2009 at 09:00:35AM +0000, Jarek Poplawski wrote:
> > On Tue, Aug 25, 2009 at 08:43:06AM +0000, Jarek Poplawski wrote:
> > ...
> >
> > > since these 64 bits will be needed soon for higher rates anyway, I
> > > guess we could try some change like the patch below, if you find it
> > > works for you (I didn't test it yet.)
>
> I hope this time it works...
>
> Jarek P.
>
> --- (take 2)
>
>  net/sched/sch_tbf.c |   14 +++++++-------
>  1 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
> index e22dfe8..7d0fe69 100644
> --- a/net/sched/sch_tbf.c
> +++ b/net/sched/sch_tbf.c
> @@ -108,8 +108,8 @@ struct tbf_sched_data
>  	struct qdisc_rate_table	*P_tab;
>
>  /* Variables */
> -	long	tokens;			/* Current number of B tokens */
> -	long	ptokens;		/* Current number of P tokens */
> +	u32	tokens;			/* Current number of B tokens */
> +	u32	ptokens;		/* Current number of P tokens */
>  	psched_time_t	t_c;		/* Time check-point */
>  	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
>  	struct qdisc_watchdog watchdog;	/* Watchdog timer */
> @@ -160,21 +160,21 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
>
>  	if (skb) {
>  		psched_time_t now;
> -		long toks;
> -		long ptoks = 0;
> +		long long toks;
> +		long long ptoks = 0;
>  		unsigned int len = qdisc_pkt_len(skb);
>
>  		now = psched_get_time();
> -		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
> +		toks = min_t(u32, now - q->t_c, q->buffer);
>
>  		if (q->P_tab) {
>  			ptoks = toks + q->ptokens;
> -			if (ptoks > (long)q->mtu)
> +			if (ptoks > q->mtu)
>  				ptoks = q->mtu;
>  			ptoks -= L2T_P(q, len);
>  		}
>  		toks += q->tokens;
> -		if (toks > (long)q->buffer)
> +		if (toks > q->buffer)
>  			toks = q->buffer;
>  		toks -= L2T(q, len);


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Aug. 25, 2009, 12:13 p.m. UTC | #3
On Tue, Aug 25, 2009 at 02:16:13PM +0300, Denys Fedoryschenko wrote:
> Done, tested, it doesn't stale and fix the issue but:
> 
> PPPoE_146 ~ # ./tc qdisc del dev ppp13 root;./tc qdisc add dev ppp13 root tbf 
> rate 96000 burst 2048000 latency 500ms                                               
> 
> This one is ok
> PPPoE_146 ~ # ./tc -s -d qdisc show dev ppp13
> qdisc tbf 8005: root rate 96000bit burst 2000Kb/8 mpu 0b lat 500.0ms
>  Sent 55260 bytes 65 pkt (dropped 0, overlimits 0 requeues 0)
>  rate 0bit 0pps backlog 0b 0p requeues 0
> qdisc ingress ffff: parent ffff:fff1 ----------------
>  Sent 641055 bytes 2485 pkt (dropped 0, overlimits 0 requeues 0)
>  rate 0bit 0pps backlog 0b 0p requeues 0
> 
> PPPoE_146 ~ # ./tc qdisc del dev ppp13 root;./tc qdisc add dev ppp13 root tbf 
> rate 96000 burst 4096000 latency 500ms                                               
> 
> But this one maybe will overflow because of limitations in iproute2.

Sure, as I wrote before:
"It shouldn't be so difficult just for 2000kb buffer here, but of course
there're some limits."
I tried to optimize sch_tbf variables usage only, but this is still
mostly within 32 bits.

> 
> PPoE_146 ~ # ./tc -s -d qdisc show dev ppp13
> qdisc tbf 8004: root rate 96000bit burst 797465b/8 mpu 0b lat 275.4s
>  Sent 82867 bytes 123 pkt (dropped 0, overlimits 0 requeues 0)
>  rate 0bit 0pps backlog 0b 0p requeues 0
> qdisc ingress ffff: parent ffff:fff1 ----------------
>  Sent 506821 bytes 1916 pkt (dropped 0, overlimits 0 requeues 0)
>  rate 0bit 0pps backlog 0b 0p requeues 0
> 
> So maybe all of that just wrong way of using TBF.
> At same time this means, if HTB and policers in filters done same way, that 
> QoS in Linux cannot do similar to squid delay pools feature:
> 
> First 10Mb give with 1Mbit/s, then slow 64Kbit/s. If user use less than 64K - 
> recharge with that unused bandwidth a "10 Mb / 1Mbit bucket".

I'll need to find more time to rethink your problem (and this patch),
because it really looks like your tbf usage is very uncommon. Anyway,
I guess these changes aren't for 2.6.31, unless there're similar
requests from other users soon.

Thanks for testing,
Jarek P.

> 
> On Tuesday 25 August 2009 12:41:20 Jarek Poplawski wrote:
> > On Tue, Aug 25, 2009 at 09:00:35AM +0000, Jarek Poplawski wrote:
> > > On Tue, Aug 25, 2009 at 08:43:06AM +0000, Jarek Poplawski wrote:
> > > ...
> > >
> > > > since these 64 bits will be needed soon for higher rates anyway, I
> > > > guess we could try some change like the patch below, if you find it
> > > > works for you (I didn't test it yet.)
> >
> > I hope this time it works...
> >
> > Jarek P.
> >
> > --- (take 2)
> >
> >  net/sched/sch_tbf.c |   14 +++++++-------
> >  1 files changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
> > index e22dfe8..7d0fe69 100644
> > --- a/net/sched/sch_tbf.c
> > +++ b/net/sched/sch_tbf.c
> > @@ -108,8 +108,8 @@ struct tbf_sched_data
> >  	struct qdisc_rate_table	*P_tab;
> >
> >  /* Variables */
> > -	long	tokens;			/* Current number of B tokens */
> > -	long	ptokens;		/* Current number of P tokens */
> > +	u32	tokens;			/* Current number of B tokens */
> > +	u32	ptokens;		/* Current number of P tokens */
> >  	psched_time_t	t_c;		/* Time check-point */
> >  	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
> >  	struct qdisc_watchdog watchdog;	/* Watchdog timer */
> > @@ -160,21 +160,21 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
> >
> >  	if (skb) {
> >  		psched_time_t now;
> > -		long toks;
> > -		long ptoks = 0;
> > +		long long toks;
> > +		long long ptoks = 0;
> >  		unsigned int len = qdisc_pkt_len(skb);
> >
> >  		now = psched_get_time();
> > -		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
> > +		toks = min_t(u32, now - q->t_c, q->buffer);
> >
> >  		if (q->P_tab) {
> >  			ptoks = toks + q->ptokens;
> > -			if (ptoks > (long)q->mtu)
> > +			if (ptoks > q->mtu)
> >  				ptoks = q->mtu;
> >  			ptoks -= L2T_P(q, len);
> >  		}
> >  		toks += q->tokens;
> > -		if (toks > (long)q->buffer)
> > +		if (toks > q->buffer)
> >  			toks = q->buffer;
> >  		toks -= L2T(q, len);
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Denys Fedoryshchenko Aug. 25, 2009, 12:18 p.m. UTC | #4
On Tuesday 25 August 2009 15:13:44 Jarek Poplawski wrote:
> I'll need to find more time to rethink your problem (and this patch),
> because it really looks like your tbf usage is very uncommon. Anyway,
> I guess these changes aren't for 2.6.31, unless there're similar
> requests from other users soon.
>
>Thanks for testing,
>Jarek P.
I guess even no need for any kernel :-)

Just maybe good idea in next kernels to document this or handle overflow, that 
it will give warning in kernel. Otherwise user will have non-functional qdisc 
that will not pass traffic.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Aug. 25, 2009, 8:03 p.m. UTC | #5
Denys Fedoryschenko wrote, On 08/25/2009 01:16 PM:
...
> But this one maybe will overflow because of limitations in iproute2.
> 
> PPoE_146 ~ # ./tc -s -d qdisc show dev ppp13
> qdisc tbf 8004: root rate 96000bit burst 797465b/8 mpu 0b lat 275.4s
>  Sent 82867 bytes 123 pkt (dropped 0, overlimits 0 requeues 0)
>  rate 0bit 0pps backlog 0b 0p requeues 0
> qdisc ingress ffff: parent ffff:fff1 ----------------
>  Sent 506821 bytes 1916 pkt (dropped 0, overlimits 0 requeues 0)
>  rate 0bit 0pps backlog 0b 0p requeues 0
> 
> So maybe all of that just wrong way of using TBF.

I guess so; I've just recollected you described it some time ago. If
it were done only with TBF it would mean very large surges with line
speed and probably a lot of drops by ISP. Since you're ISP, you
probably drop this with HTB or something (then you should mention it
describing the problem) or keep very long queues which means great
latencies. Probably there is a lot of TCP resending btw. Using TBF
with HTB etc. is considered wrong idea anyway. (But if it works for
you shouldn't care.)

> At same time this means, if HTB and policers in filters done same way, that 
> QoS in Linux cannot do similar to squid delay pools feature:
> 
> First 10Mb give with 1Mbit/s, then slow 64Kbit/s. If user use less than 64K - 
> recharge with that unused bandwidth a "10 Mb / 1Mbit bucket".

Could you remind me why HFSC can't do something similar for you?

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Aug. 26, 2009, 7:03 p.m. UTC | #6
On Tue, Aug 25, 2009 at 10:03:06PM +0200, Jarek Poplawski wrote:
> Denys Fedoryschenko wrote, On 08/25/2009 01:16 PM:
> ...
> > But this one maybe will overflow because of limitations in iproute2.
> > 
> > PPoE_146 ~ # ./tc -s -d qdisc show dev ppp13
> > qdisc tbf 8004: root rate 96000bit burst 797465b/8 mpu 0b lat 275.4s
> >  Sent 82867 bytes 123 pkt (dropped 0, overlimits 0 requeues 0)
> >  rate 0bit 0pps backlog 0b 0p requeues 0
> > qdisc ingress ffff: parent ffff:fff1 ----------------
> >  Sent 506821 bytes 1916 pkt (dropped 0, overlimits 0 requeues 0)
> >  rate 0bit 0pps backlog 0b 0p requeues 0
> > 
> > So maybe all of that just wrong way of using TBF.
> 
> I guess so; I've just recollected you described it some time ago. If
> it were done only with TBF it would mean very large surges with line
> speed and probably a lot of drops by ISP. Since you're ISP, you
> probably drop this with HTB or something (then you should mention it
> describing the problem) or keep very long queues which means great
> latencies. Probably there is a lot of TCP resending btw. Using TBF
> with HTB etc. is considered wrong idea anyway. (But if it works for
> you shouldn't care.)
> 
> > At same time this means, if HTB and policers in filters done same way, that 
> > QoS in Linux cannot do similar to squid delay pools feature:
> > 
> > First 10Mb give with 1Mbit/s, then slow 64Kbit/s. If user use less than 64K - 
> > recharge with that unused bandwidth a "10 Mb / 1Mbit bucket".

So I thought about it a little more and I'm quite sure this idea with
large buckets is wrong/ineffective. I guess you could "describe" it
in HTB something like this:

tc class add dev ppp0 parent 1:3 classid 1:4 htb rate 64kbit\
   burst 10mb cburst 10mb
tc class add dev ppp0 parent 1:4 classid 1:4 htb rate 64kbit ceil 1mbit\
   cburst 10mb

(Of course, there would be this overflow problem with 2.6.31-rc and
so big buffers.)

So, the main point is: if somebody didn't send his/her 64Kbits long
time ago it usually means it's lost and can't be shared later. You
could try your luck, but e.g. if at the moment all users use their
64Kbits plus one of them 'thinks' he/she can send "saved" bits, it
means some other guy doesn't get his/her minimum (they send together
but some bytes will be dropped or queued).

It would work OK if you've reserved 1mbit per 64Kbit user but I guess
it's not what you do. So, IMHO, it should be better to use classical
methods to guarantee these 64Kbit with reasonable latency, plus
additonal borrowing with ceil and reasonable (much smaller buffers)
yet.

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index e22dfe8..7d0fe69 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -108,8 +108,8 @@  struct tbf_sched_data
 	struct qdisc_rate_table	*P_tab;
 
 /* Variables */
-	long	tokens;			/* Current number of B tokens */
-	long	ptokens;		/* Current number of P tokens */
+	u32	tokens;			/* Current number of B tokens */
+	u32	ptokens;		/* Current number of P tokens */
 	psched_time_t	t_c;		/* Time check-point */
 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
@@ -160,21 +160,21 @@  static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
 
 	if (skb) {
 		psched_time_t now;
-		long toks;
-		long ptoks = 0;
+		long long toks;
+		long long ptoks = 0;
 		unsigned int len = qdisc_pkt_len(skb);
 
 		now = psched_get_time();
-		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
+		toks = min_t(u32, now - q->t_c, q->buffer);
 
 		if (q->P_tab) {
 			ptoks = toks + q->ptokens;
-			if (ptoks > (long)q->mtu)
+			if (ptoks > q->mtu)
 				ptoks = q->mtu;
 			ptoks -= L2T_P(q, len);
 		}
 		toks += q->tokens;
-		if (toks > (long)q->buffer)
+		if (toks > q->buffer)
 			toks = q->buffer;
 		toks -= L2T(q, len);