Patchwork [3/3] ifb: move tq from ifb_private

login
register
mail settings
Submitter Changli Gao
Date Dec. 4, 2010, 5:55 a.m.
Message ID <1291442121-3302-3-git-send-email-xiaosuo@gmail.com>
Download mbox | patch
Permalink /patch/74251/
State Superseded
Delegated to: David Miller
Headers show

Comments

Changli Gao - Dec. 4, 2010, 5:55 a.m.
tq is only used in ri_tasklet, so we move it from ifb_private to a
stack variable of ri_tasklet.

skb_queue_splice_tail_init() is used instead of the open coded and slow
one.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 drivers/net/ifb.c |   49 ++++++++++++-------------------------------------
 1 file changed, 12 insertions(+), 37 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski - Dec. 4, 2010, 1:15 p.m.
Changli Gao wrote:
> tq is only used in ri_tasklet, so we move it from ifb_private to a
> stack variable of ri_tasklet.
> 
> skb_queue_splice_tail_init() is used instead of the open coded and slow
> one.
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ---
>  drivers/net/ifb.c |   49 ++++++++++++-------------------------------------
>  1 file changed, 12 insertions(+), 37 deletions(-)
> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
> index d1e362a..cd6e90d 100644
> --- a/drivers/net/ifb.c
> +++ b/drivers/net/ifb.c
> @@ -39,9 +39,7 @@
>  #define TX_Q_LIMIT    32
>  struct ifb_private {
>  	struct tasklet_struct   ifb_tasklet;
> -	int     tasklet_pending;
>  	struct sk_buff_head     rq;
> -	struct sk_buff_head     tq;
>  };
>  
>  static int numifbs = 2;
> @@ -53,27 +51,25 @@ static int ifb_close(struct net_device *dev);
>  
>  static void ri_tasklet(unsigned long dev)
>  {
> -
>  	struct net_device *_dev = (struct net_device *)dev;
>  	struct ifb_private *dp = netdev_priv(_dev);
>  	struct net_device_stats *stats = &_dev->stats;
>  	struct netdev_queue *txq;
>  	struct sk_buff *skb;
> +	struct sk_buff_head tq;
>  
> +	__skb_queue_head_init(&tq);
>  	txq = netdev_get_tx_queue(_dev, 0);
> -	if ((skb = skb_peek(&dp->tq)) == NULL) {
> -		if (__netif_tx_trylock(txq)) {
> -			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
> -				skb_queue_tail(&dp->tq, skb);
> -			}
> -			__netif_tx_unlock(txq);
> -		} else {
> -			/* reschedule */
> -			goto resched;
> -		}
> +	if (!__netif_tx_trylock(txq)) {
> +		tasklet_schedule(&dp->ifb_tasklet);
> +		return;
>  	}
> +	skb_queue_splice_tail_init(&dp->rq, &tq);
> +	if (netif_tx_queue_stopped(txq))
> +		netif_tx_wake_queue(txq);
> +	__netif_tx_unlock(txq);
>  
> -	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
> +	while ((skb = __skb_dequeue(&tq)) != NULL) {
>  		u32 from = G_TC_FROM(skb->tc_verd);
>  
>  		skb->tc_verd = 0;
> @@ -87,7 +83,7 @@ static void ri_tasklet(unsigned long dev)
>  			rcu_read_unlock();
>  			dev_kfree_skb(skb);
>  			stats->tx_dropped++;
> -			break;
> +			continue;

IMHO this line is a bugfix and should be a separate patch (for stable).

The rest looks OK to me but you could probably skip locking of dp->rq
similarly to tq.

Jarek P.

>  		}
>  		rcu_read_unlock();
>  		skb->skb_iif = _dev->ifindex;
> @@ -100,23 +96,6 @@ static void ri_tasklet(unsigned long dev)
>  		} else
>  			BUG();
>  	}
> -
> -	if (__netif_tx_trylock(txq)) {
> -		if ((skb = skb_peek(&dp->rq)) == NULL) {
> -			dp->tasklet_pending = 0;
> -			if (netif_queue_stopped(_dev))
> -				netif_wake_queue(_dev);
> -		} else {
> -			__netif_tx_unlock(txq);
> -			goto resched;
> -		}
> -		__netif_tx_unlock(txq);
> -	} else {
> -resched:
> -		dp->tasklet_pending = 1;
> -		tasklet_schedule(&dp->ifb_tasklet);
> -	}
> -
>  }
>  
>  static const struct net_device_ops ifb_netdev_ops = {
> @@ -162,10 +141,8 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
>  	}
>  
>  	skb_queue_tail(&dp->rq, skb);
> -	if (!dp->tasklet_pending) {
> -		dp->tasklet_pending = 1;
> +	if (skb_queue_len(&dp->rq) == 1)
>  		tasklet_schedule(&dp->ifb_tasklet);
> -	}
>  
>  	return NETDEV_TX_OK;
>  }
> @@ -177,7 +154,6 @@ static int ifb_close(struct net_device *dev)
>  	tasklet_kill(&dp->ifb_tasklet);
>  	netif_stop_queue(dev);
>  	skb_queue_purge(&dp->rq);
> -	skb_queue_purge(&dp->tq);
>  	return 0;
>  }
>  
> @@ -187,7 +163,6 @@ static int ifb_open(struct net_device *dev)
>  
>  	tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
>  	skb_queue_head_init(&dp->rq);
> -	skb_queue_head_init(&dp->tq);
>  	netif_start_queue(dev);
>  
>  	return 0;
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Dec. 4, 2010, 1:29 p.m.
On Sat, Dec 4, 2010 at 9:15 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> Changli Gao wrote:
>> tq is only used in ri_tasklet, so we move it from ifb_private to a
>> stack variable of ri_tasklet.
>>
>> skb_queue_splice_tail_init() is used instead of the open coded and slow
>> one.
>>
>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>> ---
>>  drivers/net/ifb.c |   49 ++++++++++++-------------------------------------
>>  1 file changed, 12 insertions(+), 37 deletions(-)
>> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
>> index d1e362a..cd6e90d 100644
>> --- a/drivers/net/ifb.c
>> +++ b/drivers/net/ifb.c
>> @@ -39,9 +39,7 @@
>>  #define TX_Q_LIMIT    32
>>  struct ifb_private {
>>       struct tasklet_struct   ifb_tasklet;
>> -     int     tasklet_pending;
>>       struct sk_buff_head     rq;
>> -     struct sk_buff_head     tq;
>>  };
>>
>>  static int numifbs = 2;
>> @@ -53,27 +51,25 @@ static int ifb_close(struct net_device *dev);
>>
>>  static void ri_tasklet(unsigned long dev)
>>  {
>> -
>>       struct net_device *_dev = (struct net_device *)dev;
>>       struct ifb_private *dp = netdev_priv(_dev);
>>       struct net_device_stats *stats = &_dev->stats;
>>       struct netdev_queue *txq;
>>       struct sk_buff *skb;
>> +     struct sk_buff_head tq;
>>
>> +     __skb_queue_head_init(&tq);
>>       txq = netdev_get_tx_queue(_dev, 0);
>> -     if ((skb = skb_peek(&dp->tq)) == NULL) {
>> -             if (__netif_tx_trylock(txq)) {
>> -                     while ((skb = skb_dequeue(&dp->rq)) != NULL) {
>> -                             skb_queue_tail(&dp->tq, skb);
>> -                     }
>> -                     __netif_tx_unlock(txq);
>> -             } else {
>> -                     /* reschedule */
>> -                     goto resched;
>> -             }
>> +     if (!__netif_tx_trylock(txq)) {
>> +             tasklet_schedule(&dp->ifb_tasklet);
>> +             return;
>>       }
>> +     skb_queue_splice_tail_init(&dp->rq, &tq);
>> +     if (netif_tx_queue_stopped(txq))
>> +             netif_tx_wake_queue(txq);
>> +     __netif_tx_unlock(txq);
>>
>> -     while ((skb = skb_dequeue(&dp->tq)) != NULL) {
>> +     while ((skb = __skb_dequeue(&tq)) != NULL) {
>>               u32 from = G_TC_FROM(skb->tc_verd);
>>
>>               skb->tc_verd = 0;
>> @@ -87,7 +83,7 @@ static void ri_tasklet(unsigned long dev)
>>                       rcu_read_unlock();
>>                       dev_kfree_skb(skb);
>>                       stats->tx_dropped++;
>> -                     break;
>> +                     continue;
>
> IMHO this line is a bugfix and should be a separate patch (for stable).

It sounds reasonable.

>
> The rest looks OK to me but you could probably skip locking of dp->rq
> similarly to tq.
>

OK. Thanks.
jamal - Dec. 4, 2010, 2:18 p.m.
On Sat, 2010-12-04 at 13:55 +0800, Changli Gao wrote:
> tq is only used in ri_tasklet, so we move it from ifb_private to a
> stack variable of ri_tasklet.
> 
> skb_queue_splice_tail_init() is used instead of the open coded and slow
> one.

I like the splice idea but this patch makes me twitch
a little. What test setup did you use to check it?

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Dec. 4, 2010, 2:20 p.m.
On Sat, Dec 4, 2010 at 10:18 PM, jamal <hadi@cyberus.ca> wrote:
>
> I like the splice idea but this patch makes me twitch
> a little. What test setup did you use to check it?
>

Just a simple test:

tc qdisc add dev eth0 ingress
tc filter add dev eth0 parent ffff: protocol ip basic action mirred
egress redirect dev ifb0
jamal - Dec. 4, 2010, 2:28 p.m.
On Sat, 2010-12-04 at 14:15 +0100, Jarek Poplawski wrote:

> > @@ -87,7 +83,7 @@ static void ri_tasklet(unsigned long dev)
> >  			rcu_read_unlock();
> >  			dev_kfree_skb(skb);
> >  			stats->tx_dropped++;
> > -			break;
> > +			continue;
> 
> IMHO this line is a bugfix and should be a separate patch (for stable).

Should be separate, yes.
Bug? no. 
Improvement, maybe ;->
The idea was to defer processing at the first error. Changli
is changing it to continue despite the error.
The initial goal was to yield whenever possible since we dont maintain
a lot of state.

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
jamal - Dec. 4, 2010, 2:42 p.m.
On Sat, 2010-12-04 at 09:18 -0500, jamal wrote:

> I like the splice idea but this patch makes me twitch
> a little. What test setup did you use to check it?

Ok, here's one thing you changed which is important. We do:

 -->XXX-->rq-->tq-->XXX-->

rq is controlled by queue limit. 
We only load rq to tq if all of tq is empty. If it is not
we dont move things over. Essentially this is a flow
control scheme. We dont want many sources to be overwhelming
us with packets and every time we grab a txqlen number of packets.
For this reason:
I would be comfortable if all you did was to add the splice
after you skb_peek() - i think that would be a good improvement
which is not bound to break anything else.

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Dec. 4, 2010, 2:45 p.m.
On Sat, Dec 4, 2010 at 10:28 PM, jamal <hadi@cyberus.ca> wrote:
> On Sat, 2010-12-04 at 14:15 +0100, Jarek Poplawski wrote:
>
>> > @@ -87,7 +83,7 @@ static void ri_tasklet(unsigned long dev)
>> >                     rcu_read_unlock();
>> >                     dev_kfree_skb(skb);
>> >                     stats->tx_dropped++;
>> > -                   break;
>> > +                   continue;
>>
>> IMHO this line is a bugfix and should be a separate patch (for stable).
>
> Should be separate, yes.
> Bug? no.

If we breaks the loop when there are still skbs in tq and no skb in
rq, the skbs will be left in txq until new skbs are enqueued into rq.
In rare cases, no new skb is queued, then these skbs will stay in rq
forever.

-       if (__netif_tx_trylock(txq)) {
-               if ((skb = skb_peek(&dp->rq)) == NULL) {
-                       dp->tasklet_pending = 0;
-                       if (netif_queue_stopped(_dev))
-                               netif_wake_queue(_dev);
-               } else {
-                       __netif_tx_unlock(txq);
-                       goto resched;
-               }
-               __netif_tx_unlock(txq);
-       } else {
-resched:
-               dp->tasklet_pending = 1;
-               tasklet_schedule(&dp->ifb_tasklet);
-       }
-
jamal - Dec. 4, 2010, 2:48 p.m.
On Sat, 2010-12-04 at 09:28 -0500, jamal wrote:

> The idea was to defer processing at the first error. Changli
> is changing it to continue despite the error.
> The initial goal was to yield whenever possible since we dont maintain
> a lot of state.

Changli:
Other points we could defer processing is in case of packets
being dropped by dev_queue_xmit and netif_rx

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Dec. 4, 2010, 2:50 p.m.
On Sat, Dec 4, 2010 at 10:42 PM, jamal <hadi@cyberus.ca> wrote:
> On Sat, 2010-12-04 at 09:18 -0500, jamal wrote:
>
>> I like the splice idea but this patch makes me twitch
>> a little. What test setup did you use to check it?
>
> Ok, here's one thing you changed which is important. We do:
>
>  -->XXX-->rq-->tq-->XXX-->
>
> rq is controlled by queue limit.
> We only load rq to tq if all of tq is empty. If it is not
> we dont move things over. Essentially this is a flow
> control scheme. We dont want many sources to be overwhelming
> us with packets and every time we grab a txqlen number of packets.
> For this reason:
> I would be comfortable if all you did was to add the splice
> after you skb_peek() - i think that would be a good improvement
> which is not bound to break anything else.
>

Maybe you misread my patch. tq is a stack variable in ri_tasklet, and
initialized all the time. ri_tasklet() won't exits until tq is
empty().
jamal - Dec. 4, 2010, 2:55 p.m.
On Sat, 2010-12-04 at 22:45 +0800, Changli Gao wrote:

> 
> If we breaks the loop when there are still skbs in tq and no skb in
> rq, the skbs will be left in txq until new skbs are enqueued into rq.
> In rare cases, no new skb is queued, then these skbs will stay in rq
> forever.

So should we goto resched? 

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
jamal - Dec. 4, 2010, 2:59 p.m.
On Sat, 2010-12-04 at 22:50 +0800, Changli Gao wrote:

> Maybe you misread my patch. tq is a stack variable in ri_tasklet, and
> initialized all the time. ri_tasklet() won't exits until tq is
> empty().

in your patch is a variable on the stack.
What i am saying is you should defer processing when there is an
error (note the two other spots i mentioned). 
This means you may leave dp->tq non-empty and therefore it
needs to be saved somewhere as it is before your patch.

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Dec. 4, 2010, 3:01 p.m.
On Sat, Dec 4, 2010 at 10:55 PM, jamal <hadi@cyberus.ca> wrote:
> On Sat, 2010-12-04 at 22:45 +0800, Changli Gao wrote:
>
>>
>> If we breaks the loop when there are still skbs in tq and no skb in
>> rq, the skbs will be left in txq until new skbs are enqueued into rq.
>> In rare cases, no new skb is queued, then these skbs will stay in rq
>> forever.
>
> So should we goto resched?
>

Only if we can't lock the txq or rq isn't empty, we goto resched. So
it is a bug.
Changli Gao - Dec. 4, 2010, 3:07 p.m.
On Sat, Dec 4, 2010 at 10:59 PM, jamal <hadi@cyberus.ca> wrote:
> On Sat, 2010-12-04 at 22:50 +0800, Changli Gao wrote:
>
>> Maybe you misread my patch. tq is a stack variable in ri_tasklet, and
>> initialized all the time. ri_tasklet() won't exits until tq is
>> empty().
>
> in your patch is a variable on the stack.
> What i am saying is you should defer processing when there is an
> error (note the two other spots i mentioned).
> This means you may leave dp->tq non-empty and therefore it
> needs to be saved somewhere as it is before your patch.
>

I know what you concern now. Thanks. I'll keep the old behavior in v2.
jamal - Dec. 4, 2010, 3:09 p.m.
On Sat, 2010-12-04 at 23:01 +0800, Changli Gao wrote:
> On Sat, Dec 4, 2010 at 10:55 PM, jamal <hadi@cyberus.ca> wrote:
> > On Sat, 2010-12-04 at 22:45 +0800, Changli Gao wrote:
> >
> >>
> >> If we breaks the loop when there are still skbs in tq and no skb in
> >> rq, the skbs will be left in txq until new skbs are enqueued into rq.
> >> In rare cases, no new skb is queued, then these skbs will stay in rq
> >> forever.
> >
> > So should we goto resched?
> >
> 
> Only if we can't lock the txq or rq isn't empty, we goto resched. So
> it is a bug.

And to be explicit: Yes, meant to say there is a bug if we break out 
in the scenario you described above - the fix is to jump to resched.
Why do we need the lock?

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
jamal - Dec. 4, 2010, 3:11 p.m.
On Sat, 2010-12-04 at 23:07 +0800, Changli Gao wrote:

> 
> I know what you concern now. Thanks. I'll keep the old behavior in v2.

Ok - thanks Changli.

cheers,
jamal


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski - Dec. 4, 2010, 3:40 p.m.
On Sat, Dec 04, 2010 at 09:48:47AM -0500, jamal wrote:
> On Sat, 2010-12-04 at 09:28 -0500, jamal wrote:
> 
> > The idea was to defer processing at the first error. Changli
> > is changing it to continue despite the error.
> > The initial goal was to yield whenever possible since we dont maintain
> > a lot of state.
> 
> Changli:
> Other points we could defer processing is in case of packets
> being dropped by dev_queue_xmit and netif_rx

Hmm... But we didn't care until now... ;-) Btw. is it really very
probable (and worth bothering) that this current error of NULL dev
get fixed before we purge this tq queue with deferral one by one?

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
jamal - Dec. 4, 2010, 4:08 p.m.
On Sat, 2010-12-04 at 16:40 +0100, Jarek Poplawski wrote:

> Hmm... But we didn't care until now... ;-) 

Well, if Changli didnt post - there would be no discussion ;->
The message was lost in the translation somewhere; events such as
patches sometimes serve as good reminders.

> Btw. is it really very
> probable (and worth bothering) that this current error of NULL dev
> get fixed before we purge this tq queue with deferral one by one?

Indeed - this is working against something buggy. But it has happened
often in the past. And the likelihood of there being a few bad ones
in the train of packets when this occurs is high. But there are many
packets there that wont suffer this sympton - so the only fair scheme is
to check all. Note: a BUG() seems unreasonable and the deferring  serves
as a throttling scheme.
What do you have in mind?

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski - Dec. 4, 2010, 4:56 p.m.
On Sat, Dec 04, 2010 at 11:08:01AM -0500, jamal wrote:
> On Sat, 2010-12-04 at 16:40 +0100, Jarek Poplawski wrote:
> 
> > Hmm... But we didn't care until now... ;-) 
> 
> Well, if Changli didnt post - there would be no discussion ;->
> The message was lost in the translation somewhere; events such as
> patches sometimes serve as good reminders.
> 
> > Btw. is it really very
> > probable (and worth bothering) that this current error of NULL dev
> > get fixed before we purge this tq queue with deferral one by one?
> 
> Indeed - this is working against something buggy. But it has happened
> often in the past. And the likelihood of there being a few bad ones
> in the train of packets when this occurs is high. But there are many
> packets there that wont suffer this sympton - so the only fair scheme is
> to check all. Note: a BUG() seems unreasonable and the deferring  serves
> as a throttling scheme.
> What do you have in mind?

I'm simply not convinced this kind of (fast) throttling can properly
fix any of the problems (what about other flows in the queue), while
Changli's patch makes this tasklet simpler and a bit faster.

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Dec. 5, 2010, 12:22 a.m.
On Sun, Dec 5, 2010 at 12:56 AM, Jarek Poplawski <jarkao2@gmail.com> wrote:
>
> I'm simply not convinced this kind of (fast) throttling can properly
> fix any of the problems (what about other flows in the queue), while
> Changli's patch makes this tasklet simpler and a bit faster.
>

The error case handled currently is the original netdev disappears but
the corresponding skbs are still in ifb.

I do also think checking the return value of netif_rx() and
dev_queue_xmit() can fix more 'problems'. :)
Changli Gao - Dec. 5, 2010, 1:13 a.m.
On Sun, Dec 5, 2010 at 8:22 AM, Changli Gao <xiaosuo@gmail.com> wrote:
>
> The error case handled currently is the original netdev disappears but
> the corresponding skbs are still in ifb.
>
> I do also think checking the return value of netif_rx() and
> dev_queue_xmit() can fix more 'problems'. :)
>

I have posted the V2 and split the bug fix to a separate one.

BTW: My ultimate goal is making ifb a multi-queue NIC, and the number
of queues is equal to the number of the possible CPUs.
jamal - Dec. 5, 2010, 2:27 p.m.
On Sun, 2010-12-05 at 08:22 +0800, Changli Gao wrote:
> On Sun, Dec 5, 2010 at 12:56 AM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> >
> > I'm simply not convinced this kind of (fast) throttling can properly
> > fix any of the problems (what about other flows in the queue), while
> > Changli's patch makes this tasklet simpler and a bit faster.
> >
> 
> The error case handled currently is the original netdev disappears but
> the corresponding skbs are still in ifb.
> 
> I do also think checking the return value of netif_rx() and
> dev_queue_xmit() can fix more 'problems'. :)


Please proceed with a patch for that as well..

cheers,
jamal

PS:- We forgot to thank Jarek for pointing out the bug.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
jamal - Dec. 5, 2010, 2:30 p.m.
On Sun, 2010-12-05 at 09:13 +0800, Changli Gao wrote:

> BTW: My ultimate goal is making ifb a multi-queue NIC, and the number
> of queues is equal to the number of the possible CPUs.


My view is this is going to be tricky because:
- we use tasklets. When we  reschedule we can end up on a differrent
cpu. 
-I dont see any point in having a separate softIRQ 
- and if you do use other mechanisms it would require a lot more
testing since there are quiet a few use cases of ifb

cheers,
jamal


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Dec. 5, 2010, 2:40 p.m.
On Sun, Dec 5, 2010 at 10:30 PM, jamal <hadi@cyberus.ca> wrote:
> On Sun, 2010-12-05 at 09:13 +0800, Changli Gao wrote:
>
>> BTW: My ultimate goal is making ifb a multi-queue NIC, and the number
>> of queues is equal to the number of the possible CPUs.
>
>
> My view is this is going to be tricky because:
> - we use tasklets. When we  reschedule we can end up on a differrent
> cpu.

The tasklets always been scheduled to the current CPU unless it has
been schedule already on the other CPU.

> -I dont see any point in having a separate softIRQ
> - and if you do use other mechanisms it would require a lot more
> testing since there are quiet a few use cases of ifb
>

I keep using tasklet. The attachment is the alpha version.
Eric Dumazet - Dec. 5, 2010, 3:13 p.m.
Le dimanche 05 décembre 2010 à 22:40 +0800, Changli Gao a écrit :
> On Sun, Dec 5, 2010 at 10:30 PM, jamal <hadi@cyberus.ca> wrote:
> > On Sun, 2010-12-05 at 09:13 +0800, Changli Gao wrote:
> >
> >> BTW: My ultimate goal is making ifb a multi-queue NIC, and the number
> >> of queues is equal to the number of the possible CPUs.
> >
> >
> > My view is this is going to be tricky because:
> > - we use tasklets. When we  reschedule we can end up on a differrent
> > cpu.
> 
> The tasklets always been scheduled to the current CPU unless it has
> been schedule already on the other CPU.
> 
> > -I dont see any point in having a separate softIRQ
> > - and if you do use other mechanisms it would require a lot more
> > testing since there are quiet a few use cases of ifb
> >
> 
> I keep using tasklet. The attachment is the alpha version.
> 

        for_each_possible_cpu(cpu) {
                q = per_cpu_ptr(p->q, cpu);
...

        dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private), "ifb%d",
                                  ifb_setup, num_possible_cpus());

This is a very usual error.

You can have machines with 2 possibles cpus, numbered 0 and 8

Therere, you must use nr_cpu_ids here instead of num_possible_cpus()



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
jamal - Dec. 5, 2010, 3:13 p.m.
On Sun, 2010-12-05 at 22:40 +0800, Changli Gao wrote:
> On Sun, Dec 5, 2010 at 10:30 PM, jamal <hadi@cyberus.ca> wrote:

> > - we use tasklets. When we  reschedule we can end up on a differrent
> > cpu.
> 
> The tasklets always been scheduled to the current CPU unless it has
> been schedule already on the other CPU.

I dont think this can be guaranteed - So the potential of packet 
reordering exists.

> I keep using tasklet. The attachment is the alpha version.

>From quick glance I dont see any technicalities - just the 
reordering concern.

cheers,
jamal



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Dec. 5, 2010, 3:16 p.m.
On Sun, Dec 5, 2010 at 11:13 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le dimanche 05 décembre 2010 à 22:40 +0800, Changli Gao a écrit :
>> On Sun, Dec 5, 2010 at 10:30 PM, jamal <hadi@cyberus.ca> wrote:
>> > On Sun, 2010-12-05 at 09:13 +0800, Changli Gao wrote:
>> >
>> >> BTW: My ultimate goal is making ifb a multi-queue NIC, and the number
>> >> of queues is equal to the number of the possible CPUs.
>> >
>> >
>> > My view is this is going to be tricky because:
>> > - we use tasklets. When we  reschedule we can end up on a differrent
>> > cpu.
>>
>> The tasklets always been scheduled to the current CPU unless it has
>> been schedule already on the other CPU.
>>
>> > -I dont see any point in having a separate softIRQ
>> > - and if you do use other mechanisms it would require a lot more
>> > testing since there are quiet a few use cases of ifb
>> >
>>
>> I keep using tasklet. The attachment is the alpha version.
>>
>
>        for_each_possible_cpu(cpu) {
>                q = per_cpu_ptr(p->q, cpu);
> ...
>
>        dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private), "ifb%d",
>                                  ifb_setup, num_possible_cpus());
>
> This is a very usual error.
>
> You can have machines with 2 possibles cpus, numbered 0 and 8
>
> Therere, you must use nr_cpu_ids here instead of num_possible_cpus()
>

Thanks.
Changli Gao - Dec. 5, 2010, 3:22 p.m.
On Sun, Dec 5, 2010 at 11:13 PM, jamal <hadi@cyberus.ca> wrote:
> On Sun, 2010-12-05 at 22:40 +0800, Changli Gao wrote:
>> On Sun, Dec 5, 2010 at 10:30 PM, jamal <hadi@cyberus.ca> wrote:
>
>> > - we use tasklets. When we  reschedule we can end up on a differrent
>> > cpu.
>>
>> The tasklets always been scheduled to the current CPU unless it has
>> been schedule already on the other CPU.
>
> I dont think this can be guaranteed - So the potential of packet
> reordering exists.
>

The current implementation can guarantee it. However, I can't find
documentation about this 'feature', but I think this behavior should
not be changed in future since maybe some call sites relay on it.
jamal - Dec. 5, 2010, 3:31 p.m.
On Sun, 2010-12-05 at 23:22 +0800, Changli Gao wrote:

> The current implementation can guarantee it. However, I can't find
> documentation about this 'feature', but I think this behavior should
> not be changed in future since maybe some call sites relay on it.

When you think you are in good shape - please add some debug hooks
so we can verify this especially under a huge traffic input and
I will test it (thats what end of year holidays are for).

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski - Dec. 5, 2010, 7:09 p.m.
On Sun, Dec 05, 2010 at 09:27:11AM -0500, jamal wrote:
...
> PS:- We forgot to thank Jarek for pointing out the bug.

Not at all! I only pointed out that Changli didn't point out ;-)
All credits go to him and you for pointing out the appropriate fix.

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index d1e362a..cd6e90d 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -39,9 +39,7 @@ 
 #define TX_Q_LIMIT    32
 struct ifb_private {
 	struct tasklet_struct   ifb_tasklet;
-	int     tasklet_pending;
 	struct sk_buff_head     rq;
-	struct sk_buff_head     tq;
 };
 
 static int numifbs = 2;
@@ -53,27 +51,25 @@  static int ifb_close(struct net_device *dev);
 
 static void ri_tasklet(unsigned long dev)
 {
-
 	struct net_device *_dev = (struct net_device *)dev;
 	struct ifb_private *dp = netdev_priv(_dev);
 	struct net_device_stats *stats = &_dev->stats;
 	struct netdev_queue *txq;
 	struct sk_buff *skb;
+	struct sk_buff_head tq;
 
+	__skb_queue_head_init(&tq);
 	txq = netdev_get_tx_queue(_dev, 0);
-	if ((skb = skb_peek(&dp->tq)) == NULL) {
-		if (__netif_tx_trylock(txq)) {
-			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
-				skb_queue_tail(&dp->tq, skb);
-			}
-			__netif_tx_unlock(txq);
-		} else {
-			/* reschedule */
-			goto resched;
-		}
+	if (!__netif_tx_trylock(txq)) {
+		tasklet_schedule(&dp->ifb_tasklet);
+		return;
 	}
+	skb_queue_splice_tail_init(&dp->rq, &tq);
+	if (netif_tx_queue_stopped(txq))
+		netif_tx_wake_queue(txq);
+	__netif_tx_unlock(txq);
 
-	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
+	while ((skb = __skb_dequeue(&tq)) != NULL) {
 		u32 from = G_TC_FROM(skb->tc_verd);
 
 		skb->tc_verd = 0;
@@ -87,7 +83,7 @@  static void ri_tasklet(unsigned long dev)
 			rcu_read_unlock();
 			dev_kfree_skb(skb);
 			stats->tx_dropped++;
-			break;
+			continue;
 		}
 		rcu_read_unlock();
 		skb->skb_iif = _dev->ifindex;
@@ -100,23 +96,6 @@  static void ri_tasklet(unsigned long dev)
 		} else
 			BUG();
 	}
-
-	if (__netif_tx_trylock(txq)) {
-		if ((skb = skb_peek(&dp->rq)) == NULL) {
-			dp->tasklet_pending = 0;
-			if (netif_queue_stopped(_dev))
-				netif_wake_queue(_dev);
-		} else {
-			__netif_tx_unlock(txq);
-			goto resched;
-		}
-		__netif_tx_unlock(txq);
-	} else {
-resched:
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
-
 }
 
 static const struct net_device_ops ifb_netdev_ops = {
@@ -162,10 +141,8 @@  static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	skb_queue_tail(&dp->rq, skb);
-	if (!dp->tasklet_pending) {
-		dp->tasklet_pending = 1;
+	if (skb_queue_len(&dp->rq) == 1)
 		tasklet_schedule(&dp->ifb_tasklet);
-	}
 
 	return NETDEV_TX_OK;
 }
@@ -177,7 +154,6 @@  static int ifb_close(struct net_device *dev)
 	tasklet_kill(&dp->ifb_tasklet);
 	netif_stop_queue(dev);
 	skb_queue_purge(&dp->rq);
-	skb_queue_purge(&dp->tq);
 	return 0;
 }
 
@@ -187,7 +163,6 @@  static int ifb_open(struct net_device *dev)
 
 	tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
 	skb_queue_head_init(&dp->rq);
-	skb_queue_head_init(&dp->tq);
 	netif_start_queue(dev);
 
 	return 0;