diff mbox

[RFC] gianfar: Do not call skb recycling with disabled IRQs

Message ID 20091105165738.GA31923@oksana.dev.rtsoft.ru (mailing list archive)
State Superseded
Delegated to: Kumar Gala
Headers show

Commit Message

Anton Vorontsov Nov. 5, 2009, 4:57 p.m. UTC
Before calling gfar_clean_tx_ring() we grab an irqsave spinlock, and
then try to recycle an skb, which requires IRQs to be enabled. This
leads to the following badness:

nf_conntrack version 0.5.0 (1008 buckets, 4032 max)
------------[ cut here ]------------
Badness at kernel/softirq.c:143
NIP: c003e3c4 LR: c423a528 CTR: c003e344
...
NIP [c003e3c4] local_bh_enable+0x80/0xc4
LR [c423a528] destroy_conntrack+0xd4/0x13c [nf_conntrack]
Call Trace:
[c15d1b60] [c003e32c] local_bh_disable+0x1c/0x34 (unreliable)
[c15d1b70] [c423a528] destroy_conntrack+0xd4/0x13c [nf_conntrack]
[c15d1b80] [c02c6370] nf_conntrack_destroy+0x3c/0x70
--- Exception: c428168c at 0xc15d1c50
LR = 0xc15d1c40
[c15d1ba0] [c0286f3c] skb_release_head_state+0x100/0x104 (unreliable)
[c15d1bb0] [c0288340] skb_recycle_check+0x8c/0x10c
[c15d1bc0] [c01e1688] gfar_poll+0x190/0x384
[c15d1c10] [c02935ac] net_rx_action+0xec/0x22c
[c15d1c50] [c003dd8c] __do_softirq+0xe8/0x224
[c15d1ca0] [c000624c] do_softirq+0x78/0x80
[c15d1cb0] [c003d868] irq_exit+0x60/0x78
...

We can't easily get rid of the irqsave spinlock, because we must
guard ourselves from start_xmit.

So, fix this by dropping the irqsave spinlock from both xmit_start
and clean_tx_ring routines. Instead, lock the whole tx queue via
netif_tx_lock_bh() in clean_tx_ring().

Reported-by: Jon Loeliger <jdl@jdl.com>
Not-yet-Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
---

On Thu, Nov 05, 2009 at 09:43:18AM -0600, Jon Loeliger wrote:
[...]
> This is with essentially a stock 2.6.31 kernel for an 8315.
> I've seen the problem for both task 'insmod' and 'iptables'.
> Um, conn_track is a module being loaded here.  Our brief analysis
> runs like this:
> 
>     This is an issue with the gianfar driver.  In gfar_poll(), irqs are
>     disabled for the handling of gfar_clean_tx_ring(dev) (line 1928).
>     In this call, skbs can call skb_recycle_check, which can release
>     head state (net/core/skbuff.c@506), which can cause conntrack
>     cleanup (net/core/skbuff.c@402->include/linux/skbuff.h@1923), which
>     cannot be done with IRQs disabled...that is the badness.

Ugh.

We may try to call netif_tx_lock_bh() in gfar_clean_tx_ring() and
drop the irqsave spinlock start_xmit (sky2-like scheme).

But that basically means that with skb recycling we can't safely
use KGDBoE, though we can add something like this:

| diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
| index a00ec63..4d82bd7 100644
| --- a/drivers/net/gianfar.c
| +++ b/drivers/net/gianfar.c
| @@ -1619,7 +1619,8 @@ static int gfar_clean_tx_ring(struct net_device *dev)
|  		 * If there's room in the queue (limit it to rx_buffer_size)
|  		 * we add this skb back into the pool, if it's the right size
|  		 */
| -		if (skb_queue_len(&priv->rx_recycle) < priv->rx_ring_size &&
| +		if (!irqs_disabled() &&
| +			skb_queue_len(&priv->rx_recycle) < priv->rx_ring_size &&
|  				skb_recycle_check(skb, priv->rx_buffer_size +
|  					RXBUF_ALIGNMENT))
| 			__skb_queue_head(&priv->rx_recycle, skb);

So we won't recycle skbs with irqs disabled.

Anyway, apart from KGDBoE, the following patch might fix the conntrack
issue, can you try it?

With this patch the kernel boots via NFS, and survives netperf, though
I'd like to audit changes a little more and run some netperf tests, I
might also put the netif_tx_lock_bh() stuff out of the loop.

So this patch isn't for the merge.

 drivers/net/gianfar.c |   19 +++----------------
 1 files changed, 3 insertions(+), 16 deletions(-)

Comments

Kumar Gopalpet-B05799 Nov. 5, 2009, 5:23 p.m. UTC | #1
[.....]
> drivers/net/gianfar.c |   19 +++----------------
> 1 files changed, 3 insertions(+), 16 deletions(-)
>
>diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c 
>index 197b358..a0ae604 100644
>--- a/drivers/net/gianfar.c
>+++ b/drivers/net/gianfar.c
>@@ -1899,10 +1899,8 @@ static int gfar_start_xmit(struct 
>sk_buff *skb, struct net_device *dev)
> 	u32 lstatus;
> 	int i, rq = 0;
> 	u32 bufaddr;
>-	unsigned long flags;
> 	unsigned int nr_frags, length;
> 
>-
> 	rq = skb->queue_mapping;
> 	tx_queue = priv->tx_queue[rq];
> 	txq = netdev_get_tx_queue(dev, rq);
>@@ -1928,14 +1926,11 @@ static int gfar_start_xmit(struct 
>sk_buff *skb, struct net_device *dev)
> 	/* total number of fragments in the SKB */
> 	nr_frags = skb_shinfo(skb)->nr_frags;
> 
>-	spin_lock_irqsave(&tx_queue->txlock, flags);
>-
> 	/* check if there is space to queue this packet */
> 	if ((nr_frags+1) > tx_queue->num_txbdfree) {
> 		/* no space, stop the queue */
> 		netif_tx_stop_queue(txq);
> 		dev->stats.tx_fifo_errors++;
>-		spin_unlock_irqrestore(&tx_queue->txlock, flags);
> 		return NETDEV_TX_BUSY;
> 	}
> 
>@@ -2033,9 +2028,6 @@ static int gfar_start_xmit(struct 
>sk_buff *skb, struct net_device *dev)
> 	/* Tell the DMA to go go go */
> 	gfar_write(&regs->tstat, TSTAT_CLEAR_THALT >> tx_queue->qindex);
> 
>-	/* Unlock priv */
>-	spin_unlock_irqrestore(&tx_queue->txlock, flags);
>-
> 	return NETDEV_TX_OK;
> }
> 
>@@ -2550,7 +2542,6 @@ static int gfar_poll(struct napi_struct 
>*napi, int budget)
> 	int tx_cleaned = 0, i, left_over_budget = budget;
> 	unsigned long serviced_queues = 0;
> 	int num_queues = 0;
>-	unsigned long flags;
> 
> 	num_queues = gfargrp->num_rx_queues;
> 	budget_per_queue = budget/num_queues;
>@@ -2570,13 +2561,9 @@ static int gfar_poll(struct napi_struct 
>*napi, int budget)
> 			rx_queue = priv->rx_queue[i];
> 			tx_queue = priv->tx_queue[rx_queue->qindex];
> 
>-			/* If we fail to get the lock,
>-			 * don't bother with the TX BDs */
>-			if 
>(spin_trylock_irqsave(&tx_queue->txlock, flags)) {
>-				tx_cleaned += 
>gfar_clean_tx_ring(tx_queue);
>-				
>spin_unlock_irqrestore(&tx_queue->txlock,
>-							flags);
>-			}
>+			netif_tx_lock_bh(priv->ndev);

Will this not lead to locking all the tx queues even though at this
point we are working on a "particular queue" ?

>+			tx_cleaned += gfar_clean_tx_ring(tx_queue);
>+			netif_tx_unlock_bh(priv->ndev);
> 
> 			rx_cleaned_per_queue = 
>gfar_clean_rx_ring(rx_queue,
> 							
>budget_per_queue);
>--

--

Thanks
Sandeep
Anton Vorontsov Nov. 5, 2009, 5:34 p.m. UTC | #2
On Thu, Nov 05, 2009 at 10:53:08PM +0530, Kumar Gopalpet-B05799 wrote:
[...]
> >(spin_trylock_irqsave(&tx_queue->txlock, flags)) {
> >-				tx_cleaned += 
> >gfar_clean_tx_ring(tx_queue);
> >-				
> >spin_unlock_irqrestore(&tx_queue->txlock,
> >-							flags);
> >-			}
> >+			netif_tx_lock_bh(priv->ndev);
> 
> Will this not lead to locking all the tx queues even though at this
> point we are working on a "particular queue" ?

Yeah, per-txq locking would be better (or not.. I need to netperf
it).

Thanks,
Jon Loeliger Nov. 5, 2009, 5:40 p.m. UTC | #3
> 
> [.....]
> > drivers/net/gianfar.c |   19 +++----------------
> > 1 files changed, 3 insertions(+), 16 deletions(-)
> >
> >diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c=20
> >index 197b358..a0ae604 100644
> >--- a/drivers/net/gianfar.c
> >+++ b/drivers/net/gianfar.c
> >@@ -1899,10 +1899,8 @@ static int gfar_start_xmit(struct=20
> >sk_buff *skb, struct net_device *dev)
> > 	u32 lstatus;
> > 	int i, rq = 0;
> > 	u32 bufaddr;
> >-	unsigned long flags;
> > 	unsigned int nr_frags, length;
> > 
> >-
> > 	rq = skb->queue_mapping;
> > 	tx_queue = priv->tx_queue[rq];
> > 	txq = netdev_get_tx_queue(dev, rq);
> >@@ -1928,14 +1926,11 @@ static int gfar_start_xmit(struct 
> >sk_buff *skb, struct net_device *dev)
> > 	/* total number of fragments in the SKB */
> > 	nr_frags = skb_shinfo(skb)->nr_frags;
> > 
> >-	spin_lock_irqsave(&tx_queue->txlock, flags);
> >-
> > 	/* check if there is space to queue this packet */
> > 	if ((nr_frags+1) > tx_queue->num_txbdfree) {
> > 		/* no space, stop the queue */
> > 		netif_tx_stop_queue(txq);
> > 		dev->stats.tx_fifo_errors++;
> >-		spin_unlock_irqrestore(&tx_queue->txlock, flags);
> > 		return NETDEV_TX_BUSY;
> > 	}
> > 
> >@@ -2033,9 +2028,6 @@ static int gfar_start_xmit(struct 
> >sk_buff *skb, struct net_device *dev)
> > 	/* Tell the DMA to go go go */
> > 	gfar_write(&regs->tstat, TSTAT_CLEAR_THALT >> tx_queue->qindex);
> > 
> >-	/* Unlock priv */
> >-	spin_unlock_irqrestore(&tx_queue->txlock, flags);
> >-
> > 	return NETDEV_TX_OK;
> > }
> > 
> >@@ -2550,7 +2542,6 @@ static int gfar_poll(struct napi_struct 
> >*napi, int budget)
> > 	int tx_cleaned = 0, i, left_over_budget = budget;
> > 	unsigned long serviced_queues = 0;
> > 	int num_queues = 0;
> >-	unsigned long flags;
> > 
> > 	num_queues = gfargrp->num_rx_queues;
> > 	budget_per_queue = budget/num_queues;
> >@@ -2570,13 +2561,9 @@ static int gfar_poll(struct napi_struct 
> >*napi, int budget)
> > 			rx_queue = priv->rx_queue[i];
> > 			tx_queue = priv->tx_queue[rx_queue->qindex];
> > 
> >-			/* If we fail to get the lock,
> >-			 * don't bother with the TX BDs */
> >-			if 
> >(spin_trylock_irqsave(&tx_queue->txlock, flags)) {
> >-				tx_cleaned += 
> >gfar_clean_tx_ring(tx_queue);
> >-			=09
> >spin_unlock_irqrestore(&tx_queue->txlock,
> >-							flags);
> >-			}
> >+			netif_tx_lock_bh(priv->ndev);
> 
> Will this not lead to locking all the tx queues even though at this
> point we are working on a "particular queue" ?

Similar but different question:  What version is this patch based upon?
I can't find:

    > >index 197b358..a0ae604 100644

My search of 2.6.31 up through current linux head (v2.6.32-rc6-26-g91d3f9b)
and benh's current head (94a8d5caba74211ec76dac80fc6e2d5c391530df) do not
have a version of this code with separate txqueues.

I confess I'm not too familiar with the history here, so I don't know
if that feature is in flux, or came, or went, or what.

That boils down to:  I can't directly apply your patch, but I could
hand-fudge its intent, but only on a single tx queue variant.

jdl
Kumar Gopalpet-B05799 Nov. 5, 2009, 5:53 p.m. UTC | #4
>
>Similar but different question:  What version is this patch based upon?
>I can't find:
>
>    > >index 197b358..a0ae604 100644
>
>My search of 2.6.31 up through current linux head 
>(v2.6.32-rc6-26-g91d3f9b) and benh's current head 
>(94a8d5caba74211ec76dac80fc6e2d5c391530df) do not have a 
>version of this code with separate txqueues.
>
>I confess I'm not too familiar with the history here, so I 
>don't know if that feature is in flux, or came, or went, or what.
>
>That boils down to:  I can't directly apply your patch, but I 
>could hand-fudge its intent, but only on a single tx queue variant.
>

You might want to get the code base from the top of davem/net-next-2.6
tree

git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6.git

The multi queue is a feature available in etsec2.0 version. On older
versions you might want to 
work in a single queue mode itself.


--

Thanks
Sandeep
David Miller Nov. 8, 2009, 9:08 a.m. UTC | #5
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Thu, 5 Nov 2009 19:57:38 +0300

> But that basically means that with skb recycling we can't safely
> use KGDBoE, though we can add something like this:

Please stop adding special logic only to your driver to handle these
things.

Either it's a non-issue, or it's going to potentially be an issue for
everyone using skb_recycle_check() in a NAPI driver, right?

So why not add the "in_interrupt()" or whatever check to
skb_recycle_check() and if the context is unsuitable return false (0)
ok?
Anton Vorontsov Nov. 9, 2009, 1:41 p.m. UTC | #6
On Sun, Nov 08, 2009 at 01:08:48AM -0800, David Miller wrote:
> From: Anton Vorontsov <avorontsov@ru.mvista.com>
> Date: Thu, 5 Nov 2009 19:57:38 +0300
> 
> > But that basically means that with skb recycling we can't safely
> > use KGDBoE, though we can add something like this:
> 
> Please stop adding special logic only to your driver to handle these
> things.
> 
> Either it's a non-issue, or it's going to potentially be an issue for
> everyone using skb_recycle_check() in a NAPI driver, right?
> 
> So why not add the "in_interrupt()" or whatever check to
> skb_recycle_check() and if the context is unsuitable return false (0)
> ok?

I think the check is needed, yes. But if we add just that check,
then we'll never do skb recycling in the tx path (since gianfar
always grabs irqsave spinlock during tx ring cleanup, so the check
will always return false).

So we must add the check to keep polling with disabled IRQs safe,
but we also should get rid of irqsave spinlock in the gianfar
driver (to make the skb recycling actually work in most cases).

Thanks!
diff mbox

Patch

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 197b358..a0ae604 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -1899,10 +1899,8 @@  static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	u32 lstatus;
 	int i, rq = 0;
 	u32 bufaddr;
-	unsigned long flags;
 	unsigned int nr_frags, length;
 
-
 	rq = skb->queue_mapping;
 	tx_queue = priv->tx_queue[rq];
 	txq = netdev_get_tx_queue(dev, rq);
@@ -1928,14 +1926,11 @@  static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* total number of fragments in the SKB */
 	nr_frags = skb_shinfo(skb)->nr_frags;
 
-	spin_lock_irqsave(&tx_queue->txlock, flags);
-
 	/* check if there is space to queue this packet */
 	if ((nr_frags+1) > tx_queue->num_txbdfree) {
 		/* no space, stop the queue */
 		netif_tx_stop_queue(txq);
 		dev->stats.tx_fifo_errors++;
-		spin_unlock_irqrestore(&tx_queue->txlock, flags);
 		return NETDEV_TX_BUSY;
 	}
 
@@ -2033,9 +2028,6 @@  static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Tell the DMA to go go go */
 	gfar_write(&regs->tstat, TSTAT_CLEAR_THALT >> tx_queue->qindex);
 
-	/* Unlock priv */
-	spin_unlock_irqrestore(&tx_queue->txlock, flags);
-
 	return NETDEV_TX_OK;
 }
 
@@ -2550,7 +2542,6 @@  static int gfar_poll(struct napi_struct *napi, int budget)
 	int tx_cleaned = 0, i, left_over_budget = budget;
 	unsigned long serviced_queues = 0;
 	int num_queues = 0;
-	unsigned long flags;
 
 	num_queues = gfargrp->num_rx_queues;
 	budget_per_queue = budget/num_queues;
@@ -2570,13 +2561,9 @@  static int gfar_poll(struct napi_struct *napi, int budget)
 			rx_queue = priv->rx_queue[i];
 			tx_queue = priv->tx_queue[rx_queue->qindex];
 
-			/* If we fail to get the lock,
-			 * don't bother with the TX BDs */
-			if (spin_trylock_irqsave(&tx_queue->txlock, flags)) {
-				tx_cleaned += gfar_clean_tx_ring(tx_queue);
-				spin_unlock_irqrestore(&tx_queue->txlock,
-							flags);
-			}
+			netif_tx_lock_bh(priv->ndev);
+			tx_cleaned += gfar_clean_tx_ring(tx_queue);
+			netif_tx_unlock_bh(priv->ndev);
 
 			rx_cleaned_per_queue = gfar_clean_rx_ring(rx_queue,
 							budget_per_queue);