Patchwork [RFC] e100: Fix workqueue race

login
register
mail settings
Submitter Alan Cox
Date Jan. 21, 2010, 4:48 p.m.
Message ID <20100121164801.416170b9@linux.intel.com>
Download mbox | patch
Permalink /patch/43442/
State Awaiting Upstream
Delegated to: David Miller
Headers show

Comments

Alan Cox - Jan. 21, 2010, 4:48 p.m.
(Incidentally this doesn't seem to be the only net driver that looks
suspect here)

e100: Fix the TX workqueue race

From: Alan Cox <alan@linux.intel.com>

Nothing stops the workqueue being left to run in parallel with close or a
few other operations. This causes double unmaps and the like.

See kerneloops.org #1041230 for an example

Signed-off-by: Alan Cox <alan@linux.intel.com>
---

 drivers/net/e100.c |   13 +++++++++++--
 1 files changed, 11 insertions(+), 2 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger - Jan. 21, 2010, 5:20 p.m.
On Thu, 21 Jan 2010 16:48:01 +0000
Alan Cox <alan@linux.intel.com> wrote:

> (Incidentally this doesn't seem to be the only net driver that looks
> suspect here)
> 
> e100: Fix the TX workqueue race
> 
> From: Alan Cox <alan@linux.intel.com>
> 
> Nothing stops the workqueue being left to run in parallel with close or a
> few other operations. This causes double unmaps and the like.
> 
> See kerneloops.org #1041230 for an example
> 
> Signed-off-by: Alan Cox <alan@linux.intel.com>

Most drivers solve this by getting rtnl_lock in the timeout work
function.
Jarek Poplawski - Jan. 22, 2010, 8:42 a.m.
On 21-01-2010 17:48, Alan Cox wrote:
> (Incidentally this doesn't seem to be the only net driver that looks
> suspect here)
> 
> e100: Fix the TX workqueue race
> 
> From: Alan Cox <alan@linux.intel.com>
> 
> Nothing stops the workqueue being left to run in parallel with close or a
> few other operations. This causes double unmaps and the like.
> 
> See kerneloops.org #1041230 for an example
> 
> Signed-off-by: Alan Cox <alan@linux.intel.com>
> ---
> 
>  drivers/net/e100.c |   13 +++++++++++--
>  1 files changed, 11 insertions(+), 2 deletions(-)
> 
> 
> diff --git a/drivers/net/e100.c b/drivers/net/e100.c
> index 5c7a155..5e02e4f 100644
> --- a/drivers/net/e100.c
> +++ b/drivers/net/e100.c
> @@ -2232,7 +2232,7 @@ err_rx_clean_list:
>  	return err;
>  }
>  
> -static void e100_down(struct nic *nic)
> +static void e100_do_down(struct nic *nic)
>  {
>  	/* wait here for poll to complete */
>  	napi_disable(&nic->napi);
> @@ -2245,6 +2245,15 @@ static void e100_down(struct nic *nic)
>  	e100_rx_clean_list(nic);
>  }
>  
> +/* For the non TX timeout case we want to kill the tx timeout before
> +   we do this otherwise a parallel tx timeout will make a nasty mess. */
> +
> +static void e100_down(struct nic *nic)
> +{
> +	cancel_work_sync(&nic->tx_timeout_task);

Can't tx_timeout_task be triggered just between these two calls here?

Jarek P.

> +	e100_do_down(nic);
> +}
> +
>  static void e100_tx_timeout(struct net_device *netdev)
>  {
>  	struct nic *nic = netdev_priv(netdev);
> @@ -2261,7 +2270,7 @@ static void e100_tx_timeout_task(struct work_struct *work)
>  
>  	DPRINTK(TX_ERR, DEBUG, "scb.status=0x%02X\n",
>  		ioread8(&nic->csr->scb.status));
> -	e100_down(netdev_priv(netdev));
> +	e100_do_down(netdev_priv(netdev));
>  	e100_up(netdev_priv(netdev));
>  }
>  
> --
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski - Jan. 22, 2010, 9:07 a.m.
On Fri, Jan 22, 2010 at 08:42:00AM +0000, Jarek Poplawski wrote:
> On 21-01-2010 17:48, Alan Cox wrote:
> > (Incidentally this doesn't seem to be the only net driver that looks
> > suspect here)
> > 
> > e100: Fix the TX workqueue race
> > 
> > From: Alan Cox <alan@linux.intel.com>
> > 
> > Nothing stops the workqueue being left to run in parallel with close or a
> > few other operations. This causes double unmaps and the like.
> > 
> > See kerneloops.org #1041230 for an example
> > 
> > Signed-off-by: Alan Cox <alan@linux.intel.com>
> > ---
> > 
> >  drivers/net/e100.c |   13 +++++++++++--
> >  1 files changed, 11 insertions(+), 2 deletions(-)
> > 
> > 
> > diff --git a/drivers/net/e100.c b/drivers/net/e100.c
> > index 5c7a155..5e02e4f 100644
> > --- a/drivers/net/e100.c
> > +++ b/drivers/net/e100.c
> > @@ -2232,7 +2232,7 @@ err_rx_clean_list:
> >  	return err;
> >  }
> >  
> > -static void e100_down(struct nic *nic)
> > +static void e100_do_down(struct nic *nic)
> >  {
> >  	/* wait here for poll to complete */
> >  	napi_disable(&nic->napi);
> > @@ -2245,6 +2245,15 @@ static void e100_down(struct nic *nic)
> >  	e100_rx_clean_list(nic);
> >  }
> >  
> > +/* For the non TX timeout case we want to kill the tx timeout before
> > +   we do this otherwise a parallel tx timeout will make a nasty mess. */
> > +
> > +static void e100_down(struct nic *nic)
> > +{
> > +	cancel_work_sync(&nic->tx_timeout_task);
> 
> Can't tx_timeout_task be triggered just between these two calls here?

More exactly: except when this is called from dev_close(), where it
should work OK. (At least until tx_timeout_task doesn't take any lock
held here - especially rtnl_lock.)

Jarek P.

> 
> > +	e100_do_down(nic);
> > +}
> > +
> >  static void e100_tx_timeout(struct net_device *netdev)
> >  {
> >  	struct nic *nic = netdev_priv(netdev);
> > @@ -2261,7 +2270,7 @@ static void e100_tx_timeout_task(struct work_struct *work)
> >  
> >  	DPRINTK(TX_ERR, DEBUG, "scb.status=0x%02X\n",
> >  		ioread8(&nic->csr->scb.status));
> > -	e100_down(netdev_priv(netdev));
> > +	e100_do_down(netdev_priv(netdev));
> >  	e100_up(netdev_priv(netdev));
> >  }
> >  
> > --
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski - Jan. 22, 2010, 9:38 a.m.
On Fri, Jan 22, 2010 at 09:07:31AM +0000, Jarek Poplawski wrote:
> On Fri, Jan 22, 2010 at 08:42:00AM +0000, Jarek Poplawski wrote:
> > On 21-01-2010 17:48, Alan Cox wrote:
> > > (Incidentally this doesn't seem to be the only net driver that looks
> > > suspect here)
> > > 
> > > e100: Fix the TX workqueue race
> > > 
> > > From: Alan Cox <alan@linux.intel.com>
> > > 
> > > Nothing stops the workqueue being left to run in parallel with close or a
> > > few other operations. This causes double unmaps and the like.
> > > 
> > > See kerneloops.org #1041230 for an example
> > > 
> > > Signed-off-by: Alan Cox <alan@linux.intel.com>
> > > ---
> > > 
> > >  drivers/net/e100.c |   13 +++++++++++--
> > >  1 files changed, 11 insertions(+), 2 deletions(-)
> > > 
> > > 
> > > diff --git a/drivers/net/e100.c b/drivers/net/e100.c
> > > index 5c7a155..5e02e4f 100644
> > > --- a/drivers/net/e100.c
> > > +++ b/drivers/net/e100.c
> > > @@ -2232,7 +2232,7 @@ err_rx_clean_list:
> > >  	return err;
> > >  }
> > >  
> > > -static void e100_down(struct nic *nic)
> > > +static void e100_do_down(struct nic *nic)
> > >  {
> > >  	/* wait here for poll to complete */
> > >  	napi_disable(&nic->napi);
> > > @@ -2245,6 +2245,15 @@ static void e100_down(struct nic *nic)
> > >  	e100_rx_clean_list(nic);
> > >  }
> > >  
> > > +/* For the non TX timeout case we want to kill the tx timeout before
> > > +   we do this otherwise a parallel tx timeout will make a nasty mess. */
> > > +
> > > +static void e100_down(struct nic *nic)
> > > +{
> > > +	cancel_work_sync(&nic->tx_timeout_task);
> > 
> > Can't tx_timeout_task be triggered just between these two calls here?
> 
> More exactly: except when this is called from dev_close(), where it
> should work OK. (At least until tx_timeout_task doesn't take any lock
> held here - especially rtnl_lock.)

Hmm... Even more exactly, since tx_timeout_task can be triggered not
only by dev_watchdog(), dev_close() is suspicious too.

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 5c7a155..5e02e4f 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -2232,7 +2232,7 @@  err_rx_clean_list:
 	return err;
 }
 
-static void e100_down(struct nic *nic)
+static void e100_do_down(struct nic *nic)
 {
 	/* wait here for poll to complete */
 	napi_disable(&nic->napi);
@@ -2245,6 +2245,15 @@  static void e100_down(struct nic *nic)
 	e100_rx_clean_list(nic);
 }
 
+/* For the non TX timeout case we want to kill the tx timeout before
+   we do this otherwise a parallel tx timeout will make a nasty mess. */
+
+static void e100_down(struct nic *nic)
+{
+	cancel_work_sync(&nic->tx_timeout_task);
+	e100_do_down(nic);
+}
+
 static void e100_tx_timeout(struct net_device *netdev)
 {
 	struct nic *nic = netdev_priv(netdev);
@@ -2261,7 +2270,7 @@  static void e100_tx_timeout_task(struct work_struct *work)
 
 	DPRINTK(TX_ERR, DEBUG, "scb.status=0x%02X\n",
 		ioread8(&nic->csr->scb.status));
-	e100_down(netdev_priv(netdev));
+	e100_do_down(netdev_priv(netdev));
 	e100_up(netdev_priv(netdev));
 }