diff mbox

NULL deref in bnx2 / crashes ? ( was: netconsole leads to stalled CPU task )

Message ID k14poh$7bs$1@ger.gmane.org
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Cong Wang Aug. 23, 2012, 8:31 a.m. UTC
On Thu, 23 Aug 2012 at 07:57 GMT, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Wed, 22 Aug 2012 at 14:29 GMT, Sylvain Munaut <s.munaut@whatever-company.com> wrote:
>> Hi,
>>
>>
>> The machine with the intel card still hard freeze (no output / no nothing ...)
>> The machine with the bnx2 don't crash anymore and no NULL deref, but
>> the modprobe still hangs and I get this every 180 sec or so :
>
> The NULL-deref can be reproduced easily, and Eric's patch could fix it.
> So, Eric, can you resend your patch with your SOB?
>
> I can't reproduce the hang as it is net driver specific, it is
> probably related with my patch:
>
> commit 6bdb7fe31046ac50b47e83c35cd6c6b6160a475d
> Author: Amerigo Wang <amwang@redhat.com>
> Date:   Fri Aug 10 01:24:50 2012 +0000
>
>     netpoll: re-enable irq in poll_napi()
>

Could you test the following patch?


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Cong Wang Aug. 23, 2012, 9:12 a.m. UTC | #1
On Thu, 23 Aug 2012 at 08:31 GMT, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Thu, 23 Aug 2012 at 07:57 GMT, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> On Wed, 22 Aug 2012 at 14:29 GMT, Sylvain Munaut <s.munaut@whatever-company.com> wrote:
>>> Hi,
>>>
>>>
>>> The machine with the intel card still hard freeze (no output / no nothing ...)
>>> The machine with the bnx2 don't crash anymore and no NULL deref, but
>>> the modprobe still hangs and I get this every 180 sec or so :
>>
>> The NULL-deref can be reproduced easily, and Eric's patch could fix it.
>> So, Eric, can you resend your patch with your SOB?
>>
>> I can't reproduce the hang as it is net driver specific, it is
>> probably related with my patch:
>>
>> commit 6bdb7fe31046ac50b47e83c35cd6c6b6160a475d
>> Author: Amerigo Wang <amwang@redhat.com>
>> Date:   Fri Aug 10 01:24:50 2012 +0000
>>
>>     netpoll: re-enable irq in poll_napi()
>>
>
> Could you test the following patch?
>
> diff --git a/net/core/netpoll.c b/net/core/netpoll.c
> index ddc453b..ed4d1e4 100644
> --- a/net/core/netpoll.c
> +++ b/net/core/netpoll.c
> @@ -166,11 +166,18 @@ static int poll_one_napi(struct netpoll_info *npinfo,
>  static void poll_napi(struct net_device *dev)
>  {
>  	struct napi_struct *napi;
> +	LIST_HEAD(napi_list);
>  	int budget = 16;
>  
>  	WARN_ON_ONCE(!irqs_disabled());
>  
> -	list_for_each_entry(napi, &dev->napi_list, dev_list) {
> +	/* After we enable the IRQ, new entries could be added
> +	 * to this list, we need to save it before re-enable
> +	 * IRQ.
> +	 */
> +	list_splice_tail(&dev->napi_list, &napi_list);
> +

This one should be list_splice_init()...


> +	list_for_each_entry(napi, &napi_list, dev_list) {
>  		local_irq_enable();
>  		if (napi->poll_owner != smp_processor_id() &&
>  		    spin_trylock(&napi->poll_lock)) {
> @@ -187,6 +194,7 @@ static void poll_napi(struct net_device *dev)
>  		}
>  		local_irq_disable();
>  	}
> +	list_splice_tail(&napi_list, &dev->napi_list);
>  }
>  
>  static void service_arp_queue(struct netpoll_info *npi)
>
>
>
>
> However, it seems we should take rtnl lock to make sure dev->napi_list
> is really safe, I am not sure if the following one makes sense.
>
>
> diff --git a/net/core/netpoll.c b/net/core/netpoll.c
> index ddc453b..7770e2b 100644
> --- a/net/core/netpoll.c
> +++ b/net/core/netpoll.c
> @@ -170,8 +170,9 @@ static void poll_napi(struct net_device *dev)
>  
>  	WARN_ON_ONCE(!irqs_disabled());
>  
> +	local_irq_enable();
> +	rtnl_lock();
>  	list_for_each_entry(napi, &dev->napi_list, dev_list) {
> -		local_irq_enable();
>  		if (napi->poll_owner != smp_processor_id() &&
>  		    spin_trylock(&napi->poll_lock)) {
>  			rcu_read_lock_bh();
> @@ -180,13 +181,12 @@ static void poll_napi(struct net_device *dev)
>  			rcu_read_unlock_bh();
>  			spin_unlock(&napi->poll_lock);
>  
> -			if (!budget) {
> -				local_irq_disable();
> +			if (!budget)
>  				break;
> -			}
>  		}
> -		local_irq_disable();
>  	}
> +	rtnl_unlock();
> +	local_irq_disable();
>  }
>  
>  static void service_arp_queue(struct netpoll_info *npi)
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sylvain Munaut Aug. 24, 2012, 9:50 a.m. UTC | #2
Hi,

>>
>> Could you test the following patch?
>>
>> diff --git a/net/core/netpoll.c b/net/core/netpoll.c
>> index ddc453b..ed4d1e4 100644
>> --- a/net/core/netpoll.c
>> +++ b/net/core/netpoll.c
>> @@ -166,11 +166,18 @@ static int poll_one_napi(struct netpoll_info *npinfo,
>>  static void poll_napi(struct net_device *dev)
>>  {
>>       struct napi_struct *napi;
>> +     LIST_HEAD(napi_list);
>>       int budget = 16;
>>
>>       WARN_ON_ONCE(!irqs_disabled());
>>
>> -     list_for_each_entry(napi, &dev->napi_list, dev_list) {
>> +     /* After we enable the IRQ, new entries could be added
>> +      * to this list, we need to save it before re-enable
>> +      * IRQ.
>> +      */
>> +     list_splice_tail(&dev->napi_list, &napi_list);
>> +
>
> This one should be list_splice_init()...
>
>
>> +     list_for_each_entry(napi, &napi_list, dev_list) {
>>               local_irq_enable();
>>               if (napi->poll_owner != smp_processor_id() &&
>>                   spin_trylock(&napi->poll_lock)) {
>> @@ -187,6 +194,7 @@ static void poll_napi(struct net_device *dev)
>>               }
>>               local_irq_disable();
>>       }
>> +     list_splice_tail(&napi_list, &dev->napi_list);
>>  }
>>
>>  static void service_arp_queue(struct netpoll_info *npi)

I've just tested this patch on the intel machine and the behavior didn't change.
When I do the netconsole modprobe, it sends a couple of line, the
modprobe hangs and then a couple of second later the whole machine
hangs, with nothing printed on the screen or anything.

Cheers,

    Sylvain
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cong Wang Aug. 25, 2012, 8:01 a.m. UTC | #3
On Fri, Aug 24, 2012 at 5:50 PM, Sylvain Munaut
<s.munaut@whatever-company.com> wrote:
>
> I've just tested this patch on the intel machine and the behavior didn't change.
> When I do the netconsole modprobe, it sends a couple of line, the
> modprobe hangs and then a couple of second later the whole machine
> hangs, with nothing printed on the screen or anything.

Hi, Sylvain

I just sent a new patch:
http://marc.info/?l=linux-netdev&m=134588049503667&w=2

Please help to test it.

Thanks a lot!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index ddc453b..ed4d1e4 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -166,11 +166,18 @@  static int poll_one_napi(struct netpoll_info *npinfo,
 static void poll_napi(struct net_device *dev)
 {
 	struct napi_struct *napi;
+	LIST_HEAD(napi_list);
 	int budget = 16;
 
 	WARN_ON_ONCE(!irqs_disabled());
 
-	list_for_each_entry(napi, &dev->napi_list, dev_list) {
+	/* After we enable the IRQ, new entries could be added
+	 * to this list, we need to save it before re-enable
+	 * IRQ.
+	 */
+	list_splice_tail(&dev->napi_list, &napi_list);
+
+	list_for_each_entry(napi, &napi_list, dev_list) {
 		local_irq_enable();
 		if (napi->poll_owner != smp_processor_id() &&
 		    spin_trylock(&napi->poll_lock)) {
@@ -187,6 +194,7 @@  static void poll_napi(struct net_device *dev)
 		}
 		local_irq_disable();
 	}
+	list_splice_tail(&napi_list, &dev->napi_list);
 }
 
 static void service_arp_queue(struct netpoll_info *npi)




However, it seems we should take rtnl lock to make sure dev->napi_list
is really safe, I am not sure if the following one makes sense.


diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index ddc453b..7770e2b 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -170,8 +170,9 @@  static void poll_napi(struct net_device *dev)
 
 	WARN_ON_ONCE(!irqs_disabled());
 
+	local_irq_enable();
+	rtnl_lock();
 	list_for_each_entry(napi, &dev->napi_list, dev_list) {
-		local_irq_enable();
 		if (napi->poll_owner != smp_processor_id() &&
 		    spin_trylock(&napi->poll_lock)) {
 			rcu_read_lock_bh();
@@ -180,13 +181,12 @@  static void poll_napi(struct net_device *dev)
 			rcu_read_unlock_bh();
 			spin_unlock(&napi->poll_lock);
 
-			if (!budget) {
-				local_irq_disable();
+			if (!budget)
 				break;
-			}
 		}
-		local_irq_disable();
 	}
+	rtnl_unlock();
+	local_irq_disable();
 }
 
 static void service_arp_queue(struct netpoll_info *npi)