diff mbox series

[Bionic,v2] ixgbe/ixgbevf: Free IRQ when PCI error recovery removes the device

Message ID 1528810057-7417-1-git-send-email-maurosr@linux.vnet.ibm.com
State New
Headers show
Series [Bionic,v2] ixgbe/ixgbevf: Free IRQ when PCI error recovery removes the device | expand

Commit Message

Mauro S. M. Rodrigues June 12, 2018, 1:27 p.m. UTC
From: Mauro S M Rodrigues <maurosr@linux.vnet.ibm.com>

BugLink: http://bugs.launchpad.net/bugs/1776389

Since commit f7f37e7ff2b9 ("ixgbe: handle close/suspend race with
netif_device_detach/present") ixgbe_close_suspend is called, from
ixgbe_close, only if the device is present, i.e. if it isn't detached.
That exposed a situation where IRQs weren't freed if a PCI error
recovery system opts to remove the device. For such case the pci channel
state is set to pci_channel_io_perm_failure and ixgbe_io_error_detected
was returning PCI_ERS_RESULT_DISCONNECT before calling
ixgbe_close_suspend consequentially not freeing IRQ and crashing when
the remove handler calls pci_disable_device, hitting a BUG_ON at
free_msi_irqs, which asserts that there is no non-free IRQ associated
with the device to be removed:

BUG_ON(irq_has_action(entry->irq + i));

The issue is fixed by calling the ixgbe_close_suspend before evaluate
the pci channel state.

Reported-by: Naresh Bannoth <nbannoth@in.ibm.com>
Reported-by: Abdul Haleem <abdhalee@in.ibm.com>
Signed-off-by: Mauro S M Rodrigues <maurosr@linux.vnet.ibm.com>
Reviewed-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
(cherry picked from commit b212d815e77c72be921979119c715166cc8987b1)
Signed-off-by: Mauro S. M. Rodrigues <maurosr@linux.vnet.ibm.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     | 6 +++---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

Comments

Stefan Bader June 13, 2018, 2:55 p.m. UTC | #1
On 12.06.2018 15:27, Mauro S. M. Rodrigues wrote:
> From: Mauro S M Rodrigues <maurosr@linux.vnet.ibm.com>
> 
> BugLink: http://bugs.launchpad.net/bugs/1776389
> 
> Since commit f7f37e7ff2b9 ("ixgbe: handle close/suspend race with
> netif_device_detach/present") ixgbe_close_suspend is called, from
> ixgbe_close, only if the device is present, i.e. if it isn't detached.
> That exposed a situation where IRQs weren't freed if a PCI error
> recovery system opts to remove the device. For such case the pci channel
> state is set to pci_channel_io_perm_failure and ixgbe_io_error_detected
> was returning PCI_ERS_RESULT_DISCONNECT before calling
> ixgbe_close_suspend consequentially not freeing IRQ and crashing when
> the remove handler calls pci_disable_device, hitting a BUG_ON at
> free_msi_irqs, which asserts that there is no non-free IRQ associated
> with the device to be removed:
> 
> BUG_ON(irq_has_action(entry->irq + i));
> 
> The issue is fixed by calling the ixgbe_close_suspend before evaluate
> the pci channel state.
> 
> Reported-by: Naresh Bannoth <nbannoth@in.ibm.com>
> Reported-by: Abdul Haleem <abdhalee@in.ibm.com>
> Signed-off-by: Mauro S M Rodrigues <maurosr@linux.vnet.ibm.com>
> Reviewed-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> (cherry picked from commit b212d815e77c72be921979119c715166cc8987b1)
> Signed-off-by: Mauro S. M. Rodrigues <maurosr@linux.vnet.ibm.com>
Acked-by: Stefan Bader <stefan.bader@canonical.com>
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     | 6 +++---
>  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 6 +++---
>  2 files changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index a549c4870882..e1326c0603af 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -10910,14 +10910,14 @@ static pci_ers_result_t ixgbe_io_error_detected(struct pci_dev *pdev,
>  	rtnl_lock();
>  	netif_device_detach(netdev);
>  
> +	if (netif_running(netdev))
> +		ixgbe_close_suspend(adapter);
> +
>  	if (state == pci_channel_io_perm_failure) {
>  		rtnl_unlock();
>  		return PCI_ERS_RESULT_DISCONNECT;
>  	}
>  
> -	if (netif_running(netdev))
> -		ixgbe_close_suspend(adapter);
> -
>  	if (!test_and_set_bit(__IXGBE_DISABLED, &adapter->state))
>  		pci_disable_device(pdev);
>  	rtnl_unlock();
> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> index 1f4a69134ade..64859f8318c8 100644
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> @@ -4241,14 +4241,14 @@ static pci_ers_result_t ixgbevf_io_error_detected(struct pci_dev *pdev,
>  	rtnl_lock();
>  	netif_device_detach(netdev);
>  
> +	if (netif_running(netdev))
> +		ixgbevf_close_suspend(adapter);
> +
>  	if (state == pci_channel_io_perm_failure) {
>  		rtnl_unlock();
>  		return PCI_ERS_RESULT_DISCONNECT;
>  	}
>  
> -	if (netif_running(netdev))
> -		ixgbevf_close_suspend(adapter);
> -
>  	if (!test_and_set_bit(__IXGBEVF_DISABLED, &adapter->state))
>  		pci_disable_device(pdev);
>  	rtnl_unlock();
>
Khalid Elmously June 18, 2018, 6:55 p.m. UTC | #2
On 2018-06-12 10:27:37 , Mauro S. M. Rodrigues wrote:
> From: Mauro S M Rodrigues <maurosr@linux.vnet.ibm.com>
> 
> BugLink: http://bugs.launchpad.net/bugs/1776389
> 
> Since commit f7f37e7ff2b9 ("ixgbe: handle close/suspend race with
> netif_device_detach/present") ixgbe_close_suspend is called, from
> ixgbe_close, only if the device is present, i.e. if it isn't detached.
> That exposed a situation where IRQs weren't freed if a PCI error
> recovery system opts to remove the device. For such case the pci channel
> state is set to pci_channel_io_perm_failure and ixgbe_io_error_detected
> was returning PCI_ERS_RESULT_DISCONNECT before calling
> ixgbe_close_suspend consequentially not freeing IRQ and crashing when
> the remove handler calls pci_disable_device, hitting a BUG_ON at
> free_msi_irqs, which asserts that there is no non-free IRQ associated
> with the device to be removed:
> 
> BUG_ON(irq_has_action(entry->irq + i));
> 
> The issue is fixed by calling the ixgbe_close_suspend before evaluate
> the pci channel state.
> 
> Reported-by: Naresh Bannoth <nbannoth@in.ibm.com>
> Reported-by: Abdul Haleem <abdhalee@in.ibm.com>
> Signed-off-by: Mauro S M Rodrigues <maurosr@linux.vnet.ibm.com>
> Reviewed-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> (cherry picked from commit b212d815e77c72be921979119c715166cc8987b1)
> Signed-off-by: Mauro S. M. Rodrigues <maurosr@linux.vnet.ibm.com>
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     | 6 +++---
>  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 6 +++---
>  2 files changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index a549c4870882..e1326c0603af 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -10910,14 +10910,14 @@ static pci_ers_result_t ixgbe_io_error_detected(struct pci_dev *pdev,
>  	rtnl_lock();
>  	netif_device_detach(netdev);
>  
> +	if (netif_running(netdev))
> +		ixgbe_close_suspend(adapter);
> +
>  	if (state == pci_channel_io_perm_failure) {
>  		rtnl_unlock();
>  		return PCI_ERS_RESULT_DISCONNECT;
>  	}
>  
> -	if (netif_running(netdev))
> -		ixgbe_close_suspend(adapter);
> -
>  	if (!test_and_set_bit(__IXGBE_DISABLED, &adapter->state))
>  		pci_disable_device(pdev);
>  	rtnl_unlock();
> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> index 1f4a69134ade..64859f8318c8 100644
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> @@ -4241,14 +4241,14 @@ static pci_ers_result_t ixgbevf_io_error_detected(struct pci_dev *pdev,
>  	rtnl_lock();
>  	netif_device_detach(netdev);
>  
> +	if (netif_running(netdev))
> +		ixgbevf_close_suspend(adapter);
> +
>  	if (state == pci_channel_io_perm_failure) {
>  		rtnl_unlock();
>  		return PCI_ERS_RESULT_DISCONNECT;
>  	}
>  
> -	if (netif_running(netdev))
> -		ixgbevf_close_suspend(adapter);
> -
>  	if (!test_and_set_bit(__IXGBEVF_DISABLED, &adapter->state))
>  		pci_disable_device(pdev);
>  	rtnl_unlock();

Acked-by: Khalid Elmously <khalid.elmously@canonical.com>
Khalid Elmously June 19, 2018, 6:14 p.m. UTC | #3
Applied to Bionic

On 2018-06-12 10:27:37 , Mauro S. M. Rodrigues wrote:
> From: Mauro S M Rodrigues <maurosr@linux.vnet.ibm.com>
> 
> BugLink: http://bugs.launchpad.net/bugs/1776389
> 
> Since commit f7f37e7ff2b9 ("ixgbe: handle close/suspend race with
> netif_device_detach/present") ixgbe_close_suspend is called, from
> ixgbe_close, only if the device is present, i.e. if it isn't detached.
> That exposed a situation where IRQs weren't freed if a PCI error
> recovery system opts to remove the device. For such case the pci channel
> state is set to pci_channel_io_perm_failure and ixgbe_io_error_detected
> was returning PCI_ERS_RESULT_DISCONNECT before calling
> ixgbe_close_suspend consequentially not freeing IRQ and crashing when
> the remove handler calls pci_disable_device, hitting a BUG_ON at
> free_msi_irqs, which asserts that there is no non-free IRQ associated
> with the device to be removed:
> 
> BUG_ON(irq_has_action(entry->irq + i));
> 
> The issue is fixed by calling the ixgbe_close_suspend before evaluate
> the pci channel state.
> 
> Reported-by: Naresh Bannoth <nbannoth@in.ibm.com>
> Reported-by: Abdul Haleem <abdhalee@in.ibm.com>
> Signed-off-by: Mauro S M Rodrigues <maurosr@linux.vnet.ibm.com>
> Reviewed-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> (cherry picked from commit b212d815e77c72be921979119c715166cc8987b1)
> Signed-off-by: Mauro S. M. Rodrigues <maurosr@linux.vnet.ibm.com>
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     | 6 +++---
>  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 6 +++---
>  2 files changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index a549c4870882..e1326c0603af 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -10910,14 +10910,14 @@ static pci_ers_result_t ixgbe_io_error_detected(struct pci_dev *pdev,
>  	rtnl_lock();
>  	netif_device_detach(netdev);
>  
> +	if (netif_running(netdev))
> +		ixgbe_close_suspend(adapter);
> +
>  	if (state == pci_channel_io_perm_failure) {
>  		rtnl_unlock();
>  		return PCI_ERS_RESULT_DISCONNECT;
>  	}
>  
> -	if (netif_running(netdev))
> -		ixgbe_close_suspend(adapter);
> -
>  	if (!test_and_set_bit(__IXGBE_DISABLED, &adapter->state))
>  		pci_disable_device(pdev);
>  	rtnl_unlock();
> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> index 1f4a69134ade..64859f8318c8 100644
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> @@ -4241,14 +4241,14 @@ static pci_ers_result_t ixgbevf_io_error_detected(struct pci_dev *pdev,
>  	rtnl_lock();
>  	netif_device_detach(netdev);
>  
> +	if (netif_running(netdev))
> +		ixgbevf_close_suspend(adapter);
> +
>  	if (state == pci_channel_io_perm_failure) {
>  		rtnl_unlock();
>  		return PCI_ERS_RESULT_DISCONNECT;
>  	}
>  
> -	if (netif_running(netdev))
> -		ixgbevf_close_suspend(adapter);
> -
>  	if (!test_and_set_bit(__IXGBEVF_DISABLED, &adapter->state))
>  		pci_disable_device(pdev);
>  	rtnl_unlock();
> -- 
> 2.11.0
> 
> 
> -- 
> kernel-team mailing list
> kernel-team@lists.ubuntu.com
> https://lists.ubuntu.com/mailman/listinfo/kernel-team
Thadeu Lima de Souza Cascardo June 25, 2018, 1:26 p.m. UTC | #4
Applied to unstable/master.

Thanks.
Cascardo.

Applied-to: unstable/master
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a549c4870882..e1326c0603af 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10910,14 +10910,14 @@  static pci_ers_result_t ixgbe_io_error_detected(struct pci_dev *pdev,
 	rtnl_lock();
 	netif_device_detach(netdev);
 
+	if (netif_running(netdev))
+		ixgbe_close_suspend(adapter);
+
 	if (state == pci_channel_io_perm_failure) {
 		rtnl_unlock();
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 
-	if (netif_running(netdev))
-		ixgbe_close_suspend(adapter);
-
 	if (!test_and_set_bit(__IXGBE_DISABLED, &adapter->state))
 		pci_disable_device(pdev);
 	rtnl_unlock();
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 1f4a69134ade..64859f8318c8 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -4241,14 +4241,14 @@  static pci_ers_result_t ixgbevf_io_error_detected(struct pci_dev *pdev,
 	rtnl_lock();
 	netif_device_detach(netdev);
 
+	if (netif_running(netdev))
+		ixgbevf_close_suspend(adapter);
+
 	if (state == pci_channel_io_perm_failure) {
 		rtnl_unlock();
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 
-	if (netif_running(netdev))
-		ixgbevf_close_suspend(adapter);
-
 	if (!test_and_set_bit(__IXGBEVF_DISABLED, &adapter->state))
 		pci_disable_device(pdev);
 	rtnl_unlock();