diff mbox series

[v1] igc: Add pcie error handler support

Message ID 20200127115831.18047-1-sasha.neftin@intel.com
State Superseded
Delegated to: Jeff Kirsher
Headers show
Series [v1] igc: Add pcie error handler support | expand

Commit Message

Sasha Neftin Jan. 27, 2020, 11:58 a.m. UTC
Add pcie error detection, slot reset and resume capability

Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
---
 drivers/net/ethernet/intel/igc/igc_main.c | 105 ++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

Comments

Paul Menzel Jan. 27, 2020, 2:53 p.m. UTC | #1
Dear Sasha,


Thank you for your patch set.


On 2020-01-27 12:58, Sasha Neftin wrote:
> Add pcie error detection, slot reset and resume capability

Tested how?

> Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
> ---
>  drivers/net/ethernet/intel/igc/igc_main.c | 105 ++++++++++++++++++++++++++++++
>  1 file changed, 105 insertions(+)
> 
> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> index fa72460e255a..b0656ae2fcb3 100644
> --- a/drivers/net/ethernet/intel/igc/igc_main.c
> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> @@ -5076,6 +5076,110 @@ static void igc_shutdown(struct pci_dev *pdev)
>  	}
>  }
>  
> +/**
> + *  igc_io_error_detected - called when PCI error is detected
> + *  @pdev: Pointer to PCI device
> + *  @state: The current pci connection state

PCI

> + *
> + *  This function is called after a PCI bus error affecting
> + *  this device has been detected.

Document the two possible return codes?

> + **/
> +static pci_ers_result_t igc_io_error_detected(struct pci_dev *pdev,
> +					      pci_channel_state_t state)
> +{
> +	struct net_device *netdev = pci_get_drvdata(pdev);
> +	struct igc_adapter *adapter = netdev_priv(netdev);
> +
> +	netif_device_detach(netdev);
> +
> +	if (state == pci_channel_io_perm_failure)
> +		return PCI_ERS_RESULT_DISCONNECT;
> +
> +	if (netif_running(netdev))
> +		igc_down(adapter);
> +	pci_disable_device(pdev);
> +
> +	/* Request a slot slot reset. */

One slot?

> +	return PCI_ERS_RESULT_NEED_RESET;
> +}
> +
> +/**
> + *  igc_io_slot_reset - called after the pci bus has been reset.

PCI

> + *  @pdev: Pointer to PCI device
> + *
> + *  Restart the card from scratch, as if from a cold-boot. Implementation
> + *  resembles the first-half of the igc_resume routine.

Shouldn’t the common code be factored out then?

> + **/
> +static pci_ers_result_t igc_io_slot_reset(struct pci_dev *pdev)
> +{
> +	struct net_device *netdev = pci_get_drvdata(pdev);
> +	struct igc_adapter *adapter = netdev_priv(netdev);
> +	struct igc_hw *hw = &adapter->hw;
> +	pci_ers_result_t result;
> +
> +	if (pci_enable_device_mem(pdev)) {
> +		dev_err(&pdev->dev,
> +			"Cannot re-enable PCI device after reset.\n");

*Could not*, so it’s clear, it was tried, and not a policy decision.

> +		result = PCI_ERS_RESULT_DISCONNECT;
> +	} else {
> +		pci_set_master(pdev);
> +		pci_restore_state(pdev);
> +		pci_save_state(pdev);
> +
> +		pci_enable_wake(pdev, PCI_D3hot, 0);
> +		pci_enable_wake(pdev, PCI_D3cold, 0);
> +
> +		/* In case of PCI error, adapter lose its HW address

lose*s*

> +		 * so we should re-assign it here.
> +		 */
> +		hw->hw_addr = adapter->io_addr;
> +
> +		igc_reset(adapter);
> +		wr32(IGC_WUS, ~0);
> +		result = PCI_ERS_RESULT_RECOVERED;
> +	}
> +
> +	return result;

You can get rid of the if-else statement, by returning in the if branch,
and use the else branch as the follow-on(?). Then you can return the
result directly too, and even remove the variable `result`.

> +}
> +
> +/**
> + *  igc_io_resume - called when traffic can start flowing again.

start to flow

> + *  @pdev: Pointer to PCI device
> + *
> + *  This callback is called when the error recovery driver tells us that
> + *  its OK to resume normal operation. Implementation resembles the
> + *  second-half of the igc_resume routine.
> + */
> +static void igc_io_resume(struct pci_dev *pdev)
> +{
> +	struct net_device *netdev = pci_get_drvdata(pdev);
> +	struct igc_adapter *adapter = netdev_priv(netdev);
> +	u32 err = 0;
> +
> +	rtnl_lock();
> +	if (netif_running(netdev)) {
> +		err = igc_open(netdev);
> +		if (err) {

Use `if (igc_open(netdev)) {`, and remove variable `err`?

> +			dev_err(&pdev->dev, "igic_open failed after reset\n");
> +			return;
> +		}
> +	}
> +
> +	netif_device_attach(netdev);
> +
> +	/* let the f/w know that the h/w is now under the control of the
> +	 * driver.
> +	 */
> +	igc_get_hw_control(adapter);
> +	rtnl_unlock();
> +}
> +
> +static const struct pci_error_handlers igc_err_handler = {
> +	.error_detected = igc_io_error_detected,
> +	.slot_reset = igc_io_slot_reset,
> +	.resume = igc_io_resume,
> +};
> +
>  #ifdef CONFIG_PM
>  static const struct dev_pm_ops igc_pm_ops = {
>  	SET_SYSTEM_SLEEP_PM_OPS(igc_suspend, igc_resume)
> @@ -5093,6 +5197,7 @@ static struct pci_driver igc_driver = {
>  	.driver.pm = &igc_pm_ops,
>  #endif
>  	.shutdown = igc_shutdown,
> +	.err_handler = &igc_err_handler,
>  };
>  
>  /**
>
Sasha Neftin Jan. 28, 2020, 7:31 a.m. UTC | #2
On 1/27/2020 16:53, Paul Menzel wrote:
> Dear Sasha,
> 
> 
> Thank you for your patch set.
> 
> 
> On 2020-01-27 12:58, Sasha Neftin wrote:
>> Add pcie error detection, slot reset and resume capability
> 
> Tested how?
> 
Tested on the platform with error-prone PCIe slot and system has been 
recovered. To more testing, I would recommend using peripheral 
equipment. Lecroy's PCIe test board could be a good option.
>> Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
>> ---
>>   drivers/net/ethernet/intel/igc/igc_main.c | 105 ++++++++++++++++++++++++++++++
>>   1 file changed, 105 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
>> index fa72460e255a..b0656ae2fcb3 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_main.c
>> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
>> @@ -5076,6 +5076,110 @@ static void igc_shutdown(struct pci_dev *pdev)
>>   	}
>>   }
>>   
>> +/**
>> + *  igc_io_error_detected - called when PCI error is detected
>> + *  @pdev: Pointer to PCI device
>> + *  @state: The current pci connection state
> 
> PCI
> 
good. I will change.
>> + *
>> + *  This function is called after a PCI bus error affecting
>> + *  this device has been detected.
> 
> Document the two possible return codes?
> 
I will elaborate on the comment.
>> + **/
>> +static pci_ers_result_t igc_io_error_detected(struct pci_dev *pdev,
>> +					      pci_channel_state_t state)
>> +{
>> +	struct net_device *netdev = pci_get_drvdata(pdev);
>> +	struct igc_adapter *adapter = netdev_priv(netdev);
>> +
>> +	netif_device_detach(netdev);
>> +
>> +	if (state == pci_channel_io_perm_failure)
>> +		return PCI_ERS_RESULT_DISCONNECT;
>> +
>> +	if (netif_running(netdev))
>> +		igc_down(adapter);
>> +	pci_disable_device(pdev);
>> +
>> +	/* Request a slot slot reset. */
> 
> One slot?
> 
yes, typo.
>> +	return PCI_ERS_RESULT_NEED_RESET;
>> +}
>> +
>> +/**
>> + *  igc_io_slot_reset - called after the pci bus has been reset.
> 
> PCI
> 
ok.
>> + *  @pdev: Pointer to PCI device
>> + *
>> + *  Restart the card from scratch, as if from a cold-boot. Implementation
>> + *  resembles the first-half of the igc_resume routine.
> 
> Shouldn’t the common code be factored out then?
> 
I prefer stay align with our legacy code in other drivers.
>> + **/
>> +static pci_ers_result_t igc_io_slot_reset(struct pci_dev *pdev)
>> +{
>> +	struct net_device *netdev = pci_get_drvdata(pdev);
>> +	struct igc_adapter *adapter = netdev_priv(netdev);
>> +	struct igc_hw *hw = &adapter->hw;
>> +	pci_ers_result_t result;
>> +
>> +	if (pci_enable_device_mem(pdev)) {
>> +		dev_err(&pdev->dev,
>> +			"Cannot re-enable PCI device after reset.\n");
> 
> *Could not*, so it’s clear, it was tried, and not a policy decision.
> 
ok.
>> +		result = PCI_ERS_RESULT_DISCONNECT;
>> +	} else {
>> +		pci_set_master(pdev);
>> +		pci_restore_state(pdev);
>> +		pci_save_state(pdev);
>> +
>> +		pci_enable_wake(pdev, PCI_D3hot, 0);
>> +		pci_enable_wake(pdev, PCI_D3cold, 0);
>> +
>> +		/* In case of PCI error, adapter lose its HW address
> 
> lose*s*
> 
ok.
>> +		 * so we should re-assign it here.
>> +		 */
>> +		hw->hw_addr = adapter->io_addr;
>> +
>> +		igc_reset(adapter);
>> +		wr32(IGC_WUS, ~0);
>> +		result = PCI_ERS_RESULT_RECOVERED;
>> +	}
>> +
>> +	return result;
> 
> You can get rid of the if-else statement, by returning in the if branch,
> and use the else branch as the follow-on(?). Then you can return the
> result directly too, and even remove the variable `result`.
> 
I prefer stay with a legacy code to align with other ours drivers.
>> +}
>> +
>> +/**
>> + *  igc_io_resume - called when traffic can start flowing again.
> 
> start to flow
> 
ok, thanks.
>> + *  @pdev: Pointer to PCI device
>> + *
>> + *  This callback is called when the error recovery driver tells us that
>> + *  its OK to resume normal operation. Implementation resembles the
>> + *  second-half of the igc_resume routine.
>> + */
>> +static void igc_io_resume(struct pci_dev *pdev)
>> +{
>> +	struct net_device *netdev = pci_get_drvdata(pdev);
>> +	struct igc_adapter *adapter = netdev_priv(netdev);
>> +	u32 err = 0;
>> +
>> +	rtnl_lock();
>> +	if (netif_running(netdev)) {
>> +		err = igc_open(netdev);
>> +		if (err) {
> 
> Use `if (igc_open(netdev)) {`, and remove variable `err`?
> 
good idea. thanks
>> +			dev_err(&pdev->dev, "igic_open failed after reset\n");
>> +			return;
>> +		}
>> +	}
>> +
>> +	netif_device_attach(netdev);
>> +
>> +	/* let the f/w know that the h/w is now under the control of the
>> +	 * driver.
>> +	 */
>> +	igc_get_hw_control(adapter);
>> +	rtnl_unlock();
>> +}
>> +
>> +static const struct pci_error_handlers igc_err_handler = {
>> +	.error_detected = igc_io_error_detected,
>> +	.slot_reset = igc_io_slot_reset,
>> +	.resume = igc_io_resume,
>> +};
>> +
>>   #ifdef CONFIG_PM
>>   static const struct dev_pm_ops igc_pm_ops = {
>>   	SET_SYSTEM_SLEEP_PM_OPS(igc_suspend, igc_resume)
>> @@ -5093,6 +5197,7 @@ static struct pci_driver igc_driver = {
>>   	.driver.pm = &igc_pm_ops,
>>   #endif
>>   	.shutdown = igc_shutdown,
>> +	.err_handler = &igc_err_handler,
>>   };
>>   
>>   /**
>>
> 
Paul, thanks for your feedback - I will submit v2 and address your comments.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index fa72460e255a..b0656ae2fcb3 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -5076,6 +5076,110 @@  static void igc_shutdown(struct pci_dev *pdev)
 	}
 }
 
+/**
+ *  igc_io_error_detected - called when PCI error is detected
+ *  @pdev: Pointer to PCI device
+ *  @state: The current pci connection state
+ *
+ *  This function is called after a PCI bus error affecting
+ *  this device has been detected.
+ **/
+static pci_ers_result_t igc_io_error_detected(struct pci_dev *pdev,
+					      pci_channel_state_t state)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct igc_adapter *adapter = netdev_priv(netdev);
+
+	netif_device_detach(netdev);
+
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	if (netif_running(netdev))
+		igc_down(adapter);
+	pci_disable_device(pdev);
+
+	/* Request a slot slot reset. */
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ *  igc_io_slot_reset - called after the pci bus has been reset.
+ *  @pdev: Pointer to PCI device
+ *
+ *  Restart the card from scratch, as if from a cold-boot. Implementation
+ *  resembles the first-half of the igc_resume routine.
+ **/
+static pci_ers_result_t igc_io_slot_reset(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct igc_adapter *adapter = netdev_priv(netdev);
+	struct igc_hw *hw = &adapter->hw;
+	pci_ers_result_t result;
+
+	if (pci_enable_device_mem(pdev)) {
+		dev_err(&pdev->dev,
+			"Cannot re-enable PCI device after reset.\n");
+		result = PCI_ERS_RESULT_DISCONNECT;
+	} else {
+		pci_set_master(pdev);
+		pci_restore_state(pdev);
+		pci_save_state(pdev);
+
+		pci_enable_wake(pdev, PCI_D3hot, 0);
+		pci_enable_wake(pdev, PCI_D3cold, 0);
+
+		/* In case of PCI error, adapter lose its HW address
+		 * so we should re-assign it here.
+		 */
+		hw->hw_addr = adapter->io_addr;
+
+		igc_reset(adapter);
+		wr32(IGC_WUS, ~0);
+		result = PCI_ERS_RESULT_RECOVERED;
+	}
+
+	return result;
+}
+
+/**
+ *  igc_io_resume - called when traffic can start flowing again.
+ *  @pdev: Pointer to PCI device
+ *
+ *  This callback is called when the error recovery driver tells us that
+ *  its OK to resume normal operation. Implementation resembles the
+ *  second-half of the igc_resume routine.
+ */
+static void igc_io_resume(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct igc_adapter *adapter = netdev_priv(netdev);
+	u32 err = 0;
+
+	rtnl_lock();
+	if (netif_running(netdev)) {
+		err = igc_open(netdev);
+		if (err) {
+			dev_err(&pdev->dev, "igic_open failed after reset\n");
+			return;
+		}
+	}
+
+	netif_device_attach(netdev);
+
+	/* let the f/w know that the h/w is now under the control of the
+	 * driver.
+	 */
+	igc_get_hw_control(adapter);
+	rtnl_unlock();
+}
+
+static const struct pci_error_handlers igc_err_handler = {
+	.error_detected = igc_io_error_detected,
+	.slot_reset = igc_io_slot_reset,
+	.resume = igc_io_resume,
+};
+
 #ifdef CONFIG_PM
 static const struct dev_pm_ops igc_pm_ops = {
 	SET_SYSTEM_SLEEP_PM_OPS(igc_suspend, igc_resume)
@@ -5093,6 +5197,7 @@  static struct pci_driver igc_driver = {
 	.driver.pm = &igc_pm_ops,
 #endif
 	.shutdown = igc_shutdown,
+	.err_handler = &igc_err_handler,
 };
 
 /**