Message ID | 20240503152509.372-1-thinhtr@linux.ibm.com |
---|---|
State | Changes Requested |
Headers | show |
Series | [iwl-net] i40e: Fix repeated EEH reports in MSI domain | expand |
On 5/3/24 17:25, Thinh Tran wrote: > The patch fixes an issue when repeated EEH reports with a single error > on the bus of Intel X710 4-port 10G Base-T adapter, in the MSI domain > causing the devices to be permanently disabled. It fully resets and > restart the devices when handling the PCI EEH error. > > Two new functions, i40e_io_suspend() and i40e_io_resume(), have been > introduced. These functions were refactor from the existing > i40e_suspend() and i40e_resume() respectively. This refactoring was > done due to concerns about the logic of the I40E_SUSPENSED state, which > caused the device not able to recover. The functios are now used in the s/functios/functions/ > EEH handling for device suspend/resume callbacks. > > - In the PCI error detected callback, replaced i40e_prep_for_reset() > with i40e_io_suspend(). The chance is to fully suspend all I/O > operations it's only a chance? (perhaps a typo) > - In the PCI error slot reset callback, replaced pci_enable_device_mem() > with pci_enable_device(). This change enables both I/O and memory of > the device. > > - In the PCI error resume callback, replace i40e_handle_reset_warning() > with i40e_io_resume(). This change allows the system to resume I/O > operations > > Fixes: a5f3d2c17b07 ("powerpc/pseries/pci: Add MSI domains") > > Signed-off-by: Thinh Tran <thinhtr@linux.ibm.com> > Tested-by: Robert Thomas <rob.thomas@ibm.com> In general do not add a blank line after Fixes tag. Someone could complain that i40e driver had a bug prior to the cited core commit, but for more practical purposes I find it good as is. When you are a sender of the patch, it's good to place your Signed-off as a last tag. > > --- > drivers/net/ethernet/intel/i40e/i40e_main.c | 29 ++++++++++++++++----- > 1 file changed, 22 insertions(+), 7 deletions(-) > > diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c > index 48b9ddb2b1b3..58418aa9231e 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_main.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c > @@ -54,6 +54,9 @@ static int i40e_get_capabilities(struct i40e_pf *pf, > enum i40e_admin_queue_opc list_type); > static bool i40e_is_total_port_shutdown_enabled(struct i40e_pf *pf); > > +static int i40e_io_suspend(struct i40e_pf *pf); > +static int i40e_io_resume(struct i40e_pf *pf); I appreciate your effort to minimize changed lines to have a fix better visible, however we avoid forward declarations in .c files. I would split this into two commits - the first to just factor out/move functions w/o functional changes, then second one with a logic fix. > + > /* i40e_pci_tbl - PCI Device ID Table > * > * Last entry must be all 0s > @@ -11138,6 +11141,8 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit, > ret = i40e_reset(pf); > if (!ret) > i40e_rebuild(pf, reinit, lock_acquired); > + else > + dev_err(&pf->pdev->dev, "%s: i40e_reset() FAILED", __func__); > } > > /** > @@ -16327,7 +16332,7 @@ static pci_ers_result_t i40e_pci_error_detected(struct pci_dev *pdev, > > /* shutdown all operations */ > if (!test_bit(__I40E_SUSPENDED, pf->state)) > - i40e_prep_for_reset(pf); > + i40e_io_suspend(pf); > > /* Request a slot reset */ > return PCI_ERS_RESULT_NEED_RESET; > @@ -16349,7 +16354,8 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev) > u32 reg; > > dev_dbg(&pdev->dev, "%s\n", __func__); > - if (pci_enable_device_mem(pdev)) { > + /* enable I/O and memory of the device */ > + if (pci_enable_device(pdev)) { > dev_info(&pdev->dev, > "Cannot re-enable PCI device after reset.\n"); > result = PCI_ERS_RESULT_DISCONNECT; > @@ -16411,8 +16417,7 @@ static void i40e_pci_error_resume(struct pci_dev *pdev) > dev_dbg(&pdev->dev, "%s\n", __func__); > if (test_bit(__I40E_SUSPENDED, pf->state)) > return; > - > - i40e_handle_reset_warning(pf, false); > + i40e_io_resume(pf); > } > > /** > @@ -16521,11 +16526,16 @@ static void i40e_shutdown(struct pci_dev *pdev) > static int __maybe_unused i40e_suspend(struct device *dev) > { > struct i40e_pf *pf = dev_get_drvdata(dev); > - struct i40e_hw *hw = &pf->hw; > > /* If we're already suspended, then there is nothing to do */ > if (test_and_set_bit(__I40E_SUSPENDED, pf->state)) > return 0; > + return i40e_io_suspend(pf); > +} > + > +static int i40e_io_suspend(struct i40e_pf *pf) > +{ > + struct i40e_hw *hw = &pf->hw; > > set_bit(__I40E_DOWN, pf->state); > > @@ -16572,11 +16582,16 @@ static int __maybe_unused i40e_suspend(struct device *dev) > static int __maybe_unused i40e_resume(struct device *dev) > { > struct i40e_pf *pf = dev_get_drvdata(dev); > - int err; > > /* If we're not suspended, then there is nothing to do */ > if (!test_bit(__I40E_SUSPENDED, pf->state)) > return 0; > + return i40e_io_resume(pf); > +} > + > +static int i40e_io_resume(struct i40e_pf *pf) > +{ > + int err; > > /* We need to hold the RTNL lock prior to restoring interrupt schemes, > * since we're going to be restoring queues > @@ -16588,7 +16603,7 @@ static int __maybe_unused i40e_resume(struct device *dev) > */ > err = i40e_restore_interrupt_scheme(pf); > if (err) { > - dev_err(dev, "Cannot restore interrupt scheme: %d\n", > + dev_err(&pf->pdev->dev, "Cannot restore interrupt scheme: %d\n", > err); > } >
On 5/6/2024 4:35 AM, Przemek Kitszel wrote: > In general do not add a blank line after Fixes tag. > Someone could complain that i40e driver had a bug prior to the cited > core commit, but for more practical purposes I find it good as is. > When you are a sender of the patch, it's good to place your Signed-off > as a last tag. > > I appreciate your effort to minimize changed lines to have a fix better > visible, however we avoid forward declarations in .c files. > I would split this into two commits - the first to just factor out/move > functions w/o functional changes, then second one with a logic fix. > Thanks for the review. I'll correct the typos and resubmit with two separate commits . Thinh Tran
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 48b9ddb2b1b3..58418aa9231e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -54,6 +54,9 @@ static int i40e_get_capabilities(struct i40e_pf *pf, enum i40e_admin_queue_opc list_type); static bool i40e_is_total_port_shutdown_enabled(struct i40e_pf *pf); +static int i40e_io_suspend(struct i40e_pf *pf); +static int i40e_io_resume(struct i40e_pf *pf); + /* i40e_pci_tbl - PCI Device ID Table * * Last entry must be all 0s @@ -11138,6 +11141,8 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit, ret = i40e_reset(pf); if (!ret) i40e_rebuild(pf, reinit, lock_acquired); + else + dev_err(&pf->pdev->dev, "%s: i40e_reset() FAILED", __func__); } /** @@ -16327,7 +16332,7 @@ static pci_ers_result_t i40e_pci_error_detected(struct pci_dev *pdev, /* shutdown all operations */ if (!test_bit(__I40E_SUSPENDED, pf->state)) - i40e_prep_for_reset(pf); + i40e_io_suspend(pf); /* Request a slot reset */ return PCI_ERS_RESULT_NEED_RESET; @@ -16349,7 +16354,8 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev) u32 reg; dev_dbg(&pdev->dev, "%s\n", __func__); - if (pci_enable_device_mem(pdev)) { + /* enable I/O and memory of the device */ + if (pci_enable_device(pdev)) { dev_info(&pdev->dev, "Cannot re-enable PCI device after reset.\n"); result = PCI_ERS_RESULT_DISCONNECT; @@ -16411,8 +16417,7 @@ static void i40e_pci_error_resume(struct pci_dev *pdev) dev_dbg(&pdev->dev, "%s\n", __func__); if (test_bit(__I40E_SUSPENDED, pf->state)) return; - - i40e_handle_reset_warning(pf, false); + i40e_io_resume(pf); } /** @@ -16521,11 +16526,16 @@ static void i40e_shutdown(struct pci_dev *pdev) static int __maybe_unused i40e_suspend(struct device *dev) { struct i40e_pf *pf = dev_get_drvdata(dev); - struct i40e_hw *hw = &pf->hw; /* If we're already suspended, then there is nothing to do */ if (test_and_set_bit(__I40E_SUSPENDED, pf->state)) return 0; + return i40e_io_suspend(pf); +} + +static int i40e_io_suspend(struct i40e_pf *pf) +{ + struct i40e_hw *hw = &pf->hw; set_bit(__I40E_DOWN, pf->state); @@ -16572,11 +16582,16 @@ static int __maybe_unused i40e_suspend(struct device *dev) static int __maybe_unused i40e_resume(struct device *dev) { struct i40e_pf *pf = dev_get_drvdata(dev); - int err; /* If we're not suspended, then there is nothing to do */ if (!test_bit(__I40E_SUSPENDED, pf->state)) return 0; + return i40e_io_resume(pf); +} + +static int i40e_io_resume(struct i40e_pf *pf) +{ + int err; /* We need to hold the RTNL lock prior to restoring interrupt schemes, * since we're going to be restoring queues @@ -16588,7 +16603,7 @@ static int __maybe_unused i40e_resume(struct device *dev) */ err = i40e_restore_interrupt_scheme(pf); if (err) { - dev_err(dev, "Cannot restore interrupt scheme: %d\n", + dev_err(&pf->pdev->dev, "Cannot restore interrupt scheme: %d\n", err); }