diff mbox series

[2/4] powerpc/eeh: Release EEH device state synchronously

Message ID 6b3ce475194cd3c1aefd876e311b5a218c3a627a.1585544197.git.sbobroff@linux.ibm.com (mailing list archive)
State Changes Requested
Headers show
Series powerpc/eeh: Release EEH device state synchronously | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (c6624071c338732402e8c726df6a4074473eaa0e)
snowpatch_ozlabs/checkpatch success total: 0 errors, 0 warnings, 0 checks, 40 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Sam Bobroff March 30, 2020, 4:56 a.m. UTC
EEH device state is currently removed (by eeh_remove_device()) during
the device release handler, which is invoked as the device's reference
count drops to zero. This may take some time, or forever, as other
threads may hold references.

However, the PCI device state is released synchronously by
pci_stop_and_remove_bus_device(). This mismatch causes problems, for
example the device may be re-discovered as a new device before the
release handler has been called, leaving the PCI and EEH state
mismatched.

So instead, call eeh_remove_device() from the bus device removal
handlers, which are called synchronously in the removal path.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
---
 arch/powerpc/kernel/eeh.c         | 26 ++++++++++++++++++++++++++
 arch/powerpc/kernel/pci-hotplug.c |  2 --
 2 files changed, 26 insertions(+), 2 deletions(-)

Comments

Oliver O'Halloran April 3, 2020, 4:51 a.m. UTC | #1
On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote:
> EEH device state is currently removed (by eeh_remove_device()) during
> the device release handler, which is invoked as the device's reference
> count drops to zero. This may take some time, or forever, as other
> threads may hold references.
> 
> However, the PCI device state is released synchronously by
> pci_stop_and_remove_bus_device(). This mismatch causes problems, for
> example the device may be re-discovered as a new device before the
> release handler has been called, leaving the PCI and EEH state
> mismatched.
> 
> So instead, call eeh_remove_device() from the bus device removal
> handlers, which are called synchronously in the removal path.
> 
> Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
> ---
>  arch/powerpc/kernel/eeh.c         | 26 ++++++++++++++++++++++++++
>  arch/powerpc/kernel/pci-hotplug.c |  2 --
>  2 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 17cb3e9b5697..c36c5a7db5ca 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -1106,6 +1106,32 @@ static int eeh_init(void)
>  
>  core_initcall_sync(eeh_init);
>  
> +static int eeh_device_notifier(struct notifier_block *nb,
> +			       unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		eeh_remove_device(to_pci_dev(dev));
> +		break;
> +	default:
> +		break;
> +	}

A comment briefly explaining why we're not doing anything in the add
case might be nice.

Reviewed-by: Oliver O'Halloran <oohall@gmail.com>

> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block eeh_device_nb = {
> +	.notifier_call = eeh_device_notifier,
> +};
> +
> +static __init int eeh_set_bus_notifier(void)
> +{
> +	bus_register_notifier(&pci_bus_type, &eeh_device_nb);
> +	return 0;
> +}
> +arch_initcall(eeh_set_bus_notifier);
> +
>  /**
>   * eeh_add_device_early - Enable EEH for the indicated device node
>   * @pdn: PCI device node for which to set up EEH
> diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
> index d6a67f814983..28e9aa274f64 100644
> --- a/arch/powerpc/kernel/pci-hotplug.c
> +++ b/arch/powerpc/kernel/pci-hotplug.c
> @@ -57,8 +57,6 @@ void pcibios_release_device(struct pci_dev *dev)
>  	struct pci_controller *phb = pci_bus_to_host(dev->bus);
>  	struct pci_dn *pdn = pci_get_pdn(dev);
>  
> -	eeh_remove_device(dev);
> -
>  	if (phb->controller_ops.release_device)
>  		phb->controller_ops.release_device(dev);
>
Sam Bobroff April 8, 2020, 6:15 a.m. UTC | #2
On Fri, Apr 03, 2020 at 03:51:18PM +1100, Oliver O'Halloran wrote:
> On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote:
> > EEH device state is currently removed (by eeh_remove_device()) during
> > the device release handler, which is invoked as the device's reference
> > count drops to zero. This may take some time, or forever, as other
> > threads may hold references.
> > 
> > However, the PCI device state is released synchronously by
> > pci_stop_and_remove_bus_device(). This mismatch causes problems, for
> > example the device may be re-discovered as a new device before the
> > release handler has been called, leaving the PCI and EEH state
> > mismatched.
> > 
> > So instead, call eeh_remove_device() from the bus device removal
> > handlers, which are called synchronously in the removal path.
> > 
> > Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
> > ---
> >  arch/powerpc/kernel/eeh.c         | 26 ++++++++++++++++++++++++++
> >  arch/powerpc/kernel/pci-hotplug.c |  2 --
> >  2 files changed, 26 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> > index 17cb3e9b5697..c36c5a7db5ca 100644
> > --- a/arch/powerpc/kernel/eeh.c
> > +++ b/arch/powerpc/kernel/eeh.c
> > @@ -1106,6 +1106,32 @@ static int eeh_init(void)
> >  
> >  core_initcall_sync(eeh_init);
> >  
> > +static int eeh_device_notifier(struct notifier_block *nb,
> > +			       unsigned long action, void *data)
> > +{
> > +	struct device *dev = data;
> > +
> > +	switch (action) {
> > +	case BUS_NOTIFY_DEL_DEVICE:
> > +		eeh_remove_device(to_pci_dev(dev));
> > +		break;
> > +	default:
> > +		break;
> > +	}
> 
> A comment briefly explaining why we're not doing anything in the add
> case might be nice.

Good point, I'll add something for v2.
> 
> Reviewed-by: Oliver O'Halloran <oohall@gmail.com>
> 
> > +	return NOTIFY_DONE;
> > +}
> > +
> > +static struct notifier_block eeh_device_nb = {
> > +	.notifier_call = eeh_device_notifier,
> > +};
> > +
> > +static __init int eeh_set_bus_notifier(void)
> > +{
> > +	bus_register_notifier(&pci_bus_type, &eeh_device_nb);
> > +	return 0;
> > +}
> > +arch_initcall(eeh_set_bus_notifier);
> > +
> >  /**
> >   * eeh_add_device_early - Enable EEH for the indicated device node
> >   * @pdn: PCI device node for which to set up EEH
> > diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
> > index d6a67f814983..28e9aa274f64 100644
> > --- a/arch/powerpc/kernel/pci-hotplug.c
> > +++ b/arch/powerpc/kernel/pci-hotplug.c
> > @@ -57,8 +57,6 @@ void pcibios_release_device(struct pci_dev *dev)
> >  	struct pci_controller *phb = pci_bus_to_host(dev->bus);
> >  	struct pci_dn *pdn = pci_get_pdn(dev);
> >  
> > -	eeh_remove_device(dev);
> > -
> >  	if (phb->controller_ops.release_device)
> >  		phb->controller_ops.release_device(dev);
> >  
>
diff mbox series

Patch

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 17cb3e9b5697..c36c5a7db5ca 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1106,6 +1106,32 @@  static int eeh_init(void)
 
 core_initcall_sync(eeh_init);
 
+static int eeh_device_notifier(struct notifier_block *nb,
+			       unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_DEL_DEVICE:
+		eeh_remove_device(to_pci_dev(dev));
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block eeh_device_nb = {
+	.notifier_call = eeh_device_notifier,
+};
+
+static __init int eeh_set_bus_notifier(void)
+{
+	bus_register_notifier(&pci_bus_type, &eeh_device_nb);
+	return 0;
+}
+arch_initcall(eeh_set_bus_notifier);
+
 /**
  * eeh_add_device_early - Enable EEH for the indicated device node
  * @pdn: PCI device node for which to set up EEH
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index d6a67f814983..28e9aa274f64 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -57,8 +57,6 @@  void pcibios_release_device(struct pci_dev *dev)
 	struct pci_controller *phb = pci_bus_to_host(dev->bus);
 	struct pci_dn *pdn = pci_get_pdn(dev);
 
-	eeh_remove_device(dev);
-
 	if (phb->controller_ops.release_device)
 		phb->controller_ops.release_device(dev);