Patchwork [25/30] ACPI / hotplug / PCI: Check for new devices on enabled slots

login
register
mail settings
Submitter Rafael J. Wysocki
Date July 17, 2013, 11:32 p.m.
Message ID <1818424.8fNkf5pBy3@vostro.rjw.lan>
Download mbox | patch
Permalink /patch/259965/
State Not Applicable
Headers show

Comments

Rafael J. Wysocki - July 17, 2013, 11:32 p.m.
From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

The current implementation of acpiphp_check_bridge() is pretty dumb:
 - It enables a slot if it's not enabled and the slot status is
   ACPI_STA_ALL.
 - It disables a slot if it's enabled and the slot status is not
   ACPI_STA_ALL.

This behavior is not sufficient to handle the Thunderbolt daisy
chaining case properly, however, because in that case the bus
behind the already enabled slot needs to be rescanned for new
devices.

For this reason, modify acpiphp_check_bridge() so that slots are
disabled and stopped if they are not in the ACPI_STA_ALL state.

For slots in the ACPI_STA_ALL state, devices behind them that don't
respond are trimmed using a new function, trim_stale_devices(),
introduced specifically for this purpose.  That function walks
the given bus and checks each device on it.  If the device doesn't
respond, it is assumed to be gone and is removed.

Once all of the stale devices directy behind the slot have been
removed, acpiphp_check_bridge() will start looking for new devices
that might have appeared on the given bus.  It will do that even if
the slot is already enabled (SLOT_ENABLED is set for it).

In addition to that, make the bus check notification ignore
SLOT_ENABLED and go for enable_device() directly if bridge is NULL,
so that devices behind the slot are re-enumerated in that case too.

This change is based on earlier patches from Kirill A Shutemov
and Mika Westerberg.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/pci/hotplug/acpiphp_glue.c |   87 +++++++++++++++++++++++++------------
 1 file changed, 60 insertions(+), 27 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson - Sept. 4, 2013, 8:36 p.m.
On Thu, 2013-07-18 at 01:32 +0200, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> The current implementation of acpiphp_check_bridge() is pretty dumb:
>  - It enables a slot if it's not enabled and the slot status is
>    ACPI_STA_ALL.
>  - It disables a slot if it's enabled and the slot status is not
>    ACPI_STA_ALL.
> 
> This behavior is not sufficient to handle the Thunderbolt daisy
> chaining case properly, however, because in that case the bus
> behind the already enabled slot needs to be rescanned for new
> devices.
> 
> For this reason, modify acpiphp_check_bridge() so that slots are
> disabled and stopped if they are not in the ACPI_STA_ALL state.
> 
> For slots in the ACPI_STA_ALL state, devices behind them that don't
> respond are trimmed using a new function, trim_stale_devices(),
> introduced specifically for this purpose.  That function walks
> the given bus and checks each device on it.  If the device doesn't
> respond, it is assumed to be gone and is removed.
> 
> Once all of the stale devices directy behind the slot have been
> removed, acpiphp_check_bridge() will start looking for new devices
> that might have appeared on the given bus.  It will do that even if
> the slot is already enabled (SLOT_ENABLED is set for it).
> 
> In addition to that, make the bus check notification ignore
> SLOT_ENABLED and go for enable_device() directly if bridge is NULL,
> so that devices behind the slot are re-enumerated in that case too.
> 
> This change is based on earlier patches from Kirill A Shutemov
> and Mika Westerberg.
> 
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> ---

FYI, git bisect landed on this patch as the cause of my serial console
dying on current upstream.  Further debugging to come...  Thanks,

Alex


>  drivers/pci/hotplug/acpiphp_glue.c |   87 +++++++++++++++++++++++++------------
>  1 file changed, 60 insertions(+), 27 deletions(-)
> 
> Index: linux-pm/drivers/pci/hotplug/acpiphp_glue.c
> ===================================================================
> --- linux-pm.orig/drivers/pci/hotplug/acpiphp_glue.c
> +++ linux-pm/drivers/pci/hotplug/acpiphp_glue.c
> @@ -46,6 +46,7 @@
>  #include <linux/pci.h>
>  #include <linux/pci_hotplug.h>
>  #include <linux/pci-acpi.h>
> +#include <linux/pm_runtime.h>
>  #include <linux/mutex.h>
>  #include <linux/slab.h>
>  #include <linux/acpi.h>
> @@ -687,47 +688,75 @@ static unsigned int get_slot_status(stru
>  }
>  
>  /**
> + * trim_stale_devices - remove PCI devices that are not responding.
> + * @dev: PCI device to start walking the hierarchy from.
> + */
> +static void trim_stale_devices(struct pci_dev *dev)
> +{
> +	acpi_handle handle = ACPI_HANDLE(&dev->dev);
> +	struct pci_bus *bus = dev->subordinate;
> +	bool alive = false;
> +
> +	if (handle) {
> +		acpi_status status;
> +		unsigned long long sta;
> +
> +		status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
> +		alive = ACPI_SUCCESS(status) && sta == ACPI_STA_ALL;
> +	}
> +	if (!alive) {
> +		u32 v;
> +
> +		/* Check if the device responds. */
> +		alive = pci_bus_read_dev_vendor_id(dev->bus, dev->devfn, &v, 0);
> +	}
> +	if (!alive) {
> +		pci_stop_and_remove_bus_device(dev);
> +		if (handle)
> +			acpiphp_bus_trim(handle);
> +	} else if (bus) {
> +		struct pci_dev *child, *tmp;
> +
> +		/* The device is a bridge. so check the bus below it. */
> +		pm_runtime_get_sync(&dev->dev);
> +		list_for_each_entry_safe(child, tmp, &bus->devices, bus_list)
> +			trim_stale_devices(child);
> +
> +		pm_runtime_put(&dev->dev);
> +	}
> +}
> +
> +/**
>   * acpiphp_check_bridge - re-enumerate devices
>   * @bridge: where to begin re-enumeration
>   *
>   * Iterate over all slots under this bridge and make sure that if a
>   * card is present they are enabled, and if not they are disabled.
>   */
> -static int acpiphp_check_bridge(struct acpiphp_bridge *bridge)
> +static void acpiphp_check_bridge(struct acpiphp_bridge *bridge)
>  {
>  	struct acpiphp_slot *slot;
> -	int retval = 0;
> -	int enabled, disabled;
> -
> -	enabled = disabled = 0;
>  
>  	list_for_each_entry(slot, &bridge->slots, node) {
> -		unsigned int status = get_slot_status(slot);
> -		if (slot->flags & SLOT_ENABLED) {
> -			if (status == ACPI_STA_ALL)
> -				continue;
> +		struct pci_bus *bus = slot->bus;
> +		struct pci_dev *dev, *tmp;
>  
> -			retval = acpiphp_disable_and_eject_slot(slot);
> -			if (retval)
> -				goto err_exit;
> +		mutex_lock(&slot->crit_sect);
> +		/* wake up all functions */
> +		if (get_slot_status(slot) == ACPI_STA_ALL) {
> +			/* remove stale devices if any */
> +			list_for_each_entry_safe(dev, tmp, &bus->devices,
> +						 bus_list)
> +				if (PCI_SLOT(dev->devfn) == slot->device)
> +					trim_stale_devices(dev);
>  
> -			disabled++;
> +			/* configure all functions */
> +			enable_device(slot);
>  		} else {
> -			if (status != ACPI_STA_ALL)
> -				continue;
> -			retval = acpiphp_enable_slot(slot);
> -			if (retval) {
> -				err("Error occurred in enabling\n");
> -				goto err_exit;
> -			}
> -			enabled++;
> +			disable_device(slot);
>  		}
> +		mutex_unlock(&slot->crit_sect);
>  	}
> -
> -	dbg("%s: %d enabled, %d disabled\n", __func__, enabled, disabled);
> -
> - err_exit:
> -	return retval;
>  }
>  
>  static void acpiphp_set_hpp_values(struct pci_bus *bus)
> @@ -828,7 +857,11 @@ static void hotplug_event(acpi_handle ha
>  					    ACPI_UINT32_MAX, check_sub_bridges,
>  					    NULL, NULL, NULL);
>  		} else {
> -			acpiphp_enable_slot(func->slot);
> +			struct acpiphp_slot *slot = func->slot;
> +
> +			mutex_lock(&slot->crit_sect);
> +			enable_device(slot);
> +			mutex_unlock(&slot->crit_sect);
>  		}
>  		break;
>  
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki - Sept. 4, 2013, 10:54 p.m.
On Wednesday, September 04, 2013 02:36:34 PM Alex Williamson wrote:
> On Thu, 2013-07-18 at 01:32 +0200, Rafael J. Wysocki wrote:
> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > 
> > The current implementation of acpiphp_check_bridge() is pretty dumb:
> >  - It enables a slot if it's not enabled and the slot status is
> >    ACPI_STA_ALL.
> >  - It disables a slot if it's enabled and the slot status is not
> >    ACPI_STA_ALL.
> > 
> > This behavior is not sufficient to handle the Thunderbolt daisy
> > chaining case properly, however, because in that case the bus
> > behind the already enabled slot needs to be rescanned for new
> > devices.
> > 
> > For this reason, modify acpiphp_check_bridge() so that slots are
> > disabled and stopped if they are not in the ACPI_STA_ALL state.
> > 
> > For slots in the ACPI_STA_ALL state, devices behind them that don't
> > respond are trimmed using a new function, trim_stale_devices(),
> > introduced specifically for this purpose.  That function walks
> > the given bus and checks each device on it.  If the device doesn't
> > respond, it is assumed to be gone and is removed.
> > 
> > Once all of the stale devices directy behind the slot have been
> > removed, acpiphp_check_bridge() will start looking for new devices
> > that might have appeared on the given bus.  It will do that even if
> > the slot is already enabled (SLOT_ENABLED is set for it).
> > 
> > In addition to that, make the bus check notification ignore
> > SLOT_ENABLED and go for enable_device() directly if bridge is NULL,
> > so that devices behind the slot are re-enumerated in that case too.
> > 
> > This change is based on earlier patches from Kirill A Shutemov
> > and Mika Westerberg.
> > 
> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > ---
> 
> FYI, git bisect landed on this patch as the cause of my serial console
> dying on current upstream.  Further debugging to come...  Thanks,

Well, sorry about that.

What exactly do you mean by "dying"?

Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson - Sept. 4, 2013, 11:12 p.m.
On Thu, 2013-09-05 at 00:54 +0200, Rafael J. Wysocki wrote:
> On Wednesday, September 04, 2013 02:36:34 PM Alex Williamson wrote:
> > On Thu, 2013-07-18 at 01:32 +0200, Rafael J. Wysocki wrote:
> > > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > 
> > > The current implementation of acpiphp_check_bridge() is pretty dumb:
> > >  - It enables a slot if it's not enabled and the slot status is
> > >    ACPI_STA_ALL.
> > >  - It disables a slot if it's enabled and the slot status is not
> > >    ACPI_STA_ALL.
> > > 
> > > This behavior is not sufficient to handle the Thunderbolt daisy
> > > chaining case properly, however, because in that case the bus
> > > behind the already enabled slot needs to be rescanned for new
> > > devices.
> > > 
> > > For this reason, modify acpiphp_check_bridge() so that slots are
> > > disabled and stopped if they are not in the ACPI_STA_ALL state.
> > > 
> > > For slots in the ACPI_STA_ALL state, devices behind them that don't
> > > respond are trimmed using a new function, trim_stale_devices(),
> > > introduced specifically for this purpose.  That function walks
> > > the given bus and checks each device on it.  If the device doesn't
> > > respond, it is assumed to be gone and is removed.
> > > 
> > > Once all of the stale devices directy behind the slot have been
> > > removed, acpiphp_check_bridge() will start looking for new devices
> > > that might have appeared on the given bus.  It will do that even if
> > > the slot is already enabled (SLOT_ENABLED is set for it).
> > > 
> > > In addition to that, make the bus check notification ignore
> > > SLOT_ENABLED and go for enable_device() directly if bridge is NULL,
> > > so that devices behind the slot are re-enumerated in that case too.
> > > 
> > > This change is based on earlier patches from Kirill A Shutemov
> > > and Mika Westerberg.
> > > 
> > > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > ---
> > 
> > FYI, git bisect landed on this patch as the cause of my serial console
> > dying on current upstream.  Further debugging to come...  Thanks,
> 
> Well, sorry about that.
> 
> What exactly do you mean by "dying"?

Sorry, I was hoping to have more details quickly, but it's been a pain
to debug.  By dying I mean serial console output suddenly stops during
kernel boot and nothing more comes out of it until after the system is
rebooted.  The problem happens when acpiphp_check_bridge() calls
enable_slot().  The serial console dies somewhere down in
acpiphp_bus_trim().  I think this is happening on the 00:1f ISA bridge,
so there's a good chance the serial ports are described as somewhere
under there.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki - Sept. 4, 2013, 11:35 p.m.
On Wednesday, September 04, 2013 05:12:14 PM Alex Williamson wrote:
> On Thu, 2013-09-05 at 00:54 +0200, Rafael J. Wysocki wrote:
> > On Wednesday, September 04, 2013 02:36:34 PM Alex Williamson wrote:
> > > On Thu, 2013-07-18 at 01:32 +0200, Rafael J. Wysocki wrote:
> > > > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > > 
> > > > The current implementation of acpiphp_check_bridge() is pretty dumb:
> > > >  - It enables a slot if it's not enabled and the slot status is
> > > >    ACPI_STA_ALL.
> > > >  - It disables a slot if it's enabled and the slot status is not
> > > >    ACPI_STA_ALL.
> > > > 
> > > > This behavior is not sufficient to handle the Thunderbolt daisy
> > > > chaining case properly, however, because in that case the bus
> > > > behind the already enabled slot needs to be rescanned for new
> > > > devices.
> > > > 
> > > > For this reason, modify acpiphp_check_bridge() so that slots are
> > > > disabled and stopped if they are not in the ACPI_STA_ALL state.
> > > > 
> > > > For slots in the ACPI_STA_ALL state, devices behind them that don't
> > > > respond are trimmed using a new function, trim_stale_devices(),
> > > > introduced specifically for this purpose.  That function walks
> > > > the given bus and checks each device on it.  If the device doesn't
> > > > respond, it is assumed to be gone and is removed.
> > > > 
> > > > Once all of the stale devices directy behind the slot have been
> > > > removed, acpiphp_check_bridge() will start looking for new devices
> > > > that might have appeared on the given bus.  It will do that even if
> > > > the slot is already enabled (SLOT_ENABLED is set for it).
> > > > 
> > > > In addition to that, make the bus check notification ignore
> > > > SLOT_ENABLED and go for enable_device() directly if bridge is NULL,
> > > > so that devices behind the slot are re-enumerated in that case too.
> > > > 
> > > > This change is based on earlier patches from Kirill A Shutemov
> > > > and Mika Westerberg.
> > > > 
> > > > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > > Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > > ---
> > > 
> > > FYI, git bisect landed on this patch as the cause of my serial console
> > > dying on current upstream.  Further debugging to come...  Thanks,
> > 
> > Well, sorry about that.
> > 
> > What exactly do you mean by "dying"?
> 
> Sorry, I was hoping to have more details quickly, but it's been a pain
> to debug.  By dying I mean serial console output suddenly stops during
> kernel boot and nothing more comes out of it until after the system is
> rebooted.  The problem happens when acpiphp_check_bridge() calls
> enable_slot().  The serial console dies somewhere down in
> acpiphp_bus_trim().  I think this is happening on the 00:1f ISA bridge,
> so there's a good chance the serial ports are described as somewhere
> under there.

Can you please check if that is the acpiphp_bus_trim() called by
acpiphp_bus_add() or the other one called from trim_stale_devices()?

Just add a dump_stack() or WARN_ON(1) to trim_stale_devices() next to
the acpiphp_bus_trim() call and see if that triggers.  I *think* it's the one
in acpiphp_bus_add(), but it won't hurt to verify that.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson - Sept. 5, 2013, 3:37 a.m.
On Thu, 2013-09-05 at 01:35 +0200, Rafael J. Wysocki wrote:
> On Wednesday, September 04, 2013 05:12:14 PM Alex Williamson wrote:
> > On Thu, 2013-09-05 at 00:54 +0200, Rafael J. Wysocki wrote:
> > > On Wednesday, September 04, 2013 02:36:34 PM Alex Williamson wrote:
> > > > On Thu, 2013-07-18 at 01:32 +0200, Rafael J. Wysocki wrote:
> > > > > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > > > 
> > > > > The current implementation of acpiphp_check_bridge() is pretty dumb:
> > > > >  - It enables a slot if it's not enabled and the slot status is
> > > > >    ACPI_STA_ALL.
> > > > >  - It disables a slot if it's enabled and the slot status is not
> > > > >    ACPI_STA_ALL.
> > > > > 
> > > > > This behavior is not sufficient to handle the Thunderbolt daisy
> > > > > chaining case properly, however, because in that case the bus
> > > > > behind the already enabled slot needs to be rescanned for new
> > > > > devices.
> > > > > 
> > > > > For this reason, modify acpiphp_check_bridge() so that slots are
> > > > > disabled and stopped if they are not in the ACPI_STA_ALL state.
> > > > > 
> > > > > For slots in the ACPI_STA_ALL state, devices behind them that don't
> > > > > respond are trimmed using a new function, trim_stale_devices(),
> > > > > introduced specifically for this purpose.  That function walks
> > > > > the given bus and checks each device on it.  If the device doesn't
> > > > > respond, it is assumed to be gone and is removed.
> > > > > 
> > > > > Once all of the stale devices directy behind the slot have been
> > > > > removed, acpiphp_check_bridge() will start looking for new devices
> > > > > that might have appeared on the given bus.  It will do that even if
> > > > > the slot is already enabled (SLOT_ENABLED is set for it).
> > > > > 
> > > > > In addition to that, make the bus check notification ignore
> > > > > SLOT_ENABLED and go for enable_device() directly if bridge is NULL,
> > > > > so that devices behind the slot are re-enumerated in that case too.
> > > > > 
> > > > > This change is based on earlier patches from Kirill A Shutemov
> > > > > and Mika Westerberg.
> > > > > 
> > > > > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > > > Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > > > ---
> > > > 
> > > > FYI, git bisect landed on this patch as the cause of my serial console
> > > > dying on current upstream.  Further debugging to come...  Thanks,
> > > 
> > > Well, sorry about that.
> > > 
> > > What exactly do you mean by "dying"?
> > 
> > Sorry, I was hoping to have more details quickly, but it's been a pain
> > to debug.  By dying I mean serial console output suddenly stops during
> > kernel boot and nothing more comes out of it until after the system is
> > rebooted.  The problem happens when acpiphp_check_bridge() calls
> > enable_slot().  The serial console dies somewhere down in
> > acpiphp_bus_trim().  I think this is happening on the 00:1f ISA bridge,
> > so there's a good chance the serial ports are described as somewhere
> > under there.
> 
> Can you please check if that is the acpiphp_bus_trim() called by
> acpiphp_bus_add() or the other one called from trim_stale_devices()?
> 
> Just add a dump_stack() or WARN_ON(1) to trim_stale_devices() next to
> the acpiphp_bus_trim() call and see if that triggers.  I *think* it's the one
> in acpiphp_bus_add(), but it won't hurt to verify that.

Here's the call path:

[   16.120824]  [<ffffffff81627e6c>] dump_stack+0x55/0x76
[   16.125979]  [<ffffffff8162132e>] enable_slot+0x4ee/0x5e0
[   16.131396]  [<ffffffff813418fb>] ? trim_stale_devices+0x5b/0xf0
[   16.137420]  [<ffffffff81341b35>] acpiphp_check_bridge+0xd5/0x110
[   16.143531]  [<ffffffff81342acb>] hotplug_event+0x16b/0x260
[   16.149115]  [<ffffffff81072cd9>] ? process_one_work+0x189/0x540
[   16.155136]  [<ffffffff81342bf0>] hotplug_event_work+0x30/0x70
[   16.160978]  [<ffffffff81072d3b>] process_one_work+0x1eb/0x540
[   16.166819]  [<ffffffff81072cd9>] ? process_one_work+0x189/0x540
[   16.172836]  [<ffffffff8107353c>] worker_thread+0x11c/0x370
[   16.178426]  [<ffffffff81073420>] ? rescuer_thread+0x350/0x350
[   16.184276]  [<ffffffff8107b0ea>] kthread+0xea/0xf0
[   16.189165]  [<ffffffff8107b000>] ? kthread_create_on_node+0x160/0x160
[   16.195700]  [<ffffffff816395dc>] ret_from_fork+0x7c/0xb0
[   16.201109]  [<ffffffff8107b000>] ? kthread_create_on_node+0x160/0x160

The actual death of the serial console occurs in acpi_device_set_power()
called from:

enable_slot()
 acpiphp_bus_add()
  acpiphp_bus_trim()
   acpi_bus_trim()
    acpi_walk_namespace()
     acpi_bus_remove()
      acpi_device_unregister()
       acpi_device_set_power()

I can't seem to get a path from the acpi devices in question there, so I
have no idea what's getting trimmed here.  It worries me quite a bit by
introducing this trimming that apparently wasn't happening before
though.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson - Sept. 5, 2013, 4:06 a.m.
On Wed, 2013-09-04 at 21:37 -0600, Alex Williamson wrote:
> On Thu, 2013-09-05 at 01:35 +0200, Rafael J. Wysocki wrote:
> > On Wednesday, September 04, 2013 05:12:14 PM Alex Williamson wrote:
> > > On Thu, 2013-09-05 at 00:54 +0200, Rafael J. Wysocki wrote:
> > > > On Wednesday, September 04, 2013 02:36:34 PM Alex Williamson wrote:
> > > > > On Thu, 2013-07-18 at 01:32 +0200, Rafael J. Wysocki wrote:
> > > > > > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > > > > 
> > > > > > The current implementation of acpiphp_check_bridge() is pretty dumb:
> > > > > >  - It enables a slot if it's not enabled and the slot status is
> > > > > >    ACPI_STA_ALL.
> > > > > >  - It disables a slot if it's enabled and the slot status is not
> > > > > >    ACPI_STA_ALL.
> > > > > > 
> > > > > > This behavior is not sufficient to handle the Thunderbolt daisy
> > > > > > chaining case properly, however, because in that case the bus
> > > > > > behind the already enabled slot needs to be rescanned for new
> > > > > > devices.
> > > > > > 
> > > > > > For this reason, modify acpiphp_check_bridge() so that slots are
> > > > > > disabled and stopped if they are not in the ACPI_STA_ALL state.
> > > > > > 
> > > > > > For slots in the ACPI_STA_ALL state, devices behind them that don't
> > > > > > respond are trimmed using a new function, trim_stale_devices(),
> > > > > > introduced specifically for this purpose.  That function walks
> > > > > > the given bus and checks each device on it.  If the device doesn't
> > > > > > respond, it is assumed to be gone and is removed.
> > > > > > 
> > > > > > Once all of the stale devices directy behind the slot have been
> > > > > > removed, acpiphp_check_bridge() will start looking for new devices
> > > > > > that might have appeared on the given bus.  It will do that even if
> > > > > > the slot is already enabled (SLOT_ENABLED is set for it).
> > > > > > 
> > > > > > In addition to that, make the bus check notification ignore
> > > > > > SLOT_ENABLED and go for enable_device() directly if bridge is NULL,
> > > > > > so that devices behind the slot are re-enumerated in that case too.
> > > > > > 
> > > > > > This change is based on earlier patches from Kirill A Shutemov
> > > > > > and Mika Westerberg.
> > > > > > 
> > > > > > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > > > > Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > > > > ---
> > > > > 
> > > > > FYI, git bisect landed on this patch as the cause of my serial console
> > > > > dying on current upstream.  Further debugging to come...  Thanks,
> > > > 
> > > > Well, sorry about that.
> > > > 
> > > > What exactly do you mean by "dying"?
> > > 
> > > Sorry, I was hoping to have more details quickly, but it's been a pain
> > > to debug.  By dying I mean serial console output suddenly stops during
> > > kernel boot and nothing more comes out of it until after the system is
> > > rebooted.  The problem happens when acpiphp_check_bridge() calls
> > > enable_slot().  The serial console dies somewhere down in
> > > acpiphp_bus_trim().  I think this is happening on the 00:1f ISA bridge,
> > > so there's a good chance the serial ports are described as somewhere
> > > under there.
> > 
> > Can you please check if that is the acpiphp_bus_trim() called by
> > acpiphp_bus_add() or the other one called from trim_stale_devices()?
> > 
> > Just add a dump_stack() or WARN_ON(1) to trim_stale_devices() next to
> > the acpiphp_bus_trim() call and see if that triggers.  I *think* it's the one
> > in acpiphp_bus_add(), but it won't hurt to verify that.
> 
> Here's the call path:
> 
> [   16.120824]  [<ffffffff81627e6c>] dump_stack+0x55/0x76
> [   16.125979]  [<ffffffff8162132e>] enable_slot+0x4ee/0x5e0
> [   16.131396]  [<ffffffff813418fb>] ? trim_stale_devices+0x5b/0xf0
> [   16.137420]  [<ffffffff81341b35>] acpiphp_check_bridge+0xd5/0x110
> [   16.143531]  [<ffffffff81342acb>] hotplug_event+0x16b/0x260
> [   16.149115]  [<ffffffff81072cd9>] ? process_one_work+0x189/0x540
> [   16.155136]  [<ffffffff81342bf0>] hotplug_event_work+0x30/0x70
> [   16.160978]  [<ffffffff81072d3b>] process_one_work+0x1eb/0x540
> [   16.166819]  [<ffffffff81072cd9>] ? process_one_work+0x189/0x540
> [   16.172836]  [<ffffffff8107353c>] worker_thread+0x11c/0x370
> [   16.178426]  [<ffffffff81073420>] ? rescuer_thread+0x350/0x350
> [   16.184276]  [<ffffffff8107b0ea>] kthread+0xea/0xf0
> [   16.189165]  [<ffffffff8107b000>] ? kthread_create_on_node+0x160/0x160
> [   16.195700]  [<ffffffff816395dc>] ret_from_fork+0x7c/0xb0
> [   16.201109]  [<ffffffff8107b000>] ? kthread_create_on_node+0x160/0x160
> 
> The actual death of the serial console occurs in acpi_device_set_power()
> called from:
> 
> enable_slot()
>  acpiphp_bus_add()
>   acpiphp_bus_trim()
>    acpi_bus_trim()
>     acpi_walk_namespace()
>      acpi_bus_remove()
>       acpi_device_unregister()
>        acpi_device_set_power()
> 
> I can't seem to get a path from the acpi devices in question there, so I
> have no idea what's getting trimmed here.  It worries me quite a bit by
> introducing this trimming that apparently wasn't happening before
> though.  Thanks,

As suspected, the pnp.bus_id/id of the last device before the serial
console dies is COM1/PNP0501.  I also see all of these being trimmed
out:

 MBRD/PNP0C02
 DMAC/PNP0200
 MATH/PNP0C04
 PIC/PNP0000
 HPET/PNP0103
 RTC/PNP0B00
 SPKR/PNP0800
 TIME/PNP0100
 LNK{A-H}/PNP0C0F

This seems like a bad idea.  I forgot to mention, the original
hotplug_event is called with a device check on \_SB_.PCI0.PEX2.  The box
where I'm seeing this is a pretty generic X58 based Nehalem workstation
(Lenovo S20).  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

Index: linux-pm/drivers/pci/hotplug/acpiphp_glue.c
===================================================================
--- linux-pm.orig/drivers/pci/hotplug/acpiphp_glue.c
+++ linux-pm/drivers/pci/hotplug/acpiphp_glue.c
@@ -46,6 +46,7 @@ 
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
 #include <linux/pci-acpi.h>
+#include <linux/pm_runtime.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/acpi.h>
@@ -687,47 +688,75 @@  static unsigned int get_slot_status(stru
 }
 
 /**
+ * trim_stale_devices - remove PCI devices that are not responding.
+ * @dev: PCI device to start walking the hierarchy from.
+ */
+static void trim_stale_devices(struct pci_dev *dev)
+{
+	acpi_handle handle = ACPI_HANDLE(&dev->dev);
+	struct pci_bus *bus = dev->subordinate;
+	bool alive = false;
+
+	if (handle) {
+		acpi_status status;
+		unsigned long long sta;
+
+		status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
+		alive = ACPI_SUCCESS(status) && sta == ACPI_STA_ALL;
+	}
+	if (!alive) {
+		u32 v;
+
+		/* Check if the device responds. */
+		alive = pci_bus_read_dev_vendor_id(dev->bus, dev->devfn, &v, 0);
+	}
+	if (!alive) {
+		pci_stop_and_remove_bus_device(dev);
+		if (handle)
+			acpiphp_bus_trim(handle);
+	} else if (bus) {
+		struct pci_dev *child, *tmp;
+
+		/* The device is a bridge. so check the bus below it. */
+		pm_runtime_get_sync(&dev->dev);
+		list_for_each_entry_safe(child, tmp, &bus->devices, bus_list)
+			trim_stale_devices(child);
+
+		pm_runtime_put(&dev->dev);
+	}
+}
+
+/**
  * acpiphp_check_bridge - re-enumerate devices
  * @bridge: where to begin re-enumeration
  *
  * Iterate over all slots under this bridge and make sure that if a
  * card is present they are enabled, and if not they are disabled.
  */
-static int acpiphp_check_bridge(struct acpiphp_bridge *bridge)
+static void acpiphp_check_bridge(struct acpiphp_bridge *bridge)
 {
 	struct acpiphp_slot *slot;
-	int retval = 0;
-	int enabled, disabled;
-
-	enabled = disabled = 0;
 
 	list_for_each_entry(slot, &bridge->slots, node) {
-		unsigned int status = get_slot_status(slot);
-		if (slot->flags & SLOT_ENABLED) {
-			if (status == ACPI_STA_ALL)
-				continue;
+		struct pci_bus *bus = slot->bus;
+		struct pci_dev *dev, *tmp;
 
-			retval = acpiphp_disable_and_eject_slot(slot);
-			if (retval)
-				goto err_exit;
+		mutex_lock(&slot->crit_sect);
+		/* wake up all functions */
+		if (get_slot_status(slot) == ACPI_STA_ALL) {
+			/* remove stale devices if any */
+			list_for_each_entry_safe(dev, tmp, &bus->devices,
+						 bus_list)
+				if (PCI_SLOT(dev->devfn) == slot->device)
+					trim_stale_devices(dev);
 
-			disabled++;
+			/* configure all functions */
+			enable_device(slot);
 		} else {
-			if (status != ACPI_STA_ALL)
-				continue;
-			retval = acpiphp_enable_slot(slot);
-			if (retval) {
-				err("Error occurred in enabling\n");
-				goto err_exit;
-			}
-			enabled++;
+			disable_device(slot);
 		}
+		mutex_unlock(&slot->crit_sect);
 	}
-
-	dbg("%s: %d enabled, %d disabled\n", __func__, enabled, disabled);
-
- err_exit:
-	return retval;
 }
 
 static void acpiphp_set_hpp_values(struct pci_bus *bus)
@@ -828,7 +857,11 @@  static void hotplug_event(acpi_handle ha
 					    ACPI_UINT32_MAX, check_sub_bridges,
 					    NULL, NULL, NULL);
 		} else {
-			acpiphp_enable_slot(func->slot);
+			struct acpiphp_slot *slot = func->slot;
+
+			mutex_lock(&slot->crit_sect);
+			enable_device(slot);
+			mutex_unlock(&slot->crit_sect);
 		}
 		break;