diff mbox series

hack to debug acpiphp crash

Message ID 20230724135902.2217991-1-imammedo@redhat.com
State New
Headers show
Series hack to debug acpiphp crash | expand

Commit Message

Igor Mammedov July 24, 2023, 1:59 p.m. UTC
Woody thanks for testing,

can you try following patch which will try to workaround NULL bus->self if it's
a really cuplrit and print an extra debug information.
Add following to kernel command line(make sure that CONFIG_DYNAMIC_DEBUG is enabled):

dyndbg="file drivers/pci/access.c +p; file drivers/pci/hotplug/acpiphp_glue.c +p; file drivers/pci/bus.c +p; file drivers/pci/pci.c +p; file drivers/pci/setup-bus.c +p" ignore_loglevel

What I find odd in you logs is that enable_slot() is called while native PCIe
should be used. Additional info might help to understand what's going on:
  1: 'lspci' output
  2:  DSDT and all SSDT ACPI tables (you can use 'acpidump -b' to get them).

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
---
 drivers/pci/hotplug/acpiphp_glue.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

Comments

Woody Suwalski July 25, 2023, 1:52 a.m. UTC | #1
Igor Mammedov wrote:
> Woody thanks for testing,
>
> can you try following patch which will try to workaround NULL bus->self if it's
> a really cuplrit and print an extra debug information.
> Add following to kernel command line(make sure that CONFIG_DYNAMIC_DEBUG is enabled):
>
> dyndbg="file drivers/pci/access.c +p; file drivers/pci/hotplug/acpiphp_glue.c +p; file drivers/pci/bus.c +p; file drivers/pci/pci.c +p; file drivers/pci/setup-bus.c +p" ignore_loglevel
>
> What I find odd in you logs is that enable_slot() is called while native PCIe
> should be used. Additional info might help to understand what's going on:
>    1: 'lspci' output
>    2:  DSDT and all SSDT ACPI tables (you can use 'acpidump -b' to get them).
>
> Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> ---
>   drivers/pci/hotplug/acpiphp_glue.c | 18 +++++++++++++++++-
>   1 file changed, 17 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
> index 328d1e416014..9ce3fd9d72a9 100644
> --- a/drivers/pci/hotplug/acpiphp_glue.c
> +++ b/drivers/pci/hotplug/acpiphp_glue.c
> @@ -485,7 +485,10 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
>   	struct pci_bus *bus = slot->bus;
>   	struct acpiphp_func *func;
>   
> +WARN(1, "enable_slot");
> +pci_info(bus, "enable_slot bus\n");
>   	if (bridge && bus->self && hotplug_is_native(bus->self)) {
> +pr_err("enable_slot: bridge branch\n");
>   		/*
>   		 * If native hotplug is used, it will take care of hotplug
>   		 * slot management and resource allocation for hotplug
> @@ -498,8 +501,10 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
>   				acpiphp_native_scan_bridge(dev);
>   		}
>   	} else {
> +		LIST_HEAD(add_list);
>   		int max, pass;
>   
> +pr_err("enable_slot: acpiphp_rescan_slot branch\n");
>   		acpiphp_rescan_slot(slot);
>   		max = acpiphp_max_busnr(bus);
>   		for (pass = 0; pass < 2; pass++) {
> @@ -508,13 +513,23 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
>   					continue;
>   
>   				max = pci_scan_bridge(bus, dev, max, pass);
> +pci_info(dev, "enable_slot: pci_scan_bridge: max: %d\n", max);
>   				if (pass && dev->subordinate) {
>   					check_hotplug_bridge(slot, dev);
>   					pcibios_resource_survey_bus(dev->subordinate);
> +                                        if (bus->self)
> +						__pci_bus_size_bridges(dev->subordinate,
> +								       &add_list);
>   				}
>   			}
>   		}
> -		pci_assign_unassigned_bridge_resources(bus->self);
> +                if (bus->self) {
> +pci_info(bus->self, "enable_slot: pci_assign_unassigned_bridge_resources:\n");
> +			pci_assign_unassigned_bridge_resources(bus->self);
> +                } else {
> +pci_info(bus, "enable_slot: __pci_bus_assign_resources:\n");
> +			__pci_bus_assign_resources(bus, &add_list, NULL);
> +		}
>   	}
>   
>   	acpiphp_sanitize_bus(bus);
> @@ -541,6 +556,7 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
>   		}
>   		pci_dev_put(dev);
>   	}
> +pr_err("enable_slot: end\n");
>   }
>   
>   /**
Unfortunately the patch above does not seem to prevent the kernel crash.
Here comes the requested diagnostic info: dmesg's before and after, 
choice of lspci's and acpi tables. Hope that will help :-)

Thanks, Woody
Igor Mammedov July 25, 2023, 8:06 a.m. UTC | #2
On Mon, 24 Jul 2023 21:52:34 -0400
Woody Suwalski <terraluna977@gmail.com> wrote:

> Igor Mammedov wrote:
> > Woody thanks for testing,
> >
> > can you try following patch which will try to workaround NULL bus->self if it's
> > a really cuplrit and print an extra debug information.
> > Add following to kernel command line(make sure that CONFIG_DYNAMIC_DEBUG is enabled):
> >
> > dyndbg="file drivers/pci/access.c +p; file drivers/pci/hotplug/acpiphp_glue.c +p; file drivers/pci/bus.c +p; file drivers/pci/pci.c +p; file drivers/pci/setup-bus.c +p" ignore_loglevel
> >
> > What I find odd in you logs is that enable_slot() is called while native PCIe
> > should be used. Additional info might help to understand what's going on:
> >    1: 'lspci' output
> >    2:  DSDT and all SSDT ACPI tables (you can use 'acpidump -b' to get them).
> >
> > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
[...]
> >   
> >   /**  
> Unfortunately the patch above does not seem to prevent the kernel crash.
> Here comes the requested diagnostic info: dmesg's before and after, 
> choice of lspci's and acpi tables. Hope that will help :-)

Looking at dmesg-6.5-debug_after.txt
there aren't "BUG: kernel NULL pointer dereference" line anymore
The call traces you see are induced by WARN(), which purpose is
to show call path that calls enable_slot().

Let me split potential fix from debug and repost that as separate
patches for you to try.
I'd like to see debug output without 'fix' to track down which
root port/device causes NULL pointer dereference. And hopefully
in a few roundtrips figure out why old code doesn't crash.

PS:
What happens is that on resume firmware (likely EC),
issues ACPI bus check on root ports which (bus check) is
wired to acpiphp module (though pciehp module was initialized
at boot to manage root ports), it's likely firmware bug.

I'd guess the intent behind this was to check if PCIe devices
were hotplugged while laptop has been asleep, and for
some reason they didn't use native PCIe hotplug to handle that. 
However looking at laptop specs you can't hotplug PCIe
devices via external ports. Given how old laptop is
it isn't going to be fixed, so we would need a workaround
or fixup DSDT to skip buscheck.

The options I see is to keep old kernel as for such case,
or bail out early from bus check/enable_slot since root port
is managed by pciehp module (and let it handle hotplug).

> Thanks, Woody
> 
>
Igor Mammedov July 25, 2023, 8:42 a.m. UTC | #3
On Tue, 25 Jul 2023 10:06:44 +0200
Igor Mammedov <imammedo@redhat.com> wrote:

> PS:
> What happens is that on resume firmware (likely EC),
> issues ACPI bus check on root ports which (bus check) is
> wired to acpiphp module (though pciehp module was initialized
> at boot to manage root ports), it's likely firmware bug.
> 
> I'd guess the intent behind this was to check if PCIe devices
> were hotplugged while laptop has been asleep, and for
> some reason they didn't use native PCIe hotplug to handle that. 
> However looking at laptop specs you can't hotplug PCIe
> devices via external ports. Given how old laptop is
> it isn't going to be fixed, so we would need a workaround
> or fixup DSDT to skip buscheck.
> 
> The options I see is to keep old kernel as for such case,
> or bail out early from bus check/enable_slot since root port
> is managed by pciehp module (and let it handle hotplug).

scratch all of above out (it's wrong). Looking at DSDT
firmware sends Notify(rpxx, 2 /* Wake */) event. Which
according to spec needs to be handed down to the native
device driver.
Woody Suwalski July 25, 2023, 11:45 a.m. UTC | #4
Igor Mammedov wrote:
> On Tue, 25 Jul 2023 10:06:44 +0200
> Igor Mammedov <imammedo@redhat.com> wrote:
>
>> PS:
>> What happens is that on resume firmware (likely EC),
>> issues ACPI bus check on root ports which (bus check) is
>> wired to acpiphp module (though pciehp module was initialized
>> at boot to manage root ports), it's likely firmware bug.
>>
>> I'd guess the intent behind this was to check if PCIe devices
>> were hotplugged while laptop has been asleep, and for
>> some reason they didn't use native PCIe hotplug to handle that.
>> However looking at laptop specs you can't hotplug PCIe
>> devices via external ports. Given how old laptop is
>> it isn't going to be fixed, so we would need a workaround
>> or fixup DSDT to skip buscheck.
>>
>> The options I see is to keep old kernel as for such case,
>> or bail out early from bus check/enable_slot since root port
>> is managed by pciehp module (and let it handle hotplug).
> scratch all of above out (it's wrong). Looking at DSDT
> firmware sends Notify(rpxx, 2 /* Wake */) event. Which
> according to spec needs to be handed down to the native
> device driver.
>
>
I agree that this laptop is a tricky one. I had to adjust my kernel 
config NOHZ just to make it suspend to ram, otherwise it was waking back 
right after going to sleep (and the same nohz kernel worked on all my 
other machines)...
Igor Mammedov July 25, 2023, 11:58 a.m. UTC | #5
On Tue, 25 Jul 2023 07:45:08 -0400
Woody Suwalski <terraluna977@gmail.com> wrote:

> Igor Mammedov wrote:
> > On Tue, 25 Jul 2023 10:06:44 +0200
> > Igor Mammedov <imammedo@redhat.com> wrote:
> >  
> >> PS:
> >> What happens is that on resume firmware (likely EC),
> >> issues ACPI bus check on root ports which (bus check) is
> >> wired to acpiphp module (though pciehp module was initialized
> >> at boot to manage root ports), it's likely firmware bug.
> >>
> >> I'd guess the intent behind this was to check if PCIe devices
> >> were hotplugged while laptop has been asleep, and for
> >> some reason they didn't use native PCIe hotplug to handle that.
> >> However looking at laptop specs you can't hotplug PCIe
> >> devices via external ports. Given how old laptop is
> >> it isn't going to be fixed, so we would need a workaround
> >> or fixup DSDT to skip buscheck.
> >>
> >> The options I see is to keep old kernel as for such case,
> >> or bail out early from bus check/enable_slot since root port
> >> is managed by pciehp module (and let it handle hotplug).  
> > scratch all of above out (it's wrong). Looking at DSDT
> > firmware sends Notify(rpxx, 2 /* Wake */) event. Which
> > according to spec needs to be handed down to the native
> > device driver.
> >
> >  
> I agree that this laptop is a tricky one. I had to adjust my kernel 
> config NOHZ just to make it suspend to ram, otherwise it was waking back 
> right after going to sleep (and the same nohz kernel worked on all my 
> other machines)...

Blaming laptop is likely red herring in this case after some more reading.
Anyways I've just sent a new round of patches to test.
diff mbox series

Patch

diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 328d1e416014..9ce3fd9d72a9 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -485,7 +485,10 @@  static void enable_slot(struct acpiphp_slot *slot, bool bridge)
 	struct pci_bus *bus = slot->bus;
 	struct acpiphp_func *func;
 
+WARN(1, "enable_slot");
+pci_info(bus, "enable_slot bus\n");
 	if (bridge && bus->self && hotplug_is_native(bus->self)) {
+pr_err("enable_slot: bridge branch\n");
 		/*
 		 * If native hotplug is used, it will take care of hotplug
 		 * slot management and resource allocation for hotplug
@@ -498,8 +501,10 @@  static void enable_slot(struct acpiphp_slot *slot, bool bridge)
 				acpiphp_native_scan_bridge(dev);
 		}
 	} else {
+		LIST_HEAD(add_list);
 		int max, pass;
 
+pr_err("enable_slot: acpiphp_rescan_slot branch\n");
 		acpiphp_rescan_slot(slot);
 		max = acpiphp_max_busnr(bus);
 		for (pass = 0; pass < 2; pass++) {
@@ -508,13 +513,23 @@  static void enable_slot(struct acpiphp_slot *slot, bool bridge)
 					continue;
 
 				max = pci_scan_bridge(bus, dev, max, pass);
+pci_info(dev, "enable_slot: pci_scan_bridge: max: %d\n", max);
 				if (pass && dev->subordinate) {
 					check_hotplug_bridge(slot, dev);
 					pcibios_resource_survey_bus(dev->subordinate);
+                                        if (bus->self)
+						__pci_bus_size_bridges(dev->subordinate,
+								       &add_list);
 				}
 			}
 		}
-		pci_assign_unassigned_bridge_resources(bus->self);
+                if (bus->self) {
+pci_info(bus->self, "enable_slot: pci_assign_unassigned_bridge_resources:\n");
+			pci_assign_unassigned_bridge_resources(bus->self);
+                } else {
+pci_info(bus, "enable_slot: __pci_bus_assign_resources:\n");
+			__pci_bus_assign_resources(bus, &add_list, NULL);
+		}
 	}
 
 	acpiphp_sanitize_bus(bus);
@@ -541,6 +556,7 @@  static void enable_slot(struct acpiphp_slot *slot, bool bridge)
 		}
 		pci_dev_put(dev);
 	}
+pr_err("enable_slot: end\n");
 }
 
 /**