diff mbox series

PCI: pciehp: Make sure pciehp_isr clears interrupt events

Message ID 20191112215938.1145-1-stuart.w.hayes@gmail.com
State Changes Requested
Delegated to: Bjorn Helgaas
Headers show
Series PCI: pciehp: Make sure pciehp_isr clears interrupt events | expand

Commit Message

Stuart Hayes Nov. 12, 2019, 9:59 p.m. UTC
The pciehp interrupt handler pciehp_isr() will read the slot status
register and then write back to it to clear just the bits that caused the
interrupt. If a different interrupt event bit gets set between the read and
the write, pciehp_isr() will exit without having cleared all of the
interrupt event bits, so we will never get another hotplug interrupt from
that device.

That is expected behavior according to the PCI Express spec (v.5.0, section
6.7.3.4, "Software Notification of Hot-Plug Events").

Because the "presence detect changed" and "data link layer state changed"
event bits are both getting set at nearly the same time when a device is
added or removed, this is more likely to happen than it might seem. The
issue can be reproduced rather easily by connecting and disconnecting an
NVMe device on at least one system model.

This patch fixes the issue by modifying pciehp_isr() to loop back and
re-read the slot status register immediately after writing to it, until
it sees that all of the event status bits have been cleared.

Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
---
 drivers/pci/hotplug/pciehp_hpc.c | 39 +++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 6 deletions(-)

Comments

Lukas Wunner Nov. 13, 2019, 4:59 a.m. UTC | #1
On Tue, Nov 12, 2019 at 04:59:38PM -0500, Stuart Hayes wrote:
> The pciehp interrupt handler pciehp_isr() will read the slot status
> register and then write back to it to clear just the bits that caused the
> interrupt. If a different interrupt event bit gets set between the read and
> the write, pciehp_isr() will exit without having cleared all of the
> interrupt event bits, so we will never get another hotplug interrupt from
> that device.

The IRQ is masked when it occurs and unmasked after it's been handled.
See the invocation of mask_ack_irq() in handle_edge_irq() and
handle_level_irq() in kernel/irq/chip.c.

If the IRQ has fired in-between, the IRQ chip is expected to invoke
the IRQ handler again.  There is some support for IRQ chips not
capable of replaying interrupts in kernel/irq/resend.c, but in general,
if you do not get another hotplug interrupt, the hardware is broken.
What kind of IRQ chip are you using and what kind of chip is the
hotplug port built into?

I'm not opposed to quirks to support such broken hardware but the
commit message shouldn't purport that it's normal behavior and the
quirk should only be executed where necessary and be made explicit
in the code to be a quirk.

Thanks,

Lukas

> 
> That is expected behavior according to the PCI Express spec (v.5.0, section
> 6.7.3.4, "Software Notification of Hot-Plug Events").
> 
> Because the "presence detect changed" and "data link layer state changed"
> event bits are both getting set at nearly the same time when a device is
> added or removed, this is more likely to happen than it might seem. The
> issue can be reproduced rather easily by connecting and disconnecting an
> NVMe device on at least one system model.
> 
> This patch fixes the issue by modifying pciehp_isr() to loop back and
> re-read the slot status register immediately after writing to it, until
> it sees that all of the event status bits have been cleared.
> 
> Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
> ---
>  drivers/pci/hotplug/pciehp_hpc.c | 39 +++++++++++++++++++++++++++-----
>  1 file changed, 33 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
> index 1a522c1c4177..8ec22a872b28 100644
> --- a/drivers/pci/hotplug/pciehp_hpc.c
> +++ b/drivers/pci/hotplug/pciehp_hpc.c
> @@ -487,12 +487,21 @@ void pciehp_power_off_slot(struct controller *ctrl)
>  		 PCI_EXP_SLTCTL_PWR_OFF);
>  }
>  
> +/*
> + * We should never need to re-read the slot status register this many times,
> + * because there are only six possible events that could generate this
> + * interrupt.  If we still see events after this many reads, there's a stuck
> + * bit.
> + */
> +#define MAX_ISR_STATUS_READS 6
> +
>  static irqreturn_t pciehp_isr(int irq, void *dev_id)
>  {
>  	struct controller *ctrl = (struct controller *)dev_id;
>  	struct pci_dev *pdev = ctrl_dev(ctrl);
>  	struct device *parent = pdev->dev.parent;
> -	u16 status, events;
> +	u16 status, events = 0;
> +	int status_reads = 0;
>  
>  	/*
>  	 * Interrupts only occur in D3hot or shallower and only if enabled
> @@ -517,6 +526,7 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id)
>  		}
>  	}
>  
> +read_status:
>  	pcie_capability_read_word(pdev, PCI_EXP_SLTSTA, &status);
>  	if (status == (u16) ~0) {
>  		ctrl_info(ctrl, "%s: no response from device\n", __func__);
> @@ -529,16 +539,34 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id)
>  	 * Slot Status contains plain status bits as well as event
>  	 * notification bits; right now we only want the event bits.
>  	 */
> -	events = status & (PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD |
> -			   PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_CC |
> -			   PCI_EXP_SLTSTA_DLLSC);
> +	status &= (PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD |
> +		   PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_CC |
> +		   PCI_EXP_SLTSTA_DLLSC);
>  
>  	/*
>  	 * If we've already reported a power fault, don't report it again
>  	 * until we've done something to handle it.
>  	 */
>  	if (ctrl->power_fault_detected)
> -		events &= ~PCI_EXP_SLTSTA_PFD;
> +		status &= ~PCI_EXP_SLTSTA_PFD;
> +
> +	if (status) {
> +		pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, status);
> +		events |= status;
> +	}
> +
> +	/*
> +	 * All of the event bits must be zero before the port will send
> +	 * a new interrupt.  If an event bit gets set between the read
> +	 * and the write, we'll never get another interrupt, so loop until
> +	 * we see no event bits set.
> +	 */
> +	if (status && status_reads++ < MAX_ISR_STATUS_READS)
> +		goto read_status;
> +
> +	if (status_reads == MAX_ISR_STATUS_READS)
> +		ctrl_warn(ctrl, "Slot(%s): Hot plug event bit stuck, reading %x\n",
> +			  status, slot_name(ctrl));
>  
>  	if (!events) {
>  		if (parent)
> @@ -546,7 +574,6 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id)
>  		return IRQ_NONE;
>  	}
>  
> -	pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, events);
>  	ctrl_dbg(ctrl, "pending interrupts %#06x from Slot Status\n", events);
>  	if (parent)
>  		pm_runtime_put(parent);
> -- 
> 2.18.1
>
Stuart Hayes Nov. 13, 2019, 9:58 p.m. UTC | #2
On 11/12/19 10:59 PM, Lukas Wunner wrote:
> On Tue, Nov 12, 2019 at 04:59:38PM -0500, Stuart Hayes wrote:
>> The pciehp interrupt handler pciehp_isr() will read the slot status
>> register and then write back to it to clear just the bits that caused the
>> interrupt. If a different interrupt event bit gets set between the read and
>> the write, pciehp_isr() will exit without having cleared all of the
>> interrupt event bits, so we will never get another hotplug interrupt from
>> that device.
> 
> The IRQ is masked when it occurs and unmasked after it's been handled.
> See the invocation of mask_ack_irq() in handle_edge_irq() and
> handle_level_irq() in kernel/irq/chip.c.
> 
> If the IRQ has fired in-between, the IRQ chip is expected to invoke
> the IRQ handler again.  There is some support for IRQ chips not
> capable of replaying interrupts in kernel/irq/resend.c, but in general,
> if you do not get another hotplug interrupt, the hardware is broken.
> What kind of IRQ chip are you using and what kind of chip is the
> hotplug port built into?
> 
> I'm not opposed to quirks to support such broken hardware but the
> commit message shouldn't purport that it's normal behavior and the
> quirk should only be executed where necessary and be made explicit
> in the code to be a quirk.
> 
> Thanks,
> 
> Lukas
 
Thank you for the feedback!

The hotplug port I'm seeing the issue with is an AMD "Starship/Matisse GPP
Bridge" (1022/1483), which uses an MSI interrupt (PCI-MSI chip).

I don't think that the masking and unmasking will make any difference in this
case, because this pciehp port does not support MSI per-vector masking.

The PCI spec says:

"If the Port is enabled for edge-triggered interrupt signaling using MSI or MSI-X,
an interrupt message must be sent every time the logical AND of the following
conditions transitions from FALSE to TRUE:

• The associated vector is unmasked (not applicable if MSI does not support PVM).

• The Hot-Plug Interrupt Enable bit in the Slot Control register is set to 1b.

• At least one hot-plug event status bit in the Slot Status register and its
associated enable bit in the Slot Control register are both set to 1b."

Because the AMD port does not support PVM (per vector masking), the first
condition will always be true.

Because the hot-plug interrupt enable bit isn't cleared on each interrupt, the
second condition is true.

And because individual event enable bits in the slot control register aren't
cleared on each interrupt, I interpret this to mean that an interrupt message
will be sent every time that the event status bits in the slot status register
transition from all zeros to at least one event status bit being 1.  Once one
of those event status bits is 1, the logical AND of the three conditions above
will not transition from FALSE to TRUE again until all of the (enabled) event
status bits in the slot status register all go to zero, which is what my patch
is intended to ensure.

(I noticed too late that I have a compile warning with the ctrl_warn() call, so
I'll have to make a V2 of the patch for that, at least.)

Thanks,
Stuart
Lukas Wunner Nov. 14, 2019, 2:50 a.m. UTC | #3
On Wed, Nov 13, 2019 at 03:58:51PM -0600, Stuart Hayes wrote:
> The hotplug port I'm seeing the issue with is an AMD "Starship/Matisse GPP
> Bridge" (1022/1483), which uses an MSI interrupt (PCI-MSI chip).
[...]
> And because individual event enable bits in the slot control register aren't
> cleared on each interrupt, I interpret this to mean that an interrupt message
> will be sent every time that the event status bits in the slot status register
> transition from all zeros to at least one event status bit being 1.  Once one
> of those event status bits is 1, the logical AND of the three conditions above
> will not transition from FALSE to TRUE again until all of the (enabled) event
> status bits in the slot status register all go to zero, which is what my patch
> is intended to ensure.

Understood now, thanks.

I'd suggest adding a flag "unsigned int pvm_capable;" to struct controller
below "u32 slot_cap" (in the "capabilities and quirks" section),
setting that flag in pcie_init() from dev->msi_cap + PCI_MSI_FLAGS
(& PCI_MSI_FLAGS_MASKBIT) and amending pciehp_isr() to check for that
flag and re-read / re-write the Slot Status register until it's all zeroes.
That would make the reason for the modifications to pciehp_isr() explicit.
Please try to make the modifications to pciehp_isr() as small and simple
as possible.  Maybe it's worthwhile to put them in a separate static
function which is called from pciehp_isr(), I don't know.

Please mention the PCI vendor / device IDs in the commit message
and provide a reference to the PCIe Base Spec section you've quoted
above.

Thanks,

Lukas
Stuart Hayes Nov. 25, 2019, 9:03 p.m. UTC | #4
On 11/12/19 3:59 PM, Stuart Hayes wrote:
> The pciehp interrupt handler pciehp_isr() will read the slot status
> register and then write back to it to clear just the bits that caused the
> interrupt. If a different interrupt event bit gets set between the read and
> the write, pciehp_isr() will exit without having cleared all of the
> interrupt event bits, so we will never get another hotplug interrupt from
> that device.
> 
> That is expected behavior according to the PCI Express spec (v.5.0, section
> 6.7.3.4, "Software Notification of Hot-Plug Events").
> 
> Because the "presence detect changed" and "data link layer state changed"
> event bits are both getting set at nearly the same time when a device is
> added or removed, this is more likely to happen than it might seem. The
> issue can be reproduced rather easily by connecting and disconnecting an
> NVMe device on at least one system model.
> 
> This patch fixes the issue by modifying pciehp_isr() to loop back and
> re-read the slot status register immediately after writing to it, until
> it sees that all of the event status bits have been cleared.
> 
> Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>

Bjorn,

Do you have any comments or issues with this patch set?  Anything I can do?

Thanks!
Stuart
Bjorn Helgaas Nov. 26, 2019, 11:37 p.m. UTC | #5
On Mon, Nov 25, 2019 at 03:03:23PM -0600, Stuart Hayes wrote:
> 
> On 11/12/19 3:59 PM, Stuart Hayes wrote:
> > The pciehp interrupt handler pciehp_isr() will read the slot status
> > register and then write back to it to clear just the bits that caused the
> > interrupt. If a different interrupt event bit gets set between the read and
> > the write, pciehp_isr() will exit without having cleared all of the
> > interrupt event bits, so we will never get another hotplug interrupt from
> > that device.
> > 
> > That is expected behavior according to the PCI Express spec (v.5.0, section
> > 6.7.3.4, "Software Notification of Hot-Plug Events").
> > 
> > Because the "presence detect changed" and "data link layer state changed"
> > event bits are both getting set at nearly the same time when a device is
> > added or removed, this is more likely to happen than it might seem. The
> > issue can be reproduced rather easily by connecting and disconnecting an
> > NVMe device on at least one system model.
> > 
> > This patch fixes the issue by modifying pciehp_isr() to loop back and
> > re-read the slot status register immediately after writing to it, until
> > it sees that all of the event status bits have been cleared.
> > 
> > Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
> 
> Bjorn,
> 
> Do you have any comments or issues with this patch set?  Anything I can do?

Were you planning to address Lukas' comments?

https://lore.kernel.org/r/20191114025022.wz3gchr7w67fjtzn@wunner.de
Stuart Hayes Nov. 27, 2019, 12:06 a.m. UTC | #6
On Tue, Nov 26, 2019 at 5:37 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Mon, Nov 25, 2019 at 03:03:23PM -0600, Stuart Hayes wrote:
> >
> > On 11/12/19 3:59 PM, Stuart Hayes wrote:
> > > The pciehp interrupt handler pciehp_isr() will read the slot status
> > > register and then write back to it to clear just the bits that caused the
> > > interrupt. If a different interrupt event bit gets set between the read and
> > > the write, pciehp_isr() will exit without having cleared all of the
> > > interrupt event bits, so we will never get another hotplug interrupt from
> > > that device.
> > >
> > > That is expected behavior according to the PCI Express spec (v.5.0, section
> > > 6.7.3.4, "Software Notification of Hot-Plug Events").
> > >
> > > Because the "presence detect changed" and "data link layer state changed"
> > > event bits are both getting set at nearly the same time when a device is
> > > added or removed, this is more likely to happen than it might seem. The
> > > issue can be reproduced rather easily by connecting and disconnecting an
> > > NVMe device on at least one system model.
> > >
> > > This patch fixes the issue by modifying pciehp_isr() to loop back and
> > > re-read the slot status register immediately after writing to it, until
> > > it sees that all of the event status bits have been cleared.
> > >
> > > Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
> >
> > Bjorn,
> >
> > Do you have any comments or issues with this patch set?  Anything I can do?
>
> Were you planning to address Lukas' comments?
>
> https://lore.kernel.org/r/20191114025022.wz3gchr7w67fjtzn@wunner.de

Yes, I submitted a V2 for this patch (https://lkml.org/lkml/2019/11/20/1147).

But--I'm very sorry, I didn't mean to ask if you had any comments on
this patch--I meant to ask about an earlier patch set, and
accidentally replied to the wrong thread.

I meant to ask you about this patch set:
https://lore.kernel.org/lkml/20191025190047.38130-1-stuart.w.hayes@gmail.com/

Thank you!  --Stuart
diff mbox series

Patch

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 1a522c1c4177..8ec22a872b28 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -487,12 +487,21 @@  void pciehp_power_off_slot(struct controller *ctrl)
 		 PCI_EXP_SLTCTL_PWR_OFF);
 }
 
+/*
+ * We should never need to re-read the slot status register this many times,
+ * because there are only six possible events that could generate this
+ * interrupt.  If we still see events after this many reads, there's a stuck
+ * bit.
+ */
+#define MAX_ISR_STATUS_READS 6
+
 static irqreturn_t pciehp_isr(int irq, void *dev_id)
 {
 	struct controller *ctrl = (struct controller *)dev_id;
 	struct pci_dev *pdev = ctrl_dev(ctrl);
 	struct device *parent = pdev->dev.parent;
-	u16 status, events;
+	u16 status, events = 0;
+	int status_reads = 0;
 
 	/*
 	 * Interrupts only occur in D3hot or shallower and only if enabled
@@ -517,6 +526,7 @@  static irqreturn_t pciehp_isr(int irq, void *dev_id)
 		}
 	}
 
+read_status:
 	pcie_capability_read_word(pdev, PCI_EXP_SLTSTA, &status);
 	if (status == (u16) ~0) {
 		ctrl_info(ctrl, "%s: no response from device\n", __func__);
@@ -529,16 +539,34 @@  static irqreturn_t pciehp_isr(int irq, void *dev_id)
 	 * Slot Status contains plain status bits as well as event
 	 * notification bits; right now we only want the event bits.
 	 */
-	events = status & (PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD |
-			   PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_CC |
-			   PCI_EXP_SLTSTA_DLLSC);
+	status &= (PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD |
+		   PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_CC |
+		   PCI_EXP_SLTSTA_DLLSC);
 
 	/*
 	 * If we've already reported a power fault, don't report it again
 	 * until we've done something to handle it.
 	 */
 	if (ctrl->power_fault_detected)
-		events &= ~PCI_EXP_SLTSTA_PFD;
+		status &= ~PCI_EXP_SLTSTA_PFD;
+
+	if (status) {
+		pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, status);
+		events |= status;
+	}
+
+	/*
+	 * All of the event bits must be zero before the port will send
+	 * a new interrupt.  If an event bit gets set between the read
+	 * and the write, we'll never get another interrupt, so loop until
+	 * we see no event bits set.
+	 */
+	if (status && status_reads++ < MAX_ISR_STATUS_READS)
+		goto read_status;
+
+	if (status_reads == MAX_ISR_STATUS_READS)
+		ctrl_warn(ctrl, "Slot(%s): Hot plug event bit stuck, reading %x\n",
+			  status, slot_name(ctrl));
 
 	if (!events) {
 		if (parent)
@@ -546,7 +574,6 @@  static irqreturn_t pciehp_isr(int irq, void *dev_id)
 		return IRQ_NONE;
 	}
 
-	pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, events);
 	ctrl_dbg(ctrl, "pending interrupts %#06x from Slot Status\n", events);
 	if (parent)
 		pm_runtime_put(parent);