diff mbox series

[1/5] PCI/switchtec: Error out MRPC execution when no GAS access

Message ID 20210924110842.6323-2-kelvin.cao@microchip.com
State New
Headers show
Series Switchtec Fixes and Improvements | expand

Commit Message

Kelvin Cao Sept. 24, 2021, 11:08 a.m. UTC
From: Kelvin Cao <kelvin.cao@microchip.com>

After a firmware hard reset, MRPC command executions, which are based
on the PCI BAR (which Microchip refers to as GAS) read/write, will hang
indefinitely. This is because after a reset, the host will fail all GAS
reads (get all 1s), in which case the driver won't get a valid MRPC
status.

Add a read check to GAS access when a MRPC command execution doesn't
response timely, error out if the check fails.

Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
---
 drivers/pci/switch/switchtec.c | 59 ++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 7 deletions(-)

Comments

Bjorn Helgaas Oct. 1, 2021, 8:18 p.m. UTC | #1
On Fri, Sep 24, 2021 at 11:08:38AM +0000, kelvin.cao@microchip.com wrote:
> From: Kelvin Cao <kelvin.cao@microchip.com>
> 
> After a firmware hard reset, MRPC command executions, which are based
> on the PCI BAR (which Microchip refers to as GAS) read/write, will hang
> indefinitely. This is because after a reset, the host will fail all GAS
> reads (get all 1s), in which case the driver won't get a valid MRPC
> status.

Trying to write a merge commit log for this, but having a hard time
summarizing it.  It sounds like it covers both Switchtec-specific
(firmware and MRPC commands) and generic PCIe behavior (MMIO read
failures).

This has something to do with a firmware hard reset.  What is that?
Is that like a firmware reboot?  A device reset, e.g., FLR or
secondary bus reset, that causes a firmware reboot?  A device reset
initiated by firmware?

Anyway, apparently when that happens, MMIO reads to the switch fail
(timeout or error completion on PCIe) for a while.  If a device reset
is involved, that much is standard PCIe behavior.  And the driver sees
~0 data from those failed reads.  That's not part of the PCIe spec,
but is typical root complex behavior.

But you said the MRPC commands hang indefinitely.  Presumably MMIO
reads would start succeeding eventually when the device becomes ready,
so I don't know how that translates to "indefinitely."

Weird to refer to a PCI BAR as "GAS".  Maybe expanding the acronym
would help it make sense.

What does "host" refer to?  I guess it's the switch (the
switchtec_dev), since you say it fails MMIO reads?

Naming comment below.

> Add a read check to GAS access when a MRPC command execution doesn't
> response timely, error out if the check fails.
> 
> Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
> ---
>  drivers/pci/switch/switchtec.c | 59 ++++++++++++++++++++++++++++++----
>  1 file changed, 52 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
> index 0b301f8be9ed..092653487021 100644
> --- a/drivers/pci/switch/switchtec.c
> +++ b/drivers/pci/switch/switchtec.c
> @@ -45,6 +45,7 @@ enum mrpc_state {
>  	MRPC_QUEUED,
>  	MRPC_RUNNING,
>  	MRPC_DONE,
> +	MRPC_IO_ERROR,
>  };
>  
>  struct switchtec_user {
> @@ -66,6 +67,13 @@ struct switchtec_user {
>  	int event_cnt;
>  };
>  
> +static int check_access(struct switchtec_dev *stdev)
> +{
> +	u32 device = ioread32(&stdev->mmio_sys_info->device_id);
> +
> +	return stdev->pdev->device == device;
> +}

Didn't notice this before, but the "check_access()" name is not very
helpful because it doesn't give a clue about what the return value
means.  Does 0 mean no error?  Does 1 mean no error?  From reading the
implementation, I can see that 0 is actually the error case, but I
can't tell from the name.

>  static struct switchtec_user *stuser_create(struct switchtec_dev *stdev)
>  {
>  	struct switchtec_user *stuser;
> @@ -113,6 +121,7 @@ static void stuser_set_state(struct switchtec_user *stuser,
>  		[MRPC_QUEUED] = "QUEUED",
>  		[MRPC_RUNNING] = "RUNNING",
>  		[MRPC_DONE] = "DONE",
> +		[MRPC_IO_ERROR] = "IO_ERROR",
>  	};
>  
>  	stuser->state = state;
> @@ -184,6 +193,21 @@ static int mrpc_queue_cmd(struct switchtec_user *stuser)
>  	return 0;
>  }
>  
> +static void mrpc_cleanup_cmd(struct switchtec_dev *stdev)
> +{
> +	/* requires the mrpc_mutex to already be held when called */
> +	struct switchtec_user *stuser = list_entry(stdev->mrpc_queue.next,
> +						   struct switchtec_user, list);
> +
> +	stuser->cmd_done = true;
> +	wake_up_interruptible(&stuser->cmd_comp);
> +	list_del_init(&stuser->list);
> +	stuser_put(stuser);
> +	stdev->mrpc_busy = 0;
> +
> +	mrpc_cmd_submit(stdev);
> +}
> +
>  static void mrpc_complete_cmd(struct switchtec_dev *stdev)
>  {
>  	/* requires the mrpc_mutex to already be held when called */
> @@ -223,13 +247,7 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev)
>  		memcpy_fromio(stuser->data, &stdev->mmio_mrpc->output_data,
>  			      stuser->read_len);
>  out:
> -	stuser->cmd_done = true;
> -	wake_up_interruptible(&stuser->cmd_comp);
> -	list_del_init(&stuser->list);
> -	stuser_put(stuser);
> -	stdev->mrpc_busy = 0;
> -
> -	mrpc_cmd_submit(stdev);
> +	mrpc_cleanup_cmd(stdev);
>  }
>  
>  static void mrpc_event_work(struct work_struct *work)
> @@ -246,6 +264,23 @@ static void mrpc_event_work(struct work_struct *work)
>  	mutex_unlock(&stdev->mrpc_mutex);
>  }
>  
> +static void mrpc_error_complete_cmd(struct switchtec_dev *stdev)
> +{
> +	/* requires the mrpc_mutex to already be held when called */
> +
> +	struct switchtec_user *stuser;
> +
> +	if (list_empty(&stdev->mrpc_queue))
> +		return;
> +
> +	stuser = list_entry(stdev->mrpc_queue.next,
> +			    struct switchtec_user, list);
> +
> +	stuser_set_state(stuser, MRPC_IO_ERROR);
> +
> +	mrpc_cleanup_cmd(stdev);
> +}
> +
>  static void mrpc_timeout_work(struct work_struct *work)
>  {
>  	struct switchtec_dev *stdev;
> @@ -257,6 +292,11 @@ static void mrpc_timeout_work(struct work_struct *work)
>  
>  	mutex_lock(&stdev->mrpc_mutex);
>  
> +	if (!check_access(stdev)) {
> +		mrpc_error_complete_cmd(stdev);
> +		goto out;
> +	}
> +
>  	if (stdev->dma_mrpc)
>  		status = stdev->dma_mrpc->status;
>  	else
> @@ -544,6 +584,11 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
>  	if (rc)
>  		return rc;
>  
> +	if (stuser->state == MRPC_IO_ERROR) {
> +		mutex_unlock(&stdev->mrpc_mutex);
> +		return -EIO;
> +	}
> +
>  	if (stuser->state != MRPC_DONE) {
>  		mutex_unlock(&stdev->mrpc_mutex);
>  		return -EBADE;
> -- 
> 2.25.1
>
Logan Gunthorpe Oct. 1, 2021, 8:29 p.m. UTC | #2
On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> On Fri, Sep 24, 2021 at 11:08:38AM +0000, kelvin.cao@microchip.com wrote:
>> From: Kelvin Cao <kelvin.cao@microchip.com>
>>
>> After a firmware hard reset, MRPC command executions, which are based
>> on the PCI BAR (which Microchip refers to as GAS) read/write, will hang
>> indefinitely. This is because after a reset, the host will fail all GAS
>> reads (get all 1s), in which case the driver won't get a valid MRPC
>> status.
> 
> Trying to write a merge commit log for this, but having a hard time
> summarizing it.  It sounds like it covers both Switchtec-specific
> (firmware and MRPC commands) and generic PCIe behavior (MMIO read
> failures).
> 
> This has something to do with a firmware hard reset.  What is that?
> Is that like a firmware reboot?  A device reset, e.g., FLR or
> secondary bus reset, that causes a firmware reboot?  A device reset
> initiated by firmware?
> 
> Anyway, apparently when that happens, MMIO reads to the switch fail
> (timeout or error completion on PCIe) for a while.  If a device reset
> is involved, that much is standard PCIe behavior.  And the driver sees
> ~0 data from those failed reads.  That's not part of the PCIe spec,
> but is typical root complex behavior.
> 
> But you said the MRPC commands hang indefinitely.  Presumably MMIO
> reads would start succeeding eventually when the device becomes ready,
> so I don't know how that translates to "indefinitely."

I suspect Kelvin can expand on this and fix the issue below. But in my
experience, the MMIO will read ~0 forever after a firmware reset, until
the system is rebooted. Presumably on systems that have good hot plug
support they are supposed to recover. Though I've never seen that.

The MMIO read that signals the MRPC status always returns ~0 and the
userspace request will eventually time out.

> Weird to refer to a PCI BAR as "GAS".  Maybe expanding the acronym
> would help it make sense.
GAS is the term used by the firmware developers and is in all their
documentation. It stands for Global Address Space.

> What does "host" refer to?  I guess it's the switch (the
> switchtec_dev), since you say it fails MMIO reads?

Yes, a bit confusing. The firmware is dead or not setup right so MMIO
reads are not succeeding and the root complex is returning ~0 to the
driver on reads.

Logan
Kelvin Cao Oct. 1, 2021, 10:58 p.m. UTC | #3
On Fri, 2021-10-01 at 15:18 -0500, Bjorn Helgaas wrote:
> On Fri, Sep 24, 2021 at 11:08:38AM +0000, kelvin.cao@microchip.com
> wrote:
> > From: Kelvin Cao <kelvin.cao@microchip.com>
> > 
> > After a firmware hard reset, MRPC command executions, which are
> > based
> > on the PCI BAR (which Microchip refers to as GAS) read/write, will
> > hang
> > indefinitely. This is because after a reset, the host will fail all
> > GAS
> > reads (get all 1s), in which case the driver won't get a valid MRPC
> > status.
> 
> Trying to write a merge commit log for this, but having a hard time
> summarizing it.  It sounds like it covers both Switchtec-specific
> (firmware and MRPC commands) and generic PCIe behavior (MMIO read
> failures).
> 
> This has something to do with a firmware hard reset.  What is that?
> Is that like a firmware reboot?  A device reset, e.g., FLR or
> secondary bus reset, that causes a firmware reboot?  A device reset
> initiated by firmware?
> 
> Anyway, apparently when that happens, MMIO reads to the switch fail
> (timeout or error completion on PCIe) for a while.  If a device reset
> is involved, that much is standard PCIe behavior.  And the driver
> sees
> ~0 data from those failed reads.  That's not part of the PCIe spec,
> but is typical root complex behavior.
> 
> But you said the MRPC commands hang indefinitely.  Presumably MMIO
> reads would start succeeding eventually when the device becomes
> ready,
> so I don't know how that translates to "indefinitely."
> 
> Weird to refer to a PCI BAR as "GAS".  Maybe expanding the acronym
> would help it make sense.
> 
> What does "host" refer to?  I guess it's the switch (the
> switchtec_dev), since you say it fails MMIO reads?
> 
> Naming comment below.
> 
> > Add a read check to GAS access when a MRPC command execution
> > doesn't
> > response timely, error out if the check fails.
> > 
> > Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
> > ---
> >  drivers/pci/switch/switchtec.c | 59
> > ++++++++++++++++++++++++++++++----
> >  1 file changed, 52 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/pci/switch/switchtec.c
> > b/drivers/pci/switch/switchtec.c
> > index 0b301f8be9ed..092653487021 100644
> > --- a/drivers/pci/switch/switchtec.c
> > +++ b/drivers/pci/switch/switchtec.c
> > @@ -45,6 +45,7 @@ enum mrpc_state {
> >       MRPC_QUEUED,
> >       MRPC_RUNNING,
> >       MRPC_DONE,
> > +     MRPC_IO_ERROR,
> >  };
> > 
> >  struct switchtec_user {
> > @@ -66,6 +67,13 @@ struct switchtec_user {
> >       int event_cnt;
> >  };
> > 
> > +static int check_access(struct switchtec_dev *stdev)
> > +{
> > +     u32 device = ioread32(&stdev->mmio_sys_info->device_id);
> > +
> > +     return stdev->pdev->device == device;
> > +}
> 
> Didn't notice this before, but the "check_access()" name is not very
> helpful because it doesn't give a clue about what the return value
> means.  Does 0 mean no error?  Does 1 mean no error?  From reading
> the
> implementation, I can see that 0 is actually the error case, but I
> can't tell from the name.

Yes, will improve the naming, like change it to "has_gas_access()" in
v2 if a v2 patchset is preferred.
> 
> >  static struct switchtec_user *stuser_create(struct switchtec_dev
> > *stdev)
> >  {
> >       struct switchtec_user *stuser;
> > @@ -113,6 +121,7 @@ static void stuser_set_state(struct
> > switchtec_user *stuser,
> >               [MRPC_QUEUED] = "QUEUED",
> >               [MRPC_RUNNING] = "RUNNING",
> >               [MRPC_DONE] = "DONE",
> > +             [MRPC_IO_ERROR] = "IO_ERROR",
> >       };
> > 
> >       stuser->state = state;
> > @@ -184,6 +193,21 @@ static int mrpc_queue_cmd(struct
> > switchtec_user *stuser)
> >       return 0;
> >  }
> > 
> > +static void mrpc_cleanup_cmd(struct switchtec_dev *stdev)
> > +{
> > +     /* requires the mrpc_mutex to already be held when called */
> > +     struct switchtec_user *stuser = list_entry(stdev-
> > >mrpc_queue.next,
> > +                                                struct
> > switchtec_user, list);
> > +
> > +     stuser->cmd_done = true;
> > +     wake_up_interruptible(&stuser->cmd_comp);
> > +     list_del_init(&stuser->list);
> > +     stuser_put(stuser);
> > +     stdev->mrpc_busy = 0;
> > +
> > +     mrpc_cmd_submit(stdev);
> > +}
> > +
> >  static void mrpc_complete_cmd(struct switchtec_dev *stdev)
> >  {
> >       /* requires the mrpc_mutex to already be held when called */
> > @@ -223,13 +247,7 @@ static void mrpc_complete_cmd(struct
> > switchtec_dev *stdev)
> >               memcpy_fromio(stuser->data, &stdev->mmio_mrpc-
> > >output_data,
> >                             stuser->read_len);
> >  out:
> > -     stuser->cmd_done = true;
> > -     wake_up_interruptible(&stuser->cmd_comp);
> > -     list_del_init(&stuser->list);
> > -     stuser_put(stuser);
> > -     stdev->mrpc_busy = 0;
> > -
> > -     mrpc_cmd_submit(stdev);
> > +     mrpc_cleanup_cmd(stdev);
> >  }
> > 
> >  static void mrpc_event_work(struct work_struct *work)
> > @@ -246,6 +264,23 @@ static void mrpc_event_work(struct work_struct
> > *work)
> >       mutex_unlock(&stdev->mrpc_mutex);
> >  }
> > 
> > +static void mrpc_error_complete_cmd(struct switchtec_dev *stdev)
> > +{
> > +     /* requires the mrpc_mutex to already be held when called */
> > +
> > +     struct switchtec_user *stuser;
> > +
> > +     if (list_empty(&stdev->mrpc_queue))
> > +             return;
> > +
> > +     stuser = list_entry(stdev->mrpc_queue.next,
> > +                         struct switchtec_user, list);
> > +
> > +     stuser_set_state(stuser, MRPC_IO_ERROR);
> > +
> > +     mrpc_cleanup_cmd(stdev);
> > +}
> > +
> >  static void mrpc_timeout_work(struct work_struct *work)
> >  {
> >       struct switchtec_dev *stdev;
> > @@ -257,6 +292,11 @@ static void mrpc_timeout_work(struct
> > work_struct *work)
> > 
> >       mutex_lock(&stdev->mrpc_mutex);
> > 
> > +     if (!check_access(stdev)) {
> > +             mrpc_error_complete_cmd(stdev);
> > +             goto out;
> > +     }
> > +
> >       if (stdev->dma_mrpc)
> >               status = stdev->dma_mrpc->status;
> >       else
> > @@ -544,6 +584,11 @@ static ssize_t switchtec_dev_read(struct file
> > *filp, char __user *data,
> >       if (rc)
> >               return rc;
> > 
> > +     if (stuser->state == MRPC_IO_ERROR) {
> > +             mutex_unlock(&stdev->mrpc_mutex);
> > +             return -EIO;
> > +     }
> > +
> >       if (stuser->state != MRPC_DONE) {
> >               mutex_unlock(&stdev->mrpc_mutex);
> >               return -EBADE;
> > --
> > 2.25.1
> >
Kelvin Cao Oct. 1, 2021, 11:49 p.m. UTC | #4
On Fri, 2021-10-01 at 14:29 -0600, Logan Gunthorpe wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you
> know the content is safe
> 
> On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> > On Fri, Sep 24, 2021 at 11:08:38AM +0000, kelvin.cao@microchip.com
> > wrote:
> > > From: Kelvin Cao <kelvin.cao@microchip.com>
> > > 
> > > After a firmware hard reset, MRPC command executions, which are
> > > based
> > > on the PCI BAR (which Microchip refers to as GAS) read/write,
> > > will hang
> > > indefinitely. This is because after a reset, the host will fail
> > > all GAS
> > > reads (get all 1s), in which case the driver won't get a valid
> > > MRPC
> > > status.
> > 
> > Trying to write a merge commit log for this, but having a hard time
> > summarizing it.  It sounds like it covers both Switchtec-specific
> > (firmware and MRPC commands) and generic PCIe behavior (MMIO read
> > failures).
> > 
> > This has something to do with a firmware hard reset.  What is that?
> > Is that like a firmware reboot?  A device reset, e.g., FLR or
> > secondary bus reset, that causes a firmware reboot?  A device reset
> > initiated by firmware?
A firmware reset can be triggered by a reset command issued to the
firmware to reboot it.
> > Anyway, apparently when that happens, MMIO reads to the switch fail
> > (timeout or error completion on PCIe) for a while.  If a device
> > reset
> > is involved, that much is standard PCIe behavior.  And the driver
> > sees
> > ~0 data from those failed reads.  That's not part of the PCIe spec,
> > but is typical root complex behavior.
> > 
> > But you said the MRPC commands hang indefinitely.  Presumably MMIO
> > reads would start succeeding eventually when the device becomes
> > ready,
> > so I don't know how that translates to "indefinitely."
> 
> I suspect Kelvin can expand on this and fix the issue below. But in
> my
> experience, the MMIO will read ~0 forever after a firmware reset,
> until
> the system is rebooted. Presumably on systems that have good hot plug
> support they are supposed to recover. Though I've never seen that.

This is also my observation, all MMIO read will fail (~0 returned)
until the system is rebooted or a PCI rescan is performed.

> The MMIO read that signals the MRPC status always returns ~0 and the
> userspace request will eventually time out.

The problem in this case is that, in DMA MRPC mode, the status (in host
memory) is always initialized to 'in progress', and it's up to the
firmware to update it to 'done' after the command is executed in the
firmware. After a firmware reset is performed, the firmware cannot be
triggered to start a MRPC command, therefore the status in host memory
remains 'in progress' in the driver, which prevents a MRPC from timing
out. I should have included this in the message.
> 
> > Weird to refer to a PCI BAR as "GAS".  Maybe expanding the acronym
> > would help it make sense.
> GAS is the term used by the firmware developers and is in all their
> documentation. It stands for Global Address Space.
> 
> > What does "host" refer to?  I guess it's the switch (the
> > switchtec_dev), since you say it fails MMIO reads?
> 
> Yes, a bit confusing. The firmware is dead or not setup right so MMIO
> reads are not succeeding and the root complex is returning ~0 to the
> driver on reads.
Ditto. Will update in v2.
> 
> Logan
Logan Gunthorpe Oct. 1, 2021, 11:52 p.m. UTC | #5
On 2021-10-01 4:58 p.m., Kelvin.Cao@microchip.com wrote:
> On Fri, 2021-10-01 at 15:18 -0500, Bjorn Helgaas wrote:
>> Didn't notice this before, but the "check_access()" name is not very
>> helpful because it doesn't give a clue about what the return value
>> means.  Does 0 mean no error?  Does 1 mean no error?  From reading
>> the
>> implementation, I can see that 0 is actually the error case, but I
>> can't tell from the name.
> 
> Yes, will improve the naming, like change it to "has_gas_access()" in
> v2 if a v2 patchset is preferred.

I'd keep the GAS name out of the kernel. How about something along the
lines of is_firmware_running()? Maybe a comment for the function would
be good as well.

Logan
Kelvin Cao Oct. 2, 2021, 12:05 a.m. UTC | #6
On Fri, 2021-10-01 at 17:52 -0600, Logan Gunthorpe wrote:
> On 2021-10-01 4:58 p.m., Kelvin.Cao@microchip.com wrote:
> > On Fri, 2021-10-01 at 15:18 -0500, Bjorn Helgaas wrote:
> > > Didn't notice this before, but the "check_access()" name is not
> > > very
> > > helpful because it doesn't give a clue about what the return
> > > value
> > > means.  Does 0 mean no error?  Does 1 mean no error?  From
> > > reading
> > > the
> > > implementation, I can see that 0 is actually the error case, but
> > > I
> > > can't tell from the name.
> > 
> > Yes, will improve the naming, like change it to "has_gas_access()"
> > in
> > v2 if a v2 patchset is preferred.
> 
> I'd keep the GAS name out of the kernel. How about something along
> the
> lines of is_firmware_running()? Maybe a comment for the function
> would
> be good as well.
> 
Yes, that'll be an improvement.
> Logan
Bjorn Helgaas Oct. 2, 2021, 3:11 p.m. UTC | #7
On Fri, Oct 01, 2021 at 11:49:18PM +0000, Kelvin.Cao@microchip.com wrote:
> On Fri, 2021-10-01 at 14:29 -0600, Logan Gunthorpe wrote:
> > EXTERNAL EMAIL: Do not click links or open attachments unless you
> > know the content is safe
> > 
> > On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> > > On Fri, Sep 24, 2021 at 11:08:38AM +0000, kelvin.cao@microchip.com
> > > wrote:
> > > > From: Kelvin Cao <kelvin.cao@microchip.com>
> > > > 
> > > > After a firmware hard reset, MRPC command executions, which
> > > > are based on the PCI BAR (which Microchip refers to as GAS)
> > > > read/write, will hang indefinitely. This is because after a
> > > > reset, the host will fail all GAS reads (get all 1s), in which
> > > > case the driver won't get a valid MRPC status.
> > > 
> > > Trying to write a merge commit log for this, but having a hard
> > > time summarizing it.  It sounds like it covers both
> > > Switchtec-specific (firmware and MRPC commands) and generic PCIe
> > > behavior (MMIO read failures).
> > > 
> > > This has something to do with a firmware hard reset.  What is
> > > that?  Is that like a firmware reboot?  A device reset, e.g.,
> > > FLR or secondary bus reset, that causes a firmware reboot?  A
> > > device reset initiated by firmware?
>
> A firmware reset can be triggered by a reset command issued to the
> firmware to reboot it.

So I guess this reset command was issued by the driver?

> > > Anyway, apparently when that happens, MMIO reads to the switch
> > > fail (timeout or error completion on PCIe) for a while.  If a
> > > device reset is involved, that much is standard PCIe behavior.
> > > And the driver sees ~0 data from those failed reads.  That's not
> > > part of the PCIe spec, but is typical root complex behavior.
> > > 
> > > But you said the MRPC commands hang indefinitely.  Presumably
> > > MMIO reads would start succeeding eventually when the device
> > > becomes ready, so I don't know how that translates to
> > > "indefinitely."
> > 
> > I suspect Kelvin can expand on this and fix the issue below. But
> > in my experience, the MMIO will read ~0 forever after a firmware
> > reset, until the system is rebooted. Presumably on systems that
> > have good hot plug support they are supposed to recover. Though
> > I've never seen that.
> 
> This is also my observation, all MMIO read will fail (~0 returned)
> until the system is rebooted or a PCI rescan is performed.

This made sense until you said MMIO reads would start working after a
PCI rescan.  A rescan doesn't really do anything special other than
doing config accesses to the device.  Two things come to mind:

1) Rescan does a config read of the Vendor ID, and devices may
respond with "Configuration Request Retry Status" if they are not
ready.  In this event, Linux retries this for a while.  This scenario
doesn't quite fit because it sounds like this is a device-specific
reset initiated by the driver, and CRS is not permited in this case.
PCIe r5.0, sec 2.3.1, says:

  A device Function is explicitly not permitted to return CRS
  following a software-initiated reset (other than an FLR) of the
  device, e.g., by the device's software driver writing to a
  device-specific reset bit.

2) The device may lose its bus and device number configuration after a
reset, so it must capture bus and device numbers from config writes.
I don't think Linux does this explicitly, but a rescan does do config
writes, which could accidentally fix something (PCIe r5.0, sec 2.2.9).

> > The MMIO read that signals the MRPC status always returns ~0 and the
> > userspace request will eventually time out.
> 
> The problem in this case is that, in DMA MRPC mode, the status (in host
> memory) is always initialized to 'in progress', and it's up to the
> firmware to update it to 'done' after the command is executed in the
> firmware. After a firmware reset is performed, the firmware cannot be
> triggered to start a MRPC command, therefore the status in host memory
> remains 'in progress' in the driver, which prevents a MRPC from timing
> out. I should have included this in the message.

I *thought* the problem was that the PCIe Memory Read failed and the
Root Complex fabricated ~0 data to complete the CPU read.  But now I'm
not sure, because it sounds like it might be that the PCIe transaction
succeeds, but it reads data that hasn't been updated by the firmware,
i.e., it reads 'in progress' because firmware hasn't updated it to
'done'.

Bjorn
Kelvin Cao Oct. 4, 2021, 8:51 p.m. UTC | #8
On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> On Fri, Oct 01, 2021 at 11:49:18PM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Fri, 2021-10-01 at 14:29 -0600, Logan Gunthorpe wrote:
> > > EXTERNAL EMAIL: Do not click links or open attachments unless you
> > > know the content is safe
> > > 
> > > On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> > > > On Fri, Sep 24, 2021 at 11:08:38AM +0000, 
> > > > kelvin.cao@microchip.com
> > > > wrote:
> > > > > From: Kelvin Cao <kelvin.cao@microchip.com>
> > > > > 
> > > > > After a firmware hard reset, MRPC command executions, which
> > > > > are based on the PCI BAR (which Microchip refers to as GAS)
> > > > > read/write, will hang indefinitely. This is because after a
> > > > > reset, the host will fail all GAS reads (get all 1s), in
> > > > > which
> > > > > case the driver won't get a valid MRPC status.
> > > > 
> > > > Trying to write a merge commit log for this, but having a hard
> > > > time summarizing it.  It sounds like it covers both
> > > > Switchtec-specific (firmware and MRPC commands) and generic
> > > > PCIe
> > > > behavior (MMIO read failures).
> > > > 
> > > > This has something to do with a firmware hard reset.  What is
> > > > that?  Is that like a firmware reboot?  A device reset, e.g.,
> > > > FLR or secondary bus reset, that causes a firmware reboot?  A
> > > > device reset initiated by firmware?
> > 
> > A firmware reset can be triggered by a reset command issued to the
> > firmware to reboot it.
> 
> So I guess this reset command was issued by the driver?
Yes, the reset command can be issued by a userspace utility to the
firmware via the driver. In some other cases, user can also issue the
reset command, via a sideband interface (like UART), to the firmware. 
> 
> > > > Anyway, apparently when that happens, MMIO reads to the switch
> > > > fail (timeout or error completion on PCIe) for a while.  If a
> > > > device reset is involved, that much is standard PCIe behavior.
> > > > And the driver sees ~0 data from those failed reads.  That's
> > > > not
> > > > part of the PCIe spec, but is typical root complex behavior.
> > > > 
> > > > But you said the MRPC commands hang indefinitely.  Presumably
> > > > MMIO reads would start succeeding eventually when the device
> > > > becomes ready, so I don't know how that translates to
> > > > "indefinitely."
> > > 
> > > I suspect Kelvin can expand on this and fix the issue below. But
> > > in my experience, the MMIO will read ~0 forever after a firmware
> > > reset, until the system is rebooted. Presumably on systems that
> > > have good hot plug support they are supposed to recover. Though
> > > I've never seen that.
> > 
> > This is also my observation, all MMIO read will fail (~0 returned)
> > until the system is rebooted or a PCI rescan is performed.
> 
> This made sense until you said MMIO reads would start working after a
> PCI rescan.  A rescan doesn't really do anything special other than
> doing config accesses to the device.  Two things come to mind:
> 
> 1) Rescan does a config read of the Vendor ID, and devices may
> respond with "Configuration Request Retry Status" if they are not
> ready.  In this event, Linux retries this for a while.  This scenario
> doesn't quite fit because it sounds like this is a device-specific
> reset initiated by the driver, and CRS is not permited in this case.
> PCIe r5.0, sec 2.3.1, says:
> 
>   A device Function is explicitly not permitted to return CRS
>   following a software-initiated reset (other than an FLR) of the
>   device, e.g., by the device's software driver writing to a
>   device-specific reset bit.
> 
> 2) The device may lose its bus and device number configuration after
> a
> reset, so it must capture bus and device numbers from config writes.
> I don't think Linux does this explicitly, but a rescan does do config
> writes, which could accidentally fix something (PCIe r5.0, sec
> 2.2.9).

Thanks Bjorn. It makes perfect sense!
> 
> > > The MMIO read that signals the MRPC status always returns ~0 and
> > > the
> > > userspace request will eventually time out.
> > 
> > The problem in this case is that, in DMA MRPC mode, the status (in
> > host
> > memory) is always initialized to 'in progress', and it's up to the
> > firmware to update it to 'done' after the command is executed in
> > the
> > firmware. After a firmware reset is performed, the firmware cannot
> > be
> > triggered to start a MRPC command, therefore the status in host
> > memory
> > remains 'in progress' in the driver, which prevents a MRPC from
> > timing
> > out. I should have included this in the message.
> 
> I *thought* the problem was that the PCIe Memory Read failed and the
> Root Complex fabricated ~0 data to complete the CPU read.  But now
> I'm
> not sure, because it sounds like it might be that the PCIe
> transaction
> succeeds, but it reads data that hasn't been updated by the firmware,
> i.e., it reads 'in progress' because firmware hasn't updated it to
> 'done'.
> 
> Bjorn

The original message was sort of misleading. After a firmware reset,
CPU getting ~0 for the PCIe Memory Read doesn't explain the hang. In a
MRPC execution (DMA MRPC mode), the MRPC status which is located in the
host moemory, gets initialized by the CPU and updated/finilized by the
firmware. In the situation of a firmware reset, any MRPC initiated
afterwards will not get the status updated by the firmware per the
reason you pointed out above (or similar, to my understanding, firmware
can no longer DMA data to host memory in such cases), therefore the
MRPC execution will never end.


Kelvin
Bjorn Helgaas Oct. 5, 2021, 8:11 p.m. UTC | #9
On Mon, Oct 04, 2021 at 08:51:06PM +0000, Kelvin.Cao@microchip.com wrote:
> On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > On Fri, Oct 01, 2021 at 11:49:18PM +0000, Kelvin.Cao@microchip.com
> > wrote:
> > > On Fri, 2021-10-01 at 14:29 -0600, Logan Gunthorpe wrote:
> > > > EXTERNAL EMAIL: Do not click links or open attachments unless you
> > > > know the content is safe
> > > > 
> > > > On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> > > > > On Fri, Sep 24, 2021 at 11:08:38AM +0000, 
> > > > > kelvin.cao@microchip.com
> > > > > wrote:
> > > > > > From: Kelvin Cao <kelvin.cao@microchip.com>
> > > > > > 
> > > > > > After a firmware hard reset, MRPC command executions,
> > > > > > which are based on the PCI BAR (which Microchip refers to
> > > > > > as GAS) read/write, will hang indefinitely. This is
> > > > > > because after a reset, the host will fail all GAS reads
> > > > > > (get all 1s), in which case the driver won't get a valid
> > > > > > MRPC status.
> > > > > 
> > > > > Trying to write a merge commit log for this, but having a
> > > > > hard time summarizing it.  It sounds like it covers both
> > > > > Switchtec-specific (firmware and MRPC commands) and generic
> > > > > PCIe behavior (MMIO read failures).
> > > > > 
> > > > > This has something to do with a firmware hard reset.  What
> > > > > is that?  Is that like a firmware reboot?  A device reset,
> > > > > e.g., FLR or secondary bus reset, that causes a firmware
> > > > > reboot?  A device reset initiated by firmware?
> > > 
> > > A firmware reset can be triggered by a reset command issued to
> > > the firmware to reboot it.
> > 
> > So I guess this reset command was issued by the driver?
>
> Yes, the reset command can be issued by a userspace utility to the
> firmware via the driver. In some other cases, user can also issue
> the reset command, via a sideband interface (like UART), to the
> firmware. 

OK, thanks.  That means CRS is not a factor here because this is not
an FLR or similar reset.

> > > > > Anyway, apparently when that happens, MMIO reads to the switch
> > > > > fail (timeout or error completion on PCIe) for a while.  If a
> > > > > device reset is involved, that much is standard PCIe behavior.
> > > > > And the driver sees ~0 data from those failed reads.  That's
> > > > > not
> > > > > part of the PCIe spec, but is typical root complex behavior.
> > > > > 
> > > > > But you said the MRPC commands hang indefinitely.  Presumably
> > > > > MMIO reads would start succeeding eventually when the device
> > > > > becomes ready, so I don't know how that translates to
> > > > > "indefinitely."
> > > > 
> > > > I suspect Kelvin can expand on this and fix the issue below. But
> > > > in my experience, the MMIO will read ~0 forever after a firmware
> > > > reset, until the system is rebooted. Presumably on systems that
> > > > have good hot plug support they are supposed to recover. Though
> > > > I've never seen that.
> > > 
> > > This is also my observation, all MMIO read will fail (~0 returned)
> > > until the system is rebooted or a PCI rescan is performed.
> > 
> > This made sense until you said MMIO reads would start working after a
> > PCI rescan.  A rescan doesn't really do anything special other than
> > doing config accesses to the device.  Two things come to mind:
> > 
> > 1) Rescan does a config read of the Vendor ID, and devices may
> > respond with "Configuration Request Retry Status" if they are not
> > ready.  In this event, Linux retries this for a while.  This scenario
> > doesn't quite fit because it sounds like this is a device-specific
> > reset initiated by the driver, and CRS is not permited in this case.
> > PCIe r5.0, sec 2.3.1, says:
> > 
> >   A device Function is explicitly not permitted to return CRS
> >   following a software-initiated reset (other than an FLR) of the
> >   device, e.g., by the device's software driver writing to a
> >   device-specific reset bit.
> > 
> > 2) The device may lose its bus and device number configuration
> > after a reset, so it must capture bus and device numbers from
> > config writes.  I don't think Linux does this explicitly, but a
> > rescan does do config writes, which could accidentally fix
> > something (PCIe r5.0, sec 2.2.9).
> 
> Thanks Bjorn. It makes perfect sense!
> > 
> > > > The MMIO read that signals the MRPC status always returns ~0
> > > > and the userspace request will eventually time out.
> > > 
> > > The problem in this case is that, in DMA MRPC mode, the status
> > > (in host memory) is always initialized to 'in progress', and
> > > it's up to the firmware to update it to 'done' after the command
> > > is executed in the firmware. After a firmware reset is
> > > performed, the firmware cannot be triggered to start a MRPC
> > > command, therefore the status in host memory remains 'in
> > > progress' in the driver, which prevents a MRPC from timing out.
> > > I should have included this in the message.
> > 
> > I *thought* the problem was that the PCIe Memory Read failed and
> > the Root Complex fabricated ~0 data to complete the CPU read.  But
> > now I'm not sure, because it sounds like it might be that the PCIe
> > transaction succeeds, but it reads data that hasn't been updated
> > by the firmware, i.e., it reads 'in progress' because firmware
> > hasn't updated it to 'done'.
> 
> The original message was sort of misleading. After a firmware reset,
> CPU getting ~0 for the PCIe Memory Read doesn't explain the hang. In
> a MRPC execution (DMA MRPC mode), the MRPC status which is located
> in the host memory, gets initialized by the CPU and
> updated/finalized by the firmware. In the situation of a firmware
> reset, any MRPC initiated afterwards will not get the status updated
> by the firmware per the reason you pointed out above (or similar, to
> my understanding, firmware can no longer DMA data to host memory in
> such cases), therefore the MRPC execution will never end.

I'm glad this makes sense to you, because it still doesn't to me.

check_access() does an MMIO read to something in BAR0.  If that read
returns ~0, it means either the PCIe Memory Read was successful and
the Switchtec device supplied ~0 data (maybe because firmware has not
initialized that part of the BAR) or the PCIe Memory Read failed and
the root complex fabricated the ~0 data.

I'd like to know which one is happening so we can clarify the commit
log text about "MRPC command executions hang indefinitely" and "host
wil fail all GAS reads."  It's not clear whether these are PCIe
protocol issues or driver/firmware interaction issues.

Bjorn
Kelvin Cao Oct. 6, 2021, 12:37 a.m. UTC | #10
On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> On Mon, Oct 04, 2021 at 08:51:06PM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > On Fri, Oct 01, 2021 at 11:49:18PM +0000, 
> > > Kelvin.Cao@microchip.com
> > > wrote:
> > > > On Fri, 2021-10-01 at 14:29 -0600, Logan Gunthorpe wrote:
> > > > > EXTERNAL EMAIL: Do not click links or open attachments unless
> > > > > you
> > > > > know the content is safe
> > > > > 
> > > > > On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> > > > > > On Fri, Sep 24, 2021 at 11:08:38AM +0000,
> > > > > > kelvin.cao@microchip.com
> > > > > > wrote:
> > > > > > > From: Kelvin Cao <kelvin.cao@microchip.com>
> > > > > > > 
> > > > > > > After a firmware hard reset, MRPC command executions,
> > > > > > > which are based on the PCI BAR (which Microchip refers to
> > > > > > > as GAS) read/write, will hang indefinitely. This is
> > > > > > > because after a reset, the host will fail all GAS reads
> > > > > > > (get all 1s), in which case the driver won't get a valid
> > > > > > > MRPC status.
> > > > > > 
> > > > > > Trying to write a merge commit log for this, but having a
> > > > > > hard time summarizing it.  It sounds like it covers both
> > > > > > Switchtec-specific (firmware and MRPC commands) and generic
> > > > > > PCIe behavior (MMIO read failures).
> > > > > > 
> > > > > > This has something to do with a firmware hard reset.  What
> > > > > > is that?  Is that like a firmware reboot?  A device reset,
> > > > > > e.g., FLR or secondary bus reset, that causes a firmware
> > > > > > reboot?  A device reset initiated by firmware?
> > > > 
> > > > A firmware reset can be triggered by a reset command issued to
> > > > the firmware to reboot it.
> > > 
> > > So I guess this reset command was issued by the driver?
> > 
> > Yes, the reset command can be issued by a userspace utility to the
> > firmware via the driver. In some other cases, user can also issue
> > the reset command, via a sideband interface (like UART), to the
> > firmware.
> 
> OK, thanks.  That means CRS is not a factor here because this is not
> an FLR or similar reset.
> 
> > > > > > Anyway, apparently when that happens, MMIO reads to the
> > > > > > switch
> > > > > > fail (timeout or error completion on PCIe) for a while.  If
> > > > > > a
> > > > > > device reset is involved, that much is standard PCIe
> > > > > > behavior.
> > > > > > And the driver sees ~0 data from those failed
> > > > > > reads.  That's
> > > > > > not
> > > > > > part of the PCIe spec, but is typical root complex
> > > > > > behavior.
> > > > > > 
> > > > > > But you said the MRPC commands hang
> > > > > > indefinitely.  Presumably
> > > > > > MMIO reads would start succeeding eventually when the
> > > > > > device
> > > > > > becomes ready, so I don't know how that translates to
> > > > > > "indefinitely."
> > > > > 
> > > > > I suspect Kelvin can expand on this and fix the issue below.
> > > > > But
> > > > > in my experience, the MMIO will read ~0 forever after a
> > > > > firmware
> > > > > reset, until the system is rebooted. Presumably on systems
> > > > > that
> > > > > have good hot plug support they are supposed to recover.
> > > > > Though
> > > > > I've never seen that.
> > > > 
> > > > This is also my observation, all MMIO read will fail (~0
> > > > returned)
> > > > until the system is rebooted or a PCI rescan is performed.
> > > 
> > > This made sense until you said MMIO reads would start working
> > > after a
> > > PCI rescan.  A rescan doesn't really do anything special other
> > > than
> > > doing config accesses to the device.  Two things come to mind:
> > > 
> > > 1) Rescan does a config read of the Vendor ID, and devices may
> > > respond with "Configuration Request Retry Status" if they are not
> > > ready.  In this event, Linux retries this for a while.  This
> > > scenario
> > > doesn't quite fit because it sounds like this is a device-
> > > specific
> > > reset initiated by the driver, and CRS is not permited in this
> > > case.
> > > PCIe r5.0, sec 2.3.1, says:
> > > 
> > >   A device Function is explicitly not permitted to return CRS
> > >   following a software-initiated reset (other than an FLR) of the
> > >   device, e.g., by the device's software driver writing to a
> > >   device-specific reset bit.
> > > 
> > > 2) The device may lose its bus and device number configuration
> > > after a reset, so it must capture bus and device numbers from
> > > config writes.  I don't think Linux does this explicitly, but a
> > > rescan does do config writes, which could accidentally fix
> > > something (PCIe r5.0, sec 2.2.9).
> > 
> > Thanks Bjorn. It makes perfect sense!
> > > > > The MMIO read that signals the MRPC status always returns ~0
> > > > > and the userspace request will eventually time out.
> > > > 
> > > > The problem in this case is that, in DMA MRPC mode, the status
> > > > (in host memory) is always initialized to 'in progress', and
> > > > it's up to the firmware to update it to 'done' after the
> > > > command
> > > > is executed in the firmware. After a firmware reset is
> > > > performed, the firmware cannot be triggered to start a MRPC
> > > > command, therefore the status in host memory remains 'in
> > > > progress' in the driver, which prevents a MRPC from timing out.
> > > > I should have included this in the message.
> > > 
> > > I *thought* the problem was that the PCIe Memory Read failed and
> > > the Root Complex fabricated ~0 data to complete the CPU
> > > read.  But
> > > now I'm not sure, because it sounds like it might be that the
> > > PCIe
> > > transaction succeeds, but it reads data that hasn't been updated
> > > by the firmware, i.e., it reads 'in progress' because firmware
> > > hasn't updated it to 'done'.
> > 
> > The original message was sort of misleading. After a firmware
> > reset,
> > CPU getting ~0 for the PCIe Memory Read doesn't explain the hang.
> > In
> > a MRPC execution (DMA MRPC mode), the MRPC status which is located
> > in the host memory, gets initialized by the CPU and
> > updated/finalized by the firmware. In the situation of a firmware
> > reset, any MRPC initiated afterwards will not get the status
> > updated
> > by the firmware per the reason you pointed out above (or similar,
> > to
> > my understanding, firmware can no longer DMA data to host memory in
> > such cases), therefore the MRPC execution will never end.
> 
> I'm glad this makes sense to you, because it still doesn't to me.
> 
> check_access() does an MMIO read to something in BAR0.  If that read
> returns ~0, it means either the PCIe Memory Read was successful and
> the Switchtec device supplied ~0 data (maybe because firmware has not
> initialized that part of the BAR) or the PCIe Memory Read failed and
> the root complex fabricated the ~0 data.
> 
> I'd like to know which one is happening so we can clarify the commit
> log text about "MRPC command executions hang indefinitely" and "host
> wil fail all GAS reads."  It's not clear whether these are PCIe
> protocol issues or driver/firmware interaction issues.
> 
> Bjorn

I think it's the latter case, the ~0 data was fabricated by the root
complex, as the MMIO read in check_access() always returns ~0 until a
reboot or a rescan happens. 

Kelvin
Bjorn Helgaas Oct. 6, 2021, 2:33 a.m. UTC | #11
On Wed, Oct 06, 2021 at 12:37:02AM +0000, Kelvin.Cao@microchip.com wrote:
> On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > On Mon, Oct 04, 2021 at 08:51:06PM +0000, Kelvin.Cao@microchip.com
> > wrote:
> > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:

> > > > I *thought* the problem was that the PCIe Memory Read failed
> > > > and the Root Complex fabricated ~0 data to complete the CPU
> > > > read.  But now I'm not sure, because it sounds like it might
> > > > be that the PCIe transaction succeeds, but it reads data that
> > > > hasn't been updated by the firmware, i.e., it reads 'in
> > > > progress' because firmware hasn't updated it to 'done'.
> > > 
> > > The original message was sort of misleading. After a firmware
> > > reset, CPU getting ~0 for the PCIe Memory Read doesn't explain
> > > the hang.  In a MRPC execution (DMA MRPC mode), the MRPC status
> > > which is located in the host memory, gets initialized by the CPU
> > > and updated/finalized by the firmware. In the situation of a
> > > firmware reset, any MRPC initiated afterwards will not get the
> > > status updated by the firmware per the reason you pointed out
> > > above (or similar, to my understanding, firmware can no longer
> > > DMA data to host memory in such cases), therefore the MRPC
> > > execution will never end.
> > 
> > I'm glad this makes sense to you, because it still doesn't to me.
> > 
> > check_access() does an MMIO read to something in BAR0.  If that
> > read returns ~0, it means either the PCIe Memory Read was
> > successful and the Switchtec device supplied ~0 data (maybe
> > because firmware has not initialized that part of the BAR) or the
> > PCIe Memory Read failed and the root complex fabricated the ~0
> > data.
> > 
> > I'd like to know which one is happening so we can clarify the
> > commit log text about "MRPC command executions hang indefinitely"
> > and "host wil fail all GAS reads."  It's not clear whether these
> > are PCIe protocol issues or driver/firmware interaction issues.
> 
> I think it's the latter case, the ~0 data was fabricated by the root
> complex, as the MMIO read in check_access() always returns ~0 until
> a reboot or a rescan happens. 

If the root complex fabricates ~0, that means a PCIe transaction
failed, i.e., the device didn't respond.  Rescan only does config
reads and writes.  Why should that cause the PCIe transactions to
magically start working?
Kelvin Cao Oct. 6, 2021, 5:49 a.m. UTC | #12
On Tue, 2021-10-05 at 21:33 -0500, Bjorn Helgaas wrote:
> On Wed, Oct 06, 2021 at 12:37:02AM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > > On Mon, Oct 04, 2021 at 08:51:06PM +0000, 
> > > Kelvin.Cao@microchip.com
> > > wrote:
> > > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > > > I *thought* the problem was that the PCIe Memory Read failed
> > > > > and the Root Complex fabricated ~0 data to complete the CPU
> > > > > read.  But now I'm not sure, because it sounds like it might
> > > > > be that the PCIe transaction succeeds, but it reads data that
> > > > > hasn't been updated by the firmware, i.e., it reads 'in
> > > > > progress' because firmware hasn't updated it to 'done'.
> > > > 
> > > > The original message was sort of misleading. After a firmware
> > > > reset, CPU getting ~0 for the PCIe Memory Read doesn't explain
> > > > the hang.  In a MRPC execution (DMA MRPC mode), the MRPC status
> > > > which is located in the host memory, gets initialized by the
> > > > CPU
> > > > and updated/finalized by the firmware. In the situation of a
> > > > firmware reset, any MRPC initiated afterwards will not get the
> > > > status updated by the firmware per the reason you pointed out
> > > > above (or similar, to my understanding, firmware can no longer
> > > > DMA data to host memory in such cases), therefore the MRPC
> > > > execution will never end.
> > > 
> > > I'm glad this makes sense to you, because it still doesn't to me.
> > > 
> > > check_access() does an MMIO read to something in BAR0.  If that
> > > read returns ~0, it means either the PCIe Memory Read was
> > > successful and the Switchtec device supplied ~0 data (maybe
> > > because firmware has not initialized that part of the BAR) or the
> > > PCIe Memory Read failed and the root complex fabricated the ~0
> > > data.
> > > 
> > > I'd like to know which one is happening so we can clarify the
> > > commit log text about "MRPC command executions hang indefinitely"
> > > and "host wil fail all GAS reads."  It's not clear whether these
> > > are PCIe protocol issues or driver/firmware interaction issues.
> > 
> > I think it's the latter case, the ~0 data was fabricated by the
> > root
> > complex, as the MMIO read in check_access() always returns ~0 until
> > a reboot or a rescan happens.
> 
> If the root complex fabricates ~0, that means a PCIe transaction
> failed, i.e., the device didn't respond.  Rescan only does config
> reads and writes.  Why should that cause the PCIe transactions to
> magically start working?

I took a closer look. What I observed was like this. A firmware reset
cleared some CSR settings including the MSE and MBE bits and the Base
Address Registers. With a rescan (removing the switch to which the
management EP was binded from root port and rescan), the management EP
was re-enumerated and driver was re-probed, so that the settings
cleared by the firmware reset was properly setup again, therefore PCIe
transactions start working.

Kelvin
Bjorn Helgaas Oct. 6, 2021, 2:19 p.m. UTC | #13
On Wed, Oct 06, 2021 at 05:49:29AM +0000, Kelvin.Cao@microchip.com wrote:
> On Tue, 2021-10-05 at 21:33 -0500, Bjorn Helgaas wrote:
> > On Wed, Oct 06, 2021 at 12:37:02AM +0000, Kelvin.Cao@microchip.com
> > wrote:
> > > On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > > > On Mon, Oct 04, 2021 at 08:51:06PM +0000, 
> > > > Kelvin.Cao@microchip.com
> > > > wrote:
> > > > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > > > > I *thought* the problem was that the PCIe Memory Read
> > > > > > failed and the Root Complex fabricated ~0 data to complete
> > > > > > the CPU read.  But now I'm not sure, because it sounds
> > > > > > like it might be that the PCIe transaction succeeds, but
> > > > > > it reads data that hasn't been updated by the firmware,
> > > > > > i.e., it reads 'in progress' because firmware hasn't
> > > > > > updated it to 'done'.
> > > > > 
> > > > > The original message was sort of misleading. After a
> > > > > firmware reset, CPU getting ~0 for the PCIe Memory Read
> > > > > doesn't explain the hang.  In a MRPC execution (DMA MRPC
> > > > > mode), the MRPC status which is located in the host memory,
> > > > > gets initialized by the CPU and updated/finalized by the
> > > > > firmware. In the situation of a firmware reset, any MRPC
> > > > > initiated afterwards will not get the status updated by the
> > > > > firmware per the reason you pointed out above (or similar,
> > > > > to my understanding, firmware can no longer DMA data to host
> > > > > memory in such cases), therefore the MRPC execution will
> > > > > never end.
> > > > 
> > > > I'm glad this makes sense to you, because it still doesn't to
> > > > me.
> > > > 
> > > > check_access() does an MMIO read to something in BAR0.  If
> > > > that read returns ~0, it means either the PCIe Memory Read was
> > > > successful and the Switchtec device supplied ~0 data (maybe
> > > > because firmware has not initialized that part of the BAR) or
> > > > the PCIe Memory Read failed and the root complex fabricated
> > > > the ~0 data.
> > > > 
> > > > I'd like to know which one is happening so we can clarify the
> > > > commit log text about "MRPC command executions hang
> > > > indefinitely" and "host wil fail all GAS reads."  It's not
> > > > clear whether these are PCIe protocol issues or
> > > > driver/firmware interaction issues.
> > > 
> > > I think it's the latter case, the ~0 data was fabricated by the
> > > root complex, as the MMIO read in check_access() always returns
> > > ~0 until a reboot or a rescan happens.
> > 
> > If the root complex fabricates ~0, that means a PCIe transaction
> > failed, i.e., the device didn't respond.  Rescan only does config
> > reads and writes.  Why should that cause the PCIe transactions to
> > magically start working?
> 
> I took a closer look. What I observed was like this. A firmware
> reset cleared some CSR settings including the MSE and MBE bits and
> the Base Address Registers. With a rescan (removing the switch to
> which the management EP was binded from root port and rescan), the
> management EP was re-enumerated and driver was re-probed, so that
> the settings cleared by the firmware reset was properly setup again,
> therefore PCIe transactions start working.

I think what you just said is that 

  - the driver asked the firmware to reset the device

  - the firmware did reset the device, which cleared Memory Space
    Enable

  - nothing restored the device config after the reset, so Memory
    Space Enable remains cleared

  - the driver does MMIO reads to figure out when the reset has
    completed

  - the device doesn't respond to the PCIe Memory Reads because Memory
    Space Enable is cleared

  - the root complex sees a timeout or error completion and fabricates
    ~0 data for the CPU read

  - the driver sees ~0 data from the MMIO read and thinks the device
    or firmware is hung

If that's all true, I think the patch is sort of a band-aid that
doesn't fix the problem at all but only makes the driver's response to
it marginally better.  But the device is still unusable until a rescan
or reboot.

So I think we should drop this patch and do something to restore the
device state after the reset.

Bjorn
Kelvin Cao Oct. 6, 2021, 7 p.m. UTC | #14
On Wed, 2021-10-06 at 09:19 -0500, Bjorn Helgaas wrote:
> On Wed, Oct 06, 2021 at 05:49:29AM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Tue, 2021-10-05 at 21:33 -0500, Bjorn Helgaas wrote:
> > > On Wed, Oct 06, 2021 at 12:37:02AM +0000, 
> > > Kelvin.Cao@microchip.com
> > > wrote:
> > > > On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > > > > On Mon, Oct 04, 2021 at 08:51:06PM +0000,
> > > > > Kelvin.Cao@microchip.com
> > > > > wrote:
> > > > > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > > > > > I *thought* the problem was that the PCIe Memory Read
> > > > > > > failed and the Root Complex fabricated ~0 data to
> > > > > > > complete
> > > > > > > the CPU read.  But now I'm not sure, because it sounds
> > > > > > > like it might be that the PCIe transaction succeeds, but
> > > > > > > it reads data that hasn't been updated by the firmware,
> > > > > > > i.e., it reads 'in progress' because firmware hasn't
> > > > > > > updated it to 'done'.
> > > > > > 
> > > > > > The original message was sort of misleading. After a
> > > > > > firmware reset, CPU getting ~0 for the PCIe Memory Read
> > > > > > doesn't explain the hang.  In a MRPC execution (DMA MRPC
> > > > > > mode), the MRPC status which is located in the host memory,
> > > > > > gets initialized by the CPU and updated/finalized by the
> > > > > > firmware. In the situation of a firmware reset, any MRPC
> > > > > > initiated afterwards will not get the status updated by the
> > > > > > firmware per the reason you pointed out above (or similar,
> > > > > > to my understanding, firmware can no longer DMA data to
> > > > > > host
> > > > > > memory in such cases), therefore the MRPC execution will
> > > > > > never end.
> > > > > 
> > > > > I'm glad this makes sense to you, because it still doesn't to
> > > > > me.
> > > > > 
> > > > > check_access() does an MMIO read to something in BAR0.  If
> > > > > that read returns ~0, it means either the PCIe Memory Read
> > > > > was
> > > > > successful and the Switchtec device supplied ~0 data (maybe
> > > > > because firmware has not initialized that part of the BAR) or
> > > > > the PCIe Memory Read failed and the root complex fabricated
> > > > > the ~0 data.
> > > > > 
> > > > > I'd like to know which one is happening so we can clarify the
> > > > > commit log text about "MRPC command executions hang
> > > > > indefinitely" and "host wil fail all GAS reads."  It's not
> > > > > clear whether these are PCIe protocol issues or
> > > > > driver/firmware interaction issues.
> > > > 
> > > > I think it's the latter case, the ~0 data was fabricated by the
> > > > root complex, as the MMIO read in check_access() always returns
> > > > ~0 until a reboot or a rescan happens.
> > > 
> > > If the root complex fabricates ~0, that means a PCIe transaction
> > > failed, i.e., the device didn't respond.  Rescan only does config
> > > reads and writes.  Why should that cause the PCIe transactions to
> > > magically start working?
> > 
> > I took a closer look. What I observed was like this. A firmware
> > reset cleared some CSR settings including the MSE and MBE bits and
> > the Base Address Registers. With a rescan (removing the switch to
> > which the management EP was binded from root port and rescan), the
> > management EP was re-enumerated and driver was re-probed, so that
> > the settings cleared by the firmware reset was properly setup
> > again,
> > therefore PCIe transactions start working.
> 
> I think what you just said is that
> 
>   - the driver asked the firmware to reset the device
> 
>   - the firmware did reset the device, which cleared Memory Space
>     Enable
> 
>   - nothing restored the device config after the reset, so Memory
>     Space Enable remains cleared
> 
>   - the driver does MMIO reads to figure out when the reset has
>     completed
> 
>   - the device doesn't respond to the PCIe Memory Reads because
> Memory
>     Space Enable is cleared
> 
>   - the root complex sees a timeout or error completion and
> fabricates
>     ~0 data for the CPU read
> 
>   - the driver sees ~0 data from the MMIO read and thinks the device
>     or firmware is hung
> 
> If that's all true, I think the patch is sort of a band-aid that
> doesn't fix the problem at all but only makes the driver's response
> to
> it marginally better.  But the device is still unusable until a
> rescan
> or reboot.
> 
> So I think we should drop this patch and do something to restore the
> device state after the reset.

Do you mean we should do something at the driver level to automatically
try to restore the device state after the reset? I was thinking it's up
to the user to make the call to restore the device state or take other
actions, so that returning an error code from MRPC to indicate what
happened would be good enough for the driver. 

Can you possibly shed light on what might be a reasonable way to
restore the device state in the driver if applicable? I was just doing
it by leveraging the remove and rescan interfaces in the sysfs.

> 
> Bjorn

That's all true. I lean towards keeping the patch as I think making the
response better under the following situations might not be bad.

  - The firmware reset case, as we discussed. I'd think it's still
useful for users to get a fast error return which indicates something
bad happened and some actions need to be taken either to abort or try
to recover. In this case, we are assuming that a firmware reset will
boot the firmware successfully.

  - The firwmare crashes and doesn't respond, which normally is the
reason for users to issue a firmware reset command to try to recover it
via either the driver or a sideband interface. The firmware may not be
able to recover by a reset in some extream situations like hardware
errors, so that an error return is probably all the users can get
before another level of recovery happens.

So I'd think this patch is still making the driver better in some way.

Kelvin
Bjorn Helgaas Oct. 6, 2021, 8:20 p.m. UTC | #15
On Wed, Oct 06, 2021 at 07:00:55PM +0000, Kelvin.Cao@microchip.com wrote:
> On Wed, 2021-10-06 at 09:19 -0500, Bjorn Helgaas wrote:
> > On Wed, Oct 06, 2021 at 05:49:29AM +0000, Kelvin.Cao@microchip.com
> > wrote:
> > > On Tue, 2021-10-05 at 21:33 -0500, Bjorn Helgaas wrote:
> > > > On Wed, Oct 06, 2021 at 12:37:02AM +0000, 
> > > > Kelvin.Cao@microchip.com
> > > > wrote:
> > > > > On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > > > > > On Mon, Oct 04, 2021 at 08:51:06PM +0000,
> > > > > > Kelvin.Cao@microchip.com
> > > > > > wrote:
> > > > > > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > > > > > > I *thought* the problem was that the PCIe Memory Read
> > > > > > > > failed and the Root Complex fabricated ~0 data to
> > > > > > > > complete
> > > > > > > > the CPU read.  But now I'm not sure, because it sounds
> > > > > > > > like it might be that the PCIe transaction succeeds, but
> > > > > > > > it reads data that hasn't been updated by the firmware,
> > > > > > > > i.e., it reads 'in progress' because firmware hasn't
> > > > > > > > updated it to 'done'.
> > > > > > > 
> > > > > > > The original message was sort of misleading. After a
> > > > > > > firmware reset, CPU getting ~0 for the PCIe Memory Read
> > > > > > > doesn't explain the hang.  In a MRPC execution (DMA MRPC
> > > > > > > mode), the MRPC status which is located in the host memory,
> > > > > > > gets initialized by the CPU and updated/finalized by the
> > > > > > > firmware. In the situation of a firmware reset, any MRPC
> > > > > > > initiated afterwards will not get the status updated by the
> > > > > > > firmware per the reason you pointed out above (or similar,
> > > > > > > to my understanding, firmware can no longer DMA data to
> > > > > > > host
> > > > > > > memory in such cases), therefore the MRPC execution will
> > > > > > > never end.
> > > > > > 
> > > > > > I'm glad this makes sense to you, because it still doesn't to
> > > > > > me.
> > > > > > 
> > > > > > check_access() does an MMIO read to something in BAR0.  If
> > > > > > that read returns ~0, it means either the PCIe Memory Read
> > > > > > was
> > > > > > successful and the Switchtec device supplied ~0 data (maybe
> > > > > > because firmware has not initialized that part of the BAR) or
> > > > > > the PCIe Memory Read failed and the root complex fabricated
> > > > > > the ~0 data.
> > > > > > 
> > > > > > I'd like to know which one is happening so we can clarify the
> > > > > > commit log text about "MRPC command executions hang
> > > > > > indefinitely" and "host wil fail all GAS reads."  It's not
> > > > > > clear whether these are PCIe protocol issues or
> > > > > > driver/firmware interaction issues.
> > > > > 
> > > > > I think it's the latter case, the ~0 data was fabricated by the
> > > > > root complex, as the MMIO read in check_access() always returns
> > > > > ~0 until a reboot or a rescan happens.
> > > > 
> > > > If the root complex fabricates ~0, that means a PCIe transaction
> > > > failed, i.e., the device didn't respond.  Rescan only does config
> > > > reads and writes.  Why should that cause the PCIe transactions to
> > > > magically start working?
> > > 
> > > I took a closer look. What I observed was like this. A firmware
> > > reset cleared some CSR settings including the MSE and MBE bits and
> > > the Base Address Registers. With a rescan (removing the switch to
> > > which the management EP was binded from root port and rescan), the
> > > management EP was re-enumerated and driver was re-probed, so that
> > > the settings cleared by the firmware reset was properly setup
> > > again,
> > > therefore PCIe transactions start working.
> > 
> > I think what you just said is that
> > 
> >   - the driver asked the firmware to reset the device
> > 
> >   - the firmware did reset the device, which cleared Memory Space
> >     Enable
> > 
> >   - nothing restored the device config after the reset, so Memory
> >     Space Enable remains cleared
> > 
> >   - the driver does MMIO reads to figure out when the reset has
> >     completed
> > 
> >   - the device doesn't respond to the PCIe Memory Reads because
> > Memory
> >     Space Enable is cleared
> > 
> >   - the root complex sees a timeout or error completion and
> > fabricates
> >     ~0 data for the CPU read
> > 
> >   - the driver sees ~0 data from the MMIO read and thinks the device
> >     or firmware is hung
> > 
> > If that's all true, I think the patch is sort of a band-aid that
> > doesn't fix the problem at all but only makes the driver's response
> > to
> > it marginally better.  But the device is still unusable until a
> > rescan
> > or reboot.
> > 
> > So I think we should drop this patch and do something to restore the
> > device state after the reset.
> 
> Do you mean we should do something at the driver level to automatically
> try to restore the device state after the reset? I was thinking it's up
> to the user to make the call to restore the device state or take other
> actions, so that returning an error code from MRPC to indicate what
> happened would be good enough for the driver. 

It sounds like this is a completely predictable situation.  Why would
you want manual user intervention?

I'm pretty sure there are drivers that save state *before* the reset
and restore it afterwards.

> Can you possibly shed light on what might be a reasonable way to
> restore the device state in the driver if applicable? I was just doing
> it by leveraging the remove and rescan interfaces in the sysfs.
> 
> That's all true. I lean towards keeping the patch as I think making the
> response better under the following situations might not be bad.
> 
>   - The firmware reset case, as we discussed. I'd think it's still
> useful for users to get a fast error return which indicates something
> bad happened and some actions need to be taken either to abort or try
> to recover. In this case, we are assuming that a firmware reset will
> boot the firmware successfully.

So wait, you mean you just intentionally ask the firmware to reset,
knowing that the device will be unusable until the user reboots or
does a manual rescan?  And the way to improve this is for the driver
to report an error to the user instead of hanging?  I *guess* that
might be some sort of improvement, but seems like a pretty small one.

>   - The firwmare crashes and doesn't respond, which normally is the
> reason for users to issue a firmware reset command to try to recover it
> via either the driver or a sideband interface. The firmware may not be
> able to recover by a reset in some extream situations like hardware
> errors, so that an error return is probably all the users can get
> before another level of recovery happens.
> 
> So I'd think this patch is still making the driver better in some way.
> 
> Kelvin
>
Kelvin Cao Oct. 6, 2021, 9:27 p.m. UTC | #16
On Wed, 2021-10-06 at 15:20 -0500, Bjorn Helgaas wrote:
> On Wed, Oct 06, 2021 at 07:00:55PM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Wed, 2021-10-06 at 09:19 -0500, Bjorn Helgaas wrote:
> > > On Wed, Oct 06, 2021 at 05:49:29AM +0000, 
> > > Kelvin.Cao@microchip.com
> > > wrote:
> > > > On Tue, 2021-10-05 at 21:33 -0500, Bjorn Helgaas wrote:
> > > > > On Wed, Oct 06, 2021 at 12:37:02AM +0000,
> > > > > Kelvin.Cao@microchip.com
> > > > > wrote:
> > > > > > On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > > > > > > On Mon, Oct 04, 2021 at 08:51:06PM +0000,
> > > > > > > Kelvin.Cao@microchip.com
> > > > > > > wrote:
> > > > > > > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > > > > > > > I *thought* the problem was that the PCIe Memory Read
> > > > > > > > > failed and the Root Complex fabricated ~0 data to
> > > > > > > > > complete
> > > > > > > > > the CPU read.  But now I'm not sure, because it
> > > > > > > > > sounds
> > > > > > > > > like it might be that the PCIe transaction succeeds,
> > > > > > > > > but
> > > > > > > > > it reads data that hasn't been updated by the
> > > > > > > > > firmware,
> > > > > > > > > i.e., it reads 'in progress' because firmware hasn't
> > > > > > > > > updated it to 'done'.
> > > > > > > > 
> > > > > > > > The original message was sort of misleading. After a
> > > > > > > > firmware reset, CPU getting ~0 for the PCIe Memory Read
> > > > > > > > doesn't explain the hang.  In a MRPC execution (DMA
> > > > > > > > MRPC
> > > > > > > > mode), the MRPC status which is located in the host
> > > > > > > > memory,
> > > > > > > > gets initialized by the CPU and updated/finalized by
> > > > > > > > the
> > > > > > > > firmware. In the situation of a firmware reset, any
> > > > > > > > MRPC
> > > > > > > > initiated afterwards will not get the status updated by
> > > > > > > > the
> > > > > > > > firmware per the reason you pointed out above (or
> > > > > > > > similar,
> > > > > > > > to my understanding, firmware can no longer DMA data to
> > > > > > > > host
> > > > > > > > memory in such cases), therefore the MRPC execution
> > > > > > > > will
> > > > > > > > never end.
> > > > > > > 
> > > > > > > I'm glad this makes sense to you, because it still
> > > > > > > doesn't to
> > > > > > > me.
> > > > > > > 
> > > > > > > check_access() does an MMIO read to something in
> > > > > > > BAR0.  If
> > > > > > > that read returns ~0, it means either the PCIe Memory
> > > > > > > Read
> > > > > > > was
> > > > > > > successful and the Switchtec device supplied ~0 data
> > > > > > > (maybe
> > > > > > > because firmware has not initialized that part of the
> > > > > > > BAR) or
> > > > > > > the PCIe Memory Read failed and the root complex
> > > > > > > fabricated
> > > > > > > the ~0 data.
> > > > > > > 
> > > > > > > I'd like to know which one is happening so we can clarify
> > > > > > > the
> > > > > > > commit log text about "MRPC command executions hang
> > > > > > > indefinitely" and "host wil fail all GAS reads."  It's
> > > > > > > not
> > > > > > > clear whether these are PCIe protocol issues or
> > > > > > > driver/firmware interaction issues.
> > > > > > 
> > > > > > I think it's the latter case, the ~0 data was fabricated by
> > > > > > the
> > > > > > root complex, as the MMIO read in check_access() always
> > > > > > returns
> > > > > > ~0 until a reboot or a rescan happens.
> > > > > 
> > > > > If the root complex fabricates ~0, that means a PCIe
> > > > > transaction
> > > > > failed, i.e., the device didn't respond.  Rescan only does
> > > > > config
> > > > > reads and writes.  Why should that cause the PCIe
> > > > > transactions to
> > > > > magically start working?
> > > > 
> > > > I took a closer look. What I observed was like this. A firmware
> > > > reset cleared some CSR settings including the MSE and MBE bits
> > > > and
> > > > the Base Address Registers. With a rescan (removing the switch
> > > > to
> > > > which the management EP was binded from root port and rescan),
> > > > the
> > > > management EP was re-enumerated and driver was re-probed, so
> > > > that
> > > > the settings cleared by the firmware reset was properly setup
> > > > again,
> > > > therefore PCIe transactions start working.
> > > 
> > > I think what you just said is that
> > > 
> > >   - the driver asked the firmware to reset the device
> > > 
> > >   - the firmware did reset the device, which cleared Memory Space
> > >     Enable
> > > 
> > >   - nothing restored the device config after the reset, so Memory
> > >     Space Enable remains cleared
> > > 
> > >   - the driver does MMIO reads to figure out when the reset has
> > >     completed
> > > 
> > >   - the device doesn't respond to the PCIe Memory Reads because
> > > Memory
> > >     Space Enable is cleared
> > > 
> > >   - the root complex sees a timeout or error completion and
> > > fabricates
> > >     ~0 data for the CPU read
> > > 
> > >   - the driver sees ~0 data from the MMIO read and thinks the
> > > device
> > >     or firmware is hung
> > > 
> > > If that's all true, I think the patch is sort of a band-aid that
> > > doesn't fix the problem at all but only makes the driver's
> > > response
> > > to
> > > it marginally better.  But the device is still unusable until a
> > > rescan
> > > or reboot.
> > > 
> > > So I think we should drop this patch and do something to restore
> > > the
> > > device state after the reset.
> > 
> > Do you mean we should do something at the driver level to
> > automatically
> > try to restore the device state after the reset? I was thinking
> > it's up
> > to the user to make the call to restore the device state or take
> > other
> > actions, so that returning an error code from MRPC to indicate what
> > happened would be good enough for the driver.
> 
> It sounds like this is a completely predictable situation.  Why would
> you want manual user intervention?
> 
> I'm pretty sure there are drivers that save state *before* the reset
> and restore it afterwards.

Sometimes it's not predicatable. We have various interfaces to talk to
the firmware, like in-band, UART, TWI, etc. If a firmware reset is
issued to the firmware via the TWI sideband interface from BMC, neither
the host utility which might be periodically issueing MRPC commands to
monitor the switch nor the driver will be aware of this.

If the reset command is issued via the driver, as it's just another
MRPC command initiated by a user space process, the driver doesn't
actually know what will be happening unless it decodes each MRPC it
forwards to the firmware and checks for the reset command, but I doubt
it's something the driver should do.

> 
> > Can you possibly shed light on what might be a reasonable way to
> > restore the device state in the driver if applicable? I was just
> > doing
> > it by leveraging the remove and rescan interfaces in the sysfs.
> > 
> > That's all true. I lean towards keeping the patch as I think making
> > the
> > response better under the following situations might not be bad.
> > 
> >   - The firmware reset case, as we discussed. I'd think it's still
> > useful for users to get a fast error return which indicates
> > something
> > bad happened and some actions need to be taken either to abort or
> > try
> > to recover. In this case, we are assuming that a firmware reset
> > will
> > boot the firmware successfully.
> 
> So wait, you mean you just intentionally ask the firmware to reset,
> knowing that the device will be unusable until the user reboots or
> does a manual rescan?  And the way to improve this is for the driver
> to report an error to the user instead of hanging?  I *guess* that
> might be some sort of improvement, but seems like a pretty small one.

Yes, however, I believe it's something our users really like to have...
With this, they can do their user space programming/scripting more
easily in a synchronous fashion.

> 
> >   - The firwmare crashes and doesn't respond, which normally is the
> > reason for users to issue a firmware reset command to try to
> > recover it
> > via either the driver or a sideband interface. The firmware may not
> > be
> > able to recover by a reset in some extream situations like hardware
> > errors, so that an error return is probably all the users can get
> > before another level of recovery happens.
> > 
> > So I'd think this patch is still making the driver better in some
> > way.
> > 
> > Kelvin
> >
Bjorn Helgaas Oct. 7, 2021, 9:23 p.m. UTC | #17
On Wed, Oct 06, 2021 at 09:27:49PM +0000, Kelvin.Cao@microchip.com wrote:
> On Wed, 2021-10-06 at 15:20 -0500, Bjorn Helgaas wrote:
> > On Wed, Oct 06, 2021 at 07:00:55PM +0000, Kelvin.Cao@microchip.com
> > wrote:

> > So wait, you mean you just intentionally ask the firmware to
> > reset, knowing that the device will be unusable until the user
> > reboots or does a manual rescan?  And the way to improve this is
> > for the driver to report an error to the user instead of hanging?
> > I *guess* that might be some sort of improvement, but seems like a
> > pretty small one.
> 
> Yes, however, I believe it's something our users really like to
> have...  With this, they can do their user space
> programming/scripting more easily in a synchronous fashion.
> 
> > >   - The firwmare crashes and doesn't respond, which normally is
> > >   the reason for users to issue a firmware reset command to try
> > >   to recover it via either the driver or a sideband interface.
> > >   The firmware may not be able to recover by a reset in some
> > >   extream situations like hardware errors, so that an error
> > >   return is probably all the users can get before another level
> > >   of recovery happens.
> > > 
> > > So I'd think this patch is still making the driver better in
> > > some way.

OK.  I still think the fact that all these different mechanisms can
reset the device behind your back and make the switch and anything on
the other side of it just stop working is ..., well, let's just say
it's quite surprising to me.

Well, at least this isn't quite so much a mystery anymore and maybe we
can improve the commit log.  E.g., maybe something like this:

  A firmware hard reset may be initiated by various mechanisms
  including a UART interface, TWI sideband interface from BMC, MRPC
  command from userspace, etc.  The switchtec management driver is
  unaware of these resets.

  The reset clears PCI state including the BARs and Memory Space
  Enable bits, so the device no longer responds to the MMIO accesses
  the driver uses to operate it.

  MMIO reads to the device will fail with a PCIe error.  When the root
  complex handles that error, it typically fabricates ~0 data to
  complete the CPU read.

  Check for this sort of error by reading the device ID from MMIO
  space.  This ID can never be ~0, so if we see that value, it
  probably means the PCIe Memory Read failed and we should return an
  error indication to the application using the switchtec driver.
Kelvin Cao Oct. 8, 2021, 12:06 a.m. UTC | #18
On Thu, 2021-10-07 at 16:23 -0500, Bjorn Helgaas wrote:
> On Wed, Oct 06, 2021 at 09:27:49PM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Wed, 2021-10-06 at 15:20 -0500, Bjorn Helgaas wrote:
> > > On Wed, Oct 06, 2021 at 07:00:55PM +0000, 
> > > Kelvin.Cao@microchip.com
> > > wrote:
> > > So wait, you mean you just intentionally ask the firmware to
> > > reset, knowing that the device will be unusable until the user
> > > reboots or does a manual rescan?  And the way to improve this is
> > > for the driver to report an error to the user instead of hanging?
> > > I *guess* that might be some sort of improvement, but seems like
> > > a
> > > pretty small one.
> > 
> > Yes, however, I believe it's something our users really like to
> > have...  With this, they can do their user space
> > programming/scripting more easily in a synchronous fashion.
> > 
> > > >   - The firwmare crashes and doesn't respond, which normally is
> > > >   the reason for users to issue a firmware reset command to try
> > > >   to recover it via either the driver or a sideband interface.
> > > >   The firmware may not be able to recover by a reset in some
> > > >   extream situations like hardware errors, so that an error
> > > >   return is probably all the users can get before another level
> > > >   of recovery happens.
> > > > 
> > > > So I'd think this patch is still making the driver better in
> > > > some way.
> 
> OK.  I still think the fact that all these different mechanisms can
> reset the device behind your back and make the switch and anything on
> the other side of it just stop working is ..., well, let's just say
> it's quite surprising to me.

Actually there're mechanisms like permission control to limit what
people can do in the firmware, so I guess it's not as bad as it sounds
like. 
> 
> Well, at least this isn't quite so much a mystery anymore and maybe
> we
> can improve the commit log.  E.g., maybe something like this:
> 
>   A firmware hard reset may be initiated by various mechanisms
>   including a UART interface, TWI sideband interface from BMC, MRPC
>   command from userspace, etc.  The switchtec management driver is
>   unaware of these resets.
> 
>   The reset clears PCI state including the BARs and Memory Space
>   Enable bits, so the device no longer responds to the MMIO accesses
>   the driver uses to operate it.
> 
>   MMIO reads to the device will fail with a PCIe error.  When the
> root
>   complex handles that error, it typically fabricates ~0 data to
>   complete the CPU read.
> 
>   Check for this sort of error by reading the device ID from MMIO
>   space.  This ID can never be ~0, so if we see that value, it
>   probably means the PCIe Memory Read failed and we should return an
>   error indication to the application using the switchtec driver.

It looks good to me, the commit log removes the ambiguity. Let me know
if you prefer a v2 patchset with the updated commit log and naming
issue fix.

Thank you Bjorn for your patience and time! 

Kelvin
Bjorn Helgaas Oct. 8, 2021, 11:03 a.m. UTC | #19
On Fri, Oct 08, 2021 at 12:06:18AM +0000, Kelvin.Cao@microchip.com wrote:
> On Thu, 2021-10-07 at 16:23 -0500, Bjorn Helgaas wrote:
> > On Wed, Oct 06, 2021 at 09:27:49PM +0000, Kelvin.Cao@microchip.com
> > wrote:
> > > On Wed, 2021-10-06 at 15:20 -0500, Bjorn Helgaas wrote:
> > > > On Wed, Oct 06, 2021 at 07:00:55PM +0000, 
> > > > Kelvin.Cao@microchip.com
> > > > wrote:
> > > > So wait, you mean you just intentionally ask the firmware to
> > > > reset, knowing that the device will be unusable until the user
> > > > reboots or does a manual rescan?  And the way to improve this is
> > > > for the driver to report an error to the user instead of hanging?
> > > > I *guess* that might be some sort of improvement, but seems like
> > > > a
> > > > pretty small one.
> > > 
> > > Yes, however, I believe it's something our users really like to
> > > have...  With this, they can do their user space
> > > programming/scripting more easily in a synchronous fashion.
> > > 
> > > > >   - The firwmare crashes and doesn't respond, which normally is
> > > > >   the reason for users to issue a firmware reset command to try
> > > > >   to recover it via either the driver or a sideband interface.
> > > > >   The firmware may not be able to recover by a reset in some
> > > > >   extream situations like hardware errors, so that an error
> > > > >   return is probably all the users can get before another level
> > > > >   of recovery happens.
> > > > > 
> > > > > So I'd think this patch is still making the driver better in
> > > > > some way.
> > 
> > OK.  I still think the fact that all these different mechanisms can
> > reset the device behind your back and make the switch and anything on
> > the other side of it just stop working is ..., well, let's just say
> > it's quite surprising to me.
> 
> Actually there're mechanisms like permission control to limit what
> people can do in the firmware, so I guess it's not as bad as it sounds
> like. 
> > 
> > Well, at least this isn't quite so much a mystery anymore and maybe
> > we
> > can improve the commit log.  E.g., maybe something like this:
> > 
> >   A firmware hard reset may be initiated by various mechanisms
> >   including a UART interface, TWI sideband interface from BMC, MRPC
> >   command from userspace, etc.  The switchtec management driver is
> >   unaware of these resets.
> > 
> >   The reset clears PCI state including the BARs and Memory Space
> >   Enable bits, so the device no longer responds to the MMIO accesses
> >   the driver uses to operate it.
> > 
> >   MMIO reads to the device will fail with a PCIe error.  When the
> > root
> >   complex handles that error, it typically fabricates ~0 data to
> >   complete the CPU read.
> > 
> >   Check for this sort of error by reading the device ID from MMIO
> >   space.  This ID can never be ~0, so if we see that value, it
> >   probably means the PCIe Memory Read failed and we should return an
> >   error indication to the application using the switchtec driver.
> 
> It looks good to me, the commit log removes the ambiguity. Let me know
> if you prefer a v2 patchset with the updated commit log and naming
> issue fix.

Yes, if you post a v2 of this patch, I'll update my pci/switchtec
branch with it.  Thanks for your patience!

Bjorn
diff mbox series

Patch

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 0b301f8be9ed..092653487021 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -45,6 +45,7 @@  enum mrpc_state {
 	MRPC_QUEUED,
 	MRPC_RUNNING,
 	MRPC_DONE,
+	MRPC_IO_ERROR,
 };
 
 struct switchtec_user {
@@ -66,6 +67,13 @@  struct switchtec_user {
 	int event_cnt;
 };
 
+static int check_access(struct switchtec_dev *stdev)
+{
+	u32 device = ioread32(&stdev->mmio_sys_info->device_id);
+
+	return stdev->pdev->device == device;
+}
+
 static struct switchtec_user *stuser_create(struct switchtec_dev *stdev)
 {
 	struct switchtec_user *stuser;
@@ -113,6 +121,7 @@  static void stuser_set_state(struct switchtec_user *stuser,
 		[MRPC_QUEUED] = "QUEUED",
 		[MRPC_RUNNING] = "RUNNING",
 		[MRPC_DONE] = "DONE",
+		[MRPC_IO_ERROR] = "IO_ERROR",
 	};
 
 	stuser->state = state;
@@ -184,6 +193,21 @@  static int mrpc_queue_cmd(struct switchtec_user *stuser)
 	return 0;
 }
 
+static void mrpc_cleanup_cmd(struct switchtec_dev *stdev)
+{
+	/* requires the mrpc_mutex to already be held when called */
+	struct switchtec_user *stuser = list_entry(stdev->mrpc_queue.next,
+						   struct switchtec_user, list);
+
+	stuser->cmd_done = true;
+	wake_up_interruptible(&stuser->cmd_comp);
+	list_del_init(&stuser->list);
+	stuser_put(stuser);
+	stdev->mrpc_busy = 0;
+
+	mrpc_cmd_submit(stdev);
+}
+
 static void mrpc_complete_cmd(struct switchtec_dev *stdev)
 {
 	/* requires the mrpc_mutex to already be held when called */
@@ -223,13 +247,7 @@  static void mrpc_complete_cmd(struct switchtec_dev *stdev)
 		memcpy_fromio(stuser->data, &stdev->mmio_mrpc->output_data,
 			      stuser->read_len);
 out:
-	stuser->cmd_done = true;
-	wake_up_interruptible(&stuser->cmd_comp);
-	list_del_init(&stuser->list);
-	stuser_put(stuser);
-	stdev->mrpc_busy = 0;
-
-	mrpc_cmd_submit(stdev);
+	mrpc_cleanup_cmd(stdev);
 }
 
 static void mrpc_event_work(struct work_struct *work)
@@ -246,6 +264,23 @@  static void mrpc_event_work(struct work_struct *work)
 	mutex_unlock(&stdev->mrpc_mutex);
 }
 
+static void mrpc_error_complete_cmd(struct switchtec_dev *stdev)
+{
+	/* requires the mrpc_mutex to already be held when called */
+
+	struct switchtec_user *stuser;
+
+	if (list_empty(&stdev->mrpc_queue))
+		return;
+
+	stuser = list_entry(stdev->mrpc_queue.next,
+			    struct switchtec_user, list);
+
+	stuser_set_state(stuser, MRPC_IO_ERROR);
+
+	mrpc_cleanup_cmd(stdev);
+}
+
 static void mrpc_timeout_work(struct work_struct *work)
 {
 	struct switchtec_dev *stdev;
@@ -257,6 +292,11 @@  static void mrpc_timeout_work(struct work_struct *work)
 
 	mutex_lock(&stdev->mrpc_mutex);
 
+	if (!check_access(stdev)) {
+		mrpc_error_complete_cmd(stdev);
+		goto out;
+	}
+
 	if (stdev->dma_mrpc)
 		status = stdev->dma_mrpc->status;
 	else
@@ -544,6 +584,11 @@  static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
 	if (rc)
 		return rc;
 
+	if (stuser->state == MRPC_IO_ERROR) {
+		mutex_unlock(&stdev->mrpc_mutex);
+		return -EIO;
+	}
+
 	if (stuser->state != MRPC_DONE) {
 		mutex_unlock(&stdev->mrpc_mutex);
 		return -EBADE;