diff mbox series

PCI: qcom: Implement shutdown() callback to properly reset the endpoint devices

Message ID 20240313-pci-qcom-shutdown-v1-1-fb1515334bfa@linaro.org
State New
Headers show
Series PCI: qcom: Implement shutdown() callback to properly reset the endpoint devices | expand

Commit Message

Manivannan Sadhasivam March 13, 2024, 12:09 p.m. UTC
PCIe host controller drivers are supposed to properly reset the endpoint
devices during host shutdown/reboot. Currently, Qcom driver doesn't do
anything during host shutdown/reboot, resulting in both PERST# and refclk
getting disabled at the same time. This prevents the endpoint device
firmware to properly reset the state machine. Because, if the refclk is
cutoff immediately along with PERST#, access to device specific registers
within the endpoint will result in a firmware crash.

To address this issue, let's call qcom_pcie_host_deinit() inside the
shutdown callback, that asserts PERST# and then cuts off the refclk with a
delay of 1ms, thus allowing the endpoint device firmware to properly
cleanup the state machine.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/pci/controller/dwc/pcie-qcom.c | 8 ++++++++
 1 file changed, 8 insertions(+)


---
base-commit: 51459eb30f88651d3688b9e95fed0f97767ececb
change-id: 20240313-pci-qcom-shutdown-d86298186560

Best regards,

Comments

Bjorn Helgaas March 13, 2024, 2:36 p.m. UTC | #1
On Wed, Mar 13, 2024 at 05:39:22PM +0530, Manivannan Sadhasivam wrote:
> PCIe host controller drivers are supposed to properly reset the endpoint
> devices during host shutdown/reboot. Currently, Qcom driver doesn't do
> anything during host shutdown/reboot, resulting in both PERST# and refclk
> getting disabled at the same time. This prevents the endpoint device
> firmware to properly reset the state machine. Because, if the refclk is
> cutoff immediately along with PERST#, access to device specific registers
> within the endpoint will result in a firmware crash.
> 
> To address this issue, let's call qcom_pcie_host_deinit() inside the
> shutdown callback, that asserts PERST# and then cuts off the refclk with a
> delay of 1ms, thus allowing the endpoint device firmware to properly
> cleanup the state machine.

I guess this 1ms delay is the PERST_DELAY_US hidden inside
qcom_ep_reset_assert()?  I assume the refclk disable is done by
clk_bulk_disable_unprepare()?

  #define PERST_DELAY_US 1000

  qcom_pcie_shutdown
    qcom_pcie_host_deinit
      qcom_ep_reset_assert
        gpiod_set_value_cansleep(pcie->reset, 1);
        usleep_range(PERST_DELAY_US, PERST_DELAY_US + 500);  <--
      phy_power_off(pcie->phy)
      pcie->cfg->ops->deinit()
        qcom_pcie_deinit_...
          clk_bulk_disable_unprepare                         <--

Is there a spec citation for this delay requirement?  If not, how do
we know 1ms is enough for whatever the firmware needs to do?

Do other drivers require similar changes?

> Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> ---
>  drivers/pci/controller/dwc/pcie-qcom.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
> index 2ce2a3bd932b..41434bc4761a 100644
> --- a/drivers/pci/controller/dwc/pcie-qcom.c
> +++ b/drivers/pci/controller/dwc/pcie-qcom.c
> @@ -1618,6 +1618,13 @@ static int qcom_pcie_resume_noirq(struct device *dev)
>  	return 0;
>  }
>  
> +static void qcom_pcie_shutdown(struct platform_device *pdev)
> +{
> +	struct qcom_pcie *pcie = platform_get_drvdata(pdev);
> +
> +	qcom_pcie_host_deinit(&pcie->pci->pp);
> +}
> +
>  static const struct of_device_id qcom_pcie_match[] = {
>  	{ .compatible = "qcom,pcie-apq8064", .data = &cfg_2_1_0 },
>  	{ .compatible = "qcom,pcie-apq8084", .data = &cfg_1_0_0 },
> @@ -1670,5 +1677,6 @@ static struct platform_driver qcom_pcie_driver = {
>  		.pm = &qcom_pcie_pm_ops,
>  		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
>  	},
> +	.shutdown = qcom_pcie_shutdown,
>  };
>  builtin_platform_driver(qcom_pcie_driver);
> 
> ---
> base-commit: 51459eb30f88651d3688b9e95fed0f97767ececb
> change-id: 20240313-pci-qcom-shutdown-d86298186560
> 
> Best regards,
> -- 
> Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
>
Manivannan Sadhasivam March 13, 2024, 3:02 p.m. UTC | #2
On Wed, Mar 13, 2024 at 09:36:14AM -0500, Bjorn Helgaas wrote:
> On Wed, Mar 13, 2024 at 05:39:22PM +0530, Manivannan Sadhasivam wrote:
> > PCIe host controller drivers are supposed to properly reset the endpoint
> > devices during host shutdown/reboot. Currently, Qcom driver doesn't do
> > anything during host shutdown/reboot, resulting in both PERST# and refclk
> > getting disabled at the same time. This prevents the endpoint device
> > firmware to properly reset the state machine. Because, if the refclk is
> > cutoff immediately along with PERST#, access to device specific registers
> > within the endpoint will result in a firmware crash.
> > 
> > To address this issue, let's call qcom_pcie_host_deinit() inside the
> > shutdown callback, that asserts PERST# and then cuts off the refclk with a
> > delay of 1ms, thus allowing the endpoint device firmware to properly
> > cleanup the state machine.
> 
> I guess this 1ms delay is the PERST_DELAY_US hidden inside
> qcom_ep_reset_assert()?  I assume the refclk disable is done by
> clk_bulk_disable_unprepare()?
> 

Yes to both.

>   #define PERST_DELAY_US 1000
> 
>   qcom_pcie_shutdown
>     qcom_pcie_host_deinit
>       qcom_ep_reset_assert
>         gpiod_set_value_cansleep(pcie->reset, 1);
>         usleep_range(PERST_DELAY_US, PERST_DELAY_US + 500);  <--
>       phy_power_off(pcie->phy)
>       pcie->cfg->ops->deinit()
>         qcom_pcie_deinit_...
>           clk_bulk_disable_unprepare                         <--
> 
> Is there a spec citation for this delay requirement?  If not, how do
> we know 1ms is enough for whatever the firmware needs to do?
> 

Both PCIe base spec and Electromechanical spec only mentions Tperst, which is
the minimum time PERST# should remain asserted. But there is no mention about
the time, refclk should be active.

So I used the existing delay post PERST# assert in the driver. I do not know if
that is enough for all the endpoints out in the wild, but atleast satisfies the
requirement of the endpoint I'm working on (which is another Qcom SoC in EP
mode).

We can change the delay if someone reports any issue with the existing one.
Atleast, that's the best we could do in this situation.

> Do other drivers require similar changes?
> 

Most likely yes, but that also depends on when the drivers are cutting off the
refclk. Not all drivers are implementing the shutdown callback, and even few of
the ones implementing, do not assert PERST# since it is optional.

- Mani

> > Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> > ---
> >  drivers/pci/controller/dwc/pcie-qcom.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> > 
> > diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
> > index 2ce2a3bd932b..41434bc4761a 100644
> > --- a/drivers/pci/controller/dwc/pcie-qcom.c
> > +++ b/drivers/pci/controller/dwc/pcie-qcom.c
> > @@ -1618,6 +1618,13 @@ static int qcom_pcie_resume_noirq(struct device *dev)
> >  	return 0;
> >  }
> >  
> > +static void qcom_pcie_shutdown(struct platform_device *pdev)
> > +{
> > +	struct qcom_pcie *pcie = platform_get_drvdata(pdev);
> > +
> > +	qcom_pcie_host_deinit(&pcie->pci->pp);
> > +}
> > +
> >  static const struct of_device_id qcom_pcie_match[] = {
> >  	{ .compatible = "qcom,pcie-apq8064", .data = &cfg_2_1_0 },
> >  	{ .compatible = "qcom,pcie-apq8084", .data = &cfg_1_0_0 },
> > @@ -1670,5 +1677,6 @@ static struct platform_driver qcom_pcie_driver = {
> >  		.pm = &qcom_pcie_pm_ops,
> >  		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
> >  	},
> > +	.shutdown = qcom_pcie_shutdown,
> >  };
> >  builtin_platform_driver(qcom_pcie_driver);
> > 
> > ---
> > base-commit: 51459eb30f88651d3688b9e95fed0f97767ececb
> > change-id: 20240313-pci-qcom-shutdown-d86298186560
> > 
> > Best regards,
> > -- 
> > Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> >
Bjorn Helgaas March 13, 2024, 7:16 p.m. UTC | #3
On Wed, Mar 13, 2024 at 08:32:42PM +0530, Manivannan Sadhasivam wrote:
> On Wed, Mar 13, 2024 at 09:36:14AM -0500, Bjorn Helgaas wrote:
> > On Wed, Mar 13, 2024 at 05:39:22PM +0530, Manivannan Sadhasivam wrote:
> > > PCIe host controller drivers are supposed to properly reset the endpoint
> > > devices during host shutdown/reboot.

Where does this requirement to reset endpoints during host shutdown
come from?  My working assumption is that .shutdown() needs to stop
DMA and interrupts, based on this old thread:
https://lore.kernel.org/all/61f70fd6-52fd-da07-ce73-303f95132131@codeaurora.org/

> > > Currently, Qcom driver doesn't do
> > > anything during host shutdown/reboot, resulting in both PERST# and refclk
> > > getting disabled at the same time. This prevents the endpoint device
> > > firmware to properly reset the state machine. Because, if the refclk is
> > > cutoff immediately along with PERST#, access to device specific registers
> > > within the endpoint will result in a firmware crash.

Does "PERST# getting disabled" mean PERST# is asserted or deasserted?

> > > To address this issue, let's call qcom_pcie_host_deinit() inside the
> > > shutdown callback, that asserts PERST# and then cuts off the refclk with a
> > > delay of 1ms, thus allowing the endpoint device firmware to properly
> > > cleanup the state machine.

This *adds* the qcom_pcie_shutdown() callback, right?

> > I guess this 1ms delay is the PERST_DELAY_US hidden inside
> > qcom_ep_reset_assert()?  I assume the refclk disable is done by
> > clk_bulk_disable_unprepare()?
> 
> Yes to both.
> 
> >   #define PERST_DELAY_US 1000
> > 
> >   qcom_pcie_shutdown
> >     qcom_pcie_host_deinit
> >       qcom_ep_reset_assert
> >         gpiod_set_value_cansleep(pcie->reset, 1);
> >         usleep_range(PERST_DELAY_US, PERST_DELAY_US + 500);  <--
> >       phy_power_off(pcie->phy)
> >       pcie->cfg->ops->deinit()
> >         qcom_pcie_deinit_...
> >           clk_bulk_disable_unprepare                         <--
> > 
> > Is there a spec citation for this delay requirement?  If not, how do
> > we know 1ms is enough for whatever the firmware needs to do?
> 
> Both PCIe base spec and Electromechanical spec only mentions Tperst,
> which is the minimum time PERST# should remain asserted. But there
> is no mention about the time, refclk should be active.

I see Tperst mentioned in PCIe r6.0, sec 6.6.1, but AFAICS the value
is only defined in PCIe CEM (r5.0, sec 2.9.2), which says 100us, and
maybe other form factor specs.

If PERST_DELAY_US is enforcing Tperst, why is it 1000us instead of
100us?

> So I used the existing delay post PERST# assert in the driver. I do
> not know if that is enough for all the endpoints out in the wild,
> but atleast satisfies the requirement of the endpoint I'm working on
> (which is another Qcom SoC in EP mode).
> 
> We can change the delay if someone reports any issue with the
> existing one.  Atleast, that's the best we could do in this
> situation.

I'm dubious about this.  If endpoints require a delay here to work
properly, the spec should specify a minimum delay.  We can't make a
reliable system based on "here's a guess and we'll update it if people
report issues."  That makes me think this endpoint mode Qcom SoC
dependency on a delay might itself be non spec-compliant.

> > Do other drivers require similar changes?
> 
> Most likely yes, but that also depends on when the drivers are
> cutting off the refclk. Not all drivers are implementing the
> shutdown callback, and even few of the ones implementing, do not
> assert PERST# since it is optional.

> > > Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> > > ---
> > >  drivers/pci/controller/dwc/pcie-qcom.c | 8 ++++++++
> > >  1 file changed, 8 insertions(+)
> > > 
> > > diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
> > > index 2ce2a3bd932b..41434bc4761a 100644
> > > --- a/drivers/pci/controller/dwc/pcie-qcom.c
> > > +++ b/drivers/pci/controller/dwc/pcie-qcom.c
> > > @@ -1618,6 +1618,13 @@ static int qcom_pcie_resume_noirq(struct device *dev)
> > >  	return 0;
> > >  }
> > >  
> > > +static void qcom_pcie_shutdown(struct platform_device *pdev)
> > > +{
> > > +	struct qcom_pcie *pcie = platform_get_drvdata(pdev);
> > > +
> > > +	qcom_pcie_host_deinit(&pcie->pci->pp);
> > > +}
> > > +
> > >  static const struct of_device_id qcom_pcie_match[] = {
> > >  	{ .compatible = "qcom,pcie-apq8064", .data = &cfg_2_1_0 },
> > >  	{ .compatible = "qcom,pcie-apq8084", .data = &cfg_1_0_0 },
> > > @@ -1670,5 +1677,6 @@ static struct platform_driver qcom_pcie_driver = {
> > >  		.pm = &qcom_pcie_pm_ops,
> > >  		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > >  	},
> > > +	.shutdown = qcom_pcie_shutdown,
> > >  };
> > >  builtin_platform_driver(qcom_pcie_driver);
> > > 
> > > ---
> > > base-commit: 51459eb30f88651d3688b9e95fed0f97767ececb
> > > change-id: 20240313-pci-qcom-shutdown-d86298186560
> > > 
> > > Best regards,
> > > -- 
> > > Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> > > 
> 
> -- 
> மணிவண்ணன் சதாசிவம்
Manivannan Sadhasivam March 14, 2024, 5:35 a.m. UTC | #4
On Wed, Mar 13, 2024 at 02:16:56PM -0500, Bjorn Helgaas wrote:
> On Wed, Mar 13, 2024 at 08:32:42PM +0530, Manivannan Sadhasivam wrote:
> > On Wed, Mar 13, 2024 at 09:36:14AM -0500, Bjorn Helgaas wrote:
> > > On Wed, Mar 13, 2024 at 05:39:22PM +0530, Manivannan Sadhasivam wrote:
> > > > PCIe host controller drivers are supposed to properly reset the endpoint
> > > > devices during host shutdown/reboot.
> 
> Where does this requirement to reset endpoints during host shutdown
> come from?  My working assumption is that .shutdown() needs to stop
> DMA and interrupts, based on this old thread:
> https://lore.kernel.org/all/61f70fd6-52fd-da07-ce73-303f95132131@codeaurora.org/
> 

Yes, it indeed need to stop DMA and interrupts since the endpoint is going to
a dormant state. But not everyone care about PERST# since it will be asserted
by the hw automaticallyt once the SoC goes to a powerdown state. That's what
happening even without this patch. Also, in most of the cases, during host
shutdown/reboot, the power to the endpoint will be cutoff and reapplied
(on reboot). So the endpoint will undergo cold boot and it would work as usual.

But, in some rare cases like the one I'm dealing with, power to the endpoint
device is not coming from the host. The endpoint here is another SoC that works
on its own. It just receives refclk from the host. So in this case, during host
shutdown/reboot, only PERST# will be asserted and refclk will be cutoff, but the
device will be kept powered on. Due to this, the device when it tries to
cleanup the state machine post PERST# assertion, it will crash because refclk
will be cutoff immediately.

And that is what being addressed with this patch.

I should admit that this issue is not very common and that's the reason no one
cared about it so far. Because, on PCs/Laptops, most likely the endpoint device
will be powercycled during reboot.

> > > > Currently, Qcom driver doesn't do
> > > > anything during host shutdown/reboot, resulting in both PERST# and refclk
> > > > getting disabled at the same time. This prevents the endpoint device
> > > > firmware to properly reset the state machine. Because, if the refclk is
> > > > cutoff immediately along with PERST#, access to device specific registers
> > > > within the endpoint will result in a firmware crash.
> 
> Does "PERST# getting disabled" mean PERST# is asserted or deasserted?
> 

PERST# assertion I meant. Will change the wording.

> > > > To address this issue, let's call qcom_pcie_host_deinit() inside the
> > > > shutdown callback, that asserts PERST# and then cuts off the refclk with a
> > > > delay of 1ms, thus allowing the endpoint device firmware to properly
> > > > cleanup the state machine.
> 
> This *adds* the qcom_pcie_shutdown() callback, right?
> 

Yeah. Will make it explicit.

> > > I guess this 1ms delay is the PERST_DELAY_US hidden inside
> > > qcom_ep_reset_assert()?  I assume the refclk disable is done by
> > > clk_bulk_disable_unprepare()?
> > 
> > Yes to both.
> > 
> > >   #define PERST_DELAY_US 1000
> > > 
> > >   qcom_pcie_shutdown
> > >     qcom_pcie_host_deinit
> > >       qcom_ep_reset_assert
> > >         gpiod_set_value_cansleep(pcie->reset, 1);
> > >         usleep_range(PERST_DELAY_US, PERST_DELAY_US + 500);  <--
> > >       phy_power_off(pcie->phy)
> > >       pcie->cfg->ops->deinit()
> > >         qcom_pcie_deinit_...
> > >           clk_bulk_disable_unprepare                         <--
> > > 
> > > Is there a spec citation for this delay requirement?  If not, how do
> > > we know 1ms is enough for whatever the firmware needs to do?
> > 
> > Both PCIe base spec and Electromechanical spec only mentions Tperst,
> > which is the minimum time PERST# should remain asserted. But there
> > is no mention about the time, refclk should be active.
> 
> I see Tperst mentioned in PCIe r6.0, sec 6.6.1, but AFAICS the value
> is only defined in PCIe CEM (r5.0, sec 2.9.2), which says 100us, and
> maybe other form factor specs.
> 

I'm not 100% sure that Tperst represents the minimum time to keep refclk active.

> If PERST_DELAY_US is enforcing Tperst, why is it 1000us instead of
> 100us?
> 

As I said above, I'm not sure if PERST_DELAY_US corresponds to Tperst. It
predates my work on this driver, but I'll check internally.

> > So I used the existing delay post PERST# assert in the driver. I do
> > not know if that is enough for all the endpoints out in the wild,
> > but atleast satisfies the requirement of the endpoint I'm working on
> > (which is another Qcom SoC in EP mode).
> > 
> > We can change the delay if someone reports any issue with the
> > existing one.  Atleast, that's the best we could do in this
> > situation.
> 
> I'm dubious about this.  If endpoints require a delay here to work
> properly, the spec should specify a minimum delay.  We can't make a
> reliable system based on "here's a guess and we'll update it if people
> report issues."  That makes me think this endpoint mode Qcom SoC
> dependency on a delay might itself be non spec-compliant.
> 

I wouldn't say "non spec compliant", but a usecase not addressed properly in the
spec. This Chip to Chip usecase (connecting two SoCs over PCIe bus where one
acting as host and another as endpoint with their own power supply), is not very
popular IMO.

- Mani

> > > Do other drivers require similar changes?
> > 
> > Most likely yes, but that also depends on when the drivers are
> > cutting off the refclk. Not all drivers are implementing the
> > shutdown callback, and even few of the ones implementing, do not
> > assert PERST# since it is optional.
> 
> > > > Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> > > > ---
> > > >  drivers/pci/controller/dwc/pcie-qcom.c | 8 ++++++++
> > > >  1 file changed, 8 insertions(+)
> > > > 
> > > > diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
> > > > index 2ce2a3bd932b..41434bc4761a 100644
> > > > --- a/drivers/pci/controller/dwc/pcie-qcom.c
> > > > +++ b/drivers/pci/controller/dwc/pcie-qcom.c
> > > > @@ -1618,6 +1618,13 @@ static int qcom_pcie_resume_noirq(struct device *dev)
> > > >  	return 0;
> > > >  }
> > > >  
> > > > +static void qcom_pcie_shutdown(struct platform_device *pdev)
> > > > +{
> > > > +	struct qcom_pcie *pcie = platform_get_drvdata(pdev);
> > > > +
> > > > +	qcom_pcie_host_deinit(&pcie->pci->pp);
> > > > +}
> > > > +
> > > >  static const struct of_device_id qcom_pcie_match[] = {
> > > >  	{ .compatible = "qcom,pcie-apq8064", .data = &cfg_2_1_0 },
> > > >  	{ .compatible = "qcom,pcie-apq8084", .data = &cfg_1_0_0 },
> > > > @@ -1670,5 +1677,6 @@ static struct platform_driver qcom_pcie_driver = {
> > > >  		.pm = &qcom_pcie_pm_ops,
> > > >  		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > > >  	},
> > > > +	.shutdown = qcom_pcie_shutdown,
> > > >  };
> > > >  builtin_platform_driver(qcom_pcie_driver);
> > > > 
> > > > ---
> > > > base-commit: 51459eb30f88651d3688b9e95fed0f97767ececb
> > > > change-id: 20240313-pci-qcom-shutdown-d86298186560
> > > > 
> > > > Best regards,
> > > > -- 
> > > > Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> > > > 
> > 
> > -- 
> > மணிவண்ணன் சதாசிவம்
Manivannan Sadhasivam March 14, 2024, 6:25 a.m. UTC | #5
On Thu, Mar 14, 2024 at 11:05:32AM +0530, Manivannan Sadhasivam wrote:
> On Wed, Mar 13, 2024 at 02:16:56PM -0500, Bjorn Helgaas wrote:
> > On Wed, Mar 13, 2024 at 08:32:42PM +0530, Manivannan Sadhasivam wrote:
> > > On Wed, Mar 13, 2024 at 09:36:14AM -0500, Bjorn Helgaas wrote:
> > > > On Wed, Mar 13, 2024 at 05:39:22PM +0530, Manivannan Sadhasivam wrote:
> > > > > PCIe host controller drivers are supposed to properly reset the endpoint
> > > > > devices during host shutdown/reboot.
> > 
> > Where does this requirement to reset endpoints during host shutdown
> > come from?  My working assumption is that .shutdown() needs to stop
> > DMA and interrupts, based on this old thread:
> > https://lore.kernel.org/all/61f70fd6-52fd-da07-ce73-303f95132131@codeaurora.org/
> > 
> 
> Yes, it indeed need to stop DMA and interrupts since the endpoint is going to
> a dormant state. But not everyone care about PERST# since it will be asserted
> by the hw automaticallyt once the SoC goes to a powerdown state. That's what
> happening even without this patch. Also, in most of the cases, during host
> shutdown/reboot, the power to the endpoint will be cutoff and reapplied
> (on reboot). So the endpoint will undergo cold boot and it would work as usual.
> 
> But, in some rare cases like the one I'm dealing with, power to the endpoint
> device is not coming from the host. The endpoint here is another SoC that works
> on its own. It just receives refclk from the host. So in this case, during host
> shutdown/reboot, only PERST# will be asserted and refclk will be cutoff, but the
> device will be kept powered on. Due to this, the device when it tries to
> cleanup the state machine post PERST# assertion, it will crash because refclk
> will be cutoff immediately.
> 
> And that is what being addressed with this patch.
> 
> I should admit that this issue is not very common and that's the reason no one
> cared about it so far. Because, on PCs/Laptops, most likely the endpoint device
> will be powercycled during reboot.
> 
> > > > > Currently, Qcom driver doesn't do
> > > > > anything during host shutdown/reboot, resulting in both PERST# and refclk
> > > > > getting disabled at the same time. This prevents the endpoint device
> > > > > firmware to properly reset the state machine. Because, if the refclk is
> > > > > cutoff immediately along with PERST#, access to device specific registers
> > > > > within the endpoint will result in a firmware crash.
> > 
> > Does "PERST# getting disabled" mean PERST# is asserted or deasserted?
> > 
> 
> PERST# assertion I meant. Will change the wording.
> 
> > > > > To address this issue, let's call qcom_pcie_host_deinit() inside the
> > > > > shutdown callback, that asserts PERST# and then cuts off the refclk with a
> > > > > delay of 1ms, thus allowing the endpoint device firmware to properly
> > > > > cleanup the state machine.
> > 
> > This *adds* the qcom_pcie_shutdown() callback, right?
> > 
> 
> Yeah. Will make it explicit.
> 
> > > > I guess this 1ms delay is the PERST_DELAY_US hidden inside
> > > > qcom_ep_reset_assert()?  I assume the refclk disable is done by
> > > > clk_bulk_disable_unprepare()?
> > > 
> > > Yes to both.
> > > 
> > > >   #define PERST_DELAY_US 1000
> > > > 
> > > >   qcom_pcie_shutdown
> > > >     qcom_pcie_host_deinit
> > > >       qcom_ep_reset_assert
> > > >         gpiod_set_value_cansleep(pcie->reset, 1);
> > > >         usleep_range(PERST_DELAY_US, PERST_DELAY_US + 500);  <--
> > > >       phy_power_off(pcie->phy)
> > > >       pcie->cfg->ops->deinit()
> > > >         qcom_pcie_deinit_...
> > > >           clk_bulk_disable_unprepare                         <--
> > > > 
> > > > Is there a spec citation for this delay requirement?  If not, how do
> > > > we know 1ms is enough for whatever the firmware needs to do?
> > > 
> > > Both PCIe base spec and Electromechanical spec only mentions Tperst,
> > > which is the minimum time PERST# should remain asserted. But there
> > > is no mention about the time, refclk should be active.
> > 
> > I see Tperst mentioned in PCIe r6.0, sec 6.6.1, but AFAICS the value
> > is only defined in PCIe CEM (r5.0, sec 2.9.2), which says 100us, and
> > maybe other form factor specs.
> > 
> 
> I'm not 100% sure that Tperst represents the minimum time to keep refclk active.
> 
> > If PERST_DELAY_US is enforcing Tperst, why is it 1000us instead of
> > 100us?
> > 
> 
> As I said above, I'm not sure if PERST_DELAY_US corresponds to Tperst. It
> predates my work on this driver, but I'll check internally.
> 

Ok, got the answer. This delay indeed corresponds to Tperst, but on Qcom
platforms this delay was found to be not enough. So they increased it to 1000us.
This shouldn't be an issue since PCIe spec mandates only minimum delay.

But regarding the refclk active time, spec doesn't mandate any minimum time. And
as I mentioned before, it could be due to not caring about the usecase.

- Mani

> > > So I used the existing delay post PERST# assert in the driver. I do
> > > not know if that is enough for all the endpoints out in the wild,
> > > but atleast satisfies the requirement of the endpoint I'm working on
> > > (which is another Qcom SoC in EP mode).
> > > 
> > > We can change the delay if someone reports any issue with the
> > > existing one.  Atleast, that's the best we could do in this
> > > situation.
> > 
> > I'm dubious about this.  If endpoints require a delay here to work
> > properly, the spec should specify a minimum delay.  We can't make a
> > reliable system based on "here's a guess and we'll update it if people
> > report issues."  That makes me think this endpoint mode Qcom SoC
> > dependency on a delay might itself be non spec-compliant.
> > 
> 
> I wouldn't say "non spec compliant", but a usecase not addressed properly in the
> spec. This Chip to Chip usecase (connecting two SoCs over PCIe bus where one
> acting as host and another as endpoint with their own power supply), is not very
> popular IMO.
> 
> - Mani
> 
> > > > Do other drivers require similar changes?
> > > 
> > > Most likely yes, but that also depends on when the drivers are
> > > cutting off the refclk. Not all drivers are implementing the
> > > shutdown callback, and even few of the ones implementing, do not
> > > assert PERST# since it is optional.
> > 
> > > > > Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> > > > > ---
> > > > >  drivers/pci/controller/dwc/pcie-qcom.c | 8 ++++++++
> > > > >  1 file changed, 8 insertions(+)
> > > > > 
> > > > > diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
> > > > > index 2ce2a3bd932b..41434bc4761a 100644
> > > > > --- a/drivers/pci/controller/dwc/pcie-qcom.c
> > > > > +++ b/drivers/pci/controller/dwc/pcie-qcom.c
> > > > > @@ -1618,6 +1618,13 @@ static int qcom_pcie_resume_noirq(struct device *dev)
> > > > >  	return 0;
> > > > >  }
> > > > >  
> > > > > +static void qcom_pcie_shutdown(struct platform_device *pdev)
> > > > > +{
> > > > > +	struct qcom_pcie *pcie = platform_get_drvdata(pdev);
> > > > > +
> > > > > +	qcom_pcie_host_deinit(&pcie->pci->pp);
> > > > > +}
> > > > > +
> > > > >  static const struct of_device_id qcom_pcie_match[] = {
> > > > >  	{ .compatible = "qcom,pcie-apq8064", .data = &cfg_2_1_0 },
> > > > >  	{ .compatible = "qcom,pcie-apq8084", .data = &cfg_1_0_0 },
> > > > > @@ -1670,5 +1677,6 @@ static struct platform_driver qcom_pcie_driver = {
> > > > >  		.pm = &qcom_pcie_pm_ops,
> > > > >  		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > > > >  	},
> > > > > +	.shutdown = qcom_pcie_shutdown,
> > > > >  };
> > > > >  builtin_platform_driver(qcom_pcie_driver);
> > > > > 
> > > > > ---
> > > > > base-commit: 51459eb30f88651d3688b9e95fed0f97767ececb
> > > > > change-id: 20240313-pci-qcom-shutdown-d86298186560
> > > > > 
> > > > > Best regards,
> > > > > -- 
> > > > > Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> > > > > 
> > > 
> > > -- 
> > > மணிவண்ணன் சதாசிவம்
> 
> -- 
> மணிவண்ணன் சதாசிவம்
diff mbox series

Patch

diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index 2ce2a3bd932b..41434bc4761a 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -1618,6 +1618,13 @@  static int qcom_pcie_resume_noirq(struct device *dev)
 	return 0;
 }
 
+static void qcom_pcie_shutdown(struct platform_device *pdev)
+{
+	struct qcom_pcie *pcie = platform_get_drvdata(pdev);
+
+	qcom_pcie_host_deinit(&pcie->pci->pp);
+}
+
 static const struct of_device_id qcom_pcie_match[] = {
 	{ .compatible = "qcom,pcie-apq8064", .data = &cfg_2_1_0 },
 	{ .compatible = "qcom,pcie-apq8084", .data = &cfg_1_0_0 },
@@ -1670,5 +1677,6 @@  static struct platform_driver qcom_pcie_driver = {
 		.pm = &qcom_pcie_pm_ops,
 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
 	},
+	.shutdown = qcom_pcie_shutdown,
 };
 builtin_platform_driver(qcom_pcie_driver);