diff mbox series

[v1,8/9] PCI: PLDA: starfive: Add JH7110 PCIe controller

Message ID 20230719102057.22329-9-minda.chen@starfivetech.com
State New
Headers show
Series Refactoring Microchip PolarFire PCIe driver | expand

Commit Message

Minda Chen July 19, 2023, 10:20 a.m. UTC
Add StarFive JH7110 SoC PCIe controller platform
driver codes.

Signed-off-by: Minda Chen <minda.chen@starfivetech.com>
Reviewed-by: Mason Huo <mason.huo@starfivetech.com>
---
 MAINTAINERS                                 |   7 +
 drivers/pci/controller/plda/Kconfig         |   8 +
 drivers/pci/controller/plda/Makefile        |   1 +
 drivers/pci/controller/plda/pcie-plda.h     |  58 ++-
 drivers/pci/controller/plda/pcie-starfive.c | 415 ++++++++++++++++++++
 5 files changed, 487 insertions(+), 2 deletions(-)
 create mode 100644 drivers/pci/controller/plda/pcie-starfive.c

Comments

Bjorn Helgaas July 19, 2023, 4:48 p.m. UTC | #1
On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
> Add StarFive JH7110 SoC PCIe controller platform
> driver codes.

Rewrap all the commit logs to fill 75 columns or so.

>  #define PCIE_PCI_IDS_DW1		0x9c
> -
> +#define  IDS_CLASS_CODE_SHIFT		16
> +#define PCI_MISC			0xB4

Surrounding code uses lower-case hex.  Make it all match.

> +#define STG_SYSCON_AXI4_SLVL_ARFUNC_MASK	GENMASK(22, 8)
> +#define STG_SYSCON_AXI4_SLVL_ARFUNC_SHIFT	8

When practical, use FIELD_GET() and FIELD_PREP() to avoid the need for
*_SHIFT macros.

> +struct starfive_jh7110_pcie {
> +	struct plda_pcie	plda;
> +	struct reset_control *resets;
> +	struct clk_bulk_data *clks;
> +	struct regmap *reg_syscon;
> +	struct gpio_desc *power_gpio;
> +	struct gpio_desc *reset_gpio;
> +
> +	u32 stg_arfun;
> +	u32 stg_awfun;
> +	u32 stg_rp_nep;
> +	u32 stg_lnksta;
> +
> +	int num_clks;

If you indent one member with tabs, e.g., "struct plda_pcie        plda",
they should all be indented to match.

> + * The BAR0/1 of bridge should be hidden during enumeration to
> + * avoid the sizing and resource allocation by PCIe core.
> + */
> +static bool starfive_pcie_hide_rc_bar(struct pci_bus *bus, unsigned int  devfn,
> +				      int offset)
> +{
> +	if (pci_is_root_bus(bus) && !devfn &&
> +	    (offset == PCI_BASE_ADDRESS_0 || offset == PCI_BASE_ADDRESS_1))
> +		return true;
> +
> +	return false;
> +}
> +
> +int starfive_pcie_config_write(struct pci_bus *bus, unsigned int devfn,
> +			       int where, int size, u32 value)
> +{
> +	if (starfive_pcie_hide_rc_bar(bus, devfn, where))
> +		return PCIBIOS_BAD_REGISTER_NUMBER;

I think you are trying present BARs 0 & 1 as unimplemented.  Such BARs
are hardwired to zero, so you should make them behave that way (both
read and write).  Many callers of config accessors don't check the
return value, so I don't think it's reliable to just return
PCIBIOS_BAD_REGISTER_NUMBER.

> +static int starfive_pcie_is_link_up(struct starfive_jh7110_pcie *pcie)
> +{
> +	struct device *dev = pcie->plda.dev;
> +	int ret;
> +	u32 stg_reg_val;
> +
> +	/* 100ms timeout value should be enough for Gen1/2 training */
> +	ret = regmap_read_poll_timeout(pcie->reg_syscon,
> +				       pcie->stg_lnksta,
> +				       stg_reg_val,
> +				       stg_reg_val & DATA_LINK_ACTIVE,
> +				       10 * 1000, 100 * 1000);
> +
> +	/* If the link is down (no device in slot), then exit. */
> +	if (ret == -ETIMEDOUT) {
> +		dev_info(dev, "Port link down, exit.\n");
> +		return 0;
> +	} else if (ret == 0) {
> +		dev_info(dev, "Port link up.\n");
> +		return 1;
> +	}

Please copy the naming and style of the "*_pcie_link_up()" functions
in other drivers.  These are boolean functions with no side effects,
including no timeouts.

Some drivers have "*wait_for_link()" functions if polling is needed.

> +		return dev_err_probe(dev, ret,
> +			"failed to initialize pcie phy\n");

Driver messages should match (all capitalized or none capitalized).

> +	/* Enable root port */

Superfluous comment, since the function name says the same.

> +	plda_pcie_enable_root_port(plda);

> +	/* Ensure that PERST has been asserted for at least 100 ms */
> +	msleep(300);
> +	gpiod_set_value_cansleep(pcie->reset_gpio, 0);

At least 100 ms, but you sleep *300* ms?  This is probably related to
https://lore.kernel.org/r/20230718155515.GA483233@bhelgaas

Please include a comment with the source of the delay value.  I assume
it's T_PVPERL and T_PERST-CLK from the PCIe CEM spec.  This way we can
someday share those #defines across drivers.

> +#ifdef CONFIG_PM_SLEEP
> +static int __maybe_unused starfive_pcie_suspend_noirq(struct device *dev)

I think you can dispense with some of these #ifdefs and the
__maybe_unused as in
https://lore.kernel.org/all/20220720224829.GA1667002@bhelgaas/

> +{
> +	struct starfive_jh7110_pcie *pcie = dev_get_drvdata(dev);
> +
> +	if (!pcie)
> +		return 0;

How can this happen?  If we're only detecting memory corruption, it's
not worth it.

Bjorn
Kevin Xie July 20, 2023, 10:11 a.m. UTC | #2
On 2023/7/20 0:48, Bjorn Helgaas wrote:
> On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
>> Add StarFive JH7110 SoC PCIe controller platform
>> driver codes.
> 
> Rewrap all the commit logs to fill 75 columns or so.
> 

OK.

>>  #define PCIE_PCI_IDS_DW1		0x9c
>> -
>> +#define  IDS_CLASS_CODE_SHIFT		16
>> +#define PCI_MISC			0xB4
> 
> Surrounding code uses lower-case hex.  Make it all match.
> 

OK, I will make it all match.

>> +#define STG_SYSCON_AXI4_SLVL_ARFUNC_MASK	GENMASK(22, 8)
>> +#define STG_SYSCON_AXI4_SLVL_ARFUNC_SHIFT	8
> 
> When practical, use FIELD_GET() and FIELD_PREP() to avoid the need for
> *_SHIFT macros.
> 

Got it.

>> +struct starfive_jh7110_pcie {
>> +	struct plda_pcie	plda;
>> +	struct reset_control *resets;
>> +	struct clk_bulk_data *clks;
>> +	struct regmap *reg_syscon;
>> +	struct gpio_desc *power_gpio;
>> +	struct gpio_desc *reset_gpio;
>> +
>> +	u32 stg_arfun;
>> +	u32 stg_awfun;
>> +	u32 stg_rp_nep;
>> +	u32 stg_lnksta;
>> +
>> +	int num_clks;
> 
> If you indent one member with tabs, e.g., "struct plda_pcie        plda",
> they should all be indented to match.
> 

OK, I will indent that member with white space.

>> + * The BAR0/1 of bridge should be hidden during enumeration to
>> + * avoid the sizing and resource allocation by PCIe core.
>> + */
>> +static bool starfive_pcie_hide_rc_bar(struct pci_bus *bus, unsigned int  devfn,
>> +				      int offset)
>> +{
>> +	if (pci_is_root_bus(bus) && !devfn &&
>> +	    (offset == PCI_BASE_ADDRESS_0 || offset == PCI_BASE_ADDRESS_1))
>> +		return true;
>> +
>> +	return false;
>> +}
>> +
>> +int starfive_pcie_config_write(struct pci_bus *bus, unsigned int devfn,
>> +			       int where, int size, u32 value)
>> +{
>> +	if (starfive_pcie_hide_rc_bar(bus, devfn, where))
>> +		return PCIBIOS_BAD_REGISTER_NUMBER;
> 
> I think you are trying present BARs 0 & 1 as unimplemented.  Such BARs
> are hardwired to zero, so you should make them behave that way (both
> read and write).  Many callers of config accessors don't check the
> return value, so I don't think it's reliable to just return
> PCIBIOS_BAD_REGISTER_NUMBER.
> 

This is a hardware defect that we did not hardwired those BARs to zero,
and it is configurable for software now.
We have to add this filter function for workaround.

>> +static int starfive_pcie_is_link_up(struct starfive_jh7110_pcie *pcie)
>> +{
>> +	struct device *dev = pcie->plda.dev;
>> +	int ret;
>> +	u32 stg_reg_val;
>> +
>> +	/* 100ms timeout value should be enough for Gen1/2 training */
>> +	ret = regmap_read_poll_timeout(pcie->reg_syscon,
>> +				       pcie->stg_lnksta,
>> +				       stg_reg_val,
>> +				       stg_reg_val & DATA_LINK_ACTIVE,
>> +				       10 * 1000, 100 * 1000);
>> +
>> +	/* If the link is down (no device in slot), then exit. */
>> +	if (ret == -ETIMEDOUT) {
>> +		dev_info(dev, "Port link down, exit.\n");
>> +		return 0;
>> +	} else if (ret == 0) {
>> +		dev_info(dev, "Port link up.\n");
>> +		return 1;
>> +	}
> 
> Please copy the naming and style of the "*_pcie_link_up()" functions
> in other drivers.  These are boolean functions with no side effects,
> including no timeouts.
> 
> Some drivers have "*wait_for_link()" functions if polling is needed.
> 

OK, I will refer to other drivers in this part.

>> +		return dev_err_probe(dev, ret,
>> +			"failed to initialize pcie phy\n");
> 
> Driver messages should match (all capitalized or none capitalized).
> 

OK, I will make them all matched.

>> +	/* Enable root port */
> 
> Superfluous comment, since the function name says the same.
> 

I will delete this comment.

>> +	plda_pcie_enable_root_port(plda);
> 
>> +	/* Ensure that PERST has been asserted for at least 100 ms */
>> +	msleep(300);
>> +	gpiod_set_value_cansleep(pcie->reset_gpio, 0);
> 
> At least 100 ms, but you sleep *300* ms?  This is probably related to
> https://lore.kernel.org/r/20230718155515.GA483233@bhelgaas
> 
> Please include a comment with the source of the delay value.  I assume
> it's T_PVPERL and T_PERST-CLK from the PCIe CEM spec.  This way we can
> someday share those #defines across drivers.
> 

Yes, the delay value here is T_PVPERL from PCIe CEM spec r2.0 (Table 2-4).
At the first time we set 100ms delay according to sector 2.2 of the spec:
"After there has been time (TPVPERL) for the power and clock to become stable,
PERST# is deasserted high and the PCI Express functions can start up."

However, in the compatibility testing with several NVMe SSD, we found that
Lenovo Thinklife ST8000 NVMe can not get ready in 100ms,
and it actually needs almost 200ms.
Thus, we increased the T_PVPERL value to 300ms for the better device compatibility.

We will use a macro to define T_PVPERL, and add comments for the source of it.
If the compatibility delay of 300ms is not reasonable, we can revert it to 100ms.

>> +#ifdef CONFIG_PM_SLEEP
>> +static int __maybe_unused starfive_pcie_suspend_noirq(struct device *dev)
> 
> I think you can dispense with some of these #ifdefs and the
> __maybe_unused as in
> https://lore.kernel.org/all/20220720224829.GA1667002@bhelgaas/
> 

Thanks, I will refer to your patch.

>> +{
>> +	struct starfive_jh7110_pcie *pcie = dev_get_drvdata(dev);
>> +
>> +	if (!pcie)
>> +		return 0;
> 
> How can this happen?  If we're only detecting memory corruption, it's
> not worth it.
> 
> Bjorn

OK, I will delete this condition.
Conor Dooley July 20, 2023, 11:14 a.m. UTC | #3
On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
> Add StarFive JH7110 SoC PCIe controller platform
> driver codes.
> 
> Signed-off-by: Minda Chen <minda.chen@starfivetech.com>
> Reviewed-by: Mason Huo <mason.huo@starfivetech.com>
> ---
>  MAINTAINERS                                 |   7 +
>  drivers/pci/controller/plda/Kconfig         |   8 +
>  drivers/pci/controller/plda/Makefile        |   1 +
>  drivers/pci/controller/plda/pcie-plda.h     |  58 ++-
>  drivers/pci/controller/plda/pcie-starfive.c | 415 ++++++++++++++++++++
>  5 files changed, 487 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/pci/controller/plda/pcie-starfive.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f02618c2bdf5..b88a54a24ae5 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -20356,6 +20356,13 @@ S:	Supported
>  F:	Documentation/devicetree/bindings/watchdog/starfive*
>  F:	drivers/watchdog/starfive-wdt.c
>  
> +STARFIVE JH71x0 PCIE DRIVER
> +M:	Minda Chen <minda.chen@starfivetech.com>
> +L:	linux-pci@vger.kernel.org
> +S:	Supported
> +F:	Documentation/devicetree/bindings/pci/starfive*
> +F:	drivers/pci/controller/plda/pcie-starfive.c
> +
>  STATIC BRANCH/CALL
>  M:	Peter Zijlstra <peterz@infradead.org>
>  M:	Josh Poimboeuf <jpoimboe@kernel.org>
> diff --git a/drivers/pci/controller/plda/Kconfig b/drivers/pci/controller/plda/Kconfig
> index a3c790545843..eaf72954da9f 100644
> --- a/drivers/pci/controller/plda/Kconfig
> +++ b/drivers/pci/controller/plda/Kconfig
> @@ -24,4 +24,12 @@ config PCIE_MICROCHIP_HOST
>  	  Say Y here if you want kernel to support the Microchip AXI PCIe
>  	  Host Bridge driver.
>  
> +config PCIE_STARFIVE_HOST
> +	tristate "StarFive PCIe host controller"
> +	select PCIE_PLDA_HOST

Ditto here, I think this suffers from the same issue, although its
probably only really randconfigs that'll trigger it.

> +	help
> +	  Say Y here if you want to support the StarFive PCIe controller
> +	  in host mode. StarFive PCIe controller uses PLDA PCIe
> +	  core.
> +
>  endmenu
Bjorn Helgaas July 20, 2023, 4:15 p.m. UTC | #4
On Thu, Jul 20, 2023 at 06:11:59PM +0800, Kevin Xie wrote:
> On 2023/7/20 0:48, Bjorn Helgaas wrote:
> > On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
> >> Add StarFive JH7110 SoC PCIe controller platform
> >> driver codes.

> >> + * The BAR0/1 of bridge should be hidden during enumeration to
> >> + * avoid the sizing and resource allocation by PCIe core.
> >> + */
> >> +static bool starfive_pcie_hide_rc_bar(struct pci_bus *bus, unsigned int  devfn,
> >> +				      int offset)
> >> +{
> >> +	if (pci_is_root_bus(bus) && !devfn &&
> >> +	    (offset == PCI_BASE_ADDRESS_0 || offset == PCI_BASE_ADDRESS_1))
> >> +		return true;
> >> +
> >> +	return false;
> >> +}
> >> +
> >> +int starfive_pcie_config_write(struct pci_bus *bus, unsigned int devfn,
> >> +			       int where, int size, u32 value)
> >> +{
> >> +	if (starfive_pcie_hide_rc_bar(bus, devfn, where))
> >> +		return PCIBIOS_BAD_REGISTER_NUMBER;
> > 
> > I think you are trying present BARs 0 & 1 as unimplemented.  Such BARs
> > are hardwired to zero, so you should make them behave that way (both
> > read and write).  Many callers of config accessors don't check the
> > return value, so I don't think it's reliable to just return
> > PCIBIOS_BAD_REGISTER_NUMBER.
> 
> This is a hardware defect that we did not hardwired those BARs to
> zero, and it is configurable for software now.  We have to add this
> filter function for workaround.

Yes.  My point is that this only affects the write path, and the read
probably does not read 0 as it should.  This means lspci will show the
wrong thing, and the PCI core will try to size the BAR when it doesn't
need to.  I haven't looked at the BAR sizing code; it might even come
up with a bogus size and address, when it *should* just conclude the
BAR doesn't exist at all.

> >> +	/* Ensure that PERST has been asserted for at least 100 ms */
> >> +	msleep(300);
> >> +	gpiod_set_value_cansleep(pcie->reset_gpio, 0);
> > 
> > At least 100 ms, but you sleep *300* ms?  This is probably related to
> > https://lore.kernel.org/r/20230718155515.GA483233@bhelgaas
> > 
> > Please include a comment with the source of the delay value.  I assume
> > it's T_PVPERL and T_PERST-CLK from the PCIe CEM spec.  This way we can
> > someday share those #defines across drivers.
> 
> Yes, the delay value here is T_PVPERL from PCIe CEM spec r2.0 (Table
> 2-4).  At the first time we set 100ms delay according to sector 2.2
> of the spec: "After there has been time (TPVPERL) for the power and
> clock to become stable, PERST# is deasserted high and the PCI
> Express functions can start up."
> 
> However, in the compatibility testing with several NVMe SSD, we
> found that Lenovo Thinklife ST8000 NVMe can not get ready in 100ms,
> and it actually needs almost 200ms.  Thus, we increased the T_PVPERL
> value to 300ms for the better device compatibility.
>
> We will use a macro to define T_PVPERL, and add comments for the
> source of it.  If the compatibility delay of 300ms is not
> reasonable, we can revert it to 100ms.

Thanks for this valuable information!  This NVMe issue potentially
affects many similar drivers, and we may need a more generic fix so
this device works well with all of them.

T_PVPERL is defined to start when power is stable.  Do you have a way
to accurately determine that point?  I'm guessing this:

  gpiod_set_value_cansleep(pcie->power_gpio, 1)

turns the power on?  But of course that doesn't mean it is instantly
stable.  Maybe your testing is telling you that your driver should
have a hardware-specific 200ms delay to wait for power to become
stable, followed by the standard 100ms for T_PVPERL?

Bjorn
Minda Chen July 21, 2023, 1:03 a.m. UTC | #5
On 2023/7/20 19:14, Conor Dooley wrote:
> On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
>> Add StarFive JH7110 SoC PCIe controller platform
>> driver codes.
>> 
>> Signed-off-by: Minda Chen <minda.chen@starfivetech.com>
>> Reviewed-by: Mason Huo <mason.huo@starfivetech.com>
>> ---
>>  MAINTAINERS                                 |   7 +
>>  drivers/pci/controller/plda/Kconfig         |   8 +
>>  drivers/pci/controller/plda/Makefile        |   1 +
>>  drivers/pci/controller/plda/pcie-plda.h     |  58 ++-
>>  drivers/pci/controller/plda/pcie-starfive.c | 415 ++++++++++++++++++++
>>  5 files changed, 487 insertions(+), 2 deletions(-)
>>  create mode 100644 drivers/pci/controller/plda/pcie-starfive.c
>> 
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index f02618c2bdf5..b88a54a24ae5 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -20356,6 +20356,13 @@ S:	Supported
>>  F:	Documentation/devicetree/bindings/watchdog/starfive*
>>  F:	drivers/watchdog/starfive-wdt.c
>>  
>> +STARFIVE JH71x0 PCIE DRIVER
>> +M:	Minda Chen <minda.chen@starfivetech.com>
>> +L:	linux-pci@vger.kernel.org
>> +S:	Supported
>> +F:	Documentation/devicetree/bindings/pci/starfive*
>> +F:	drivers/pci/controller/plda/pcie-starfive.c
>> +
>>  STATIC BRANCH/CALL
>>  M:	Peter Zijlstra <peterz@infradead.org>
>>  M:	Josh Poimboeuf <jpoimboe@kernel.org>
>> diff --git a/drivers/pci/controller/plda/Kconfig b/drivers/pci/controller/plda/Kconfig
>> index a3c790545843..eaf72954da9f 100644
>> --- a/drivers/pci/controller/plda/Kconfig
>> +++ b/drivers/pci/controller/plda/Kconfig
>> @@ -24,4 +24,12 @@ config PCIE_MICROCHIP_HOST
>>  	  Say Y here if you want kernel to support the Microchip AXI PCIe
>>  	  Host Bridge driver.
>>  
>> +config PCIE_STARFIVE_HOST
>> +	tristate "StarFive PCIe host controller"
>> +	select PCIE_PLDA_HOST
> 
> Ditto here, I think this suffers from the same issue, although its
> probably only really randconfigs that'll trigger it.
> 
ok, thanks, I will change it with microchip.
>> +	help
>> +	  Say Y here if you want to support the StarFive PCIe controller
>> +	  in host mode. StarFive PCIe controller uses PLDA PCIe
>> +	  core.
>> +
>>  endmenu
Kevin Xie July 24, 2023, 10:48 a.m. UTC | #6
On 2023/7/21 0:15, Bjorn Helgaas wrote:
> On Thu, Jul 20, 2023 at 06:11:59PM +0800, Kevin Xie wrote:
>> On 2023/7/20 0:48, Bjorn Helgaas wrote:
>> > On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
>> >> Add StarFive JH7110 SoC PCIe controller platform
>> >> driver codes.
> 
>> >> + * The BAR0/1 of bridge should be hidden during enumeration to
>> >> + * avoid the sizing and resource allocation by PCIe core.
>> >> + */
>> >> +static bool starfive_pcie_hide_rc_bar(struct pci_bus *bus, unsigned int  devfn,
>> >> +				      int offset)
>> >> +{
>> >> +	if (pci_is_root_bus(bus) && !devfn &&
>> >> +	    (offset == PCI_BASE_ADDRESS_0 || offset == PCI_BASE_ADDRESS_1))
>> >> +		return true;
>> >> +
>> >> +	return false;
>> >> +}
>> >> +
>> >> +int starfive_pcie_config_write(struct pci_bus *bus, unsigned int devfn,
>> >> +			       int where, int size, u32 value)
>> >> +{
>> >> +	if (starfive_pcie_hide_rc_bar(bus, devfn, where))
>> >> +		return PCIBIOS_BAD_REGISTER_NUMBER;
>> > 
>> > I think you are trying present BARs 0 & 1 as unimplemented.  Such BARs
>> > are hardwired to zero, so you should make them behave that way (both
>> > read and write).  Many callers of config accessors don't check the
>> > return value, so I don't think it's reliable to just return
>> > PCIBIOS_BAD_REGISTER_NUMBER.
>> 
>> This is a hardware defect that we did not hardwired those BARs to
>> zero, and it is configurable for software now.  We have to add this
>> filter function for workaround.
> 
> Yes.  My point is that this only affects the write path, and the read
> probably does not read 0 as it should.  This means lspci will show the
> wrong thing, and the PCI core will try to size the BAR when it doesn't
> need to.  I haven't looked at the BAR sizing code; it might even come
> up with a bogus size and address, when it *should* just conclude the
> BAR doesn't exist at all.
> 

Got it, I will try to hide those BARs both in read and write operations.

>> >> +	/* Ensure that PERST has been asserted for at least 100 ms */
>> >> +	msleep(300);
>> >> +	gpiod_set_value_cansleep(pcie->reset_gpio, 0);
>> > 
>> > At least 100 ms, but you sleep *300* ms?  This is probably related to
>> > https://lore.kernel.org/r/20230718155515.GA483233@bhelgaas
>> > 
>> > Please include a comment with the source of the delay value.  I assume
>> > it's T_PVPERL and T_PERST-CLK from the PCIe CEM spec.  This way we can
>> > someday share those #defines across drivers.
>> 
>> Yes, the delay value here is T_PVPERL from PCIe CEM spec r2.0 (Table
>> 2-4).  At the first time we set 100ms delay according to sector 2.2
>> of the spec: "After there has been time (TPVPERL) for the power and
>> clock to become stable, PERST# is deasserted high and the PCI
>> Express functions can start up."
>> 
>> However, in the compatibility testing with several NVMe SSD, we
>> found that Lenovo Thinklife ST8000 NVMe can not get ready in 100ms,
>> and it actually needs almost 200ms.  Thus, we increased the T_PVPERL
>> value to 300ms for the better device compatibility.
>>
>> We will use a macro to define T_PVPERL, and add comments for the
>> source of it.  If the compatibility delay of 300ms is not
>> reasonable, we can revert it to 100ms.
> 
> Thanks for this valuable information!  This NVMe issue potentially
> affects many similar drivers, and we may need a more generic fix so
> this device works well with all of them.
> 
> T_PVPERL is defined to start when power is stable.  Do you have a way
> to accurately determine that point?  I'm guessing this:
> 
>   gpiod_set_value_cansleep(pcie->power_gpio, 1)
> 
> turns the power on?  But of course that doesn't mean it is instantly
> stable.  Maybe your testing is telling you that your driver should
> have a hardware-specific 200ms delay to wait for power to become
> stable, followed by the standard 100ms for T_PVPERL?
> 

You are right, we did not take the power stable cost into account.
T_PVPERL is enough for Lenovo Thinklife ST8000 NVMe SSD to get ready,
and the extra cost is from the power circuit of a PCIe to M.2 connector,
which is used to verify M.2 SSD with our EVB at early stage.

As the Thinklife NVMe SSD may be a halted product,
and the onboard power circuit of VisionFive V2 is no problem,
we decided revert the sleep time to be 100ms.

We will add a comment for the source of T_PVPERL until your define in pci.h is accepted.

> Bjorn
Bjorn Helgaas July 25, 2023, 8:46 p.m. UTC | #7
On Mon, Jul 24, 2023 at 06:48:47PM +0800, Kevin Xie wrote:
> On 2023/7/21 0:15, Bjorn Helgaas wrote:
> > On Thu, Jul 20, 2023 at 06:11:59PM +0800, Kevin Xie wrote:
> >> On 2023/7/20 0:48, Bjorn Helgaas wrote:
> >> > On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
> >> >> Add StarFive JH7110 SoC PCIe controller platform
> >> >> driver codes.

> >> However, in the compatibility testing with several NVMe SSD, we
> >> found that Lenovo Thinklife ST8000 NVMe can not get ready in 100ms,
> >> and it actually needs almost 200ms.  Thus, we increased the T_PVPERL
> >> value to 300ms for the better device compatibility.
> > ...
> > 
> > Thanks for this valuable information!  This NVMe issue potentially
> > affects many similar drivers, and we may need a more generic fix so
> > this device works well with all of them.
> > 
> > T_PVPERL is defined to start when power is stable.  Do you have a way
> > to accurately determine that point?  I'm guessing this:
> > 
> >   gpiod_set_value_cansleep(pcie->power_gpio, 1)
> > 
> > turns the power on?  But of course that doesn't mean it is instantly
> > stable.  Maybe your testing is telling you that your driver should
> > have a hardware-specific 200ms delay to wait for power to become
> > stable, followed by the standard 100ms for T_PVPERL?
> 
> You are right, we did not take the power stable cost into account.
> T_PVPERL is enough for Lenovo Thinklife ST8000 NVMe SSD to get ready,
> and the extra cost is from the power circuit of a PCIe to M.2 connector,
> which is used to verify M.2 SSD with our EVB at early stage.

Hmm.  That sounds potentially interesting.  I assume you're talking
about something like this: https://www.amazon.com/dp/B07JKH5VTL

I'm not familiar with the timing requirements for something like this.
There is a PCIe M.2 spec with some timing requirements, but I don't
know whether or how software is supposed to manage this.  There is a
T_PVPGL (power valid to PERST# inactive) parameter, but it's
implementation specific, so I don't know what the point of that is.
And I don't see a way for software to even detect the presence of such
an adapter.

But I assume some end users will use adapters like this and expect it
to "just work," so it would be nice if it did.

> As the Thinklife NVMe SSD may be a halted product, and the onboard
> power circuit of VisionFive V2 is no problem, we decided revert the
> sleep time to be 100ms.

Even though the product may be end-of-life, people will probably still
try to use it, and I would like it to work.  Otherwise we end up with
frustrated users and problem reports that are hard to resolve.  But I
don't know where to go here.

Bjorn
Bjorn Helgaas July 27, 2023, 9:40 p.m. UTC | #8
[+cc Mika, Maciej since they've worked on similar delays recently]

On Tue, Jul 25, 2023 at 03:46:35PM -0500, Bjorn Helgaas wrote:
> On Mon, Jul 24, 2023 at 06:48:47PM +0800, Kevin Xie wrote:
> > On 2023/7/21 0:15, Bjorn Helgaas wrote:
> > > On Thu, Jul 20, 2023 at 06:11:59PM +0800, Kevin Xie wrote:
> > >> On 2023/7/20 0:48, Bjorn Helgaas wrote:
> > >> > On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
> > >> >> Add StarFive JH7110 SoC PCIe controller platform
> > >> >> driver codes.
> 
> > >> However, in the compatibility testing with several NVMe SSD, we
> > >> found that Lenovo Thinklife ST8000 NVMe can not get ready in 100ms,
> > >> and it actually needs almost 200ms.  Thus, we increased the T_PVPERL
> > >> value to 300ms for the better device compatibility.
> > > ...
> > > 
> > > Thanks for this valuable information!  This NVMe issue potentially
> > > affects many similar drivers, and we may need a more generic fix so
> > > this device works well with all of them.
> > > 
> > > T_PVPERL is defined to start when power is stable.  Do you have a way
> > > to accurately determine that point?  I'm guessing this:
> > > 
> > >   gpiod_set_value_cansleep(pcie->power_gpio, 1)
> > > 
> > > turns the power on?  But of course that doesn't mean it is instantly
> > > stable.  Maybe your testing is telling you that your driver should
> > > have a hardware-specific 200ms delay to wait for power to become
> > > stable, followed by the standard 100ms for T_PVPERL?
> > 
> > You are right, we did not take the power stable cost into account.
> > T_PVPERL is enough for Lenovo Thinklife ST8000 NVMe SSD to get ready,
> > and the extra cost is from the power circuit of a PCIe to M.2 connector,
> > which is used to verify M.2 SSD with our EVB at early stage.
> 
> Hmm.  That sounds potentially interesting.  I assume you're talking
> about something like this: https://www.amazon.com/dp/B07JKH5VTL
> 
> I'm not familiar with the timing requirements for something like this.
> There is a PCIe M.2 spec with some timing requirements, but I don't
> know whether or how software is supposed to manage this.  There is a
> T_PVPGL (power valid to PERST# inactive) parameter, but it's
> implementation specific, so I don't know what the point of that is.
> And I don't see a way for software to even detect the presence of such
> an adapter.

I intended to ask about this on the PCI-SIG forum, but after reading
this thread [1], I don't think we would learn anything.  The question
was:

  The M.2 device has 5 voltage rails generated from the 3.3V input
  supply voltage
  -------------------------------------------
  This is re. Table 17 in PCI Express M.2 Specification Revision 1.1
  Power Valid* to PERST# input inactive : Implementation specific;
  recommended 50 ms

  What exactly does this mean ?

  The Note says

    *Power Valid when all the voltage supply rails have reached their
    respective Vmin.

  Does this mean that the 50ms to PERSTn is counted from the instant
  when all *5 voltage rails* on the M.2 device have become "good" ?

and the answer was:

  You wrote;
  Does this mean that the 50ms to PERSTn is counted from the instant
  when all 5 voltage rails on the M.2 device have become "good" ?

  Reply:
  This means that counting the recommended 50 ms begins from the time
  when the power rails coming to the device/module, from the host, are
  stable *at the device connector*.

  As for the time it takes voltages derived inside the device from any
  of the host power rails (e.g., 3.3V rail) to become stable, that is
  part of the 50ms the host should wait before de-asserting PERST#, in
  order ensure that most devices will be ready by then.

  Strictly speaking, nothing disastrous happens if a host violates the
  50ms. If it de-asserts too soon, the device may not be ready, but
  most hosts will try again. If the host de-asserts too late, the
  device has even more time to stabilize. This is why the WG felt that
  an exact minimum number for >>Tpvpgl, was not valid in practice, and
  we made it a recommendation.

Since T_PVPGL is implementation-specific, we can't really base
anything in software on the 50ms recommendation.  It sounds to me like
they are counting on software to retry config reads when enumerating.

I guess the delays we *can* observe are:

  100ms T_PVPERL "Power stable to PERST# inactive" (CEM 2.9.2)
  100ms software delay between reset and config request (Base 6.6.1)

The PCI core doesn't know how to assert PERST#, so the T_PVPERL delay
definitely has to be in the host controller driver.

The PCI core observes the second 100ms delay after a reset in
pci_bridge_wait_for_secondary_bus().  But this 100ms delay does not
happen during initial enumeration.  I think the assumption of the PCI
core is that when the host controller driver calls pci_host_probe(),
we can issue config requests immediately.

So I think that to be safe, we probably need to do both of those 100ms
delays in the host controller driver.  Maybe there's some hope of
supporting the latter one in the PCI core someday, but that's not
today.

Bjorn

[1] https://forum.pcisig.com/viewtopic.php?f=74&t=1037
Kevin Xie July 31, 2023, 5:52 a.m. UTC | #9
On 2023/7/28 5:40, Bjorn Helgaas wrote:
> [+cc Mika, Maciej since they've worked on similar delays recently]
> 
> On Tue, Jul 25, 2023 at 03:46:35PM -0500, Bjorn Helgaas wrote:
>> On Mon, Jul 24, 2023 at 06:48:47PM +0800, Kevin Xie wrote:
>> > On 2023/7/21 0:15, Bjorn Helgaas wrote:
>> > > On Thu, Jul 20, 2023 at 06:11:59PM +0800, Kevin Xie wrote:
>> > >> On 2023/7/20 0:48, Bjorn Helgaas wrote:
>> > >> > On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
>> > >> >> Add StarFive JH7110 SoC PCIe controller platform
>> > >> >> driver codes.
>> 
>> > >> However, in the compatibility testing with several NVMe SSD, we
>> > >> found that Lenovo Thinklife ST8000 NVMe can not get ready in 100ms,
>> > >> and it actually needs almost 200ms.  Thus, we increased the T_PVPERL
>> > >> value to 300ms for the better device compatibility.
>> > > ...
>> > > 
>> > > Thanks for this valuable information!  This NVMe issue potentially
>> > > affects many similar drivers, and we may need a more generic fix so
>> > > this device works well with all of them.
>> > > 
>> > > T_PVPERL is defined to start when power is stable.  Do you have a way
>> > > to accurately determine that point?  I'm guessing this:
>> > > 
>> > >   gpiod_set_value_cansleep(pcie->power_gpio, 1)
>> > > 
>> > > turns the power on?  But of course that doesn't mean it is instantly
>> > > stable.  Maybe your testing is telling you that your driver should
>> > > have a hardware-specific 200ms delay to wait for power to become
>> > > stable, followed by the standard 100ms for T_PVPERL?
>> > 
>> > You are right, we did not take the power stable cost into account.
>> > T_PVPERL is enough for Lenovo Thinklife ST8000 NVMe SSD to get ready,
>> > and the extra cost is from the power circuit of a PCIe to M.2 connector,
>> > which is used to verify M.2 SSD with our EVB at early stage.
>> 
>> Hmm.  That sounds potentially interesting.  I assume you're talking
>> about something like this: https://www.amazon.com/dp/B07JKH5VTL
>> 
>> I'm not familiar with the timing requirements for something like this.
>> There is a PCIe M.2 spec with some timing requirements, but I don't
>> know whether or how software is supposed to manage this.  There is a
>> T_PVPGL (power valid to PERST# inactive) parameter, but it's
>> implementation specific, so I don't know what the point of that is.
>> And I don't see a way for software to even detect the presence of such
>> an adapter.
> 
> I intended to ask about this on the PCI-SIG forum, but after reading
> this thread [1], I don't think we would learn anything.  The question
> was:
> 
>   The M.2 device has 5 voltage rails generated from the 3.3V input
>   supply voltage
>   -------------------------------------------
>   This is re. Table 17 in PCI Express M.2 Specification Revision 1.1
>   Power Valid* to PERST# input inactive : Implementation specific;
>   recommended 50 ms
> 
>   What exactly does this mean ?
> 
>   The Note says
> 
>     *Power Valid when all the voltage supply rails have reached their
>     respective Vmin.
> 
>   Does this mean that the 50ms to PERSTn is counted from the instant
>   when all *5 voltage rails* on the M.2 device have become "good" ?
> 
> and the answer was:
> 
>   You wrote;
>   Does this mean that the 50ms to PERSTn is counted from the instant
>   when all 5 voltage rails on the M.2 device have become "good" ?
> 
>   Reply:
>   This means that counting the recommended 50 ms begins from the time
>   when the power rails coming to the device/module, from the host, are
>   stable *at the device connector*.
> 
>   As for the time it takes voltages derived inside the device from any
>   of the host power rails (e.g., 3.3V rail) to become stable, that is
>   part of the 50ms the host should wait before de-asserting PERST#, in
>   order ensure that most devices will be ready by then.
> 
>   Strictly speaking, nothing disastrous happens if a host violates the
>   50ms. If it de-asserts too soon, the device may not be ready, but
>   most hosts will try again. If the host de-asserts too late, the
>   device has even more time to stabilize. This is why the WG felt that
>   an exact minimum number for >>Tpvpgl, was not valid in practice, and
>   we made it a recommendation.
> 
> Since T_PVPGL is implementation-specific, we can't really base
> anything in software on the 50ms recommendation.  It sounds to me like
> they are counting on software to retry config reads when enumerating.
> 
> I guess the delays we *can* observe are:
> 
>   100ms T_PVPERL "Power stable to PERST# inactive" (CEM 2.9.2)
>   100ms software delay between reset and config request (Base 6.6.1)
> 

Refer to Figure2-10 in CEM Spec V2.0, I guess this two delays are T2 & T4?
In the PATCH v2[4/4], T2 is the msleep(100) for T_PVPERL,
and T4 is done by starfive_pcie_host_wait_for_link().

I am sorry for the late feedback to you, because we keep on testing since last week.
Several NVMe SSD are verified with this patch, and they work fine.

It is a pity that we lost the Thinklife NVMe SSD mentioned before,
because it belongs to a departing employee.
We bought two new SSD in the same model for testing,
the issue can not be reproduced, and all of then work fine with V1 & V2 patch.

> The PCI core doesn't know how to assert PERST#, so the T_PVPERL delay
> definitely has to be in the host controller driver.
> 
> The PCI core observes the second 100ms delay after a reset in
> pci_bridge_wait_for_secondary_bus().  But this 100ms delay does not
> happen during initial enumeration.  I think the assumption of the PCI
> core is that when the host controller driver calls pci_host_probe(),
> we can issue config requests immediately.
> 
> So I think that to be safe, we probably need to do both of those 100ms
> delays in the host controller driver.  Maybe there's some hope of
> supporting the latter one in the PCI core someday, but that's not
> today.
> 
> Bjorn
> 
> [1] https://forum.pcisig.com/viewtopic.php?f=74&t=1037
Bjorn Helgaas July 31, 2023, 11:12 p.m. UTC | #10
[+cc Pali, Marek because I used f76b36d40bee ("PCI: aardvark: Fix link
training") as an example]

On Mon, Jul 31, 2023 at 01:52:35PM +0800, Kevin Xie wrote:
> On 2023/7/28 5:40, Bjorn Helgaas wrote:
> > On Tue, Jul 25, 2023 at 03:46:35PM -0500, Bjorn Helgaas wrote:
> >> On Mon, Jul 24, 2023 at 06:48:47PM +0800, Kevin Xie wrote:
> >> > On 2023/7/21 0:15, Bjorn Helgaas wrote:
> >> > > On Thu, Jul 20, 2023 at 06:11:59PM +0800, Kevin Xie wrote:
> >> > >> On 2023/7/20 0:48, Bjorn Helgaas wrote:
> >> > >> > On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
> >> > >> >> Add StarFive JH7110 SoC PCIe controller platform
> >> > >> >> driver codes.
> >> 
> >> > >> However, in the compatibility testing with several NVMe SSD, we
> >> > >> found that Lenovo Thinklife ST8000 NVMe can not get ready in 100ms,
> >> > >> and it actually needs almost 200ms.  Thus, we increased the T_PVPERL
> >> > >> value to 300ms for the better device compatibility.
> >> > > ...
> >> > > 
> >> > > Thanks for this valuable information!  This NVMe issue potentially
> >> > > affects many similar drivers, and we may need a more generic fix so
> >> > > this device works well with all of them.
> >> > > 
> >> > > T_PVPERL is defined to start when power is stable.  Do you have a way
> >> > > to accurately determine that point?  I'm guessing this:
> >> > > 
> >> > >   gpiod_set_value_cansleep(pcie->power_gpio, 1)
> >> > > 
> >> > > turns the power on?  But of course that doesn't mean it is instantly
> >> > > stable.  Maybe your testing is telling you that your driver should
> >> > > have a hardware-specific 200ms delay to wait for power to become
> >> > > stable, followed by the standard 100ms for T_PVPERL?
> >> > 
> >> > You are right, we did not take the power stable cost into account.
> >> > T_PVPERL is enough for Lenovo Thinklife ST8000 NVMe SSD to get ready,
> >> > and the extra cost is from the power circuit of a PCIe to M.2 connector,
> >> > which is used to verify M.2 SSD with our EVB at early stage.
> >> 
> >> Hmm.  That sounds potentially interesting.  I assume you're talking
> >> about something like this: https://www.amazon.com/dp/B07JKH5VTL
> >> 
> >> I'm not familiar with the timing requirements for something like this.
> >> There is a PCIe M.2 spec with some timing requirements, but I don't
> >> know whether or how software is supposed to manage this.  There is a
> >> T_PVPGL (power valid to PERST# inactive) parameter, but it's
> >> implementation specific, so I don't know what the point of that is.
> >> And I don't see a way for software to even detect the presence of such
> >> an adapter.
> > 
> > I intended to ask about this on the PCI-SIG forum, but after reading
> > this thread [1], I don't think we would learn anything.  The question
> > was:
> > 
> >   The M.2 device has 5 voltage rails generated from the 3.3V input
> >   supply voltage
> >   -------------------------------------------
> >   This is re. Table 17 in PCI Express M.2 Specification Revision 1.1
> >   Power Valid* to PERST# input inactive : Implementation specific;
> >   recommended 50 ms
> > 
> >   What exactly does this mean ?
> > 
> >   The Note says
> > 
> >     *Power Valid when all the voltage supply rails have reached their
> >     respective Vmin.
> > 
> >   Does this mean that the 50ms to PERSTn is counted from the instant
> >   when all *5 voltage rails* on the M.2 device have become "good" ?
> > 
> > and the answer was:
> > 
> >   You wrote;
> >   Does this mean that the 50ms to PERSTn is counted from the instant
> >   when all 5 voltage rails on the M.2 device have become "good" ?
> > 
> >   Reply:
> >   This means that counting the recommended 50 ms begins from the time
> >   when the power rails coming to the device/module, from the host, are
> >   stable *at the device connector*.
> > 
> >   As for the time it takes voltages derived inside the device from any
> >   of the host power rails (e.g., 3.3V rail) to become stable, that is
> >   part of the 50ms the host should wait before de-asserting PERST#, in
> >   order ensure that most devices will be ready by then.
> > 
> >   Strictly speaking, nothing disastrous happens if a host violates the
> >   50ms. If it de-asserts too soon, the device may not be ready, but
> >   most hosts will try again. If the host de-asserts too late, the
> >   device has even more time to stabilize. This is why the WG felt that
> >   an exact minimum number for >>Tpvpgl, was not valid in practice, and
> >   we made it a recommendation.
> > 
> > Since T_PVPGL is implementation-specific, we can't really base
> > anything in software on the 50ms recommendation.  It sounds to me like
> > they are counting on software to retry config reads when enumerating.
> > 
> > I guess the delays we *can* observe are:
> > 
> >   100ms T_PVPERL "Power stable to PERST# inactive" (CEM 2.9.2)
> >   100ms software delay between reset and config request (Base 6.6.1)
> 
> Refer to Figure2-10 in CEM Spec V2.0, I guess this two delays are T2 & T4?
> In the PATCH v2[4/4], T2 is the msleep(100) for T_PVPERL,
> and T4 is done by starfive_pcie_host_wait_for_link().

Yes, I think "T2" is T_PVPERL.  The CEM r2.0 Figure 2-10 note is
"2. Minimum time from power rails within specified tolerance to
PERST# inactive (T_PVPERL)."

As far as T4 ("Minimum PERST# inactive to PCI Express link out of
electrical idle"), I don't see a name or a value for that parameter,
and I don't think it is the delay required by PCIe r6.0, sec 6.6.1.

The delay required by sec 6.6.1 is a minimum of 100ms following exit
from reset or, for fast links, 100ms after link training completes.

The comment at the call of advk_pcie_wait_for_link() [2] says it is
the delay required by sec 6.6.1, but that doesn't seem right to me.

For one thing, I don't think 6.6.1 says anything about "link up" being
the end of a delay.  So if we want to do the delay required by 6.6.1,
"wait_for_link()" doesn't seem like quite the right name.

For another, all the *_wait_for_link() functions can return success
after 0ms, 90ms, 180ms, etc.  They're unlikely to return after 0ms,
but 90ms is quite possible.  If we avoided the 0ms return and
LINK_WAIT_USLEEP_MIN were 100ms instead of 90ms, that should be enough
for slow links, where we need 100ms following "exit from reset."

But it's still not enough for fast links where we need 100ms "after
link training completes" because we don't know when training
completed.  If training completed 89ms into *_wait_for_link(), we only
delay 1ms after that.

> > The PCI core doesn't know how to assert PERST#, so the T_PVPERL delay
> > definitely has to be in the host controller driver.
> > 
> > The PCI core observes the second 100ms delay after a reset in
> > pci_bridge_wait_for_secondary_bus().  But this 100ms delay does not
> > happen during initial enumeration.  I think the assumption of the PCI
> > core is that when the host controller driver calls pci_host_probe(),
> > we can issue config requests immediately.
> > 
> > So I think that to be safe, we probably need to do both of those 100ms
> > delays in the host controller driver.  Maybe there's some hope of
> > supporting the latter one in the PCI core someday, but that's not
> > today.
> > 
> > Bjorn
> > 
> > [1] https://forum.pcisig.com/viewtopic.php?f=74&t=1037

[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/pci/controller/pci-aardvark.c?id=v6.4#n433
Pali Rohár Aug. 1, 2023, 7:05 a.m. UTC | #11
On Monday 31 July 2023 18:12:23 Bjorn Helgaas wrote:
> [+cc Pali, Marek because I used f76b36d40bee ("PCI: aardvark: Fix link
> training") as an example]
> 
> On Mon, Jul 31, 2023 at 01:52:35PM +0800, Kevin Xie wrote:
> > On 2023/7/28 5:40, Bjorn Helgaas wrote:
> > > On Tue, Jul 25, 2023 at 03:46:35PM -0500, Bjorn Helgaas wrote:
> > >> On Mon, Jul 24, 2023 at 06:48:47PM +0800, Kevin Xie wrote:
> > >> > On 2023/7/21 0:15, Bjorn Helgaas wrote:
> > >> > > On Thu, Jul 20, 2023 at 06:11:59PM +0800, Kevin Xie wrote:
> > >> > >> On 2023/7/20 0:48, Bjorn Helgaas wrote:
> > >> > >> > On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
> > >> > >> >> Add StarFive JH7110 SoC PCIe controller platform
> > >> > >> >> driver codes.
> > >> 
> > >> > >> However, in the compatibility testing with several NVMe SSD, we
> > >> > >> found that Lenovo Thinklife ST8000 NVMe can not get ready in 100ms,
> > >> > >> and it actually needs almost 200ms.  Thus, we increased the T_PVPERL
> > >> > >> value to 300ms for the better device compatibility.
> > >> > > ...
> > >> > > 
> > >> > > Thanks for this valuable information!  This NVMe issue potentially
> > >> > > affects many similar drivers, and we may need a more generic fix so
> > >> > > this device works well with all of them.
> > >> > > 
> > >> > > T_PVPERL is defined to start when power is stable.  Do you have a way
> > >> > > to accurately determine that point?  I'm guessing this:
> > >> > > 
> > >> > >   gpiod_set_value_cansleep(pcie->power_gpio, 1)
> > >> > > 
> > >> > > turns the power on?  But of course that doesn't mean it is instantly
> > >> > > stable.  Maybe your testing is telling you that your driver should
> > >> > > have a hardware-specific 200ms delay to wait for power to become
> > >> > > stable, followed by the standard 100ms for T_PVPERL?
> > >> > 
> > >> > You are right, we did not take the power stable cost into account.
> > >> > T_PVPERL is enough for Lenovo Thinklife ST8000 NVMe SSD to get ready,
> > >> > and the extra cost is from the power circuit of a PCIe to M.2 connector,
> > >> > which is used to verify M.2 SSD with our EVB at early stage.
> > >> 
> > >> Hmm.  That sounds potentially interesting.  I assume you're talking
> > >> about something like this: https://www.amazon.com/dp/B07JKH5VTL
> > >> 
> > >> I'm not familiar with the timing requirements for something like this.
> > >> There is a PCIe M.2 spec with some timing requirements, but I don't
> > >> know whether or how software is supposed to manage this.  There is a
> > >> T_PVPGL (power valid to PERST# inactive) parameter, but it's
> > >> implementation specific, so I don't know what the point of that is.
> > >> And I don't see a way for software to even detect the presence of such
> > >> an adapter.
> > > 
> > > I intended to ask about this on the PCI-SIG forum, but after reading
> > > this thread [1], I don't think we would learn anything.  The question
> > > was:
> > > 
> > >   The M.2 device has 5 voltage rails generated from the 3.3V input
> > >   supply voltage
> > >   -------------------------------------------
> > >   This is re. Table 17 in PCI Express M.2 Specification Revision 1.1
> > >   Power Valid* to PERST# input inactive : Implementation specific;
> > >   recommended 50 ms
> > > 
> > >   What exactly does this mean ?
> > > 
> > >   The Note says
> > > 
> > >     *Power Valid when all the voltage supply rails have reached their
> > >     respective Vmin.
> > > 
> > >   Does this mean that the 50ms to PERSTn is counted from the instant
> > >   when all *5 voltage rails* on the M.2 device have become "good" ?
> > > 
> > > and the answer was:
> > > 
> > >   You wrote;
> > >   Does this mean that the 50ms to PERSTn is counted from the instant
> > >   when all 5 voltage rails on the M.2 device have become "good" ?
> > > 
> > >   Reply:
> > >   This means that counting the recommended 50 ms begins from the time
> > >   when the power rails coming to the device/module, from the host, are
> > >   stable *at the device connector*.
> > > 
> > >   As for the time it takes voltages derived inside the device from any
> > >   of the host power rails (e.g., 3.3V rail) to become stable, that is
> > >   part of the 50ms the host should wait before de-asserting PERST#, in
> > >   order ensure that most devices will be ready by then.
> > > 
> > >   Strictly speaking, nothing disastrous happens if a host violates the
> > >   50ms. If it de-asserts too soon, the device may not be ready, but
> > >   most hosts will try again. If the host de-asserts too late, the
> > >   device has even more time to stabilize. This is why the WG felt that
> > >   an exact minimum number for >>Tpvpgl, was not valid in practice, and
> > >   we made it a recommendation.
> > > 
> > > Since T_PVPGL is implementation-specific, we can't really base
> > > anything in software on the 50ms recommendation.  It sounds to me like
> > > they are counting on software to retry config reads when enumerating.
> > > 
> > > I guess the delays we *can* observe are:
> > > 
> > >   100ms T_PVPERL "Power stable to PERST# inactive" (CEM 2.9.2)
> > >   100ms software delay between reset and config request (Base 6.6.1)
> > 
> > Refer to Figure2-10 in CEM Spec V2.0, I guess this two delays are T2 & T4?
> > In the PATCH v2[4/4], T2 is the msleep(100) for T_PVPERL,
> > and T4 is done by starfive_pcie_host_wait_for_link().
> 
> Yes, I think "T2" is T_PVPERL.  The CEM r2.0 Figure 2-10 note is
> "2. Minimum time from power rails within specified tolerance to
> PERST# inactive (T_PVPERL)."
> 
> As far as T4 ("Minimum PERST# inactive to PCI Express link out of
> electrical idle"), I don't see a name or a value for that parameter,
> and I don't think it is the delay required by PCIe r6.0, sec 6.6.1.
> 
> The delay required by sec 6.6.1 is a minimum of 100ms following exit
> from reset or, for fast links, 100ms after link training completes.
> 
> The comment at the call of advk_pcie_wait_for_link() [2] says it is
> the delay required by sec 6.6.1, but that doesn't seem right to me.
> 
> For one thing, I don't think 6.6.1 says anything about "link up" being
> the end of a delay.  So if we want to do the delay required by 6.6.1,
> "wait_for_link()" doesn't seem like quite the right name.
> 
> For another, all the *_wait_for_link() functions can return success
> after 0ms, 90ms, 180ms, etc.  They're unlikely to return after 0ms,
> but 90ms is quite possible.  If we avoided the 0ms return and
> LINK_WAIT_USLEEP_MIN were 100ms instead of 90ms, that should be enough
> for slow links, where we need 100ms following "exit from reset."
> 
> But it's still not enough for fast links where we need 100ms "after
> link training completes" because we don't know when training
> completed.  If training completed 89ms into *_wait_for_link(), we only
> delay 1ms after that.

Please look into discussion "How long should be PCIe card in Warm Reset
state?" including external references where are more interesting details:
https://lore.kernel.org/linux-pci/20210310110535.zh4pnn4vpmvzwl5q@pali/

About wait for the link, this should be done asynchronously...

> > > The PCI core doesn't know how to assert PERST#, so the T_PVPERL delay
> > > definitely has to be in the host controller driver.
> > > 
> > > The PCI core observes the second 100ms delay after a reset in
> > > pci_bridge_wait_for_secondary_bus().  But this 100ms delay does not
> > > happen during initial enumeration.  I think the assumption of the PCI
> > > core is that when the host controller driver calls pci_host_probe(),
> > > we can issue config requests immediately.
> > > 
> > > So I think that to be safe, we probably need to do both of those 100ms
> > > delays in the host controller driver.  Maybe there's some hope of
> > > supporting the latter one in the PCI core someday, but that's not
> > > today.
> > > 
> > > Bjorn
> > > 
> > > [1] https://forum.pcisig.com/viewtopic.php?f=74&t=1037
> 
> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/pci/controller/pci-aardvark.c?id=v6.4#n433
Kevin Xie Aug. 1, 2023, 7:05 a.m. UTC | #12
On 2023/8/1 7:12, Bjorn Helgaas wrote:
> [+cc Pali, Marek because I used f76b36d40bee ("PCI: aardvark: Fix link
> training") as an example]
> 
> On Mon, Jul 31, 2023 at 01:52:35PM +0800, Kevin Xie wrote:
>> On 2023/7/28 5:40, Bjorn Helgaas wrote:
>> > On Tue, Jul 25, 2023 at 03:46:35PM -0500, Bjorn Helgaas wrote:
>> >> On Mon, Jul 24, 2023 at 06:48:47PM +0800, Kevin Xie wrote:
>> >> > On 2023/7/21 0:15, Bjorn Helgaas wrote:
>> >> > > On Thu, Jul 20, 2023 at 06:11:59PM +0800, Kevin Xie wrote:
>> >> > >> On 2023/7/20 0:48, Bjorn Helgaas wrote:
>> >> > >> > On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
>> >> > >> >> Add StarFive JH7110 SoC PCIe controller platform
>> >> > >> >> driver codes.
>> >> 
>> >> > >> However, in the compatibility testing with several NVMe SSD, we
>> >> > >> found that Lenovo Thinklife ST8000 NVMe can not get ready in 100ms,
>> >> > >> and it actually needs almost 200ms.  Thus, we increased the T_PVPERL
>> >> > >> value to 300ms for the better device compatibility.
>> >> > > ...
>> >> > > 
>> >> > > Thanks for this valuable information!  This NVMe issue potentially
>> >> > > affects many similar drivers, and we may need a more generic fix so
>> >> > > this device works well with all of them.
>> >> > > 
>> >> > > T_PVPERL is defined to start when power is stable.  Do you have a way
>> >> > > to accurately determine that point?  I'm guessing this:
>> >> > > 
>> >> > >   gpiod_set_value_cansleep(pcie->power_gpio, 1)
>> >> > > 
>> >> > > turns the power on?  But of course that doesn't mean it is instantly
>> >> > > stable.  Maybe your testing is telling you that your driver should
>> >> > > have a hardware-specific 200ms delay to wait for power to become
>> >> > > stable, followed by the standard 100ms for T_PVPERL?
>> >> > 
>> >> > You are right, we did not take the power stable cost into account.
>> >> > T_PVPERL is enough for Lenovo Thinklife ST8000 NVMe SSD to get ready,
>> >> > and the extra cost is from the power circuit of a PCIe to M.2 connector,
>> >> > which is used to verify M.2 SSD with our EVB at early stage.
>> >> 
>> >> Hmm.  That sounds potentially interesting.  I assume you're talking
>> >> about something like this: https://www.amazon.com/dp/B07JKH5VTL
>> >> 
>> >> I'm not familiar with the timing requirements for something like this.
>> >> There is a PCIe M.2 spec with some timing requirements, but I don't
>> >> know whether or how software is supposed to manage this.  There is a
>> >> T_PVPGL (power valid to PERST# inactive) parameter, but it's
>> >> implementation specific, so I don't know what the point of that is.
>> >> And I don't see a way for software to even detect the presence of such
>> >> an adapter.
>> > 
>> > I intended to ask about this on the PCI-SIG forum, but after reading
>> > this thread [1], I don't think we would learn anything.  The question
>> > was:
>> > 
>> >   The M.2 device has 5 voltage rails generated from the 3.3V input
>> >   supply voltage
>> >   -------------------------------------------
>> >   This is re. Table 17 in PCI Express M.2 Specification Revision 1.1
>> >   Power Valid* to PERST# input inactive : Implementation specific;
>> >   recommended 50 ms
>> > 
>> >   What exactly does this mean ?
>> > 
>> >   The Note says
>> > 
>> >     *Power Valid when all the voltage supply rails have reached their
>> >     respective Vmin.
>> > 
>> >   Does this mean that the 50ms to PERSTn is counted from the instant
>> >   when all *5 voltage rails* on the M.2 device have become "good" ?
>> > 
>> > and the answer was:
>> > 
>> >   You wrote;
>> >   Does this mean that the 50ms to PERSTn is counted from the instant
>> >   when all 5 voltage rails on the M.2 device have become "good" ?
>> > 
>> >   Reply:
>> >   This means that counting the recommended 50 ms begins from the time
>> >   when the power rails coming to the device/module, from the host, are
>> >   stable *at the device connector*.
>> > 
>> >   As for the time it takes voltages derived inside the device from any
>> >   of the host power rails (e.g., 3.3V rail) to become stable, that is
>> >   part of the 50ms the host should wait before de-asserting PERST#, in
>> >   order ensure that most devices will be ready by then.
>> > 
>> >   Strictly speaking, nothing disastrous happens if a host violates the
>> >   50ms. If it de-asserts too soon, the device may not be ready, but
>> >   most hosts will try again. If the host de-asserts too late, the
>> >   device has even more time to stabilize. This is why the WG felt that
>> >   an exact minimum number for >>Tpvpgl, was not valid in practice, and
>> >   we made it a recommendation.
>> > 
>> > Since T_PVPGL is implementation-specific, we can't really base
>> > anything in software on the 50ms recommendation.  It sounds to me like
>> > they are counting on software to retry config reads when enumerating.
>> > 
>> > I guess the delays we *can* observe are:
>> > 
>> >   100ms T_PVPERL "Power stable to PERST# inactive" (CEM 2.9.2)
>> >   100ms software delay between reset and config request (Base 6.6.1)
>> 
>> Refer to Figure2-10 in CEM Spec V2.0, I guess this two delays are T2 & T4?
>> In the PATCH v2[4/4], T2 is the msleep(100) for T_PVPERL,
>> and T4 is done by starfive_pcie_host_wait_for_link().
> 
> Yes, I think "T2" is T_PVPERL.  The CEM r2.0 Figure 2-10 note is
> "2. Minimum time from power rails within specified tolerance to
> PERST# inactive (T_PVPERL)."
> 
> As far as T4 ("Minimum PERST# inactive to PCI Express link out of
> electrical idle"), I don't see a name or a value for that parameter,
> and I don't think it is the delay required by PCIe r6.0, sec 6.6.1.
> 
> The delay required by sec 6.6.1 is a minimum of 100ms following exit
> from reset or, for fast links, 100ms after link training completes.
> 
> The comment at the call of advk_pcie_wait_for_link() [2] says it is
> the delay required by sec 6.6.1, but that doesn't seem right to me.
> 
> For one thing, I don't think 6.6.1 says anything about "link up" being
> the end of a delay.  So if we want to do the delay required by 6.6.1,
> "wait_for_link()" doesn't seem like quite the right name.
> 
> For another, all the *_wait_for_link() functions can return success
> after 0ms, 90ms, 180ms, etc.  They're unlikely to return after 0ms,
> but 90ms is quite possible.  If we avoided the 0ms return and
> LINK_WAIT_USLEEP_MIN were 100ms instead of 90ms, that should be enough
> for slow links, where we need 100ms following "exit from reset."
> 
> But it's still not enough for fast links where we need 100ms "after
> link training completes" because we don't know when training
> completed.  If training completed 89ms into *_wait_for_link(), we only
> delay 1ms after that.
> 

That's the point, we will add a extra 100ms after PERST# de-assert
in the patch-v3 according to Base Spec r6.0 - 6.6.1:
        msleep(100);
        gpiod_set_value_cansleep(pcie->reset_gpio, 0);

+       /* As the requirement in PCIe base spec r6.0, system must wait a
+        * minimum of 100 ms following exit from a Conventional Reset
+        * before sending a Configuration Request to the device.*/
+       msleep(100);
+
        if (starfive_pcie_host_wait_for_link(pcie))
                return -EIO;

>> > The PCI core doesn't know how to assert PERST#, so the T_PVPERL delay
>> > definitely has to be in the host controller driver.
>> > 
>> > The PCI core observes the second 100ms delay after a reset in
>> > pci_bridge_wait_for_secondary_bus().  But this 100ms delay does not
>> > happen during initial enumeration.  I think the assumption of the PCI
>> > core is that when the host controller driver calls pci_host_probe(),
>> > we can issue config requests immediately.
>> > 
>> > So I think that to be safe, we probably need to do both of those 100ms
>> > delays in the host controller driver.  Maybe there's some hope of
>> > supporting the latter one in the PCI core someday, but that's not
>> > today.
>> > 
>> > Bjorn
>> > 
>> > [1] https://forum.pcisig.com/viewtopic.php?f=74&t=1037
> 
> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/pci/controller/pci-aardvark.c?id=v6.4#n433
Pali Rohár Aug. 1, 2023, 7:14 a.m. UTC | #13
On Tuesday 01 August 2023 15:05:46 Kevin Xie wrote:
> 
> 
> On 2023/8/1 7:12, Bjorn Helgaas wrote:
> > [+cc Pali, Marek because I used f76b36d40bee ("PCI: aardvark: Fix link
> > training") as an example]
> > 
> > On Mon, Jul 31, 2023 at 01:52:35PM +0800, Kevin Xie wrote:
> >> On 2023/7/28 5:40, Bjorn Helgaas wrote:
> >> > On Tue, Jul 25, 2023 at 03:46:35PM -0500, Bjorn Helgaas wrote:
> >> >> On Mon, Jul 24, 2023 at 06:48:47PM +0800, Kevin Xie wrote:
> >> >> > On 2023/7/21 0:15, Bjorn Helgaas wrote:
> >> >> > > On Thu, Jul 20, 2023 at 06:11:59PM +0800, Kevin Xie wrote:
> >> >> > >> On 2023/7/20 0:48, Bjorn Helgaas wrote:
> >> >> > >> > On Wed, Jul 19, 2023 at 06:20:56PM +0800, Minda Chen wrote:
> >> >> > >> >> Add StarFive JH7110 SoC PCIe controller platform
> >> >> > >> >> driver codes.
> >> >> 
> >> >> > >> However, in the compatibility testing with several NVMe SSD, we
> >> >> > >> found that Lenovo Thinklife ST8000 NVMe can not get ready in 100ms,
> >> >> > >> and it actually needs almost 200ms.  Thus, we increased the T_PVPERL
> >> >> > >> value to 300ms for the better device compatibility.
> >> >> > > ...
> >> >> > > 
> >> >> > > Thanks for this valuable information!  This NVMe issue potentially
> >> >> > > affects many similar drivers, and we may need a more generic fix so
> >> >> > > this device works well with all of them.
> >> >> > > 
> >> >> > > T_PVPERL is defined to start when power is stable.  Do you have a way
> >> >> > > to accurately determine that point?  I'm guessing this:
> >> >> > > 
> >> >> > >   gpiod_set_value_cansleep(pcie->power_gpio, 1)
> >> >> > > 
> >> >> > > turns the power on?  But of course that doesn't mean it is instantly
> >> >> > > stable.  Maybe your testing is telling you that your driver should
> >> >> > > have a hardware-specific 200ms delay to wait for power to become
> >> >> > > stable, followed by the standard 100ms for T_PVPERL?
> >> >> > 
> >> >> > You are right, we did not take the power stable cost into account.
> >> >> > T_PVPERL is enough for Lenovo Thinklife ST8000 NVMe SSD to get ready,
> >> >> > and the extra cost is from the power circuit of a PCIe to M.2 connector,
> >> >> > which is used to verify M.2 SSD with our EVB at early stage.
> >> >> 
> >> >> Hmm.  That sounds potentially interesting.  I assume you're talking
> >> >> about something like this: https://www.amazon.com/dp/B07JKH5VTL
> >> >> 
> >> >> I'm not familiar with the timing requirements for something like this.
> >> >> There is a PCIe M.2 spec with some timing requirements, but I don't
> >> >> know whether or how software is supposed to manage this.  There is a
> >> >> T_PVPGL (power valid to PERST# inactive) parameter, but it's
> >> >> implementation specific, so I don't know what the point of that is.
> >> >> And I don't see a way for software to even detect the presence of such
> >> >> an adapter.
> >> > 
> >> > I intended to ask about this on the PCI-SIG forum, but after reading
> >> > this thread [1], I don't think we would learn anything.  The question
> >> > was:
> >> > 
> >> >   The M.2 device has 5 voltage rails generated from the 3.3V input
> >> >   supply voltage
> >> >   -------------------------------------------
> >> >   This is re. Table 17 in PCI Express M.2 Specification Revision 1.1
> >> >   Power Valid* to PERST# input inactive : Implementation specific;
> >> >   recommended 50 ms
> >> > 
> >> >   What exactly does this mean ?
> >> > 
> >> >   The Note says
> >> > 
> >> >     *Power Valid when all the voltage supply rails have reached their
> >> >     respective Vmin.
> >> > 
> >> >   Does this mean that the 50ms to PERSTn is counted from the instant
> >> >   when all *5 voltage rails* on the M.2 device have become "good" ?
> >> > 
> >> > and the answer was:
> >> > 
> >> >   You wrote;
> >> >   Does this mean that the 50ms to PERSTn is counted from the instant
> >> >   when all 5 voltage rails on the M.2 device have become "good" ?
> >> > 
> >> >   Reply:
> >> >   This means that counting the recommended 50 ms begins from the time
> >> >   when the power rails coming to the device/module, from the host, are
> >> >   stable *at the device connector*.
> >> > 
> >> >   As for the time it takes voltages derived inside the device from any
> >> >   of the host power rails (e.g., 3.3V rail) to become stable, that is
> >> >   part of the 50ms the host should wait before de-asserting PERST#, in
> >> >   order ensure that most devices will be ready by then.
> >> > 
> >> >   Strictly speaking, nothing disastrous happens if a host violates the
> >> >   50ms. If it de-asserts too soon, the device may not be ready, but
> >> >   most hosts will try again. If the host de-asserts too late, the
> >> >   device has even more time to stabilize. This is why the WG felt that
> >> >   an exact minimum number for >>Tpvpgl, was not valid in practice, and
> >> >   we made it a recommendation.
> >> > 
> >> > Since T_PVPGL is implementation-specific, we can't really base
> >> > anything in software on the 50ms recommendation.  It sounds to me like
> >> > they are counting on software to retry config reads when enumerating.
> >> > 
> >> > I guess the delays we *can* observe are:
> >> > 
> >> >   100ms T_PVPERL "Power stable to PERST# inactive" (CEM 2.9.2)
> >> >   100ms software delay between reset and config request (Base 6.6.1)
> >> 
> >> Refer to Figure2-10 in CEM Spec V2.0, I guess this two delays are T2 & T4?
> >> In the PATCH v2[4/4], T2 is the msleep(100) for T_PVPERL,
> >> and T4 is done by starfive_pcie_host_wait_for_link().
> > 
> > Yes, I think "T2" is T_PVPERL.  The CEM r2.0 Figure 2-10 note is
> > "2. Minimum time from power rails within specified tolerance to
> > PERST# inactive (T_PVPERL)."
> > 
> > As far as T4 ("Minimum PERST# inactive to PCI Express link out of
> > electrical idle"), I don't see a name or a value for that parameter,
> > and I don't think it is the delay required by PCIe r6.0, sec 6.6.1.
> > 
> > The delay required by sec 6.6.1 is a minimum of 100ms following exit
> > from reset or, for fast links, 100ms after link training completes.
> > 
> > The comment at the call of advk_pcie_wait_for_link() [2] says it is
> > the delay required by sec 6.6.1, but that doesn't seem right to me.
> > 
> > For one thing, I don't think 6.6.1 says anything about "link up" being
> > the end of a delay.  So if we want to do the delay required by 6.6.1,
> > "wait_for_link()" doesn't seem like quite the right name.
> > 
> > For another, all the *_wait_for_link() functions can return success
> > after 0ms, 90ms, 180ms, etc.  They're unlikely to return after 0ms,
> > but 90ms is quite possible.  If we avoided the 0ms return and
> > LINK_WAIT_USLEEP_MIN were 100ms instead of 90ms, that should be enough
> > for slow links, where we need 100ms following "exit from reset."
> > 
> > But it's still not enough for fast links where we need 100ms "after
> > link training completes" because we don't know when training
> > completed.  If training completed 89ms into *_wait_for_link(), we only
> > delay 1ms after that.
> > 
> 
> That's the point, we will add a extra 100ms after PERST# de-assert
> in the patch-v3 according to Base Spec r6.0 - 6.6.1:
>         msleep(100);
>         gpiod_set_value_cansleep(pcie->reset_gpio, 0);
> 
> +       /* As the requirement in PCIe base spec r6.0, system must wait a
> +        * minimum of 100 ms following exit from a Conventional Reset
> +        * before sending a Configuration Request to the device.*/
> +       msleep(100);
> +
>         if (starfive_pcie_host_wait_for_link(pcie))
>                 return -EIO;
> 

Maybe this information can be useful here:
https://lore.kernel.org/linux-pci/20211022183808.jdeo7vntnagqkg7g@pali/

> >> > The PCI core doesn't know how to assert PERST#, so the T_PVPERL delay
> >> > definitely has to be in the host controller driver.
> >> > 
> >> > The PCI core observes the second 100ms delay after a reset in
> >> > pci_bridge_wait_for_secondary_bus().  But this 100ms delay does not
> >> > happen during initial enumeration.  I think the assumption of the PCI
> >> > core is that when the host controller driver calls pci_host_probe(),
> >> > we can issue config requests immediately.
> >> > 
> >> > So I think that to be safe, we probably need to do both of those 100ms
> >> > delays in the host controller driver.  Maybe there's some hope of
> >> > supporting the latter one in the PCI core someday, but that's not
> >> > today.
> >> > 
> >> > Bjorn
> >> > 
> >> > [1] https://forum.pcisig.com/viewtopic.php?f=74&t=1037
> > 
> > [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/pci/controller/pci-aardvark.c?id=v6.4#n433
Bjorn Helgaas Aug. 2, 2023, 5:14 p.m. UTC | #14
On Tue, Aug 01, 2023 at 09:14:53AM +0200, Pali Rohár wrote:
> On Tuesday 01 August 2023 15:05:46 Kevin Xie wrote:
> > On 2023/8/1 7:12, Bjorn Helgaas wrote:
> > ...

> > That's the point, we will add a extra 100ms after PERST# de-assert
> > in the patch-v3 according to Base Spec r6.0 - 6.6.1:
> >         msleep(100);
> >         gpiod_set_value_cansleep(pcie->reset_gpio, 0);
> > 
> > +       /* As the requirement in PCIe base spec r6.0, system must wait a
> > +        * minimum of 100 ms following exit from a Conventional Reset
> > +        * before sending a Configuration Request to the device.*/
> > +       msleep(100);
> > +
> >         if (starfive_pcie_host_wait_for_link(pcie))
> >                 return -EIO;
> 
> Maybe this information can be useful here:
> https://lore.kernel.org/linux-pci/20211022183808.jdeo7vntnagqkg7g@pali/

Yes, thank you!  That is a great summary!
Bjorn Helgaas Aug. 2, 2023, 5:18 p.m. UTC | #15
On Tue, Aug 01, 2023 at 03:05:46PM +0800, Kevin Xie wrote:
> On 2023/8/1 7:12, Bjorn Helgaas wrote:
> ...

> > The delay required by sec 6.6.1 is a minimum of 100ms following exit
> > from reset or, for fast links, 100ms after link training completes.
> > 
> > The comment at the call of advk_pcie_wait_for_link() [2] says it is
> > the delay required by sec 6.6.1, but that doesn't seem right to me.
> > 
> > For one thing, I don't think 6.6.1 says anything about "link up" being
> > the end of a delay.  So if we want to do the delay required by 6.6.1,
> > "wait_for_link()" doesn't seem like quite the right name.
> > 
> > For another, all the *_wait_for_link() functions can return success
> > after 0ms, 90ms, 180ms, etc.  They're unlikely to return after 0ms,
> > but 90ms is quite possible.  If we avoided the 0ms return and
> > LINK_WAIT_USLEEP_MIN were 100ms instead of 90ms, that should be enough
> > for slow links, where we need 100ms following "exit from reset."
> > 
> > But it's still not enough for fast links where we need 100ms "after
> > link training completes" because we don't know when training
> > completed.  If training completed 89ms into *_wait_for_link(), we only
> > delay 1ms after that.
> 
> That's the point, we will add a extra 100ms after PERST# de-assert
> in the patch-v3 according to Base Spec r6.0 - 6.6.1:
>         msleep(100);
>         gpiod_set_value_cansleep(pcie->reset_gpio, 0);
> 
> +       /* As the requirement in PCIe base spec r6.0, system must wait a
> +        * minimum of 100 ms following exit from a Conventional Reset
> +        * before sending a Configuration Request to the device.*/
> +       msleep(100);
> +
>         if (starfive_pcie_host_wait_for_link(pcie))
>                 return -EIO;

For fast links (links that support > 5.0 GT/s), the 100ms starts
*after* link training completes.  The above looks OK if starfive only
supports slow links, but then I'm not sure why we would need
starfive_pcie_host_wait_for_link().

Bjorn
Kevin Xie Aug. 3, 2023, 2:23 a.m. UTC | #16
On 2023/8/3 1:18, Bjorn Helgaas wrote:
> On Tue, Aug 01, 2023 at 03:05:46PM +0800, Kevin Xie wrote:
>> On 2023/8/1 7:12, Bjorn Helgaas wrote:
>> ...
> 
>> > The delay required by sec 6.6.1 is a minimum of 100ms following exit
>> > from reset or, for fast links, 100ms after link training completes.
>> > 
>> > The comment at the call of advk_pcie_wait_for_link() [2] says it is
>> > the delay required by sec 6.6.1, but that doesn't seem right to me.
>> > 
>> > For one thing, I don't think 6.6.1 says anything about "link up" being
>> > the end of a delay.  So if we want to do the delay required by 6.6.1,
>> > "wait_for_link()" doesn't seem like quite the right name.
>> > 
>> > For another, all the *_wait_for_link() functions can return success
>> > after 0ms, 90ms, 180ms, etc.  They're unlikely to return after 0ms,
>> > but 90ms is quite possible.  If we avoided the 0ms return and
>> > LINK_WAIT_USLEEP_MIN were 100ms instead of 90ms, that should be enough
>> > for slow links, where we need 100ms following "exit from reset."
>> > 
>> > But it's still not enough for fast links where we need 100ms "after
>> > link training completes" because we don't know when training
>> > completed.  If training completed 89ms into *_wait_for_link(), we only
>> > delay 1ms after that.
>> 
>> That's the point, we will add a extra 100ms after PERST# de-assert
>> in the patch-v3 according to Base Spec r6.0 - 6.6.1:
>>         msleep(100);
>>         gpiod_set_value_cansleep(pcie->reset_gpio, 0);
>> 
>> +       /* As the requirement in PCIe base spec r6.0, system must wait a
>> +        * minimum of 100 ms following exit from a Conventional Reset
>> +        * before sending a Configuration Request to the device.*/
>> +       msleep(100);
>> +
>>         if (starfive_pcie_host_wait_for_link(pcie))
>>                 return -EIO;
> 
> For fast links (links that support > 5.0 GT/s), the 100ms starts
> *after* link training completes.  The above looks OK if starfive only
> supports slow links, but then I'm not sure why we would need
> starfive_pcie_host_wait_for_link().
> 
Yes, the maximum speed of JH7110 PCIe is 5.0 GT/s (Gen2x1).

About starfive_pcie_host_wait_for_link():
JH7110 SoC only has one root port in each PCIe controller (2 in total)
and they do not support hot-plug yet.
Thus, We add starfive_pcie_host_wait_for_link() to poll if it is a empty slot.
If nothing here, we will exit the probe() of this controller, and it will not
go into pci_host_probe() too.
This may not be a very standard logic, should we remove it or rewrite in a better way?

> Bjorn
Pali Rohár Aug. 3, 2023, 6:58 a.m. UTC | #17
On Thursday 03 August 2023 10:23:47 Kevin Xie wrote:
> On 2023/8/3 1:18, Bjorn Helgaas wrote:
> > On Tue, Aug 01, 2023 at 03:05:46PM +0800, Kevin Xie wrote:
> >> On 2023/8/1 7:12, Bjorn Helgaas wrote:
> >> ...
> > 
> >> > The delay required by sec 6.6.1 is a minimum of 100ms following exit
> >> > from reset or, for fast links, 100ms after link training completes.
> >> > 
> >> > The comment at the call of advk_pcie_wait_for_link() [2] says it is
> >> > the delay required by sec 6.6.1, but that doesn't seem right to me.
> >> > 
> >> > For one thing, I don't think 6.6.1 says anything about "link up" being
> >> > the end of a delay.  So if we want to do the delay required by 6.6.1,
> >> > "wait_for_link()" doesn't seem like quite the right name.
> >> > 
> >> > For another, all the *_wait_for_link() functions can return success
> >> > after 0ms, 90ms, 180ms, etc.  They're unlikely to return after 0ms,
> >> > but 90ms is quite possible.  If we avoided the 0ms return and
> >> > LINK_WAIT_USLEEP_MIN were 100ms instead of 90ms, that should be enough
> >> > for slow links, where we need 100ms following "exit from reset."
> >> > 
> >> > But it's still not enough for fast links where we need 100ms "after
> >> > link training completes" because we don't know when training
> >> > completed.  If training completed 89ms into *_wait_for_link(), we only
> >> > delay 1ms after that.
> >> 
> >> That's the point, we will add a extra 100ms after PERST# de-assert
> >> in the patch-v3 according to Base Spec r6.0 - 6.6.1:
> >>         msleep(100);
> >>         gpiod_set_value_cansleep(pcie->reset_gpio, 0);
> >> 
> >> +       /* As the requirement in PCIe base spec r6.0, system must wait a
> >> +        * minimum of 100 ms following exit from a Conventional Reset
> >> +        * before sending a Configuration Request to the device.*/
> >> +       msleep(100);
> >> +
> >>         if (starfive_pcie_host_wait_for_link(pcie))
> >>                 return -EIO;
> > 
> > For fast links (links that support > 5.0 GT/s), the 100ms starts
> > *after* link training completes.  The above looks OK if starfive only
> > supports slow links, but then I'm not sure why we would need
> > starfive_pcie_host_wait_for_link().
> > 
> Yes, the maximum speed of JH7110 PCIe is 5.0 GT/s (Gen2x1).
> 
> About starfive_pcie_host_wait_for_link():
> JH7110 SoC only has one root port in each PCIe controller (2 in total)
> and they do not support hot-plug yet.

Beware that even if HW does not support hotplug, endpoint PCIe card
still may drop the link down and later put it up (for example if FW in
the card crashes or when card want to do internal reset, etc...; this is
very common for wifi cards). So drivers for non-hotplug controllers
still have to handle hotplug events generated by link up/down state.

So it means that, if endpoint PCIe card is not detected during probe
time, it may be detected later. So this check to completely stop
registering controller is not a good idea. Note that userspace can
tell kernel (via sysfs) to rescan all PCIe buses and try to discover new
PCIea devices.

> Thus, We add starfive_pcie_host_wait_for_link() to poll if it is a empty slot.
> If nothing here, we will exit the probe() of this controller, and it will not
> go into pci_host_probe() too.
> This may not be a very standard logic, should we remove it or rewrite in a better way?
> 
> > Bjorn

Rather to remove this starfive_pcie_host_wait_for_link logic.

Better option would be to teach PCI core code to wait for the link
before trying to read vendor/device ids, like I described in my old
proposal.
Kevin Xie Aug. 3, 2023, 7:43 a.m. UTC | #18
On 2023/8/3 14:58, Pali Rohár wrote:
> On Thursday 03 August 2023 10:23:47 Kevin Xie wrote:
>> On 2023/8/3 1:18, Bjorn Helgaas wrote:
>> > On Tue, Aug 01, 2023 at 03:05:46PM +0800, Kevin Xie wrote:
>> >> On 2023/8/1 7:12, Bjorn Helgaas wrote:
>> >> ...
>> > 
>> >> > The delay required by sec 6.6.1 is a minimum of 100ms following exit
>> >> > from reset or, for fast links, 100ms after link training completes.
>> >> > 
>> >> > The comment at the call of advk_pcie_wait_for_link() [2] says it is
>> >> > the delay required by sec 6.6.1, but that doesn't seem right to me.
>> >> > 
>> >> > For one thing, I don't think 6.6.1 says anything about "link up" being
>> >> > the end of a delay.  So if we want to do the delay required by 6.6.1,
>> >> > "wait_for_link()" doesn't seem like quite the right name.
>> >> > 
>> >> > For another, all the *_wait_for_link() functions can return success
>> >> > after 0ms, 90ms, 180ms, etc.  They're unlikely to return after 0ms,
>> >> > but 90ms is quite possible.  If we avoided the 0ms return and
>> >> > LINK_WAIT_USLEEP_MIN were 100ms instead of 90ms, that should be enough
>> >> > for slow links, where we need 100ms following "exit from reset."
>> >> > 
>> >> > But it's still not enough for fast links where we need 100ms "after
>> >> > link training completes" because we don't know when training
>> >> > completed.  If training completed 89ms into *_wait_for_link(), we only
>> >> > delay 1ms after that.
>> >> 
>> >> That's the point, we will add a extra 100ms after PERST# de-assert
>> >> in the patch-v3 according to Base Spec r6.0 - 6.6.1:
>> >>         msleep(100);
>> >>         gpiod_set_value_cansleep(pcie->reset_gpio, 0);
>> >> 
>> >> +       /* As the requirement in PCIe base spec r6.0, system must wait a
>> >> +        * minimum of 100 ms following exit from a Conventional Reset
>> >> +        * before sending a Configuration Request to the device.*/
>> >> +       msleep(100);
>> >> +
>> >>         if (starfive_pcie_host_wait_for_link(pcie))
>> >>                 return -EIO;
>> > 
>> > For fast links (links that support > 5.0 GT/s), the 100ms starts
>> > *after* link training completes.  The above looks OK if starfive only
>> > supports slow links, but then I'm not sure why we would need
>> > starfive_pcie_host_wait_for_link().
>> > 
>> Yes, the maximum speed of JH7110 PCIe is 5.0 GT/s (Gen2x1).
>> 
>> About starfive_pcie_host_wait_for_link():
>> JH7110 SoC only has one root port in each PCIe controller (2 in total)
>> and they do not support hot-plug yet.
> 
> Beware that even if HW does not support hotplug, endpoint PCIe card
> still may drop the link down and later put it up (for example if FW in
> the card crashes or when card want to do internal reset, etc...; this is
> very common for wifi cards). So drivers for non-hotplug controllers
> still have to handle hotplug events generated by link up/down state.
> 
> So it means that, if endpoint PCIe card is not detected during probe
> time, it may be detected later. So this check to completely stop
> registering controller is not a good idea. Note that userspace can
> tell kernel (via sysfs) to rescan all PCIe buses and try to discover new
> PCIea devices.
> 

Yes, we should not ignored this situation.

>> Thus, We add starfive_pcie_host_wait_for_link() to poll if it is a empty slot.
>> If nothing here, we will exit the probe() of this controller, and it will not
>> go into pci_host_probe() too.
>> This may not be a very standard logic, should we remove it or rewrite in a better way?
>> 
>> > Bjorn
> 
> Rather to remove this starfive_pcie_host_wait_for_link logic.
> 
> Better option would be to teach PCI core code to wait for the link
> before trying to read vendor/device ids, like I described in my old
> proposal.

Yes, the proposal can prevent us from writing the wrong timing.
However, as things stand, we have to do the waiting in host controller driver now.
We will keep the wait for the link, but don't return error when the link is down,
such as:
    if (starfive_pcie_host_wait_for_link(pcie))
	dev_info(dev, "port link down\n");
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index f02618c2bdf5..b88a54a24ae5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20356,6 +20356,13 @@  S:	Supported
 F:	Documentation/devicetree/bindings/watchdog/starfive*
 F:	drivers/watchdog/starfive-wdt.c
 
+STARFIVE JH71x0 PCIE DRIVER
+M:	Minda Chen <minda.chen@starfivetech.com>
+L:	linux-pci@vger.kernel.org
+S:	Supported
+F:	Documentation/devicetree/bindings/pci/starfive*
+F:	drivers/pci/controller/plda/pcie-starfive.c
+
 STATIC BRANCH/CALL
 M:	Peter Zijlstra <peterz@infradead.org>
 M:	Josh Poimboeuf <jpoimboe@kernel.org>
diff --git a/drivers/pci/controller/plda/Kconfig b/drivers/pci/controller/plda/Kconfig
index a3c790545843..eaf72954da9f 100644
--- a/drivers/pci/controller/plda/Kconfig
+++ b/drivers/pci/controller/plda/Kconfig
@@ -24,4 +24,12 @@  config PCIE_MICROCHIP_HOST
 	  Say Y here if you want kernel to support the Microchip AXI PCIe
 	  Host Bridge driver.
 
+config PCIE_STARFIVE_HOST
+	tristate "StarFive PCIe host controller"
+	select PCIE_PLDA_HOST
+	help
+	  Say Y here if you want to support the StarFive PCIe controller
+	  in host mode. StarFive PCIe controller uses PLDA PCIe
+	  core.
+
 endmenu
diff --git a/drivers/pci/controller/plda/Makefile b/drivers/pci/controller/plda/Makefile
index 2f16d9126535..a6089a873d18 100644
--- a/drivers/pci/controller/plda/Makefile
+++ b/drivers/pci/controller/plda/Makefile
@@ -2,3 +2,4 @@ 
 obj-$(CONFIG_PCIE_PLDA_HOST) += pcie-plda-host.o
 obj-$(CONFIG_PCIE_PLDA_PLAT_HOST) += pcie-plda-plat.o
 obj-$(CONFIG_PCIE_MICROCHIP_HOST) += pcie-microchip-host.o
+obj-$(CONFIG_PCIE_STARFIVE_HOST) += pcie-starfive.o
diff --git a/drivers/pci/controller/plda/pcie-plda.h b/drivers/pci/controller/plda/pcie-plda.h
index 8785f885ddb1..ef0d2aa7ccea 100644
--- a/drivers/pci/controller/plda/pcie-plda.h
+++ b/drivers/pci/controller/plda/pcie-plda.h
@@ -15,13 +15,19 @@ 
 #define PLDA_NUM_MSI_IRQS		32
 #define NUM_MSI_IRQS_CODED		5
 
-/* PCIe Bridge Phy Regs */
+/* PCIe Bridge Regs */
+#define GEN_SETTINGS			0x80
+#define  RP_ENABLE			1
 #define PCIE_PCI_IDS_DW1		0x9c
-
+#define  IDS_CLASS_CODE_SHIFT		16
+#define PCI_MISC			0xB4
+#define  PHY_FUNCTION_DIS		BIT(15)
 /* PCIe Config space MSI capability structure */
 #define MSI_CAP_CTRL_OFFSET		0xe0
 #define  MSI_MAX_Q_AVAIL		(NUM_MSI_IRQS_CODED << 1)
 #define  MSI_Q_SIZE			(NUM_MSI_IRQS_CODED << 4)
+#define PCIE_WINROM			0xfc
+#define  PREF_MEM_WIN_64_SUPPORT		BIT(3)
 
 #define IMASK_LOCAL				0x180
 #define  DMA_END_ENGINE_0_MASK			0x00000000u
@@ -75,6 +81,8 @@ 
 #define ISTATUS_HOST				0x18c
 #define IMSI_ADDR				0x190
 #define ISTATUS_MSI				0x194
+#define PMSG_SUPPORT_RX				0x3F0
+#define  PMSG_LTR_SUPPORT			BIT(2)
 
 /* PCIe Master table init defines */
 #define ATR0_PCIE_WIN0_SRCADDR_PARAM		0x600u
@@ -173,4 +181,50 @@  static inline void plda_set_default_msi(struct plda_msi *msi)
 	msi->vector_phy = IMSI_ADDR;
 	msi->num_vectors = PLDA_NUM_MSI_IRQS;
 }
+
+static inline void plda_pcie_enable_root_port(struct plda_pcie *plda)
+{
+	u32 value;
+
+	value = readl_relaxed(plda->bridge_addr + GEN_SETTINGS);
+	value |= RP_ENABLE;
+	writel_relaxed(value, plda->bridge_addr + GEN_SETTINGS);
+}
+
+static inline void plda_pcie_set_standard_class(struct plda_pcie *plda)
+{
+	u32 value;
+
+	value = readl_relaxed(plda->bridge_addr + PCIE_PCI_IDS_DW1);
+	value &= 0xff;
+	value |= (PCI_CLASS_BRIDGE_PCI << IDS_CLASS_CODE_SHIFT);
+	writel_relaxed(value, plda->bridge_addr + PCIE_PCI_IDS_DW1);
+}
+
+static inline void plda_pcie_set_pref_win_64bit(struct plda_pcie *plda)
+{
+	u32 value;
+
+	value = readl_relaxed(plda->bridge_addr + PCIE_WINROM);
+	value |= PREF_MEM_WIN_64_SUPPORT;
+	writel_relaxed(value, plda->bridge_addr + PCIE_WINROM);
+}
+
+static inline void plda_pcie_disable_ltr(struct plda_pcie *plda)
+{
+	u32 value;
+
+	value = readl_relaxed(plda->bridge_addr + PMSG_SUPPORT_RX);
+	value &= ~PMSG_LTR_SUPPORT;
+	writel_relaxed(value, plda->bridge_addr + PMSG_SUPPORT_RX);
+}
+
+static inline void plda_pcie_disable_func(struct plda_pcie *plda)
+{
+	u32 value;
+
+	value = readl_relaxed(plda->bridge_addr + PCI_MISC);
+	value |= PHY_FUNCTION_DIS;
+	writel_relaxed(value, plda->bridge_addr + PCI_MISC);
+}
 #endif /* _PCIE_PLDA_H */
diff --git a/drivers/pci/controller/plda/pcie-starfive.c b/drivers/pci/controller/plda/pcie-starfive.c
new file mode 100644
index 000000000000..816aa77a311d
--- /dev/null
+++ b/drivers/pci/controller/plda/pcie-starfive.c
@@ -0,0 +1,415 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PCIe host controller driver for StarFive JH7110 Soc.
+ *
+ * Copyright (C) 2023 StarFive Technology Co., Ltd.
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/of_pci.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/regmap.h>
+#include <linux/reset.h>
+#include "../../pci.h"
+
+#include "pcie-plda.h"
+
+#define DATA_LINK_ACTIVE		BIT(5)
+#define PREF_MEM_WIN_64_SUPPORT		BIT(3)
+#define PMSG_LTR_SUPPORT		BIT(2)
+#define LINK_SPEED_GEN2			BIT(12)
+#define PHY_FUNCTION_DIS		BIT(15)
+#define PCIE_FUNC_NUM			4
+#define PHY_FUNC_SHIFT			9
+
+/* system control */
+#define STG_SYSCON_K_RP_NEP			BIT(8)
+#define STG_SYSCON_AXI4_SLVL_ARFUNC_MASK	GENMASK(22, 8)
+#define STG_SYSCON_AXI4_SLVL_ARFUNC_SHIFT	8
+#define STG_SYSCON_AXI4_SLVL_AWFUNC_MASK	GENMASK(14, 0)
+#define STG_SYSCON_CLKREQ			BIT(22)
+#define STG_SYSCON_CKREF_SRC_SHIFT		18
+#define STG_SYSCON_CKREF_SRC_MASK		GENMASK(19, 18)
+
+struct starfive_jh7110_pcie {
+	struct plda_pcie	plda;
+	struct reset_control *resets;
+	struct clk_bulk_data *clks;
+	struct regmap *reg_syscon;
+	struct gpio_desc *power_gpio;
+	struct gpio_desc *reset_gpio;
+
+	u32 stg_arfun;
+	u32 stg_awfun;
+	u32 stg_rp_nep;
+	u32 stg_lnksta;
+
+	int num_clks;
+};
+
+/*
+ * The BAR0/1 of bridge should be hidden during enumeration to
+ * avoid the sizing and resource allocation by PCIe core.
+ */
+static bool starfive_pcie_hide_rc_bar(struct pci_bus *bus, unsigned int  devfn,
+				      int offset)
+{
+	if (pci_is_root_bus(bus) && !devfn &&
+	    (offset == PCI_BASE_ADDRESS_0 || offset == PCI_BASE_ADDRESS_1))
+		return true;
+
+	return false;
+}
+
+int starfive_pcie_config_write(struct pci_bus *bus, unsigned int devfn,
+			       int where, int size, u32 value)
+{
+	if (starfive_pcie_hide_rc_bar(bus, devfn, where))
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+
+	return pci_generic_config_write(bus, devfn, where, size, value);
+}
+
+static int starfive_pcie_parse_dt(struct starfive_jh7110_pcie *pcie, struct device *dev)
+{
+	unsigned int args[4];
+
+	pcie->num_clks = devm_clk_bulk_get_all(dev, &pcie->clks);
+	if (pcie->num_clks < 0)
+		return dev_err_probe(dev, -ENODEV,
+			"Failed to get pcie clocks\n");
+
+	pcie->resets = devm_reset_control_array_get_exclusive(dev);
+	if (IS_ERR(pcie->resets))
+		return dev_err_probe(dev, PTR_ERR(pcie->resets),
+			"Failed to get pcie resets");
+
+	pcie->reg_syscon =
+		syscon_regmap_lookup_by_phandle_args(dev->of_node,
+						     "starfive,stg-syscon", 4, args);
+
+	if (IS_ERR(pcie->reg_syscon))
+		return dev_err_probe(dev, PTR_ERR(pcie->reg_syscon),
+			"Failed to parse starfive,stg-syscon\n");
+
+	pcie->stg_arfun = args[0];
+	pcie->stg_awfun = args[1];
+	pcie->stg_rp_nep = args[2];
+	pcie->stg_lnksta = args[3];
+
+	pcie->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
+	if (IS_ERR_OR_NULL(pcie->reset_gpio)) {
+		dev_warn(dev, "Failed to get reset-gpio.\n");
+		return -EINVAL;
+	}
+
+	pcie->power_gpio = devm_gpiod_get_optional(dev, "enable", GPIOD_OUT_LOW);
+	if (IS_ERR(pcie->power_gpio)) {
+		dev_warn(dev, "Failed to get power-gpio.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct pci_ops starfive_pcie_ops = {
+	.map_bus	= plda_pcie_map_bus,
+	.read           = pci_generic_config_read,
+	.write          = starfive_pcie_config_write,
+};
+
+static int starfive_pcie_clk_rst_init(struct starfive_jh7110_pcie *pcie)
+{
+	int ret;
+	struct device *dev = pcie->plda.dev;
+
+	ret = clk_bulk_prepare_enable(pcie->num_clks, pcie->clks);
+	if (ret) {
+		dev_err(dev, "Failed to enable clocks\n");
+		return ret;
+	}
+
+	ret = reset_control_deassert(pcie->resets);
+	if (ret) {
+		clk_bulk_disable_unprepare(pcie->num_clks, pcie->clks);
+		dev_err(dev, "Failed to resets\n");
+	}
+
+	return ret;
+}
+
+static void starfive_pcie_clk_rst_deinit(struct starfive_jh7110_pcie *pcie)
+{
+	reset_control_assert(pcie->resets);
+	clk_bulk_disable_unprepare(pcie->num_clks, pcie->clks);
+}
+
+static int starfive_pcie_is_link_up(struct starfive_jh7110_pcie *pcie)
+{
+	struct device *dev = pcie->plda.dev;
+	int ret;
+	u32 stg_reg_val;
+
+	/* 100ms timeout value should be enough for Gen1/2 training */
+	ret = regmap_read_poll_timeout(pcie->reg_syscon,
+				       pcie->stg_lnksta,
+				       stg_reg_val,
+				       stg_reg_val & DATA_LINK_ACTIVE,
+				       10 * 1000, 100 * 1000);
+
+	/* If the link is down (no device in slot), then exit. */
+	if (ret == -ETIMEDOUT) {
+		dev_info(dev, "Port link down, exit.\n");
+		return 0;
+	} else if (ret == 0) {
+		dev_info(dev, "Port link up.\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+int starfive_pcie_enable_phy(struct device *dev, struct plda_pcie *pcie)
+{
+	int ret;
+
+	if (!pcie->phy)
+		return 0;
+
+	ret = phy_init(pcie->phy);
+	if (ret)
+		return dev_err_probe(dev, ret,
+			"failed to initialize pcie phy\n");
+
+	ret = phy_set_mode(pcie->phy, PHY_MODE_PCIE);
+	if (ret) {
+		dev_err(dev, "failed to set pcie mode\n");
+		goto err_phy_on;
+	}
+
+	ret = phy_power_on(pcie->phy);
+	if (ret) {
+		dev_err(dev, "failed to power on pcie phy\n");
+		goto err_phy_on;
+	}
+
+	return 0;
+
+err_phy_on:
+	phy_exit(pcie->phy);
+	return ret;
+}
+
+void starfive_pcie_disable_phy(struct plda_pcie *pcie)
+{
+	phy_power_off(pcie->phy);
+	phy_exit(pcie->phy);
+}
+
+static void starfive_pcie_host_deinit(struct plda_pcie *plda)
+{
+	struct starfive_jh7110_pcie *pcie =
+		container_of(plda, struct starfive_jh7110_pcie, plda);
+
+	starfive_pcie_clk_rst_deinit(pcie);
+	if (pcie->power_gpio)
+		gpiod_set_value_cansleep(pcie->power_gpio, 0);
+	starfive_pcie_disable_phy(plda);
+}
+
+static int starfive_pcie_host_init(struct plda_pcie *plda)
+{
+	int i;
+	struct starfive_jh7110_pcie *pcie =
+		container_of(plda, struct starfive_jh7110_pcie, plda);
+	struct device *dev = plda->dev;
+	int ret;
+
+	ret = starfive_pcie_enable_phy(dev, plda);
+	if (ret)
+		return ret;
+
+	regmap_update_bits(pcie->reg_syscon, pcie->stg_rp_nep,
+			   STG_SYSCON_K_RP_NEP, STG_SYSCON_K_RP_NEP);
+
+	regmap_update_bits(pcie->reg_syscon, pcie->stg_awfun,
+			   STG_SYSCON_CKREF_SRC_MASK,
+			   2 << STG_SYSCON_CKREF_SRC_SHIFT);
+
+	regmap_update_bits(pcie->reg_syscon, pcie->stg_awfun,
+			   STG_SYSCON_CLKREQ, STG_SYSCON_CLKREQ);
+
+	ret = starfive_pcie_clk_rst_init(pcie);
+	if (ret)
+		return ret;
+
+	if (pcie->power_gpio)
+		gpiod_set_value_cansleep(pcie->power_gpio, 1);
+
+	gpiod_set_value_cansleep(pcie->reset_gpio, 1);
+
+	/* Disable physical functions except #0 */
+	for (i = 1; i < PCIE_FUNC_NUM; i++) {
+		regmap_update_bits(pcie->reg_syscon,
+				   pcie->stg_arfun,
+				   STG_SYSCON_AXI4_SLVL_ARFUNC_MASK,
+				   (i << PHY_FUNC_SHIFT) <<
+				   STG_SYSCON_AXI4_SLVL_ARFUNC_SHIFT);
+		regmap_update_bits(pcie->reg_syscon,
+				   pcie->stg_awfun,
+				   STG_SYSCON_AXI4_SLVL_AWFUNC_MASK,
+				   i << PHY_FUNC_SHIFT);
+
+		plda_pcie_disable_func(plda);
+	}
+
+	regmap_update_bits(pcie->reg_syscon, pcie->stg_arfun,
+			   STG_SYSCON_AXI4_SLVL_ARFUNC_MASK, 0);
+	regmap_update_bits(pcie->reg_syscon, pcie->stg_awfun,
+			   STG_SYSCON_AXI4_SLVL_AWFUNC_MASK, 0);
+
+	/* Enable root port */
+	plda_pcie_enable_root_port(plda);
+
+	/* PCIe PCI Standard Configuration Identification Settings. */
+	plda_pcie_set_standard_class(plda);
+
+	/*
+	 * The LTR message forwarding of PCIe Message Reception was set by core
+	 * as default, but the forward id & addr are also need to be reset.
+	 * If we do not disable LTR message forwarding here, or set a legal
+	 * forwarding address, the kernel will get stuck after this driver probe.
+	 * To workaround, disable the LTR message forwarding support on
+	 * PCIe Message Reception.
+	 */
+	plda_pcie_disable_ltr(plda);
+
+	/* Prefetchable memory window 64-bit addressing support */
+	plda_pcie_set_pref_win_64bit(plda);
+
+	/* Ensure that PERST has been asserted for at least 100 ms */
+	msleep(300);
+	gpiod_set_value_cansleep(pcie->reset_gpio, 0);
+
+	if (!starfive_pcie_is_link_up(pcie))
+		return -EIO;
+
+	return ret;
+}
+
+static const struct plda_pcie_ops pcie_ops = {
+	.host_init = starfive_pcie_host_init,
+	.host_deinit = starfive_pcie_host_deinit,
+};
+
+static int starfive_pcie_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct starfive_jh7110_pcie *pcie;
+	struct plda_pcie *plda;
+	int ret;
+
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
+	if (!pcie)
+		return -ENOMEM;
+
+	plda = &pcie->plda;
+	plda->dev = dev;
+
+	ret = starfive_pcie_parse_dt(pcie, dev);
+	if (ret)
+		return ret;
+
+	plda->ops = &pcie_ops;
+	plda->num_events = NUM_PLDA_EVENTS;
+	ret = plda_pcie_host_init(&pcie->plda, &starfive_pcie_ops);
+	if (ret)
+		return ret;
+
+	pm_runtime_enable(&pdev->dev);
+	pm_runtime_get_sync(&pdev->dev);
+	platform_set_drvdata(pdev, pcie);
+
+	return 0;
+}
+
+static void starfive_pcie_remove(struct platform_device *pdev)
+{
+	struct starfive_jh7110_pcie *pcie = platform_get_drvdata(pdev);
+
+	plda_pcie_host_deinit(&pcie->plda);
+	platform_set_drvdata(pdev, NULL);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int __maybe_unused starfive_pcie_suspend_noirq(struct device *dev)
+{
+	struct starfive_jh7110_pcie *pcie = dev_get_drvdata(dev);
+
+	if (!pcie)
+		return 0;
+
+	clk_bulk_disable_unprepare(pcie->num_clks, pcie->clks);
+	starfive_pcie_disable_phy(&pcie->plda);
+
+	return 0;
+}
+
+static int __maybe_unused starfive_pcie_resume_noirq(struct device *dev)
+{
+	struct starfive_jh7110_pcie *pcie = dev_get_drvdata(dev);
+	int ret;
+
+	if (!pcie)
+		return 0;
+
+	ret = starfive_pcie_enable_phy(dev, &pcie->plda);
+	if (ret)
+		return ret;
+
+	ret = clk_bulk_prepare_enable(pcie->num_clks, pcie->clks);
+	if (ret) {
+		dev_err(dev, "Failed to enable clocks\n");
+		starfive_pcie_disable_phy(&pcie->plda);
+		return ret;
+	}
+
+	return ret;
+}
+
+static const struct dev_pm_ops starfive_pcie_pm_ops = {
+	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(starfive_pcie_suspend_noirq,
+				      starfive_pcie_resume_noirq)
+};
+#endif
+
+static const struct of_device_id starfive_pcie_of_match[] = {
+	{ .compatible = "starfive,jh7110-pcie"},
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, starfive_pcie_of_match);
+
+static struct platform_driver starfive_pcie_driver = {
+	.driver = {
+		.name = "pcie-starfive",
+		.of_match_table = of_match_ptr(starfive_pcie_of_match),
+#ifdef CONFIG_PM_SLEEP
+		.pm = &starfive_pcie_pm_ops,
+#endif
+	},
+	.probe = starfive_pcie_probe,
+	.remove_new = starfive_pcie_remove,
+};
+module_platform_driver(starfive_pcie_driver);
+
+MODULE_DESCRIPTION("StarFive JH7110 PCIe host driver");
+MODULE_LICENSE("GPL v2");