diff mbox

[RFC] PCIe: Add PCIe runtime D3cold support

Message ID 4F8790F6.5080408@intel.com
State Changes Requested, archived
Headers show

Commit Message

Yan, Zheng April 13, 2012, 2:35 a.m. UTC
Hi all,

This patch adds PCIe runtime D3cold support, namely cut power supply for functions
beneath a PCIe port when they all have entered D3. A device in D3cold can only
generate wake event through the WAKE# pin. Because we can not access to a device's
configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.

Any comment will be appreciated.

Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
---
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Alex He April 13, 2012, 6:06 a.m. UTC | #1
On Fri, Apr 13, 2012 at 10:35:34AM +0800, Yan, Zheng wrote:
> Hi all,
> 
> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
> beneath a PCIe port when they all have entered D3. A device in D3cold can only
> generate wake event through the WAKE# pin. Because we can not access to a device's
> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
> 
> Any comment will be appreciated.
> 
> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
> ---
> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> index 0f150f2..e210e8cb 100644
> --- a/drivers/pci/pci-acpi.c
> +++ b/drivers/pci/pci-acpi.c
> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>  		[PCI_D1] = ACPI_STATE_D1,
>  		[PCI_D2] = ACPI_STATE_D2,
>  		[PCI_D3hot] = ACPI_STATE_D3,
> -		[PCI_D3cold] = ACPI_STATE_D3
> +		[PCI_D3cold] = ACPI_STATE_D3_COLD
>  	};
>  	int error = -EINVAL;
>  
> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
>  
>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
>  {
> -	if (dev->pme_interrupt)
> +	/* PME interrupt isn't available in the D3cold case */
> +	if (dev->pme_interrupt && !dev->runtime_d3cold)
>  		return 0;
PCI BUS PM Interface spec says that PMEs can be support for D3COLD.
Please check section 3.2.4 , 5.4.2 and 5.4.3.


>  
>  	if (!acpi_pm_device_run_wake(&dev->dev, enable))
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 8156744..bc16869 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -731,8 +731,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>  	int error;
>  
>  	/* bound the state we're entering */
> -	if (state > PCI_D3hot)
> -		state = PCI_D3hot;
> +	if (state > PCI_D3cold)
> +		state = PCI_D3cold;
>  	else if (state < PCI_D0)
>  		state = PCI_D0;
>  	else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
> @@ -750,7 +750,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>  	if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
>  		return 0;
>  
> -	error = pci_raw_set_power_state(dev, state);
> +	error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
> +					PCI_D3hot : state);
Do you want to set the pci device to D3hot if the state = D3Code through
PMCSR register? I am agree if you realy want that but what is the meaninig
to distingulish D3cold with D3hot in this patch. We can't put the PCI device
through PMCSR register.

>  
>  	if (!__pci_complete_power_transition(dev, state))
>  		error = 0;
> @@ -1482,6 +1483,17 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state)
>  	return !!(dev->pme_support & (1 << state));
>  }
>  
> +static void pci_pme_poll_wakeup(struct pci_dev *dev)
> +{
> +	struct pci_dev *bridge = dev->bus->self;
> +
> +	/* don't poll the pme bit if parent is in low power state */
> +	if (bridge && bridge->current_state != PCI_D0)
> +		return;
> +
> +	pci_pme_wakeup(dev, NULL);
> +}
> +
>  static void pci_pme_list_scan(struct work_struct *work)
>  {
>  	struct pci_pme_device *pme_dev, *n;
> @@ -1490,7 +1502,7 @@ static void pci_pme_list_scan(struct work_struct *work)
>  	if (!list_empty(&pci_pme_list)) {
>  		list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) {
>  			if (pme_dev->dev->pme_poll) {
> -				pci_pme_wakeup(pme_dev->dev, NULL);
> +				pci_pme_poll_wakeup(pme_dev->dev);
>  			} else {
>  				list_del(&pme_dev->list);
>  				kfree(pme_dev);
> @@ -1608,6 +1620,10 @@ int __pci_enable_wake(struct pci_dev *dev, pci_power_t state,
>  	if (enable) {
>  		int error;
>  
> +		if (runtime && state >= PCI_D3cold)
> +			dev->runtime_d3cold = true;
> +		else
> +			dev->runtime_d3cold = false;
>  		if (pci_pme_capable(dev, state))
>  			pci_pme_active(dev, true);
>  		else
> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
> index e0610bd..d66b7e9 100644
> --- a/drivers/pci/pcie/portdrv_pci.c
> +++ b/drivers/pci/pcie/portdrv_pci.c
> @@ -11,11 +11,13 @@
>  #include <linux/kernel.h>
>  #include <linux/errno.h>
>  #include <linux/pm.h>
> +#include <linux/pm_runtime.h>
>  #include <linux/init.h>
>  #include <linux/pcieport_if.h>
>  #include <linux/aer.h>
>  #include <linux/dmi.h>
>  #include <linux/pci-aspm.h>
> +#include <linux/delay.h>
>  
>  #include "portdrv.h"
>  #include "aer/aerdrv.h"
> @@ -99,6 +101,25 @@ static int pcie_port_resume_noirq(struct device *dev)
>  	return 0;
>  }
>  
> +static int pcie_port_runtime_suspend(struct device *dev)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +
> +	pci_save_state(pdev);
> +	return 0;
> +}
> +
> +static int pcie_port_runtime_resume(struct device *dev)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +
> +	pci_restore_state(pdev);
> +	if (pdev->runtime_d3cold)
> +		msleep(100);
> +
> +	return 0;
> +}
> +
>  static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>  	.suspend	= pcie_port_device_suspend,
>  	.resume		= pcie_port_device_resume,
> @@ -107,6 +128,8 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>  	.poweroff	= pcie_port_device_suspend,
>  	.restore	= pcie_port_device_resume,
>  	.resume_noirq	= pcie_port_resume_noirq,
> +	.runtime_suspend = pcie_port_runtime_suspend,
> +	.runtime_resume = pcie_port_runtime_resume,
>  };
>  
>  #define PCIE_PORTDRV_PM_OPS	(&pcie_portdrv_pm_ops)
> @@ -144,12 +167,14 @@ static int __devinit pcie_portdrv_probe(struct pci_dev *dev,
>  		return status;
>  
>  	pci_save_state(dev);
> +	pm_runtime_put_noidle(&dev->dev);
>  
>  	return 0;
>  }
>  
>  static void pcie_portdrv_remove(struct pci_dev *dev)
>  {
> +	pm_runtime_get_noresume(&dev->dev);
>  	pcie_port_device_remove(dev);
>  	pci_disable_device(dev);
>  }
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index e444f5b..b41d9a1 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -281,6 +281,7 @@ struct pci_dev {
>  	unsigned int	no_d1d2:1;	/* Only allow D0 and D3 */
>  	unsigned int	mmio_always_on:1;	/* disallow turning off io/mem
>  						   decoding during bar sizing */
> +	unsigned int	runtime_d3cold:1;
>  	unsigned int	wakeup_prepared:1;
>  	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
>  
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng April 13, 2012, 6:28 a.m. UTC | #2
On 04/13/2012 02:06 PM, Alex He wrote:
> On Fri, Apr 13, 2012 at 10:35:34AM +0800, Yan, Zheng wrote:
>> Hi all,
>>
>> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
>> beneath a PCIe port when they all have entered D3. A device in D3cold can only
>> generate wake event through the WAKE# pin. Because we can not access to a device's
>> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
>>
>> Any comment will be appreciated.
>>
>> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
>> ---
>> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
>> index 0f150f2..e210e8cb 100644
>> --- a/drivers/pci/pci-acpi.c
>> +++ b/drivers/pci/pci-acpi.c
>> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>  		[PCI_D1] = ACPI_STATE_D1,
>>  		[PCI_D2] = ACPI_STATE_D2,
>>  		[PCI_D3hot] = ACPI_STATE_D3,
>> -		[PCI_D3cold] = ACPI_STATE_D3
>> +		[PCI_D3cold] = ACPI_STATE_D3_COLD
>>  	};
>>  	int error = -EINVAL;
>>  
>> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
>>  
>>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
>>  {
>> -	if (dev->pme_interrupt)
>> +	/* PME interrupt isn't available in the D3cold case */
>> +	if (dev->pme_interrupt && !dev->runtime_d3cold)
>>  		return 0;
> PCI BUS PM Interface spec says that PMEs can be support for D3COLD.
> Please check section 3.2.4 , 5.4.2 and 5.4.3.
> 
I can't make PME interrupt work in mint spring platform. Our internal RTD3 document say:
"A device in RTD3 is prohibited from generating any activity other than wake event, through the WAKE# pin"

>>  
>>  	if (!acpi_pm_device_run_wake(&dev->dev, enable))
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index 8156744..bc16869 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -731,8 +731,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>  	int error;
>>  
>>  	/* bound the state we're entering */
>> -	if (state > PCI_D3hot)
>> -		state = PCI_D3hot;
>> +	if (state > PCI_D3cold)
>> +		state = PCI_D3cold;
>>  	else if (state < PCI_D0)
>>  		state = PCI_D0;
>>  	else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
>> @@ -750,7 +750,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>  	if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
>>  		return 0;
>>  
>> -	error = pci_raw_set_power_state(dev, state);
>> +	error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
>> +					PCI_D3hot : state);
> Do you want to set the pci device to D3hot if the state = D3Code through
> PMCSR register? I am agree if you realy want that but what is the meaninig
> to distingulish D3cold with D3hot in this patch. We can't put the PCI device
> through PMCSR register.

To set a device to D3cold. We first set device to D3hot through PMCSR register,
then cut the power supply through ACPI.

Regards
Yan, Zheng

> 
>>  
>>  	if (!__pci_complete_power_transition(dev, state))
>>  		error = 0;
>> @@ -1482,6 +1483,17 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state)
>>  	return !!(dev->pme_support & (1 << state));
>>  }
>>  
>> +static void pci_pme_poll_wakeup(struct pci_dev *dev)
>> +{
>> +	struct pci_dev *bridge = dev->bus->self;
>> +
>> +	/* don't poll the pme bit if parent is in low power state */
>> +	if (bridge && bridge->current_state != PCI_D0)
>> +		return;
>> +
>> +	pci_pme_wakeup(dev, NULL);
>> +}
>> +
>>  static void pci_pme_list_scan(struct work_struct *work)
>>  {
>>  	struct pci_pme_device *pme_dev, *n;
>> @@ -1490,7 +1502,7 @@ static void pci_pme_list_scan(struct work_struct *work)
>>  	if (!list_empty(&pci_pme_list)) {
>>  		list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) {
>>  			if (pme_dev->dev->pme_poll) {
>> -				pci_pme_wakeup(pme_dev->dev, NULL);
>> +				pci_pme_poll_wakeup(pme_dev->dev);
>>  			} else {
>>  				list_del(&pme_dev->list);
>>  				kfree(pme_dev);
>> @@ -1608,6 +1620,10 @@ int __pci_enable_wake(struct pci_dev *dev, pci_power_t state,
>>  	if (enable) {
>>  		int error;
>>  
>> +		if (runtime && state >= PCI_D3cold)
>> +			dev->runtime_d3cold = true;
>> +		else
>> +			dev->runtime_d3cold = false;
>>  		if (pci_pme_capable(dev, state))
>>  			pci_pme_active(dev, true);
>>  		else
>> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
>> index e0610bd..d66b7e9 100644
>> --- a/drivers/pci/pcie/portdrv_pci.c
>> +++ b/drivers/pci/pcie/portdrv_pci.c
>> @@ -11,11 +11,13 @@
>>  #include <linux/kernel.h>
>>  #include <linux/errno.h>
>>  #include <linux/pm.h>
>> +#include <linux/pm_runtime.h>
>>  #include <linux/init.h>
>>  #include <linux/pcieport_if.h>
>>  #include <linux/aer.h>
>>  #include <linux/dmi.h>
>>  #include <linux/pci-aspm.h>
>> +#include <linux/delay.h>
>>  
>>  #include "portdrv.h"
>>  #include "aer/aerdrv.h"
>> @@ -99,6 +101,25 @@ static int pcie_port_resume_noirq(struct device *dev)
>>  	return 0;
>>  }
>>  
>> +static int pcie_port_runtime_suspend(struct device *dev)
>> +{
>> +	struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +	pci_save_state(pdev);
>> +	return 0;
>> +}
>> +
>> +static int pcie_port_runtime_resume(struct device *dev)
>> +{
>> +	struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +	pci_restore_state(pdev);
>> +	if (pdev->runtime_d3cold)
>> +		msleep(100);
>> +
>> +	return 0;
>> +}
>> +
>>  static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>>  	.suspend	= pcie_port_device_suspend,
>>  	.resume		= pcie_port_device_resume,
>> @@ -107,6 +128,8 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>>  	.poweroff	= pcie_port_device_suspend,
>>  	.restore	= pcie_port_device_resume,
>>  	.resume_noirq	= pcie_port_resume_noirq,
>> +	.runtime_suspend = pcie_port_runtime_suspend,
>> +	.runtime_resume = pcie_port_runtime_resume,
>>  };
>>  
>>  #define PCIE_PORTDRV_PM_OPS	(&pcie_portdrv_pm_ops)
>> @@ -144,12 +167,14 @@ static int __devinit pcie_portdrv_probe(struct pci_dev *dev,
>>  		return status;
>>  
>>  	pci_save_state(dev);
>> +	pm_runtime_put_noidle(&dev->dev);
>>  
>>  	return 0;
>>  }
>>  
>>  static void pcie_portdrv_remove(struct pci_dev *dev)
>>  {
>> +	pm_runtime_get_noresume(&dev->dev);
>>  	pcie_port_device_remove(dev);
>>  	pci_disable_device(dev);
>>  }
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index e444f5b..b41d9a1 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -281,6 +281,7 @@ struct pci_dev {
>>  	unsigned int	no_d1d2:1;	/* Only allow D0 and D3 */
>>  	unsigned int	mmio_always_on:1;	/* disallow turning off io/mem
>>  						   decoding during bar sizing */
>> +	unsigned int	runtime_d3cold:1;
>>  	unsigned int	wakeup_prepared:1;
>>  	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
>>  
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-pm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 13, 2012, 7:41 p.m. UTC | #3
Hi,

On Friday, April 13, 2012, Yan, Zheng wrote:
> Hi all,
> 
> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
> beneath a PCIe port when they all have entered D3. A device in D3cold can only
> generate wake event through the WAKE# pin. Because we can not access to a device's
> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
> 
> Any comment will be appreciated.
> 
> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
> ---
> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> index 0f150f2..e210e8cb 100644
> --- a/drivers/pci/pci-acpi.c
> +++ b/drivers/pci/pci-acpi.c
> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>  		[PCI_D1] = ACPI_STATE_D1,
>  		[PCI_D2] = ACPI_STATE_D2,
>  		[PCI_D3hot] = ACPI_STATE_D3,
> -		[PCI_D3cold] = ACPI_STATE_D3
> +		[PCI_D3cold] = ACPI_STATE_D3_COLD
>  	};
>  	int error = -EINVAL;
>  

Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.

We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
instead.  I'll prepare a patch for that over the weekend if no one has done
that already.

> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
>  
>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
>  {
> -	if (dev->pme_interrupt)
> +	/* PME interrupt isn't available in the D3cold case */
> +	if (dev->pme_interrupt && !dev->runtime_d3cold)

This whole thing is wrong.  First off, I don't think that the runtime_d3cold
flag makes any sense.  We already cover that in dev->pme_support.

Second, pme_interrupt means that the _root_ _port_, not the device itself will
trigger an interrupt whenever the device sends the PME message to it (which
very well may happen for a device in D3_cold woken up by an external signal).

>  		return 0;
>  
>  	if (!acpi_pm_device_run_wake(&dev->dev, enable))
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 8156744..bc16869 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -731,8 +731,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>  	int error;
>  

Guys, please.  Never, _ever_, touch pci_set_power_state() without discussing
your ideas with someone who knows how it works and _why_ it works this way.

The problem here is that you can't program a PCI device into D3_cold, so it
doesn't even make sense to have a helper for that.

>  	/* bound the state we're entering */
> -	if (state > PCI_D3hot)
> -		state = PCI_D3hot;
> +	if (state > PCI_D3cold)
> +		state = PCI_D3cold;
>  	else if (state < PCI_D0)
>  		state = PCI_D0;
>  	else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
> @@ -750,7 +750,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>  	if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
>  		return 0;
>  
> -	error = pci_raw_set_power_state(dev, state);
> +	error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
> +					PCI_D3hot : state);
>  
>  	if (!__pci_complete_power_transition(dev, state))
>  		error = 0;
> @@ -1482,6 +1483,17 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state)
>  	return !!(dev->pme_support & (1 << state));
>  }
>  
> +static void pci_pme_poll_wakeup(struct pci_dev *dev)
> +{
> +	struct pci_dev *bridge = dev->bus->self;
> +
> +	/* don't poll the pme bit if parent is in low power state */
> +	if (bridge && bridge->current_state != PCI_D0)
> +		return;
> +
> +	pci_pme_wakeup(dev, NULL);
> +}

This one actually makes some sense, although it might be better to put the
test into pci_pme_wakeup() itself.

> +
>  static void pci_pme_list_scan(struct work_struct *work)
>  {
>  	struct pci_pme_device *pme_dev, *n;
> @@ -1490,7 +1502,7 @@ static void pci_pme_list_scan(struct work_struct *work)
>  	if (!list_empty(&pci_pme_list)) {
>  		list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) {
>  			if (pme_dev->dev->pme_poll) {
> -				pci_pme_wakeup(pme_dev->dev, NULL);
> +				pci_pme_poll_wakeup(pme_dev->dev);
>  			} else {
>  				list_del(&pme_dev->list);
>  				kfree(pme_dev);
> @@ -1608,6 +1620,10 @@ int __pci_enable_wake(struct pci_dev *dev, pci_power_t state,
>  	if (enable) {
>  		int error;
>  
> +		if (runtime && state >= PCI_D3cold)
> +			dev->runtime_d3cold = true;
> +		else
> +			dev->runtime_d3cold = false;
>  		if (pci_pme_capable(dev, state))
>  			pci_pme_active(dev, true);
>  		else
> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
> index e0610bd..d66b7e9 100644
> --- a/drivers/pci/pcie/portdrv_pci.c
> +++ b/drivers/pci/pcie/portdrv_pci.c
> @@ -11,11 +11,13 @@
>  #include <linux/kernel.h>
>  #include <linux/errno.h>
>  #include <linux/pm.h>
> +#include <linux/pm_runtime.h>
>  #include <linux/init.h>
>  #include <linux/pcieport_if.h>
>  #include <linux/aer.h>
>  #include <linux/dmi.h>
>  #include <linux/pci-aspm.h>
> +#include <linux/delay.h>
>  
>  #include "portdrv.h"
>  #include "aer/aerdrv.h"
> @@ -99,6 +101,25 @@ static int pcie_port_resume_noirq(struct device *dev)
>  	return 0;
>  }
>  
> +static int pcie_port_runtime_suspend(struct device *dev)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +
> +	pci_save_state(pdev);

Are you sure this is sufficient?

> +	return 0;
> +}
> +
> +static int pcie_port_runtime_resume(struct device *dev)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +
> +	pci_restore_state(pdev);
> +	if (pdev->runtime_d3cold)
> +		msleep(100);

What's _that_ supposed to do?

> +
> +	return 0;
> +}
> +
>  static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>  	.suspend	= pcie_port_device_suspend,
>  	.resume		= pcie_port_device_resume,
> @@ -107,6 +128,8 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>  	.poweroff	= pcie_port_device_suspend,
>  	.restore	= pcie_port_device_resume,
>  	.resume_noirq	= pcie_port_resume_noirq,
> +	.runtime_suspend = pcie_port_runtime_suspend,
> +	.runtime_resume = pcie_port_runtime_resume,
>  };
>  
>  #define PCIE_PORTDRV_PM_OPS	(&pcie_portdrv_pm_ops)
> @@ -144,12 +167,14 @@ static int __devinit pcie_portdrv_probe(struct pci_dev *dev,
>  		return status;
>  
>  	pci_save_state(dev);
> +	pm_runtime_put_noidle(&dev->dev);

What's the purpose of this?

>  	return 0;
>  }
>  
>  static void pcie_portdrv_remove(struct pci_dev *dev)
>  {
> +	pm_runtime_get_noresume(&dev->dev);
>  	pcie_port_device_remove(dev);
>  	pci_disable_device(dev);
>  }
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index e444f5b..b41d9a1 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -281,6 +281,7 @@ struct pci_dev {
>  	unsigned int	no_d1d2:1;	/* Only allow D0 and D3 */
>  	unsigned int	mmio_always_on:1;	/* disallow turning off io/mem
>  						   decoding during bar sizing */
> +	unsigned int	runtime_d3cold:1;
>  	unsigned int	wakeup_prepared:1;
>  	unsigned int	d3_delay;	/* D3->D0 transition time in ms */

OK

So now please tell me what exactly you want to achieve and why you want to do
that in the first place.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lin Ming April 16, 2012, 12:48 a.m. UTC | #4
On Fri, 2012-04-13 at 21:41 +0200, Rafael J. Wysocki wrote:
> Hi,
> 
> On Friday, April 13, 2012, Yan, Zheng wrote:
> > Hi all,
> > 
> > This patch adds PCIe runtime D3cold support, namely cut power supply for functions
> > beneath a PCIe port when they all have entered D3. A device in D3cold can only
> > generate wake event through the WAKE# pin. Because we can not access to a device's
> > configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
> > 
> > Any comment will be appreciated.
> > 
> > Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
> > ---
> > diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> > index 0f150f2..e210e8cb 100644
> > --- a/drivers/pci/pci-acpi.c
> > +++ b/drivers/pci/pci-acpi.c
> > @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >  		[PCI_D1] = ACPI_STATE_D1,
> >  		[PCI_D2] = ACPI_STATE_D2,
> >  		[PCI_D3hot] = ACPI_STATE_D3,
> > -		[PCI_D3cold] = ACPI_STATE_D3
> > +		[PCI_D3cold] = ACPI_STATE_D3_COLD
> >  	};
> >  	int error = -EINVAL;
> >  
> 
> Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
> 
> We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
> instead.  I'll prepare a patch for that over the weekend if no one has done
> that already.

Hi Rafael,

Have you started to write the patch?

If not, I can do it.

Thanks,
Lin Ming

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng April 16, 2012, 2:23 a.m. UTC | #5
On 04/14/2012 03:41 AM, Rafael J. Wysocki wrote:
> Hi,
> 
> On Friday, April 13, 2012, Yan, Zheng wrote:
>> Hi all,
>>
>> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
>> beneath a PCIe port when they all have entered D3. A device in D3cold can only
>> generate wake event through the WAKE# pin. Because we can not access to a device's
>> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
>>
>> Any comment will be appreciated.
>>
>> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
>> ---
>> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
>> index 0f150f2..e210e8cb 100644
>> --- a/drivers/pci/pci-acpi.c
>> +++ b/drivers/pci/pci-acpi.c
>> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>  		[PCI_D1] = ACPI_STATE_D1,
>>  		[PCI_D2] = ACPI_STATE_D2,
>>  		[PCI_D3hot] = ACPI_STATE_D3,
>> -		[PCI_D3cold] = ACPI_STATE_D3
>> +		[PCI_D3cold] = ACPI_STATE_D3_COLD
>>  	};
>>  	int error = -EINVAL;
>>  
> 
> Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
> 
> We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
> instead.  I'll prepare a patch for that over the weekend if no one has done
> that already.
> 
>> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
>>  
>>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
>>  {
>> -	if (dev->pme_interrupt)
>> +	/* PME interrupt isn't available in the D3cold case */
>> +	if (dev->pme_interrupt && !dev->runtime_d3cold)
> 
> This whole thing is wrong.  First off, I don't think that the runtime_d3cold
> flag makes any sense.  We already cover that in dev->pme_support.
> 
> Second, pme_interrupt means that the _root_ _port_, not the device itself will
> trigger an interrupt whenever the device sends the PME message to it (which
> very well may happen for a device in D3_cold woken up by an external signal).
> 
The reason I introduced the runtime_d3cold flag is I can't make the PME interrupt
work in the D3cold case in our test platform. Document for our test platform says
A device in D3cold can only generate wake event through the WAKE# pin.

>>  		return 0;
>>  
>>  	if (!acpi_pm_device_run_wake(&dev->dev, enable))
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index 8156744..bc16869 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -731,8 +731,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>  	int error;
>>  
> 
> Guys, please.  Never, _ever_, touch pci_set_power_state() without discussing
> your ideas with someone who knows how it works and _why_ it works this way.
> 
> The problem here is that you can't program a PCI device into D3_cold, so it
> doesn't even make sense to have a helper for that.
> 
>>  	/* bound the state we're entering */
>> -	if (state > PCI_D3hot)
>> -		state = PCI_D3hot;
>> +	if (state > PCI_D3cold)
>> +		state = PCI_D3cold;
>>  	else if (state < PCI_D0)
>>  		state = PCI_D0;
>>  	else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
>> @@ -750,7 +750,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>  	if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
>>  		return 0;
>>  
>> -	error = pci_raw_set_power_state(dev, state);
>> +	error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
>> +					PCI_D3hot : state);
>>  
>>  	if (!__pci_complete_power_transition(dev, state))
>>  		error = 0;
>> @@ -1482,6 +1483,17 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state)
>>  	return !!(dev->pme_support & (1 << state));
>>  }
>>  
>> +static void pci_pme_poll_wakeup(struct pci_dev *dev)
>> +{
>> +	struct pci_dev *bridge = dev->bus->self;
>> +
>> +	/* don't poll the pme bit if parent is in low power state */
>> +	if (bridge && bridge->current_state != PCI_D0)
>> +		return;
>> +
>> +	pci_pme_wakeup(dev, NULL);
>> +}
> 
> This one actually makes some sense, although it might be better to put the
> test into pci_pme_wakeup() itself.

put it into pci_pme_wakeup will break ACPI wakeup, because ACPI wakeup also
uses pci_pme_wakeup. When a device exits from D3cold by asserting the WAKE#
signal, device power is restored automatically by ACPI, then pci_pme_wakeup
is called to check device's PME bit. pci_dev->current_state is PCI_D3hot
during the checking. 

> 
>> +
>>  static void pci_pme_list_scan(struct work_struct *work)
>>  {
>>  	struct pci_pme_device *pme_dev, *n;
>> @@ -1490,7 +1502,7 @@ static void pci_pme_list_scan(struct work_struct *work)
>>  	if (!list_empty(&pci_pme_list)) {
>>  		list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) {
>>  			if (pme_dev->dev->pme_poll) {
>> -				pci_pme_wakeup(pme_dev->dev, NULL);
>> +				pci_pme_poll_wakeup(pme_dev->dev);
>>  			} else {
>>  				list_del(&pme_dev->list);
>>  				kfree(pme_dev);
>> @@ -1608,6 +1620,10 @@ int __pci_enable_wake(struct pci_dev *dev, pci_power_t state,
>>  	if (enable) {
>>  		int error;
>>  
>> +		if (runtime && state >= PCI_D3cold)
>> +			dev->runtime_d3cold = true;
>> +		else
>> +			dev->runtime_d3cold = false;
>>  		if (pci_pme_capable(dev, state))
>>  			pci_pme_active(dev, true);
>>  		else
>> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
>> index e0610bd..d66b7e9 100644
>> --- a/drivers/pci/pcie/portdrv_pci.c
>> +++ b/drivers/pci/pcie/portdrv_pci.c
>> @@ -11,11 +11,13 @@
>>  #include <linux/kernel.h>
>>  #include <linux/errno.h>
>>  #include <linux/pm.h>
>> +#include <linux/pm_runtime.h>
>>  #include <linux/init.h>
>>  #include <linux/pcieport_if.h>
>>  #include <linux/aer.h>
>>  #include <linux/dmi.h>
>>  #include <linux/pci-aspm.h>
>> +#include <linux/delay.h>
>>  
>>  #include "portdrv.h"
>>  #include "aer/aerdrv.h"
>> @@ -99,6 +101,25 @@ static int pcie_port_resume_noirq(struct device *dev)
>>  	return 0;
>>  }
>>  
>> +static int pcie_port_runtime_suspend(struct device *dev)
>> +{
>> +	struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +	pci_save_state(pdev);
> 
> Are you sure this is sufficient?

What else should I do?

> 
>> +	return 0;
>> +}
>> +
>> +static int pcie_port_runtime_resume(struct device *dev)
>> +{
>> +	struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +	pci_restore_state(pdev);
>> +	if (pdev->runtime_d3cold)
>> +		msleep(100);
> 
> What's _that_ supposed to do?

Bad thing happens if device is accessed immediately after restoring power.
Document for our test platform states 100ms PERST delay is required.

> 
>> +
>> +	return 0;
>> +}
>> +
>>  static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>>  	.suspend	= pcie_port_device_suspend,
>>  	.resume		= pcie_port_device_resume,
>> @@ -107,6 +128,8 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>>  	.poweroff	= pcie_port_device_suspend,
>>  	.restore	= pcie_port_device_resume,
>>  	.resume_noirq	= pcie_port_resume_noirq,
>> +	.runtime_suspend = pcie_port_runtime_suspend,
>> +	.runtime_resume = pcie_port_runtime_resume,
>>  };
>>  
>>  #define PCIE_PORTDRV_PM_OPS	(&pcie_portdrv_pm_ops)
>> @@ -144,12 +167,14 @@ static int __devinit pcie_portdrv_probe(struct pci_dev *dev,
>>  		return status;
>>  
>>  	pci_save_state(dev);
>> +	pm_runtime_put_noidle(&dev->dev);
> 
> What's the purpose of this?
> 
>>  	return 0;
>>  }
>>  
>>  static void pcie_portdrv_remove(struct pci_dev *dev)
>>  {
>> +	pm_runtime_get_noresume(&dev->dev);
>>  	pcie_port_device_remove(dev);
>>  	pci_disable_device(dev);
>>  }
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index e444f5b..b41d9a1 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -281,6 +281,7 @@ struct pci_dev {
>>  	unsigned int	no_d1d2:1;	/* Only allow D0 and D3 */
>>  	unsigned int	mmio_always_on:1;	/* disallow turning off io/mem
>>  						   decoding during bar sizing */
>> +	unsigned int	runtime_d3cold:1;
>>  	unsigned int	wakeup_prepared:1;
>>  	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
> 
> OK
> 
> So now please tell me what exactly you want to achieve and why you want to do
> that in the first place.
> 
> Thanks,
> Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng April 16, 2012, 7:49 a.m. UTC | #6
On 04/14/2012 03:41 AM, Rafael J. Wysocki wrote:
>> > @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
>> >  
>> >  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
>> >  {
>> > -	if (dev->pme_interrupt)
>> > +	/* PME interrupt isn't available in the D3cold case */
>> > +	if (dev->pme_interrupt && !dev->runtime_d3cold)
>
> This whole thing is wrong.  First off, I don't think that the runtime_d3cold
> flag makes any sense.  We already cover that in dev->pme_support.
> 
> Second, pme_interrupt means that the _root_ _port_, not the device itself will
> trigger an interrupt whenever the device sends the PME message to it (which
> very well may happen for a device in D3_cold woken up by an external signal).
> 

I rechecked this. The port does trigger PME interrupt, but after the WAKE# signal
restores device power. So my comments "PME interrupt isn't available in the D3cold
case" is wrong, but setup ACPI wakeup is still required.

Regards
Yan, Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 16, 2012, 8:15 a.m. UTC | #7
On Fri, Apr 13, 2012 at 2:06 PM, Alex He <alex.he@amd.com> wrote:
> On Fri, Apr 13, 2012 at 10:35:34AM +0800, Yan, Zheng wrote:
[snip]
>> @@ -750,7 +750,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>       if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
>>               return 0;
>>
>> -     error = pci_raw_set_power_state(dev, state);
>> +     error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
>> +                                     PCI_D3hot : state);
> Do you want to set the pci device to D3hot if the state = D3Code through
> PMCSR register?

Yes.

> I am agree if you realy want that but what is the meaninig
> to distingulish D3cold with D3hot in this patch. We can't put the PCI device
> through PMCSR register.

Yes.  We put PCI device into D3hot via PMCSR firstly, then we use ACPI
_PS3 and _PR3 to put device into D3cold.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 16, 2012, 8:58 a.m. UTC | #8
Hi,

On Sat, Apr 14, 2012 at 3:41 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> Hi,
>
> On Friday, April 13, 2012, Yan, Zheng wrote:
>> Hi all,
>>
>> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
>> beneath a PCIe port when they all have entered D3. A device in D3cold can only
>> generate wake event through the WAKE# pin. Because we can not access to a device's
>> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
>>
>> Any comment will be appreciated.
>>
>> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
>> ---
>> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
>> index 0f150f2..e210e8cb 100644
>> --- a/drivers/pci/pci-acpi.c
>> +++ b/drivers/pci/pci-acpi.c
>> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>               [PCI_D1] = ACPI_STATE_D1,
>>               [PCI_D2] = ACPI_STATE_D2,
>>               [PCI_D3hot] = ACPI_STATE_D3,
>> -             [PCI_D3cold] = ACPI_STATE_D3
>> +             [PCI_D3cold] = ACPI_STATE_D3_COLD
>>       };
>>       int error = -EINVAL;
>>
>
> Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
>
> We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
> instead.  I'll prepare a patch for that over the weekend if no one has done
> that already.
>
>> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
>>
>>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
>>  {
>> -     if (dev->pme_interrupt)
>> +     /* PME interrupt isn't available in the D3cold case */
>> +     if (dev->pme_interrupt && !dev->runtime_d3cold)
>
> This whole thing is wrong.  First off, I don't think that the runtime_d3cold
> flag makes any sense.  We already cover that in dev->pme_support.

PCIe devices and PCIe root port may have proper PME interrupt support
(that is, dev->pme_interrupt = true), but the process of remote wakeup
from D3cold is as follow:

1) In D3cold, the power of the main link is turned off, aux power is
provided (PCIe L2 link state)
2) Device detect condition to resume, then assert #WAKE pin
2) ACPI circuit linked with #WAKE pin, and will generate a GPE for that
3) GPE handler will resume device with ACPI (via _PS3 and _PR0), the
power of the main link is turned on, after a while, link goes into L0
state
4) The PME message is sent to root port, pme interrupt generated

So, for deivce, dev->pme_interrupt = true and dev->pme_support
advocate it support PME in D3cold.  But we still need ACPI to setup
run wake for the device.

So we need a way to inform acpi_pci_run_wake(), one way is using a
flag, there is other ways such as adding another parameter to
acpi_pci_run_wake() to tell it which power state to go.

> Second, pme_interrupt means that the _root_ _port_, not the device itself will
> trigger an interrupt whenever the device sends the PME message to it (which
> very well may happen for a device in D3_cold woken up by an external signal).

As above, that may happend after ACPI GPE.

>>               return 0;
>>
>>       if (!acpi_pm_device_run_wake(&dev->dev, enable))
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index 8156744..bc16869 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -731,8 +731,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>       int error;
>>
>
> Guys, please.  Never, _ever_, touch pci_set_power_state() without discussing
> your ideas with someone who knows how it works and _why_ it works this way.

This patch is sent as RFC, I think this is to call for discussing
instead of merging, isn't it?

> The problem here is that you can't program a PCI device into D3_cold, so it
> doesn't even make sense to have a helper for that.

We need to program ACPI to make the PCI device go into D3_cold.  That
is called in this function.

pci_set_power_sate
  -> __pci_complete_power_transition
    -> pci_platform_power_transition

>>       /* bound the state we're entering */
>> -     if (state > PCI_D3hot)
>> -             state = PCI_D3hot;
>> +     if (state > PCI_D3cold)
>> +             state = PCI_D3cold;
>>       else if (state < PCI_D0)
>>               state = PCI_D0;
>>       else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
>> @@ -750,7 +750,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>       if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
>>               return 0;
>>
>> -     error = pci_raw_set_power_state(dev, state);
>> +     error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
>> +                                     PCI_D3hot : state);
>>
>>       if (!__pci_complete_power_transition(dev, state))
>>               error = 0;
>> @@ -1482,6 +1483,17 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state)
>>       return !!(dev->pme_support & (1 << state));
>>  }
>>
>> +static void pci_pme_poll_wakeup(struct pci_dev *dev)
>> +{
>> +     struct pci_dev *bridge = dev->bus->self;
>> +
>> +     /* don't poll the pme bit if parent is in low power state */
>> +     if (bridge && bridge->current_state != PCI_D0)
>> +             return;
>> +
>> +     pci_pme_wakeup(dev, NULL);
>> +}
>
> This one actually makes some sense, although it might be better to put the
> test into pci_pme_wakeup() itself.
>
>> +
>>  static void pci_pme_list_scan(struct work_struct *work)
>>  {
>>       struct pci_pme_device *pme_dev, *n;
>> @@ -1490,7 +1502,7 @@ static void pci_pme_list_scan(struct work_struct *work)
>>       if (!list_empty(&pci_pme_list)) {
>>               list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) {
>>                       if (pme_dev->dev->pme_poll) {
>> -                             pci_pme_wakeup(pme_dev->dev, NULL);
>> +                             pci_pme_poll_wakeup(pme_dev->dev);
>>                       } else {
>>                               list_del(&pme_dev->list);
>>                               kfree(pme_dev);
>> @@ -1608,6 +1620,10 @@ int __pci_enable_wake(struct pci_dev *dev, pci_power_t state,
>>       if (enable) {
>>               int error;
>>
>> +             if (runtime && state >= PCI_D3cold)
>> +                     dev->runtime_d3cold = true;
>> +             else
>> +                     dev->runtime_d3cold = false;
>>               if (pci_pme_capable(dev, state))
>>                       pci_pme_active(dev, true);
>>               else
>> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
>> index e0610bd..d66b7e9 100644
>> --- a/drivers/pci/pcie/portdrv_pci.c
>> +++ b/drivers/pci/pcie/portdrv_pci.c
>> @@ -11,11 +11,13 @@
>>  #include <linux/kernel.h>
>>  #include <linux/errno.h>
>>  #include <linux/pm.h>
>> +#include <linux/pm_runtime.h>
>>  #include <linux/init.h>
>>  #include <linux/pcieport_if.h>
>>  #include <linux/aer.h>
>>  #include <linux/dmi.h>
>>  #include <linux/pci-aspm.h>
>> +#include <linux/delay.h>
>>
>>  #include "portdrv.h"
>>  #include "aer/aerdrv.h"
>> @@ -99,6 +101,25 @@ static int pcie_port_resume_noirq(struct device *dev)
>>       return 0;
>>  }
>>
>> +static int pcie_port_runtime_suspend(struct device *dev)
>> +{
>> +     struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +     pci_save_state(pdev);
>
> Are you sure this is sufficient?

Maybe we need to pay attention to the pcie port drivers?  Run ->runtime_

>> +     return 0;
>> +}
>> +
>> +static int pcie_port_runtime_resume(struct device *dev)
>> +{
>> +     struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +     pci_restore_state(pdev);
>> +     if (pdev->runtime_d3cold)
>> +             msleep(100);
>
> What's _that_ supposed to do?

When resume from d3cold, PCIe main link will be powered on again, it
will take quite some time before the main link go into L0 state.
Otherwise, accessing devices under the port may return wrong result.

>> +
>> +     return 0;
>> +}
>> +
>>  static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>>       .suspend        = pcie_port_device_suspend,
>>       .resume         = pcie_port_device_resume,
>> @@ -107,6 +128,8 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>>       .poweroff       = pcie_port_device_suspend,
>>       .restore        = pcie_port_device_resume,
>>       .resume_noirq   = pcie_port_resume_noirq,
>> +     .runtime_suspend = pcie_port_runtime_suspend,
>> +     .runtime_resume = pcie_port_runtime_resume,
>>  };
>>
>>  #define PCIE_PORTDRV_PM_OPS  (&pcie_portdrv_pm_ops)
>> @@ -144,12 +167,14 @@ static int __devinit pcie_portdrv_probe(struct pci_dev *dev,
>>               return status;
>>
>>       pci_save_state(dev);
>> +     pm_runtime_put_noidle(&dev->dev);
>
> What's the purpose of this?

I think this is needed by PCI runtime PM support.  Before ->probe,
pm_runtime_get_xxx() is executed on device.

>>       return 0;
>>  }
>>
>>  static void pcie_portdrv_remove(struct pci_dev *dev)
>>  {
>> +     pm_runtime_get_noresume(&dev->dev);
>>       pcie_port_device_remove(dev);
>>       pci_disable_device(dev);
>>  }
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index e444f5b..b41d9a1 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -281,6 +281,7 @@ struct pci_dev {
>>       unsigned int    no_d1d2:1;      /* Only allow D0 and D3 */
>>       unsigned int    mmio_always_on:1;       /* disallow turning off io/mem
>>                                                  decoding during bar sizing */
>> +     unsigned int    runtime_d3cold:1;
>>       unsigned int    wakeup_prepared:1;
>>       unsigned int    d3_delay;       /* D3->D0 transition time in ms */
>
> OK
>
> So now please tell me what exactly you want to achieve and why you want to do
> that in the first place.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 16, 2012, 4:26 p.m. UTC | #9
On Monday, April 16, 2012, Lin Ming wrote:
> On Fri, 2012-04-13 at 21:41 +0200, Rafael J. Wysocki wrote:
> > Hi,
> > 
> > On Friday, April 13, 2012, Yan, Zheng wrote:
> > > Hi all,
> > > 
> > > This patch adds PCIe runtime D3cold support, namely cut power supply for functions
> > > beneath a PCIe port when they all have entered D3. A device in D3cold can only
> > > generate wake event through the WAKE# pin. Because we can not access to a device's
> > > configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
> > > 
> > > Any comment will be appreciated.
> > > 
> > > Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
> > > ---
> > > diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> > > index 0f150f2..e210e8cb 100644
> > > --- a/drivers/pci/pci-acpi.c
> > > +++ b/drivers/pci/pci-acpi.c
> > > @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> > >  		[PCI_D1] = ACPI_STATE_D1,
> > >  		[PCI_D2] = ACPI_STATE_D2,
> > >  		[PCI_D3hot] = ACPI_STATE_D3,
> > > -		[PCI_D3cold] = ACPI_STATE_D3
> > > +		[PCI_D3cold] = ACPI_STATE_D3_COLD
> > >  	};
> > >  	int error = -EINVAL;
> > >  
> > 
> > Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
> > 
> > We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
> > instead.  I'll prepare a patch for that over the weekend if no one has done
> > that already.
> 
> Hi Rafael,
> 
> Have you started to write the patch?
> 
> If not, I can do it.

Please do if you have the time.  I'd like this one to be cleaned up ASAP,
to reduce the amount of confusion taking place.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 16, 2012, 5:07 p.m. UTC | #10
Hi,

On Monday, April 16, 2012, Yan, Zheng wrote:
> On 04/14/2012 03:41 AM, Rafael J. Wysocki wrote:
> > Hi,
> > 
> > On Friday, April 13, 2012, Yan, Zheng wrote:
> >> Hi all,
> >>
> >> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
> >> beneath a PCIe port when they all have entered D3. A device in D3cold can only
> >> generate wake event through the WAKE# pin. Because we can not access to a device's
> >> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
> >>
> >> Any comment will be appreciated.
> >>
> >> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
> >> ---
> >> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> >> index 0f150f2..e210e8cb 100644
> >> --- a/drivers/pci/pci-acpi.c
> >> +++ b/drivers/pci/pci-acpi.c
> >> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >>  		[PCI_D1] = ACPI_STATE_D1,
> >>  		[PCI_D2] = ACPI_STATE_D2,
> >>  		[PCI_D3hot] = ACPI_STATE_D3,
> >> -		[PCI_D3cold] = ACPI_STATE_D3
> >> +		[PCI_D3cold] = ACPI_STATE_D3_COLD
> >>  	};
> >>  	int error = -EINVAL;
> >>  
> > 
> > Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
> > 
> > We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
> > instead.  I'll prepare a patch for that over the weekend if no one has done
> > that already.
> > 
> >> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
> >>  
> >>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
> >>  {
> >> -	if (dev->pme_interrupt)
> >> +	/* PME interrupt isn't available in the D3cold case */
> >> +	if (dev->pme_interrupt && !dev->runtime_d3cold)
> > 
> > This whole thing is wrong.  First off, I don't think that the runtime_d3cold
> > flag makes any sense.  We already cover that in dev->pme_support.
> > 
> > Second, pme_interrupt means that the _root_ _port_, not the device itself will
> > trigger an interrupt whenever the device sends the PME message to it (which
> > very well may happen for a device in D3_cold woken up by an external signal).
> > 
> The reason I introduced the runtime_d3cold flag is I can't make the PME interrupt
> work in the D3cold case in our test platform. Document for our test platform says
> A device in D3cold can only generate wake event through the WAKE# pin.

OK, so that clearly is not a standard PCI device, so please don't try to
make our PCI bus type handle non-standard devices.

> >>  		return 0;
> >>  
> >>  	if (!acpi_pm_device_run_wake(&dev->dev, enable))
> >> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> >> index 8156744..bc16869 100644
> >> --- a/drivers/pci/pci.c
> >> +++ b/drivers/pci/pci.c
> >> @@ -731,8 +731,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >>  	int error;
> >>  
> > 
> > Guys, please.  Never, _ever_, touch pci_set_power_state() without discussing
> > your ideas with someone who knows how it works and _why_ it works this way.
> > 
> > The problem here is that you can't program a PCI device into D3_cold, so it
> > doesn't even make sense to have a helper for that.
> > 
> >>  	/* bound the state we're entering */
> >> -	if (state > PCI_D3hot)
> >> -		state = PCI_D3hot;
> >> +	if (state > PCI_D3cold)
> >> +		state = PCI_D3cold;
> >>  	else if (state < PCI_D0)
> >>  		state = PCI_D0;
> >>  	else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
> >> @@ -750,7 +750,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >>  	if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
> >>  		return 0;
> >>  
> >> -	error = pci_raw_set_power_state(dev, state);
> >> +	error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
> >> +					PCI_D3hot : state);
> >>  
> >>  	if (!__pci_complete_power_transition(dev, state))
> >>  		error = 0;
> >> @@ -1482,6 +1483,17 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state)
> >>  	return !!(dev->pme_support & (1 << state));
> >>  }
> >>  
> >> +static void pci_pme_poll_wakeup(struct pci_dev *dev)
> >> +{
> >> +	struct pci_dev *bridge = dev->bus->self;
> >> +
> >> +	/* don't poll the pme bit if parent is in low power state */
> >> +	if (bridge && bridge->current_state != PCI_D0)
> >> +		return;
> >> +
> >> +	pci_pme_wakeup(dev, NULL);
> >> +}
> > 
> > This one actually makes some sense, although it might be better to put the
> > test into pci_pme_wakeup() itself.
> 
> put it into pci_pme_wakeup will break ACPI wakeup, because ACPI wakeup also
> uses pci_pme_wakeup. When a device exits from D3cold by asserting the WAKE#
> signal, device power is restored automatically by ACPI, then pci_pme_wakeup
> is called to check device's PME bit. pci_dev->current_state is PCI_D3hot
> during the checking. 

You seem to be talking about two different device here.  One device is
dev, which is the one being checked for the PME bit and the second one
is the bridge that should be in D0 so that dev is accessible, right?

So if you put this code into pci_pme_wakeup():

	struct pci_dev *bridge = pci_dev->bus->self;

	...

	if (bridge && bridge->current_state != PCI_D0)
		return 0;

then it should work just like your code above.

BTW, can you please explain to me what the #WAKE signal is and how it is
different from PME#?

> >> +
> >>  static void pci_pme_list_scan(struct work_struct *work)
> >>  {
> >>  	struct pci_pme_device *pme_dev, *n;
> >> @@ -1490,7 +1502,7 @@ static void pci_pme_list_scan(struct work_struct *work)
> >>  	if (!list_empty(&pci_pme_list)) {
> >>  		list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) {
> >>  			if (pme_dev->dev->pme_poll) {
> >> -				pci_pme_wakeup(pme_dev->dev, NULL);
> >> +				pci_pme_poll_wakeup(pme_dev->dev);
> >>  			} else {
> >>  				list_del(&pme_dev->list);
> >>  				kfree(pme_dev);
> >> @@ -1608,6 +1620,10 @@ int __pci_enable_wake(struct pci_dev *dev, pci_power_t state,
> >>  	if (enable) {
> >>  		int error;
> >>  
> >> +		if (runtime && state >= PCI_D3cold)
> >> +			dev->runtime_d3cold = true;
> >> +		else
> >> +			dev->runtime_d3cold = false;
> >>  		if (pci_pme_capable(dev, state))
> >>  			pci_pme_active(dev, true);
> >>  		else
> >> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
> >> index e0610bd..d66b7e9 100644
> >> --- a/drivers/pci/pcie/portdrv_pci.c
> >> +++ b/drivers/pci/pcie/portdrv_pci.c
> >> @@ -11,11 +11,13 @@
> >>  #include <linux/kernel.h>
> >>  #include <linux/errno.h>
> >>  #include <linux/pm.h>
> >> +#include <linux/pm_runtime.h>
> >>  #include <linux/init.h>
> >>  #include <linux/pcieport_if.h>
> >>  #include <linux/aer.h>
> >>  #include <linux/dmi.h>
> >>  #include <linux/pci-aspm.h>
> >> +#include <linux/delay.h>
> >>  
> >>  #include "portdrv.h"
> >>  #include "aer/aerdrv.h"
> >> @@ -99,6 +101,25 @@ static int pcie_port_resume_noirq(struct device *dev)
> >>  	return 0;
> >>  }
> >>  
> >> +static int pcie_port_runtime_suspend(struct device *dev)
> >> +{
> >> +	struct pci_dev *pdev = to_pci_dev(dev);
> >> +
> >> +	pci_save_state(pdev);
> > 
> > Are you sure this is sufficient?
> 
> What else should I do?

First off, you need not do pci_save_state() here, most likely, because
pci_pm_runtime_suspend() will do it for you later.

Second, you're suspending a device that has children here.  Namely,
the PCIe service devices are children of the port they are associated with.
I wonder if they are suspended and how exactly, if so?

> >> +	return 0;
> >> +}
> >> +
> >> +static int pcie_port_runtime_resume(struct device *dev)
> >> +{
> >> +	struct pci_dev *pdev = to_pci_dev(dev);
> >> +
> >> +	pci_restore_state(pdev);
> >> +	if (pdev->runtime_d3cold)
> >> +		msleep(100);
> > 
> > What's _that_ supposed to do?
> 
> Bad thing happens if device is accessed immediately after restoring power.
> Document for our test platform states 100ms PERST delay is required.

This is a non-standard thing again, so please don't slap it on the code
supposed to handle _standard_ PCI Express.

Moreover, I really don't think it's a good idea to put PCI Express ports into
low-power states in general.  It might work on your platform, whatever it is,
but that doesn't mean it's going to work on every PCI Express system out
there.  I actually know of a number of such systems where it surely won't
work at all.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 16, 2012, 9:11 p.m. UTC | #11
On Monday, April 16, 2012, Yan, Zheng wrote:
> On 04/14/2012 03:41 AM, Rafael J. Wysocki wrote:
> >> > @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
> >> >  
> >> >  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
> >> >  {
> >> > -	if (dev->pme_interrupt)
> >> > +	/* PME interrupt isn't available in the D3cold case */
> >> > +	if (dev->pme_interrupt && !dev->runtime_d3cold)
> >
> > This whole thing is wrong.  First off, I don't think that the runtime_d3cold
> > flag makes any sense.  We already cover that in dev->pme_support.
> > 
> > Second, pme_interrupt means that the _root_ _port_, not the device itself will
> > trigger an interrupt whenever the device sends the PME message to it (which
> > very well may happen for a device in D3_cold woken up by an external signal).
> > 
> 
> I rechecked this. The port does trigger PME interrupt, but after the WAKE# signal
> restores device power. So my comments "PME interrupt isn't available in the D3cold
> case" is wrong, but setup ACPI wakeup is still required.

What exactly do you mean by "setup ACPI wakeup"?

Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 16, 2012, 9:30 p.m. UTC | #12
On Monday, April 16, 2012, huang ying wrote:
> Hi,
> 
> On Sat, Apr 14, 2012 at 3:41 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > Hi,
> >
> > On Friday, April 13, 2012, Yan, Zheng wrote:
> >> Hi all,
> >>
> >> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
> >> beneath a PCIe port when they all have entered D3. A device in D3cold can only
> >> generate wake event through the WAKE# pin. Because we can not access to a device's
> >> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
> >>
> >> Any comment will be appreciated.
> >>
> >> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
> >> ---
> >> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> >> index 0f150f2..e210e8cb 100644
> >> --- a/drivers/pci/pci-acpi.c
> >> +++ b/drivers/pci/pci-acpi.c
> >> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >>               [PCI_D1] = ACPI_STATE_D1,
> >>               [PCI_D2] = ACPI_STATE_D2,
> >>               [PCI_D3hot] = ACPI_STATE_D3,
> >> -             [PCI_D3cold] = ACPI_STATE_D3
> >> +             [PCI_D3cold] = ACPI_STATE_D3_COLD
> >>       };
> >>       int error = -EINVAL;
> >>
> >
> > Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
> >
> > We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
> > instead.  I'll prepare a patch for that over the weekend if no one has done
> > that already.
> >
> >> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
> >>
> >>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
> >>  {
> >> -     if (dev->pme_interrupt)
> >> +     /* PME interrupt isn't available in the D3cold case */
> >> +     if (dev->pme_interrupt && !dev->runtime_d3cold)
> >
> > This whole thing is wrong.  First off, I don't think that the runtime_d3cold
> > flag makes any sense.  We already cover that in dev->pme_support.
> 
> PCIe devices and PCIe root port may have proper PME interrupt support
> (that is, dev->pme_interrupt = true), but the process of remote wakeup
> from D3cold is as follow:
> 
> 1) In D3cold, the power of the main link is turned off, aux power is
> provided (PCIe L2 link state)
> 2) Device detect condition to resume, then assert #WAKE pin
> 2) ACPI circuit linked with #WAKE pin, and will generate a GPE for that
> 3) GPE handler will resume device with ACPI (via _PS3 and _PR0), the
> power of the main link is turned on, after a while, link goes into L0
> state
> 4) The PME message is sent to root port, pme interrupt generated

This isn't how it's supposed to work in theory.  If the device can signal PME
from D3cold, it should be able to reestablish the link and send a PME
message from there.  dev->pme_interrupt set means exactly that.

ACPI is only supposed to be needed for things that don't send PME
messages (in your case the PME interrupt generated by the port is essentially
useless, because the wakeup event has already been signaled through ACPI).

> So, for deivce, dev->pme_interrupt = true and dev->pme_support
> advocate it support PME in D3cold.  But we still need ACPI to setup
> run wake for the device.

OK, so this is nonstandard.  

> So we need a way to inform acpi_pci_run_wake(), one way is using a
> flag, there is other ways such as adding another parameter to
> acpi_pci_run_wake() to tell it which power state to go.

I'm not sure what to do with that.  It looks like we should use ACPI for
signaling remote wakeup on this platform and additionally provide a dummy
interrupt handler for root ports that will only clear PME Status.

> > Second, pme_interrupt means that the _root_ _port_, not the device itself will
> > trigger an interrupt whenever the device sends the PME message to it (which
> > very well may happen for a device in D3_cold woken up by an external signal).
> 
> As above, that may happend after ACPI GPE.

Because it's not a standard PCIe system.

> >>               return 0;
> >>
> >>       if (!acpi_pm_device_run_wake(&dev->dev, enable))
> >> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> >> index 8156744..bc16869 100644
> >> --- a/drivers/pci/pci.c
> >> +++ b/drivers/pci/pci.c
> >> @@ -731,8 +731,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >>       int error;
> >>
> >
> > Guys, please.  Never, _ever_, touch pci_set_power_state() without discussing
> > your ideas with someone who knows how it works and _why_ it works this way.
> 
> This patch is sent as RFC, I think this is to call for discussing
> instead of merging, isn't it?
> 
> > The problem here is that you can't program a PCI device into D3_cold, so it
> > doesn't even make sense to have a helper for that.
> 
> We need to program ACPI to make the PCI device go into D3_cold.  That
> is called in this function.
> 
> pci_set_power_sate
>   -> __pci_complete_power_transition
>     -> pci_platform_power_transition

So don't use pci_set_power_state() for that, because it's essentially
a different operation.  You need a pci_platform_remove_power() helper or
similar for that.

What ACPI method exactly is used to remove power from the device?

> >>       /* bound the state we're entering */
> >> -     if (state > PCI_D3hot)
> >> -             state = PCI_D3hot;
> >> +     if (state > PCI_D3cold)
> >> +             state = PCI_D3cold;
> >>       else if (state < PCI_D0)
> >>               state = PCI_D0;
> >>       else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
> >> @@ -750,7 +750,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >>       if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
> >>               return 0;
> >>
> >> -     error = pci_raw_set_power_state(dev, state);
> >> +     error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
> >> +                                     PCI_D3hot : state);
> >>
> >>       if (!__pci_complete_power_transition(dev, state))
> >>               error = 0;
> >> @@ -1482,6 +1483,17 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state)
> >>       return !!(dev->pme_support & (1 << state));
> >>  }
> >>
> >> +static void pci_pme_poll_wakeup(struct pci_dev *dev)
> >> +{
> >> +     struct pci_dev *bridge = dev->bus->self;
> >> +
> >> +     /* don't poll the pme bit if parent is in low power state */
> >> +     if (bridge && bridge->current_state != PCI_D0)
> >> +             return;
> >> +
> >> +     pci_pme_wakeup(dev, NULL);
> >> +}
> >
> > This one actually makes some sense, although it might be better to put the
> > test into pci_pme_wakeup() itself.
> >
> >> +
> >>  static void pci_pme_list_scan(struct work_struct *work)
> >>  {
> >>       struct pci_pme_device *pme_dev, *n;
> >> @@ -1490,7 +1502,7 @@ static void pci_pme_list_scan(struct work_struct *work)
> >>       if (!list_empty(&pci_pme_list)) {
> >>               list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) {
> >>                       if (pme_dev->dev->pme_poll) {
> >> -                             pci_pme_wakeup(pme_dev->dev, NULL);
> >> +                             pci_pme_poll_wakeup(pme_dev->dev);
> >>                       } else {
> >>                               list_del(&pme_dev->list);
> >>                               kfree(pme_dev);
> >> @@ -1608,6 +1620,10 @@ int __pci_enable_wake(struct pci_dev *dev, pci_power_t state,
> >>       if (enable) {
> >>               int error;
> >>
> >> +             if (runtime && state >= PCI_D3cold)
> >> +                     dev->runtime_d3cold = true;
> >> +             else
> >> +                     dev->runtime_d3cold = false;
> >>               if (pci_pme_capable(dev, state))
> >>                       pci_pme_active(dev, true);
> >>               else
> >> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
> >> index e0610bd..d66b7e9 100644
> >> --- a/drivers/pci/pcie/portdrv_pci.c
> >> +++ b/drivers/pci/pcie/portdrv_pci.c
> >> @@ -11,11 +11,13 @@
> >>  #include <linux/kernel.h>
> >>  #include <linux/errno.h>
> >>  #include <linux/pm.h>
> >> +#include <linux/pm_runtime.h>
> >>  #include <linux/init.h>
> >>  #include <linux/pcieport_if.h>
> >>  #include <linux/aer.h>
> >>  #include <linux/dmi.h>
> >>  #include <linux/pci-aspm.h>
> >> +#include <linux/delay.h>
> >>
> >>  #include "portdrv.h"
> >>  #include "aer/aerdrv.h"
> >> @@ -99,6 +101,25 @@ static int pcie_port_resume_noirq(struct device *dev)
> >>       return 0;
> >>  }
> >>
> >> +static int pcie_port_runtime_suspend(struct device *dev)
> >> +{
> >> +     struct pci_dev *pdev = to_pci_dev(dev);
> >> +
> >> +     pci_save_state(pdev);
> >
> > Are you sure this is sufficient?
> 
> Maybe we need to pay attention to the pcie port drivers?  Run ->runtime_

Yes.

> >> +     return 0;
> >> +}
> >> +
> >> +static int pcie_port_runtime_resume(struct device *dev)
> >> +{
> >> +     struct pci_dev *pdev = to_pci_dev(dev);
> >> +
> >> +     pci_restore_state(pdev);
> >> +     if (pdev->runtime_d3cold)
> >> +             msleep(100);
> >
> > What's _that_ supposed to do?
> 
> When resume from d3cold, PCIe main link will be powered on again, it
> will take quite some time before the main link go into L0 state.
> Otherwise, accessing devices under the port may return wrong result.

OK, but this is generic code and as per the standard the link should have been
reestablished at this point already.

Please don't put some nonstandard-platform-specific quirks like this into
code that's supposed to handle _every_ PCIe system.

> >> +
> >> +     return 0;
> >> +}
> >> +
> >>  static const struct dev_pm_ops pcie_portdrv_pm_ops = {
> >>       .suspend        = pcie_port_device_suspend,
> >>       .resume         = pcie_port_device_resume,
> >> @@ -107,6 +128,8 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
> >>       .poweroff       = pcie_port_device_suspend,
> >>       .restore        = pcie_port_device_resume,
> >>       .resume_noirq   = pcie_port_resume_noirq,
> >> +     .runtime_suspend = pcie_port_runtime_suspend,
> >> +     .runtime_resume = pcie_port_runtime_resume,
> >>  };
> >>
> >>  #define PCIE_PORTDRV_PM_OPS  (&pcie_portdrv_pm_ops)
> >> @@ -144,12 +167,14 @@ static int __devinit pcie_portdrv_probe(struct pci_dev *dev,
> >>               return status;
> >>
> >>       pci_save_state(dev);
> >> +     pm_runtime_put_noidle(&dev->dev);
> >
> > What's the purpose of this?
> 
> I think this is needed by PCI runtime PM support.  Before ->probe,
> pm_runtime_get_xxx() is executed on device.

ok

> >>       return 0;
> >>  }
> >>
> >>  static void pcie_portdrv_remove(struct pci_dev *dev)
> >>  {
> >> +     pm_runtime_get_noresume(&dev->dev);
> >>       pcie_port_device_remove(dev);
> >>       pci_disable_device(dev);
> >>  }
> >> diff --git a/include/linux/pci.h b/include/linux/pci.h
> >> index e444f5b..b41d9a1 100644
> >> --- a/include/linux/pci.h
> >> +++ b/include/linux/pci.h
> >> @@ -281,6 +281,7 @@ struct pci_dev {
> >>       unsigned int    no_d1d2:1;      /* Only allow D0 and D3 */
> >>       unsigned int    mmio_always_on:1;       /* disallow turning off io/mem
> >>                                                  decoding during bar sizing */
> >> +     unsigned int    runtime_d3cold:1;
> >>       unsigned int    wakeup_prepared:1;
> >>       unsigned int    d3_delay;       /* D3->D0 transition time in ms */
> >
> > OK
> >
> > So now please tell me what exactly you want to achieve and why you want to do
> > that in the first place.

Well, is there any chance to get that information?

Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 17, 2012, 2:02 a.m. UTC | #13
On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Monday, April 16, 2012, huang ying wrote:
>> Hi,
>>
>> On Sat, Apr 14, 2012 at 3:41 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> > Hi,
>> >
>> > On Friday, April 13, 2012, Yan, Zheng wrote:
>> >> Hi all,
>> >>
>> >> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
>> >> beneath a PCIe port when they all have entered D3. A device in D3cold can only
>> >> generate wake event through the WAKE# pin. Because we can not access to a device's
>> >> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
>> >>
>> >> Any comment will be appreciated.
>> >>
>> >> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
>> >> ---
>> >> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
>> >> index 0f150f2..e210e8cb 100644
>> >> --- a/drivers/pci/pci-acpi.c
>> >> +++ b/drivers/pci/pci-acpi.c
>> >> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>> >>               [PCI_D1] = ACPI_STATE_D1,
>> >>               [PCI_D2] = ACPI_STATE_D2,
>> >>               [PCI_D3hot] = ACPI_STATE_D3,
>> >> -             [PCI_D3cold] = ACPI_STATE_D3
>> >> +             [PCI_D3cold] = ACPI_STATE_D3_COLD
>> >>       };
>> >>       int error = -EINVAL;
>> >>
>> >
>> > Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
>> >
>> > We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
>> > instead.  I'll prepare a patch for that over the weekend if no one has done
>> > that already.
>> >
>> >> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
>> >>
>> >>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
>> >>  {
>> >> -     if (dev->pme_interrupt)
>> >> +     /* PME interrupt isn't available in the D3cold case */
>> >> +     if (dev->pme_interrupt && !dev->runtime_d3cold)
>> >
>> > This whole thing is wrong.  First off, I don't think that the runtime_d3cold
>> > flag makes any sense.  We already cover that in dev->pme_support.
>>
>> PCIe devices and PCIe root port may have proper PME interrupt support
>> (that is, dev->pme_interrupt = true), but the process of remote wakeup
>> from D3cold is as follow:
>>
>> 1) In D3cold, the power of the main link is turned off, aux power is
>> provided (PCIe L2 link state)
>> 2) Device detect condition to resume, then assert #WAKE pin
>> 2) ACPI circuit linked with #WAKE pin, and will generate a GPE for that
>> 3) GPE handler will resume device with ACPI (via _PS3 and _PR0), the
>> power of the main link is turned on, after a while, link goes into L0
>> state
>> 4) The PME message is sent to root port, pme interrupt generated
>
> This isn't how it's supposed to work in theory.  If the device can signal PME
> from D3cold, it should be able to reestablish the link and send a PME
> message from there.  dev->pme_interrupt set means exactly that.
>
> ACPI is only supposed to be needed for things that don't send PME
> messages (in your case the PME interrupt generated by the port is essentially
> useless, because the wakeup event has already been signaled through ACPI).
>
>> So, for deivce, dev->pme_interrupt = true and dev->pme_support
>> advocate it support PME in D3cold.  But we still need ACPI to setup
>> run wake for the device.
>
> OK, so this is nonstandard.

This is the standard behavior.  Please refer to PCI Express Base
Sepcification Revision 2.0, section 5.3.3.2 Link Wakeup.  In D1, D2
and D3hot state, PCIe device can transit the link from L1 to L0 state,
and send the PME message.  In D3cold, the main link is powered off,
PCIe device will use a STANDARD sideband signal WAKE# to signal wakeup
firstly, then platform (power controller in spec) will power on the
main link for the device, after main link is back to L0, the PME
message is send to root port, pme interrupt is generated.  So in
theory, the wake up process can be divided into platform part (which
power on the main link) and PCIe part (which send PME).

>> So we need a way to inform acpi_pci_run_wake(), one way is using a
>> flag, there is other ways such as adding another parameter to
>> acpi_pci_run_wake() to tell it which power state to go.
>
> I'm not sure what to do with that.  It looks like we should use ACPI for
> signaling remote wakeup on this platform and additionally provide a dummy
> interrupt handler for root ports that will only clear PME Status.
>
>> > Second, pme_interrupt means that the _root_ _port_, not the device itself will
>> > trigger an interrupt whenever the device sends the PME message to it (which
>> > very well may happen for a device in D3_cold woken up by an external signal).
>>
>> As above, that may happend after ACPI GPE.
>
> Because it's not a standard PCIe system.
>
>> >>               return 0;
>> >>
>> >>       if (!acpi_pm_device_run_wake(&dev->dev, enable))
>> >> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> >> index 8156744..bc16869 100644
>> >> --- a/drivers/pci/pci.c
>> >> +++ b/drivers/pci/pci.c
>> >> @@ -731,8 +731,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>> >>       int error;
>> >>
>> >
>> > Guys, please.  Never, _ever_, touch pci_set_power_state() without discussing
>> > your ideas with someone who knows how it works and _why_ it works this way.
>>
>> This patch is sent as RFC, I think this is to call for discussing
>> instead of merging, isn't it?
>>
>> > The problem here is that you can't program a PCI device into D3_cold, so it
>> > doesn't even make sense to have a helper for that.
>>
>> We need to program ACPI to make the PCI device go into D3_cold.  That
>> is called in this function.
>>
>> pci_set_power_sate
>>   -> __pci_complete_power_transition
>>     -> pci_platform_power_transition
>
> So don't use pci_set_power_state() for that, because it's essentially
> a different operation.  You need a pci_platform_remove_power() helper or
> similar for that.
>
> What ACPI method exactly is used to remove power from the device?

The ACPI method executed is as follow:

- _PS3 (if exist)
- Power resources in _PR3 is turned on
- Power resources in _PR0 is turned off
- Power resources in _PR3 is turned off

I think the process can fit pci_set_power_state() quite well, so why
invent another helper for that?

>> >>       /* bound the state we're entering */
>> >> -     if (state > PCI_D3hot)
>> >> -             state = PCI_D3hot;
>> >> +     if (state > PCI_D3cold)
>> >> +             state = PCI_D3cold;
>> >>       else if (state < PCI_D0)
>> >>               state = PCI_D0;
>> >>       else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
>> >> @@ -750,7 +750,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>> >>       if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
>> >>               return 0;
>> >>
>> >> -     error = pci_raw_set_power_state(dev, state);
>> >> +     error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
>> >> +                                     PCI_D3hot : state);
>> >>
>> >>       if (!__pci_complete_power_transition(dev, state))
>> >>               error = 0;
>> >> @@ -1482,6 +1483,17 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state)
>> >>       return !!(dev->pme_support & (1 << state));
>> >>  }
>> >>
>> >> +static void pci_pme_poll_wakeup(struct pci_dev *dev)
>> >> +{
>> >> +     struct pci_dev *bridge = dev->bus->self;
>> >> +
>> >> +     /* don't poll the pme bit if parent is in low power state */
>> >> +     if (bridge && bridge->current_state != PCI_D0)
>> >> +             return;
>> >> +
>> >> +     pci_pme_wakeup(dev, NULL);
>> >> +}
>> >
>> > This one actually makes some sense, although it might be better to put the
>> > test into pci_pme_wakeup() itself.
>> >
>> >> +
>> >>  static void pci_pme_list_scan(struct work_struct *work)
>> >>  {
>> >>       struct pci_pme_device *pme_dev, *n;
>> >> @@ -1490,7 +1502,7 @@ static void pci_pme_list_scan(struct work_struct *work)
>> >>       if (!list_empty(&pci_pme_list)) {
>> >>               list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) {
>> >>                       if (pme_dev->dev->pme_poll) {
>> >> -                             pci_pme_wakeup(pme_dev->dev, NULL);
>> >> +                             pci_pme_poll_wakeup(pme_dev->dev);
>> >>                       } else {
>> >>                               list_del(&pme_dev->list);
>> >>                               kfree(pme_dev);
>> >> @@ -1608,6 +1620,10 @@ int __pci_enable_wake(struct pci_dev *dev, pci_power_t state,
>> >>       if (enable) {
>> >>               int error;
>> >>
>> >> +             if (runtime && state >= PCI_D3cold)
>> >> +                     dev->runtime_d3cold = true;
>> >> +             else
>> >> +                     dev->runtime_d3cold = false;
>> >>               if (pci_pme_capable(dev, state))
>> >>                       pci_pme_active(dev, true);
>> >>               else
>> >> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
>> >> index e0610bd..d66b7e9 100644
>> >> --- a/drivers/pci/pcie/portdrv_pci.c
>> >> +++ b/drivers/pci/pcie/portdrv_pci.c
>> >> @@ -11,11 +11,13 @@
>> >>  #include <linux/kernel.h>
>> >>  #include <linux/errno.h>
>> >>  #include <linux/pm.h>
>> >> +#include <linux/pm_runtime.h>
>> >>  #include <linux/init.h>
>> >>  #include <linux/pcieport_if.h>
>> >>  #include <linux/aer.h>
>> >>  #include <linux/dmi.h>
>> >>  #include <linux/pci-aspm.h>
>> >> +#include <linux/delay.h>
>> >>
>> >>  #include "portdrv.h"
>> >>  #include "aer/aerdrv.h"
>> >> @@ -99,6 +101,25 @@ static int pcie_port_resume_noirq(struct device *dev)
>> >>       return 0;
>> >>  }
>> >>
>> >> +static int pcie_port_runtime_suspend(struct device *dev)
>> >> +{
>> >> +     struct pci_dev *pdev = to_pci_dev(dev);
>> >> +
>> >> +     pci_save_state(pdev);
>> >
>> > Are you sure this is sufficient?
>>
>> Maybe we need to pay attention to the pcie port drivers?  Run ->runtime_
>
> Yes.
>
>> >> +     return 0;
>> >> +}
>> >> +
>> >> +static int pcie_port_runtime_resume(struct device *dev)
>> >> +{
>> >> +     struct pci_dev *pdev = to_pci_dev(dev);
>> >> +
>> >> +     pci_restore_state(pdev);
>> >> +     if (pdev->runtime_d3cold)
>> >> +             msleep(100);
>> >
>> > What's _that_ supposed to do?
>>
>> When resume from d3cold, PCIe main link will be powered on again, it
>> will take quite some time before the main link go into L0 state.
>> Otherwise, accessing devices under the port may return wrong result.
>
> OK, but this is generic code and as per the standard the link should have been
> reestablished at this point already.

Maybe we can have some other way to check whether the link is
reestablished.  I will check the this.

> Please don't put some nonstandard-platform-specific quirks like this into
> code that's supposed to handle _every_ PCIe system.
>
>> >> +
>> >> +     return 0;
>> >> +}
>> >> +
>> >>  static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>> >>       .suspend        = pcie_port_device_suspend,
>> >>       .resume         = pcie_port_device_resume,
>> >> @@ -107,6 +128,8 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
>> >>       .poweroff       = pcie_port_device_suspend,
>> >>       .restore        = pcie_port_device_resume,
>> >>       .resume_noirq   = pcie_port_resume_noirq,
>> >> +     .runtime_suspend = pcie_port_runtime_suspend,
>> >> +     .runtime_resume = pcie_port_runtime_resume,
>> >>  };
>> >>
>> >>  #define PCIE_PORTDRV_PM_OPS  (&pcie_portdrv_pm_ops)
>> >> @@ -144,12 +167,14 @@ static int __devinit pcie_portdrv_probe(struct pci_dev *dev,
>> >>               return status;
>> >>
>> >>       pci_save_state(dev);
>> >> +     pm_runtime_put_noidle(&dev->dev);
>> >
>> > What's the purpose of this?
>>
>> I think this is needed by PCI runtime PM support.  Before ->probe,
>> pm_runtime_get_xxx() is executed on device.
>
> ok
>
>> >>       return 0;
>> >>  }
>> >>
>> >>  static void pcie_portdrv_remove(struct pci_dev *dev)
>> >>  {
>> >> +     pm_runtime_get_noresume(&dev->dev);
>> >>       pcie_port_device_remove(dev);
>> >>       pci_disable_device(dev);
>> >>  }
>> >> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> >> index e444f5b..b41d9a1 100644
>> >> --- a/include/linux/pci.h
>> >> +++ b/include/linux/pci.h
>> >> @@ -281,6 +281,7 @@ struct pci_dev {
>> >>       unsigned int    no_d1d2:1;      /* Only allow D0 and D3 */
>> >>       unsigned int    mmio_always_on:1;       /* disallow turning off io/mem
>> >>                                                  decoding during bar sizing */
>> >> +     unsigned int    runtime_d3cold:1;
>> >>       unsigned int    wakeup_prepared:1;
>> >>       unsigned int    d3_delay;       /* D3->D0 transition time in ms */
>> >
>> > OK
>> >
>> > So now please tell me what exactly you want to achieve and why you want to do
>> > that in the first place.
>
> Well, is there any chance to get that information?

You mean the runtime_d3cold flag? That flag is used to tell
acpi_pci_run_wake() that we need ACPI wakeup setup for the device
because that is needed by D3cold.  The ACPI wakeup setup here means
turn on power resources needed by wake up (_PRW) and execute _DSW.

If you mean the whole patch, we want to implement runtime D3cold
support, which can save more power than D3hot.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 17, 2012, 2:07 a.m. UTC | #14
On Tue, Apr 17, 2012 at 1:07 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> Moreover, I really don't think it's a good idea to put PCI Express ports into
> low-power states in general.  It might work on your platform, whatever it is,
> but that doesn't mean it's going to work on every PCI Express system out
> there.  I actually know of a number of such systems where it surely won't
> work at all.

The runtime PM support for PCIe port is not turned on by default, we need

# echo auto > /sys/devices/pcixxxx:xx/<pci id>/power/control

to turn on it.  And we will turn on it on system it works.  Is this sufficient?

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng April 17, 2012, 2:12 a.m. UTC | #15
On 04/17/2012 01:07 AM, Rafael J. Wysocki wrote:
> Hi,
> 
> On Monday, April 16, 2012, Yan, Zheng wrote:
>> On 04/14/2012 03:41 AM, Rafael J. Wysocki wrote:
>>> Hi,
>>>
>>> On Friday, April 13, 2012, Yan, Zheng wrote:
>>>> Hi all,
>>>>
>>>> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
>>>> beneath a PCIe port when they all have entered D3. A device in D3cold can only
>>>> generate wake event through the WAKE# pin. Because we can not access to a device's
>>>> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
>>>>
>>>> Any comment will be appreciated.
>>>>
>>>> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
>>>> ---
>>>> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
>>>> index 0f150f2..e210e8cb 100644
>>>> --- a/drivers/pci/pci-acpi.c
>>>> +++ b/drivers/pci/pci-acpi.c
>>>> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>>>  		[PCI_D1] = ACPI_STATE_D1,
>>>>  		[PCI_D2] = ACPI_STATE_D2,
>>>>  		[PCI_D3hot] = ACPI_STATE_D3,
>>>> -		[PCI_D3cold] = ACPI_STATE_D3
>>>> +		[PCI_D3cold] = ACPI_STATE_D3_COLD
>>>>  	};
>>>>  	int error = -EINVAL;
>>>>  
>>>
>>> Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
>>>
>>> We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
>>> instead.  I'll prepare a patch for that over the weekend if no one has done
>>> that already.
>>>
>>>> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
>>>>  
>>>>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
>>>>  {
>>>> -	if (dev->pme_interrupt)
>>>> +	/* PME interrupt isn't available in the D3cold case */
>>>> +	if (dev->pme_interrupt && !dev->runtime_d3cold)
>>>
>>> This whole thing is wrong.  First off, I don't think that the runtime_d3cold
>>> flag makes any sense.  We already cover that in dev->pme_support.
>>>
>>> Second, pme_interrupt means that the _root_ _port_, not the device itself will
>>> trigger an interrupt whenever the device sends the PME message to it (which
>>> very well may happen for a device in D3_cold woken up by an external signal).
>>>
>> The reason I introduced the runtime_d3cold flag is I can't make the PME interrupt
>> work in the D3cold case in our test platform. Document for our test platform says
>> A device in D3cold can only generate wake event through the WAKE# pin.
> 
> OK, so that clearly is not a standard PCI device, so please don't try to
> make our PCI bus type handle non-standard devices.
> 
>>>>  		return 0;
>>>>  
>>>>  	if (!acpi_pm_device_run_wake(&dev->dev, enable))
>>>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>>>> index 8156744..bc16869 100644
>>>> --- a/drivers/pci/pci.c
>>>> +++ b/drivers/pci/pci.c
>>>> @@ -731,8 +731,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>>>  	int error;
>>>>  
>>>
>>> Guys, please.  Never, _ever_, touch pci_set_power_state() without discussing
>>> your ideas with someone who knows how it works and _why_ it works this way.
>>>
>>> The problem here is that you can't program a PCI device into D3_cold, so it
>>> doesn't even make sense to have a helper for that.
>>>
>>>>  	/* bound the state we're entering */
>>>> -	if (state > PCI_D3hot)
>>>> -		state = PCI_D3hot;
>>>> +	if (state > PCI_D3cold)
>>>> +		state = PCI_D3cold;
>>>>  	else if (state < PCI_D0)
>>>>  		state = PCI_D0;
>>>>  	else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
>>>> @@ -750,7 +750,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>>>>  	if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
>>>>  		return 0;
>>>>  
>>>> -	error = pci_raw_set_power_state(dev, state);
>>>> +	error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
>>>> +					PCI_D3hot : state);
>>>>  
>>>>  	if (!__pci_complete_power_transition(dev, state))
>>>>  		error = 0;
>>>> @@ -1482,6 +1483,17 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state)
>>>>  	return !!(dev->pme_support & (1 << state));
>>>>  }
>>>>  
>>>> +static void pci_pme_poll_wakeup(struct pci_dev *dev)
>>>> +{
>>>> +	struct pci_dev *bridge = dev->bus->self;
>>>> +
>>>> +	/* don't poll the pme bit if parent is in low power state */
>>>> +	if (bridge && bridge->current_state != PCI_D0)
>>>> +		return;
>>>> +
>>>> +	pci_pme_wakeup(dev, NULL);
>>>> +}
>>>
>>> This one actually makes some sense, although it might be better to put the
>>> test into pci_pme_wakeup() itself.
>>
>> put it into pci_pme_wakeup will break ACPI wakeup, because ACPI wakeup also
>> uses pci_pme_wakeup. When a device exits from D3cold by asserting the WAKE#
>> signal, device power is restored automatically by ACPI, then pci_pme_wakeup
>> is called to check device's PME bit. pci_dev->current_state is PCI_D3hot
>> during the checking. 
> 
> You seem to be talking about two different device here.  One device is
> dev, which is the one being checked for the PME bit and the second one
> is the bridge that should be in D0 so that dev is accessible, right?

Yes
> 
> So if you put this code into pci_pme_wakeup():
> 
> 	struct pci_dev *bridge = pci_dev->bus->self;
> 
> 	...
> 
> 	if (bridge && bridge->current_state != PCI_D0)
> 		return 0;
> 
> then it should work just like your code above.
No. In the wakeup case, bridge's power is restored automatically by ACPI,
but bridge->current_state does not reflect the power state, it's still PCI_D3hot.

> 
> BTW, can you please explain to me what the #WAKE signal is and how it is
> different from PME#?

#WAKE signal is triggered by a pin connected to the root complex or other
motherboard logic. PME# is triggered by PME message sent to the port.

> 
>>>> +
>>>>  static void pci_pme_list_scan(struct work_struct *work)
>>>>  {
>>>>  	struct pci_pme_device *pme_dev, *n;
>>>> @@ -1490,7 +1502,7 @@ static void pci_pme_list_scan(struct work_struct *work)
>>>>  	if (!list_empty(&pci_pme_list)) {
>>>>  		list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) {
>>>>  			if (pme_dev->dev->pme_poll) {
>>>> -				pci_pme_wakeup(pme_dev->dev, NULL);
>>>> +				pci_pme_poll_wakeup(pme_dev->dev);
>>>>  			} else {
>>>>  				list_del(&pme_dev->list);
>>>>  				kfree(pme_dev);
>>>> @@ -1608,6 +1620,10 @@ int __pci_enable_wake(struct pci_dev *dev, pci_power_t state,
>>>>  	if (enable) {
>>>>  		int error;
>>>>  
>>>> +		if (runtime && state >= PCI_D3cold)
>>>> +			dev->runtime_d3cold = true;
>>>> +		else
>>>> +			dev->runtime_d3cold = false;
>>>>  		if (pci_pme_capable(dev, state))
>>>>  			pci_pme_active(dev, true);
>>>>  		else
>>>> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
>>>> index e0610bd..d66b7e9 100644
>>>> --- a/drivers/pci/pcie/portdrv_pci.c
>>>> +++ b/drivers/pci/pcie/portdrv_pci.c
>>>> @@ -11,11 +11,13 @@
>>>>  #include <linux/kernel.h>
>>>>  #include <linux/errno.h>
>>>>  #include <linux/pm.h>
>>>> +#include <linux/pm_runtime.h>
>>>>  #include <linux/init.h>
>>>>  #include <linux/pcieport_if.h>
>>>>  #include <linux/aer.h>
>>>>  #include <linux/dmi.h>
>>>>  #include <linux/pci-aspm.h>
>>>> +#include <linux/delay.h>
>>>>  
>>>>  #include "portdrv.h"
>>>>  #include "aer/aerdrv.h"
>>>> @@ -99,6 +101,25 @@ static int pcie_port_resume_noirq(struct device *dev)
>>>>  	return 0;
>>>>  }
>>>>  
>>>> +static int pcie_port_runtime_suspend(struct device *dev)
>>>> +{
>>>> +	struct pci_dev *pdev = to_pci_dev(dev);
>>>> +
>>>> +	pci_save_state(pdev);
>>>
>>> Are you sure this is sufficient?
>>
>> What else should I do?
> 
> First off, you need not do pci_save_state() here, most likely, because
> pci_pm_runtime_suspend() will do it for you later.

Our test platform behaves strangely if this pci_save_state is removed.
I guess it's because pci_fixup_device is called before pci_save_state
in pci_pm_runtime_suspend

> 
> Second, you're suspending a device that has children here.  Namely,
> the PCIe service devices are children of the port they are associated with.
> I wonder if they are suspended and how exactly, if so?
> 
Only the PME service has the PM callbacks. I don't think we should stop the PME
service in the runtime suspend case. For example, when both device and port are
in D3hot, No PME interrupt can be received if we stop the PME service.

Regards
Yan, Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 17, 2012, 5:13 a.m. UTC | #16
On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> >> +     return 0;
>> >> +}
>> >> +
>> >> +static int pcie_port_runtime_resume(struct device *dev)
>> >> +{
>> >> +     struct pci_dev *pdev = to_pci_dev(dev);
>> >> +
>> >> +     pci_restore_state(pdev);
>> >> +     if (pdev->runtime_d3cold)
>> >> +             msleep(100);
>> >
>> > What's _that_ supposed to do?
>>
>> When resume from d3cold, PCIe main link will be powered on again, it
>> will take quite some time before the main link go into L0 state.
>> Otherwise, accessing devices under the port may return wrong result.
>
> OK, but this is generic code and as per the standard the link should have been
> reestablished at this point already.
>
> Please don't put some nonstandard-platform-specific quirks like this into
> code that's supposed to handle _every_ PCIe system.

After checking PCIe spec, I found that the 100ms here has its standard origin :)

In PCI Express Base Specification Revision 2.0:

Section 6.6.1 Conventional Reset

"
To allow components to perform internal initialization, system
software must wait for at least
100 ms from the end of a Conventional Reset of one or more devices
before it is permitted to
issue Configuration Requests to those devices
"

But I think we should move the 100ms delay here to PCIe bus code or
PCIe/ACPI code, because that is needed by all PCIe devices for D3cold
support.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 17, 2012, 5:32 a.m. UTC | #17
On Tue, Apr 17, 2012 at 10:12 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> On 04/17/2012 01:07 AM, Rafael J. Wysocki wrote:
>
>> BTW, can you please explain to me what the #WAKE signal is and how it is
>> different from PME#?
>
> #WAKE signal is triggered by a pin connected to the root complex or other
> motherboard logic. PME# is triggered by PME message sent to the port.

PME# is a PCI pin, while WAKE# is a PCI Express pin.  In PCI Express,
there is no PME#, PME is delivered between end point device and root
port or root complex event collector via PME message, and the PME
message will trigger IRQ on root port or root complex event collector.
 WAKE# is not used for PCI Express D1, D2 and D3hot, it is just used
by D3cold.  When remote wakeup detected by end point device, it will
assert WAKE# to notify power controller (implemented via ACPI on some
platform), then power controller will turn on power for main link,
after link goes back to L0, PME message will be sent to root port or
root complex event collector by end point device.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 17, 2012, 8:20 p.m. UTC | #18
On Tuesday, April 17, 2012, huang ying wrote:
> On Tue, Apr 17, 2012 at 1:07 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > Moreover, I really don't think it's a good idea to put PCI Express ports into
> > low-power states in general.  It might work on your platform, whatever it is,
> > but that doesn't mean it's going to work on every PCI Express system out
> > there.  I actually know of a number of such systems where it surely won't
> > work at all.
> 
> The runtime PM support for PCIe port is not turned on by default, we need
> 
> # echo auto > /sys/devices/pcixxxx:xx/<pci id>/power/control
> 
> to turn on it.  And we will turn on it on system it works.  Is this sufficient?

No, it is not.  In some cases it shouldn't be enabled at all for PCI Express
ports, as far as I can say, so to be on the safe side we should only enable it
on platforms where PCI Express ports are known to work correctly with runtime PM.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 17, 2012, 8:35 p.m. UTC | #19
On Tuesday, April 17, 2012, Yan, Zheng wrote:
> On 04/17/2012 01:07 AM, Rafael J. Wysocki wrote:
> > Hi,
> > 
> > On Monday, April 16, 2012, Yan, Zheng wrote:
> >> On 04/14/2012 03:41 AM, Rafael J. Wysocki wrote:
> >>> Hi,
> >>>
> >>> On Friday, April 13, 2012, Yan, Zheng wrote:
> >>>> Hi all,
> >>>>
> >>>> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
> >>>> beneath a PCIe port when they all have entered D3. A device in D3cold can only
> >>>> generate wake event through the WAKE# pin. Because we can not access to a device's
> >>>> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
> >>>>
> >>>> Any comment will be appreciated.
> >>>>
> >>>> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
> >>>> ---
> >>>> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> >>>> index 0f150f2..e210e8cb 100644
> >>>> --- a/drivers/pci/pci-acpi.c
> >>>> +++ b/drivers/pci/pci-acpi.c
> >>>> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >>>>  		[PCI_D1] = ACPI_STATE_D1,
> >>>>  		[PCI_D2] = ACPI_STATE_D2,
> >>>>  		[PCI_D3hot] = ACPI_STATE_D3,
> >>>> -		[PCI_D3cold] = ACPI_STATE_D3
> >>>> +		[PCI_D3cold] = ACPI_STATE_D3_COLD
> >>>>  	};
> >>>>  	int error = -EINVAL;
> >>>>  
> >>>
> >>> Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
> >>>
> >>> We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
> >>> instead.  I'll prepare a patch for that over the weekend if no one has done
> >>> that already.
> >>>
> >>>> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
> >>>>  
> >>>>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
> >>>>  {
> >>>> -	if (dev->pme_interrupt)
> >>>> +	/* PME interrupt isn't available in the D3cold case */
> >>>> +	if (dev->pme_interrupt && !dev->runtime_d3cold)
> >>>
> >>> This whole thing is wrong.  First off, I don't think that the runtime_d3cold
> >>> flag makes any sense.  We already cover that in dev->pme_support.
> >>>
> >>> Second, pme_interrupt means that the _root_ _port_, not the device itself will
> >>> trigger an interrupt whenever the device sends the PME message to it (which
> >>> very well may happen for a device in D3_cold woken up by an external signal).
> >>>
> >> The reason I introduced the runtime_d3cold flag is I can't make the PME interrupt
> >> work in the D3cold case in our test platform. Document for our test platform says
> >> A device in D3cold can only generate wake event through the WAKE# pin.
> > 
> > OK, so that clearly is not a standard PCI device, so please don't try to
> > make our PCI bus type handle non-standard devices.
> > 
> >>>>  		return 0;
> >>>>  
> >>>>  	if (!acpi_pm_device_run_wake(&dev->dev, enable))
> >>>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> >>>> index 8156744..bc16869 100644
> >>>> --- a/drivers/pci/pci.c
> >>>> +++ b/drivers/pci/pci.c
> >>>> @@ -731,8 +731,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >>>>  	int error;
> >>>>  
> >>>
> >>> Guys, please.  Never, _ever_, touch pci_set_power_state() without discussing
> >>> your ideas with someone who knows how it works and _why_ it works this way.
> >>>
> >>> The problem here is that you can't program a PCI device into D3_cold, so it
> >>> doesn't even make sense to have a helper for that.
> >>>
> >>>>  	/* bound the state we're entering */
> >>>> -	if (state > PCI_D3hot)
> >>>> -		state = PCI_D3hot;
> >>>> +	if (state > PCI_D3cold)
> >>>> +		state = PCI_D3cold;
> >>>>  	else if (state < PCI_D0)
> >>>>  		state = PCI_D0;
> >>>>  	else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
> >>>> @@ -750,7 +750,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >>>>  	if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
> >>>>  		return 0;
> >>>>  
> >>>> -	error = pci_raw_set_power_state(dev, state);
> >>>> +	error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
> >>>> +					PCI_D3hot : state);
> >>>>  
> >>>>  	if (!__pci_complete_power_transition(dev, state))
> >>>>  		error = 0;
> >>>> @@ -1482,6 +1483,17 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state)
> >>>>  	return !!(dev->pme_support & (1 << state));
> >>>>  }
> >>>>  
> >>>> +static void pci_pme_poll_wakeup(struct pci_dev *dev)
> >>>> +{
> >>>> +	struct pci_dev *bridge = dev->bus->self;
> >>>> +
> >>>> +	/* don't poll the pme bit if parent is in low power state */
> >>>> +	if (bridge && bridge->current_state != PCI_D0)
> >>>> +		return;
> >>>> +
> >>>> +	pci_pme_wakeup(dev, NULL);
> >>>> +}
> >>>
> >>> This one actually makes some sense, although it might be better to put the
> >>> test into pci_pme_wakeup() itself.
> >>
> >> put it into pci_pme_wakeup will break ACPI wakeup, because ACPI wakeup also
> >> uses pci_pme_wakeup. When a device exits from D3cold by asserting the WAKE#
> >> signal, device power is restored automatically by ACPI, then pci_pme_wakeup
> >> is called to check device's PME bit. pci_dev->current_state is PCI_D3hot
> >> during the checking. 
> > 
> > You seem to be talking about two different device here.  One device is
> > dev, which is the one being checked for the PME bit and the second one
> > is the bridge that should be in D0 so that dev is accessible, right?
> 
> Yes
> > 
> > So if you put this code into pci_pme_wakeup():
> > 
> > 	struct pci_dev *bridge = pci_dev->bus->self;
> > 
> > 	...
> > 
> > 	if (bridge && bridge->current_state != PCI_D0)
> > 		return 0;
> > 
> > then it should work just like your code above.
> No. In the wakeup case, bridge's power is restored automatically by ACPI,
> but bridge->current_state does not reflect the power state, it's still PCI_D3hot.

So there's a bug somewhere, because the port (bridge) is the parent of the
device being resumed, so it should be resumed automatically by runtime PM
before that device.  Do you have any idea why that doesn't happen?

> > BTW, can you please explain to me what the #WAKE signal is and how it is
> > different from PME#?
> 
> #WAKE signal is triggered by a pin connected to the root complex or other
> motherboard logic. PME# is triggered by PME message sent to the port.

And #WAKE triggers a GPE in turn?

> >>>> +
> >>>>  static void pci_pme_list_scan(struct work_struct *work)
> >>>>  {
> >>>>  	struct pci_pme_device *pme_dev, *n;
> >>>> @@ -1490,7 +1502,7 @@ static void pci_pme_list_scan(struct work_struct *work)
> >>>>  	if (!list_empty(&pci_pme_list)) {
> >>>>  		list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) {
> >>>>  			if (pme_dev->dev->pme_poll) {
> >>>> -				pci_pme_wakeup(pme_dev->dev, NULL);
> >>>> +				pci_pme_poll_wakeup(pme_dev->dev);
> >>>>  			} else {
> >>>>  				list_del(&pme_dev->list);
> >>>>  				kfree(pme_dev);
> >>>> @@ -1608,6 +1620,10 @@ int __pci_enable_wake(struct pci_dev *dev, pci_power_t state,
> >>>>  	if (enable) {
> >>>>  		int error;
> >>>>  
> >>>> +		if (runtime && state >= PCI_D3cold)
> >>>> +			dev->runtime_d3cold = true;
> >>>> +		else
> >>>> +			dev->runtime_d3cold = false;
> >>>>  		if (pci_pme_capable(dev, state))
> >>>>  			pci_pme_active(dev, true);
> >>>>  		else
> >>>> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
> >>>> index e0610bd..d66b7e9 100644
> >>>> --- a/drivers/pci/pcie/portdrv_pci.c
> >>>> +++ b/drivers/pci/pcie/portdrv_pci.c
> >>>> @@ -11,11 +11,13 @@
> >>>>  #include <linux/kernel.h>
> >>>>  #include <linux/errno.h>
> >>>>  #include <linux/pm.h>
> >>>> +#include <linux/pm_runtime.h>
> >>>>  #include <linux/init.h>
> >>>>  #include <linux/pcieport_if.h>
> >>>>  #include <linux/aer.h>
> >>>>  #include <linux/dmi.h>
> >>>>  #include <linux/pci-aspm.h>
> >>>> +#include <linux/delay.h>
> >>>>  
> >>>>  #include "portdrv.h"
> >>>>  #include "aer/aerdrv.h"
> >>>> @@ -99,6 +101,25 @@ static int pcie_port_resume_noirq(struct device *dev)
> >>>>  	return 0;
> >>>>  }
> >>>>  
> >>>> +static int pcie_port_runtime_suspend(struct device *dev)
> >>>> +{
> >>>> +	struct pci_dev *pdev = to_pci_dev(dev);
> >>>> +
> >>>> +	pci_save_state(pdev);
> >>>
> >>> Are you sure this is sufficient?
> >>
> >> What else should I do?
> > 
> > First off, you need not do pci_save_state() here, most likely, because
> > pci_pm_runtime_suspend() will do it for you later.
> 
> Our test platform behaves strangely if this pci_save_state is removed.
> I guess it's because pci_fixup_device is called before pci_save_state
> in pci_pm_runtime_suspend

It would be good to verify that theory, so that we have a clear picture.

> > Second, you're suspending a device that has children here.  Namely,
> > the PCIe service devices are children of the port they are associated with.
> > I wonder if they are suspended and how exactly, if so?
> > 
> Only the PME service has the PM callbacks.

For now.  That may change in the future, though.

> I don't think we should stop the PME service in the runtime suspend case.

You're right.

> For example, when both device and port are
> in D3hot, No PME interrupt can be received if we stop the PME service.

Well, I don't think that PME interrupts are necessary at all on this platform,
because it uses ACPI for wakeup signaling anyway.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 17, 2012, 8:43 p.m. UTC | #20
On Tuesday, April 17, 2012, huang ying wrote:
> On Tue, Apr 17, 2012 at 10:12 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> > On 04/17/2012 01:07 AM, Rafael J. Wysocki wrote:
> >
> >> BTW, can you please explain to me what the #WAKE signal is and how it is
> >> different from PME#?
> >
> > #WAKE signal is triggered by a pin connected to the root complex or other
> > motherboard logic. PME# is triggered by PME message sent to the port.
> 
> PME# is a PCI pin, while WAKE# is a PCI Express pin.  In PCI Express,
> there is no PME#, PME is delivered between end point device and root
> port or root complex event collector via PME message, and the PME
> message will trigger IRQ on root port or root complex event collector.
>  WAKE# is not used for PCI Express D1, D2 and D3hot, it is just used
> by D3cold.  When remote wakeup detected by end point device, it will
> assert WAKE# to notify power controller (implemented via ACPI on some
> platform), then power controller will turn on power for main link,
> after link goes back to L0, PME message will be sent to root port or
> root complex event collector by end point device.

OK

So do I understand correctly that the WAKE# signal here is the one described
in Section 5.3.3.2 Link Wakeup of PCI Express Base spec. 2.0?

So what happens is that it triggers a GPE and that GPE has a _Lxx method
associated with it, I suppose.  Is that correct?

Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 17, 2012, 9:03 p.m. UTC | #21
On Tuesday, April 17, 2012, huang ying wrote:
> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > On Monday, April 16, 2012, huang ying wrote:
> >> Hi,
> >>
> >> On Sat, Apr 14, 2012 at 3:41 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >> > Hi,
> >> >
> >> > On Friday, April 13, 2012, Yan, Zheng wrote:
> >> >> Hi all,
> >> >>
> >> >> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
> >> >> beneath a PCIe port when they all have entered D3. A device in D3cold can only
> >> >> generate wake event through the WAKE# pin. Because we can not access to a device's
> >> >> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
> >> >>
> >> >> Any comment will be appreciated.
> >> >>
> >> >> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
> >> >> ---
> >> >> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> >> >> index 0f150f2..e210e8cb 100644
> >> >> --- a/drivers/pci/pci-acpi.c
> >> >> +++ b/drivers/pci/pci-acpi.c
> >> >> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >> >>               [PCI_D1] = ACPI_STATE_D1,
> >> >>               [PCI_D2] = ACPI_STATE_D2,
> >> >>               [PCI_D3hot] = ACPI_STATE_D3,
> >> >> -             [PCI_D3cold] = ACPI_STATE_D3
> >> >> +             [PCI_D3cold] = ACPI_STATE_D3_COLD
> >> >>       };
> >> >>       int error = -EINVAL;
> >> >>
> >> >
> >> > Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
> >> >
> >> > We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
> >> > instead.  I'll prepare a patch for that over the weekend if no one has done
> >> > that already.
> >> >
> >> >> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
> >> >>
> >> >>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
> >> >>  {
> >> >> -     if (dev->pme_interrupt)
> >> >> +     /* PME interrupt isn't available in the D3cold case */
> >> >> +     if (dev->pme_interrupt && !dev->runtime_d3cold)
> >> >
> >> > This whole thing is wrong.  First off, I don't think that the runtime_d3cold
> >> > flag makes any sense.  We already cover that in dev->pme_support.
> >>
> >> PCIe devices and PCIe root port may have proper PME interrupt support
> >> (that is, dev->pme_interrupt = true), but the process of remote wakeup
> >> from D3cold is as follow:
> >>
> >> 1) In D3cold, the power of the main link is turned off, aux power is
> >> provided (PCIe L2 link state)
> >> 2) Device detect condition to resume, then assert #WAKE pin
> >> 2) ACPI circuit linked with #WAKE pin, and will generate a GPE for that
> >> 3) GPE handler will resume device with ACPI (via _PS3 and _PR0), the
> >> power of the main link is turned on, after a while, link goes into L0
> >> state
> >> 4) The PME message is sent to root port, pme interrupt generated
> >
> > This isn't how it's supposed to work in theory.  If the device can signal PME
> > from D3cold, it should be able to reestablish the link and send a PME
> > message from there.  dev->pme_interrupt set means exactly that.
> >
> > ACPI is only supposed to be needed for things that don't send PME
> > messages (in your case the PME interrupt generated by the port is essentially
> > useless, because the wakeup event has already been signaled through ACPI).
> >
> >> So, for deivce, dev->pme_interrupt = true and dev->pme_support
> >> advocate it support PME in D3cold.  But we still need ACPI to setup
> >> run wake for the device.
> >
> > OK, so this is nonstandard.
> 
> This is the standard behavior.  Please refer to PCI Express Base
> Sepcification Revision 2.0, section 5.3.3.2 Link Wakeup.  In D1, D2
> and D3hot state, PCIe device can transit the link from L1 to L0 state,
> and send the PME message.  In D3cold, the main link is powered off,
> PCIe device will use a STANDARD sideband signal WAKE# to signal wakeup
> firstly, then platform (power controller in spec) will power on the
> main link for the device, after main link is back to L0, the PME
> message is send to root port, pme interrupt is generated.  So in
> theory, the wake up process can be divided into platform part (which
> power on the main link) and PCIe part (which send PME).

That's fine.  However, the platform part should be completely transparent
to the PCI bus type, then.  It just should power up the link to allow a
PME message to be transmitted through it.

[...]

> > So don't use pci_set_power_state() for that, because it's essentially
> > a different operation.  You need a pci_platform_remove_power() helper or
> > similar for that.
> >
> > What ACPI method exactly is used to remove power from the device?
> 
> The ACPI method executed is as follow:
> 
> - _PS3 (if exist)
> - Power resources in _PR3 is turned on
> - Power resources in _PR0 is turned off
> - Power resources in _PR3 is turned off

I'd rather think

- make sure that _PR3 resources are referenced
- drop references (from this device) for all other power resources
- execute _PS3 (if any)
- drop references for _PR3 resources

if Section 7.2.11 of ACPI 5.0 is to be followed.

> I think the process can fit pci_set_power_state() quite well, so why
> invent another helper for that?

OK, we can special case it, perhaps.

Suppose we have a "this device may be put into D3_cold" flag.

Who's going to decide whether to put it into D3_hot or D3_cold?


[...]

> >> > So now please tell me what exactly you want to achieve and why you want to do
> >> > that in the first place.
> >
> > Well, is there any chance to get that information?
> 
> You mean the runtime_d3cold flag? That flag is used to tell
> acpi_pci_run_wake() that we need ACPI wakeup setup for the device
> because that is needed by D3cold.  The ACPI wakeup setup here means
> turn on power resources needed by wake up (_PRW) and execute _DSW.
> 
> If you mean the whole patch, we want to implement runtime D3cold
> support, which can save more power than D3hot.

So, do I think correctly that you'd like to put devices into D3_cold
if that's possible via ACPI and to be able to wake it up from that state
using remote wakeup?

Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 17, 2012, 9:10 p.m. UTC | #22
On Tuesday, April 17, 2012, huang ying wrote:
> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >> >> +     return 0;
> >> >> +}
> >> >> +
> >> >> +static int pcie_port_runtime_resume(struct device *dev)
> >> >> +{
> >> >> +     struct pci_dev *pdev = to_pci_dev(dev);
> >> >> +
> >> >> +     pci_restore_state(pdev);
> >> >> +     if (pdev->runtime_d3cold)
> >> >> +             msleep(100);
> >> >
> >> > What's _that_ supposed to do?
> >>
> >> When resume from d3cold, PCIe main link will be powered on again, it
> >> will take quite some time before the main link go into L0 state.
> >> Otherwise, accessing devices under the port may return wrong result.
> >
> > OK, but this is generic code and as per the standard the link should have been
> > reestablished at this point already.
> >
> > Please don't put some nonstandard-platform-specific quirks like this into
> > code that's supposed to handle _every_ PCIe system.
> 
> After checking PCIe spec, I found that the 100ms here has its standard origin :)
> 
> In PCI Express Base Specification Revision 2.0:
> 
> Section 6.6.1 Conventional Reset
> 
> "
> To allow components to perform internal initialization, system
> software must wait for at least
> 100 ms from the end of a Conventional Reset of one or more devices
> before it is permitted to
> issue Configuration Requests to those devices
> "
> 
> But I think we should move the 100ms delay here to PCIe bus code or
> PCIe/ACPI code, because that is needed by all PCIe devices for D3cold
> support.

I think it should be sufficient to wait for the PME message to arrive at
the root port (which will cause the PME interrupt to appear), at which
point the device that sent it should be able to receive configuration
requests.

At this point, I need to konw what exactly happens when the GPE is triggered
by WAKE#.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 18, 2012, 1:19 a.m. UTC | #23
On Wed, Apr 18, 2012 at 4:20 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Tuesday, April 17, 2012, huang ying wrote:
>> On Tue, Apr 17, 2012 at 1:07 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> > Moreover, I really don't think it's a good idea to put PCI Express ports into
>> > low-power states in general.  It might work on your platform, whatever it is,
>> > but that doesn't mean it's going to work on every PCI Express system out
>> > there.  I actually know of a number of such systems where it surely won't
>> > work at all.
>>
>> The runtime PM support for PCIe port is not turned on by default, we need
>>
>> # echo auto > /sys/devices/pcixxxx:xx/<pci id>/power/control
>>
>> to turn on it.  And we will turn on it on system it works.  Is this sufficient?
>
> No, it is not.  In some cases it shouldn't be enabled at all for PCI Express
> ports, as far as I can say, so to be on the safe side we should only enable it
> on platforms where PCI Express ports are known to work correctly with runtime PM.

Sorry, I did not catch your idea exactly.  I think disable it by
default and enable it by sysfs interface is what you suggested, isn't
it?  Or you suggest something like "white list"?

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 18, 2012, 1:22 a.m. UTC | #24
On Wed, Apr 18, 2012 at 4:43 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Tuesday, April 17, 2012, huang ying wrote:
>> On Tue, Apr 17, 2012 at 10:12 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>> > On 04/17/2012 01:07 AM, Rafael J. Wysocki wrote:
>> >
>> >> BTW, can you please explain to me what the #WAKE signal is and how it is
>> >> different from PME#?
>> >
>> > #WAKE signal is triggered by a pin connected to the root complex or other
>> > motherboard logic. PME# is triggered by PME message sent to the port.
>>
>> PME# is a PCI pin, while WAKE# is a PCI Express pin.  In PCI Express,
>> there is no PME#, PME is delivered between end point device and root
>> port or root complex event collector via PME message, and the PME
>> message will trigger IRQ on root port or root complex event collector.
>>  WAKE# is not used for PCI Express D1, D2 and D3hot, it is just used
>> by D3cold.  When remote wakeup detected by end point device, it will
>> assert WAKE# to notify power controller (implemented via ACPI on some
>> platform), then power controller will turn on power for main link,
>> after link goes back to L0, PME message will be sent to root port or
>> root complex event collector by end point device.
>
> OK
>
> So do I understand correctly that the WAKE# signal here is the one described
> in Section 5.3.3.2 Link Wakeup of PCI Express Base spec. 2.0?
>
> So what happens is that it triggers a GPE and that GPE has a _Lxx method
> associated with it, I suppose.  Is that correct?

Yes.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 18, 2012, 1:45 a.m. UTC | #25
On Wed, Apr 18, 2012 at 5:03 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Tuesday, April 17, 2012, huang ying wrote:
>> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> > On Monday, April 16, 2012, huang ying wrote:
>> >> Hi,
>> >>
>> >> On Sat, Apr 14, 2012 at 3:41 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> >> > Hi,
>> >> >
>> >> > On Friday, April 13, 2012, Yan, Zheng wrote:
>> >> >> Hi all,
>> >> >>
>> >> >> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
>> >> >> beneath a PCIe port when they all have entered D3. A device in D3cold can only
>> >> >> generate wake event through the WAKE# pin. Because we can not access to a device's
>> >> >> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
>> >> >>
>> >> >> Any comment will be appreciated.
>> >> >>
>> >> >> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
>> >> >> ---
>> >> >> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
>> >> >> index 0f150f2..e210e8cb 100644
>> >> >> --- a/drivers/pci/pci-acpi.c
>> >> >> +++ b/drivers/pci/pci-acpi.c
>> >> >> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>> >> >>               [PCI_D1] = ACPI_STATE_D1,
>> >> >>               [PCI_D2] = ACPI_STATE_D2,
>> >> >>               [PCI_D3hot] = ACPI_STATE_D3,
>> >> >> -             [PCI_D3cold] = ACPI_STATE_D3
>> >> >> +             [PCI_D3cold] = ACPI_STATE_D3_COLD
>> >> >>       };
>> >> >>       int error = -EINVAL;
>> >> >>
>> >> >
>> >> > Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
>> >> >
>> >> > We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
>> >> > instead.  I'll prepare a patch for that over the weekend if no one has done
>> >> > that already.
>> >> >
>> >> >> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
>> >> >>
>> >> >>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
>> >> >>  {
>> >> >> -     if (dev->pme_interrupt)
>> >> >> +     /* PME interrupt isn't available in the D3cold case */
>> >> >> +     if (dev->pme_interrupt && !dev->runtime_d3cold)
>> >> >
>> >> > This whole thing is wrong.  First off, I don't think that the runtime_d3cold
>> >> > flag makes any sense.  We already cover that in dev->pme_support.
>> >>
>> >> PCIe devices and PCIe root port may have proper PME interrupt support
>> >> (that is, dev->pme_interrupt = true), but the process of remote wakeup
>> >> from D3cold is as follow:
>> >>
>> >> 1) In D3cold, the power of the main link is turned off, aux power is
>> >> provided (PCIe L2 link state)
>> >> 2) Device detect condition to resume, then assert #WAKE pin
>> >> 2) ACPI circuit linked with #WAKE pin, and will generate a GPE for that
>> >> 3) GPE handler will resume device with ACPI (via _PS3 and _PR0), the
>> >> power of the main link is turned on, after a while, link goes into L0
>> >> state
>> >> 4) The PME message is sent to root port, pme interrupt generated
>> >
>> > This isn't how it's supposed to work in theory.  If the device can signal PME
>> > from D3cold, it should be able to reestablish the link and send a PME
>> > message from there.  dev->pme_interrupt set means exactly that.
>> >
>> > ACPI is only supposed to be needed for things that don't send PME
>> > messages (in your case the PME interrupt generated by the port is essentially
>> > useless, because the wakeup event has already been signaled through ACPI).
>> >
>> >> So, for deivce, dev->pme_interrupt = true and dev->pme_support
>> >> advocate it support PME in D3cold.  But we still need ACPI to setup
>> >> run wake for the device.
>> >
>> > OK, so this is nonstandard.
>>
>> This is the standard behavior.  Please refer to PCI Express Base
>> Sepcification Revision 2.0, section 5.3.3.2 Link Wakeup.  In D1, D2
>> and D3hot state, PCIe device can transit the link from L1 to L0 state,
>> and send the PME message.  In D3cold, the main link is powered off,
>> PCIe device will use a STANDARD sideband signal WAKE# to signal wakeup
>> firstly, then platform (power controller in spec) will power on the
>> main link for the device, after main link is back to L0, the PME
>> message is send to root port, pme interrupt is generated.  So in
>> theory, the wake up process can be divided into platform part (which
>> power on the main link) and PCIe part (which send PME).
>
> That's fine.  However, the platform part should be completely transparent
> to the PCI bus type, then.  It just should power up the link to allow a
> PME message to be transmitted through it.

Yes.

>[...]
>
>> > So don't use pci_set_power_state() for that, because it's essentially
>> > a different operation.  You need a pci_platform_remove_power() helper or
>> > similar for that.
>> >
>> > What ACPI method exactly is used to remove power from the device?
>>
>> The ACPI method executed is as follow:
>>
>> - _PS3 (if exist)
>> - Power resources in _PR3 is turned on
>> - Power resources in _PR0 is turned off
>> - Power resources in _PR3 is turned off
>
> I'd rather think
>
> - make sure that _PR3 resources are referenced
> - drop references (from this device) for all other power resources
> - execute _PS3 (if any)
> - drop references for _PR3 resources
>
> if Section 7.2.11 of ACPI 5.0 is to be followed.

Yes.  You are right.

>> I think the process can fit pci_set_power_state() quite well, so why
>> invent another helper for that?
>
> OK, we can special case it, perhaps.
>
> Suppose we have a "this device may be put into D3_cold" flag.
>
> Who's going to decide whether to put it into D3_hot or D3_cold?

In most cases, I think it is OK to put device into D3_cold if that is
supported.  But there should be some special case where D3_cold is not
desirable, for example, we can put SSD into D3_cold safely, but it is
not quite safe to put HDD into D3_cold.  So we want to introduce a
flag: "may_power_off" like in the following patch

https://lkml.org/lkml/2012/3/29/41

It gives device driver a chance to prevent the device to be put into D3_cold.

> [...]
>
>> >> > So now please tell me what exactly you want to achieve and why you want to do
>> >> > that in the first place.
>> >
>> > Well, is there any chance to get that information?
>>
>> You mean the runtime_d3cold flag? That flag is used to tell
>> acpi_pci_run_wake() that we need ACPI wakeup setup for the device
>> because that is needed by D3cold.  The ACPI wakeup setup here means
>> turn on power resources needed by wake up (_PRW) and execute _DSW.
>>
>> If you mean the whole patch, we want to implement runtime D3cold
>> support, which can save more power than D3hot.
>
> So, do I think correctly that you'd like to put devices into D3_cold
> if that's possible via ACPI and to be able to wake it up from that state
> using remote wakeup?

Yes.  Support both remote wakeup and host wakeup.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 18, 2012, 2:01 a.m. UTC | #26
On Wed, Apr 18, 2012 at 5:10 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Tuesday, April 17, 2012, huang ying wrote:
>> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> >> >> +     return 0;
>> >> >> +}
>> >> >> +
>> >> >> +static int pcie_port_runtime_resume(struct device *dev)
>> >> >> +{
>> >> >> +     struct pci_dev *pdev = to_pci_dev(dev);
>> >> >> +
>> >> >> +     pci_restore_state(pdev);
>> >> >> +     if (pdev->runtime_d3cold)
>> >> >> +             msleep(100);
>> >> >
>> >> > What's _that_ supposed to do?
>> >>
>> >> When resume from d3cold, PCIe main link will be powered on again, it
>> >> will take quite some time before the main link go into L0 state.
>> >> Otherwise, accessing devices under the port may return wrong result.
>> >
>> > OK, but this is generic code and as per the standard the link should have been
>> > reestablished at this point already.
>> >
>> > Please don't put some nonstandard-platform-specific quirks like this into
>> > code that's supposed to handle _every_ PCIe system.
>>
>> After checking PCIe spec, I found that the 100ms here has its standard origin :)
>>
>> In PCI Express Base Specification Revision 2.0:
>>
>> Section 6.6.1 Conventional Reset
>>
>> "
>> To allow components to perform internal initialization, system
>> software must wait for at least
>> 100 ms from the end of a Conventional Reset of one or more devices
>> before it is permitted to
>> issue Configuration Requests to those devices
>> "
>>
>> But I think we should move the 100ms delay here to PCIe bus code or
>> PCIe/ACPI code, because that is needed by all PCIe devices for D3cold
>> support.
>
> I think it should be sufficient to wait for the PME message to arrive at
> the root port (which will cause the PME interrupt to appear), at which
> point the device that sent it should be able to receive configuration
> requests.

For remote wake up, it is sufficient.  But for host wake up, we still
need to wait 100ms.

> At this point, I need to konw what exactly happens when the GPE is triggered
> by WAKE#.

- Lxx handler will be executed
- in Lxx handler, Notify the ACPI handle PCIe port
- Linux has registered a handler for the ACPI handle of PCIe port, in
the handler, turn on _PR0 and execute _PS0, which will power on the
link.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 18, 2012, 7:51 p.m. UTC | #27
On Wednesday, April 18, 2012, huang ying wrote:
> On Wed, Apr 18, 2012 at 4:20 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > On Tuesday, April 17, 2012, huang ying wrote:
> >> On Tue, Apr 17, 2012 at 1:07 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >> > Moreover, I really don't think it's a good idea to put PCI Express ports into
> >> > low-power states in general.  It might work on your platform, whatever it is,
> >> > but that doesn't mean it's going to work on every PCI Express system out
> >> > there.  I actually know of a number of such systems where it surely won't
> >> > work at all.
> >>
> >> The runtime PM support for PCIe port is not turned on by default, we need
> >>
> >> # echo auto > /sys/devices/pcixxxx:xx/<pci id>/power/control
> >>
> >> to turn on it.  And we will turn on it on system it works.  Is this sufficient?
> >
> > No, it is not.  In some cases it shouldn't be enabled at all for PCI Express
> > ports, as far as I can say, so to be on the safe side we should only enable it
> > on platforms where PCI Express ports are known to work correctly with runtime PM.
> 
> Sorry, I did not catch your idea exactly.  I think disable it by
> default and enable it by sysfs interface is what you suggested, isn't
> it?

No, it isn't.  User space shouldn't be able to enable functionality that's
going to break on a subset of systems.

> Or you suggest something like "white list"?

Yes, something like this.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 18, 2012, 7:52 p.m. UTC | #28
On Wednesday, April 18, 2012, huang ying wrote:
> On Wed, Apr 18, 2012 at 4:43 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > On Tuesday, April 17, 2012, huang ying wrote:
> >> On Tue, Apr 17, 2012 at 10:12 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> >> > On 04/17/2012 01:07 AM, Rafael J. Wysocki wrote:
> >> >
> >> >> BTW, can you please explain to me what the #WAKE signal is and how it is
> >> >> different from PME#?
> >> >
> >> > #WAKE signal is triggered by a pin connected to the root complex or other
> >> > motherboard logic. PME# is triggered by PME message sent to the port.
> >>
> >> PME# is a PCI pin, while WAKE# is a PCI Express pin.  In PCI Express,
> >> there is no PME#, PME is delivered between end point device and root
> >> port or root complex event collector via PME message, and the PME
> >> message will trigger IRQ on root port or root complex event collector.
> >>  WAKE# is not used for PCI Express D1, D2 and D3hot, it is just used
> >> by D3cold.  When remote wakeup detected by end point device, it will
> >> assert WAKE# to notify power controller (implemented via ACPI on some
> >> platform), then power controller will turn on power for main link,
> >> after link goes back to L0, PME message will be sent to root port or
> >> root complex event collector by end point device.
> >
> > OK
> >
> > So do I understand correctly that the WAKE# signal here is the one described
> > in Section 5.3.3.2 Link Wakeup of PCI Express Base spec. 2.0?
> >
> > So what happens is that it triggers a GPE and that GPE has a _Lxx method
> > associated with it, I suppose.  Is that correct?
> 
> Yes.

So I wonder what that _Lxx method looks like.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 18, 2012, 8:51 p.m. UTC | #29
On Wednesday, April 18, 2012, huang ying wrote:
> On Wed, Apr 18, 2012 at 5:10 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > On Tuesday, April 17, 2012, huang ying wrote:
> >> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >> >> >> +     return 0;
> >> >> >> +}
> >> >> >> +
> >> >> >> +static int pcie_port_runtime_resume(struct device *dev)
> >> >> >> +{
> >> >> >> +     struct pci_dev *pdev = to_pci_dev(dev);
> >> >> >> +
> >> >> >> +     pci_restore_state(pdev);
> >> >> >> +     if (pdev->runtime_d3cold)
> >> >> >> +             msleep(100);
> >> >> >
> >> >> > What's _that_ supposed to do?
> >> >>
> >> >> When resume from d3cold, PCIe main link will be powered on again, it
> >> >> will take quite some time before the main link go into L0 state.
> >> >> Otherwise, accessing devices under the port may return wrong result.
> >> >
> >> > OK, but this is generic code and as per the standard the link should have been
> >> > reestablished at this point already.
> >> >
> >> > Please don't put some nonstandard-platform-specific quirks like this into
> >> > code that's supposed to handle _every_ PCIe system.
> >>
> >> After checking PCIe spec, I found that the 100ms here has its standard origin :)
> >>
> >> In PCI Express Base Specification Revision 2.0:
> >>
> >> Section 6.6.1 Conventional Reset
> >>
> >> "
> >> To allow components to perform internal initialization, system
> >> software must wait for at least
> >> 100 ms from the end of a Conventional Reset of one or more devices
> >> before it is permitted to
> >> issue Configuration Requests to those devices
> >> "
> >>
> >> But I think we should move the 100ms delay here to PCIe bus code or
> >> PCIe/ACPI code, because that is needed by all PCIe devices for D3cold
> >> support.
> >
> > I think it should be sufficient to wait for the PME message to arrive at
> > the root port (which will cause the PME interrupt to appear), at which
> > point the device that sent it should be able to receive configuration
> > requests.
> 
> For remote wake up, it is sufficient.  But for host wake up, we still
> need to wait 100ms.

Yes, we do.

> > At this point, I need to konw what exactly happens when the GPE is triggered
> > by WAKE#.
> 
> - Lxx handler will be executed
> - in Lxx handler, Notify the ACPI handle PCIe port
> - Linux has registered a handler for the ACPI handle of PCIe port, in
> the handler, turn on _PR0 and execute _PS0, which will power on the
> link.

But the handler we have is not the handler we want here.

In fact, there are two handlers, pci_acpi_wake_bus() and pci_acpi_wake_dev()
and they only do useful things for ACPI_NOTIFY_DEVICE_WAKE.  Is that the
event type we receive from that _Lxx?

Even if so, these routines don't seem to be suitable to handle the case at hand.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 18, 2012, 9 p.m. UTC | #30
On Wednesday, April 18, 2012, huang ying wrote:
> On Wed, Apr 18, 2012 at 5:03 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > On Tuesday, April 17, 2012, huang ying wrote:
> >> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >> > On Monday, April 16, 2012, huang ying wrote:
> >> >> Hi,
> >> >>
> >> >> On Sat, Apr 14, 2012 at 3:41 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >> >> > Hi,
> >> >> >
> >> >> > On Friday, April 13, 2012, Yan, Zheng wrote:
> >> >> >> Hi all,
> >> >> >>
> >> >> >> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
> >> >> >> beneath a PCIe port when they all have entered D3. A device in D3cold can only
> >> >> >> generate wake event through the WAKE# pin. Because we can not access to a device's
> >> >> >> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
> >> >> >>
> >> >> >> Any comment will be appreciated.
> >> >> >>
> >> >> >> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
> >> >> >> ---
> >> >> >> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> >> >> >> index 0f150f2..e210e8cb 100644
> >> >> >> --- a/drivers/pci/pci-acpi.c
> >> >> >> +++ b/drivers/pci/pci-acpi.c
> >> >> >> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >> >> >>               [PCI_D1] = ACPI_STATE_D1,
> >> >> >>               [PCI_D2] = ACPI_STATE_D2,
> >> >> >>               [PCI_D3hot] = ACPI_STATE_D3,
> >> >> >> -             [PCI_D3cold] = ACPI_STATE_D3
> >> >> >> +             [PCI_D3cold] = ACPI_STATE_D3_COLD
> >> >> >>       };
> >> >> >>       int error = -EINVAL;
> >> >> >>
> >> >> >
> >> >> > Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
> >> >> >
> >> >> > We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
> >> >> > instead.  I'll prepare a patch for that over the weekend if no one has done
> >> >> > that already.
> >> >> >
> >> >> >> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
> >> >> >>
> >> >> >>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
> >> >> >>  {
> >> >> >> -     if (dev->pme_interrupt)
> >> >> >> +     /* PME interrupt isn't available in the D3cold case */
> >> >> >> +     if (dev->pme_interrupt && !dev->runtime_d3cold)
> >> >> >
> >> >> > This whole thing is wrong.  First off, I don't think that the runtime_d3cold
> >> >> > flag makes any sense.  We already cover that in dev->pme_support.
> >> >>
> >> >> PCIe devices and PCIe root port may have proper PME interrupt support
> >> >> (that is, dev->pme_interrupt = true), but the process of remote wakeup
> >> >> from D3cold is as follow:
> >> >>
> >> >> 1) In D3cold, the power of the main link is turned off, aux power is
> >> >> provided (PCIe L2 link state)
> >> >> 2) Device detect condition to resume, then assert #WAKE pin
> >> >> 2) ACPI circuit linked with #WAKE pin, and will generate a GPE for that
> >> >> 3) GPE handler will resume device with ACPI (via _PS3 and _PR0), the
> >> >> power of the main link is turned on, after a while, link goes into L0
> >> >> state
> >> >> 4) The PME message is sent to root port, pme interrupt generated
> >> >
> >> > This isn't how it's supposed to work in theory.  If the device can signal PME
> >> > from D3cold, it should be able to reestablish the link and send a PME
> >> > message from there.  dev->pme_interrupt set means exactly that.
> >> >
> >> > ACPI is only supposed to be needed for things that don't send PME
> >> > messages (in your case the PME interrupt generated by the port is essentially
> >> > useless, because the wakeup event has already been signaled through ACPI).
> >> >
> >> >> So, for deivce, dev->pme_interrupt = true and dev->pme_support
> >> >> advocate it support PME in D3cold.  But we still need ACPI to setup
> >> >> run wake for the device.
> >> >
> >> > OK, so this is nonstandard.
> >>
> >> This is the standard behavior.  Please refer to PCI Express Base
> >> Sepcification Revision 2.0, section 5.3.3.2 Link Wakeup.  In D1, D2
> >> and D3hot state, PCIe device can transit the link from L1 to L0 state,
> >> and send the PME message.  In D3cold, the main link is powered off,
> >> PCIe device will use a STANDARD sideband signal WAKE# to signal wakeup
> >> firstly, then platform (power controller in spec) will power on the
> >> main link for the device, after main link is back to L0, the PME
> >> message is send to root port, pme interrupt is generated.  So in
> >> theory, the wake up process can be divided into platform part (which
> >> power on the main link) and PCIe part (which send PME).
> >
> > That's fine.  However, the platform part should be completely transparent
> > to the PCI bus type, then.  It just should power up the link to allow a
> > PME message to be transmitted through it.
> 
> Yes.
> 
> >[...]
> >
> >> > So don't use pci_set_power_state() for that, because it's essentially
> >> > a different operation.  You need a pci_platform_remove_power() helper or
> >> > similar for that.
> >> >
> >> > What ACPI method exactly is used to remove power from the device?
> >>
> >> The ACPI method executed is as follow:
> >>
> >> - _PS3 (if exist)
> >> - Power resources in _PR3 is turned on
> >> - Power resources in _PR0 is turned off
> >> - Power resources in _PR3 is turned off
> >
> > I'd rather think
> >
> > - make sure that _PR3 resources are referenced
> > - drop references (from this device) for all other power resources
> > - execute _PS3 (if any)
> > - drop references for _PR3 resources
> >
> > if Section 7.2.11 of ACPI 5.0 is to be followed.
> 
> Yes.  You are right.
> 
> >> I think the process can fit pci_set_power_state() quite well, so why
> >> invent another helper for that?
> >
> > OK, we can special case it, perhaps.
> >
> > Suppose we have a "this device may be put into D3_cold" flag.
> >
> > Who's going to decide whether to put it into D3_hot or D3_cold?
> 
> In most cases, I think it is OK to put device into D3_cold if that is
> supported.

Well, there may be PM QoS latency requirements preventing us from doing so.

> But there should be some special case where D3_cold is not
> desirable, for example, we can put SSD into D3_cold safely, but it is
> not quite safe to put HDD into D3_cold.  So we want to introduce a
> flag: "may_power_off" like in the following patch
> 
> https://lkml.org/lkml/2012/3/29/41
> 
> It gives device driver a chance to prevent the device to be put into D3_cold.

I see.  So your proposal is that the flag might be used to indicate to
whoever carries out power transitions of devices that power must not be
removed from this particular device, right?

In that case we can put that flag into struct dev_pm_info after all, but
perhaps the name should indicate more precisely what it is about.  Something
like "power_must_be_on" maybe?

> > [...]
> >
> >> >> > So now please tell me what exactly you want to achieve and why you want to do
> >> >> > that in the first place.
> >> >
> >> > Well, is there any chance to get that information?
> >>
> >> You mean the runtime_d3cold flag? That flag is used to tell
> >> acpi_pci_run_wake() that we need ACPI wakeup setup for the device
> >> because that is needed by D3cold.  The ACPI wakeup setup here means
> >> turn on power resources needed by wake up (_PRW) and execute _DSW.
> >>
> >> If you mean the whole patch, we want to implement runtime D3cold
> >> support, which can save more power than D3hot.
> >
> > So, do I think correctly that you'd like to put devices into D3_cold
> > if that's possible via ACPI and to be able to wake it up from that state
> > using remote wakeup?
> 
> Yes.  Support both remote wakeup and host wakeup.

OK, so we need to clean up the ACPI D3_HOT/D3_COLD mess first.  Then, we need
to change PCI so that devices are not put into D3cold (by ACPI) if only D3hot
is requested by the caller of pci_set_power_state().

Having done that, we can modify pci_set_power_state() to handle D3cold as
a special case (essentially, it should check that case before doing anything
else).  Finally, we need to teach the ACPI notify handler about the WAKE#
event and we need to add the 100 ms wait to the device resume code path
somewhere (I guess in pci_set_power_state() for the D3cold->D0 transition).

Now, there's one more thing to consider.  Namely, if a PCIe endpoint is put
into D3hot (via native PM) and then the port it is connected to is put into
D3_hot (via native PM), does that transfer the endpoint into D3cold?

Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 19, 2012, 2:08 a.m. UTC | #31
On Thu, Apr 19, 2012 at 4:51 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Wednesday, April 18, 2012, huang ying wrote:
>> On Wed, Apr 18, 2012 at 5:10 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> > On Tuesday, April 17, 2012, huang ying wrote:
>> >> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> >> >> >> +     return 0;
>> >> >> >> +}
>> >> >> >> +
>> >> >> >> +static int pcie_port_runtime_resume(struct device *dev)
>> >> >> >> +{
>> >> >> >> +     struct pci_dev *pdev = to_pci_dev(dev);
>> >> >> >> +
>> >> >> >> +     pci_restore_state(pdev);
>> >> >> >> +     if (pdev->runtime_d3cold)
>> >> >> >> +             msleep(100);
>> >> >> >
>> >> >> > What's _that_ supposed to do?
>> >> >>
>> >> >> When resume from d3cold, PCIe main link will be powered on again, it
>> >> >> will take quite some time before the main link go into L0 state.
>> >> >> Otherwise, accessing devices under the port may return wrong result.
>> >> >
>> >> > OK, but this is generic code and as per the standard the link should have been
>> >> > reestablished at this point already.
>> >> >
>> >> > Please don't put some nonstandard-platform-specific quirks like this into
>> >> > code that's supposed to handle _every_ PCIe system.
>> >>
>> >> After checking PCIe spec, I found that the 100ms here has its standard origin :)
>> >>
>> >> In PCI Express Base Specification Revision 2.0:
>> >>
>> >> Section 6.6.1 Conventional Reset
>> >>
>> >> "
>> >> To allow components to perform internal initialization, system
>> >> software must wait for at least
>> >> 100 ms from the end of a Conventional Reset of one or more devices
>> >> before it is permitted to
>> >> issue Configuration Requests to those devices
>> >> "
>> >>
>> >> But I think we should move the 100ms delay here to PCIe bus code or
>> >> PCIe/ACPI code, because that is needed by all PCIe devices for D3cold
>> >> support.
>> >
>> > I think it should be sufficient to wait for the PME message to arrive at
>> > the root port (which will cause the PME interrupt to appear), at which
>> > point the device that sent it should be able to receive configuration
>> > requests.
>>
>> For remote wake up, it is sufficient.  But for host wake up, we still
>> need to wait 100ms.
>
> Yes, we do.
>
>> > At this point, I need to konw what exactly happens when the GPE is triggered
>> > by WAKE#.
>>
>> - Lxx handler will be executed
>> - in Lxx handler, Notify the ACPI handle PCIe port
>> - Linux has registered a handler for the ACPI handle of PCIe port, in
>> the handler, turn on _PR0 and execute _PS0, which will power on the
>> link.
>
> But the handler we have is not the handler we want here.
>
> In fact, there are two handlers, pci_acpi_wake_bus() and pci_acpi_wake_dev()
> and they only do useful things for ACPI_NOTIFY_DEVICE_WAKE.  Is that the
> event type we receive from that _Lxx?

I check the DSDT, in _Lxx, there is

Notify (\_SB.PCI0.RP03, 0x02)

That is, the event type is ACPI_NOTIFY_DEVICE_WAKE.

> Even if so, these routines don't seem to be suitable to handle the case at hand.

Yes.  Maybe add a flag named like "come_from_d3cold", and if
come_from_d3cold == true, resume the dev itself without checking pme
bits, because the PCIe main link is not available now.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 19, 2012, 2:47 a.m. UTC | #32
On Thu, Apr 19, 2012 at 5:00 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Wednesday, April 18, 2012, huang ying wrote:
>> On Wed, Apr 18, 2012 at 5:03 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> > On Tuesday, April 17, 2012, huang ying wrote:
>> >> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> >> > On Monday, April 16, 2012, huang ying wrote:
>> >> >> Hi,
>> >> >>
>> >> >> On Sat, Apr 14, 2012 at 3:41 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> >> >> > Hi,
>> >> >> >
>> >> >> > On Friday, April 13, 2012, Yan, Zheng wrote:
>> >> >> >> Hi all,
>> >> >> >>
>> >> >> >> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
>> >> >> >> beneath a PCIe port when they all have entered D3. A device in D3cold can only
>> >> >> >> generate wake event through the WAKE# pin. Because we can not access to a device's
>> >> >> >> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
>> >> >> >>
>> >> >> >> Any comment will be appreciated.
>> >> >> >>
>> >> >> >> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
>> >> >> >> ---
>> >> >> >> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
>> >> >> >> index 0f150f2..e210e8cb 100644
>> >> >> >> --- a/drivers/pci/pci-acpi.c
>> >> >> >> +++ b/drivers/pci/pci-acpi.c
>> >> >> >> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>> >> >> >>               [PCI_D1] = ACPI_STATE_D1,
>> >> >> >>               [PCI_D2] = ACPI_STATE_D2,
>> >> >> >>               [PCI_D3hot] = ACPI_STATE_D3,
>> >> >> >> -             [PCI_D3cold] = ACPI_STATE_D3
>> >> >> >> +             [PCI_D3cold] = ACPI_STATE_D3_COLD
>> >> >> >>       };
>> >> >> >>       int error = -EINVAL;
>> >> >> >>
>> >> >> >
>> >> >> > Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
>> >> >> >
>> >> >> > We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
>> >> >> > instead.  I'll prepare a patch for that over the weekend if no one has done
>> >> >> > that already.
>> >> >> >
>> >> >> >> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
>> >> >> >>
>> >> >> >>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
>> >> >> >>  {
>> >> >> >> -     if (dev->pme_interrupt)
>> >> >> >> +     /* PME interrupt isn't available in the D3cold case */
>> >> >> >> +     if (dev->pme_interrupt && !dev->runtime_d3cold)
>> >> >> >
>> >> >> > This whole thing is wrong.  First off, I don't think that the runtime_d3cold
>> >> >> > flag makes any sense.  We already cover that in dev->pme_support.
>> >> >>
>> >> >> PCIe devices and PCIe root port may have proper PME interrupt support
>> >> >> (that is, dev->pme_interrupt = true), but the process of remote wakeup
>> >> >> from D3cold is as follow:
>> >> >>
>> >> >> 1) In D3cold, the power of the main link is turned off, aux power is
>> >> >> provided (PCIe L2 link state)
>> >> >> 2) Device detect condition to resume, then assert #WAKE pin
>> >> >> 2) ACPI circuit linked with #WAKE pin, and will generate a GPE for that
>> >> >> 3) GPE handler will resume device with ACPI (via _PS3 and _PR0), the
>> >> >> power of the main link is turned on, after a while, link goes into L0
>> >> >> state
>> >> >> 4) The PME message is sent to root port, pme interrupt generated
>> >> >
>> >> > This isn't how it's supposed to work in theory.  If the device can signal PME
>> >> > from D3cold, it should be able to reestablish the link and send a PME
>> >> > message from there.  dev->pme_interrupt set means exactly that.
>> >> >
>> >> > ACPI is only supposed to be needed for things that don't send PME
>> >> > messages (in your case the PME interrupt generated by the port is essentially
>> >> > useless, because the wakeup event has already been signaled through ACPI).
>> >> >
>> >> >> So, for deivce, dev->pme_interrupt = true and dev->pme_support
>> >> >> advocate it support PME in D3cold.  But we still need ACPI to setup
>> >> >> run wake for the device.
>> >> >
>> >> > OK, so this is nonstandard.
>> >>
>> >> This is the standard behavior.  Please refer to PCI Express Base
>> >> Sepcification Revision 2.0, section 5.3.3.2 Link Wakeup.  In D1, D2
>> >> and D3hot state, PCIe device can transit the link from L1 to L0 state,
>> >> and send the PME message.  In D3cold, the main link is powered off,
>> >> PCIe device will use a STANDARD sideband signal WAKE# to signal wakeup
>> >> firstly, then platform (power controller in spec) will power on the
>> >> main link for the device, after main link is back to L0, the PME
>> >> message is send to root port, pme interrupt is generated.  So in
>> >> theory, the wake up process can be divided into platform part (which
>> >> power on the main link) and PCIe part (which send PME).
>> >
>> > That's fine.  However, the platform part should be completely transparent
>> > to the PCI bus type, then.  It just should power up the link to allow a
>> > PME message to be transmitted through it.
>>
>> Yes.
>>
>> >[...]
>> >
>> >> > So don't use pci_set_power_state() for that, because it's essentially
>> >> > a different operation.  You need a pci_platform_remove_power() helper or
>> >> > similar for that.
>> >> >
>> >> > What ACPI method exactly is used to remove power from the device?
>> >>
>> >> The ACPI method executed is as follow:
>> >>
>> >> - _PS3 (if exist)
>> >> - Power resources in _PR3 is turned on
>> >> - Power resources in _PR0 is turned off
>> >> - Power resources in _PR3 is turned off
>> >
>> > I'd rather think
>> >
>> > - make sure that _PR3 resources are referenced
>> > - drop references (from this device) for all other power resources
>> > - execute _PS3 (if any)
>> > - drop references for _PR3 resources
>> >
>> > if Section 7.2.11 of ACPI 5.0 is to be followed.
>>
>> Yes.  You are right.
>>
>> >> I think the process can fit pci_set_power_state() quite well, so why
>> >> invent another helper for that?
>> >
>> > OK, we can special case it, perhaps.
>> >
>> > Suppose we have a "this device may be put into D3_cold" flag.
>> >
>> > Who's going to decide whether to put it into D3_hot or D3_cold?
>>
>> In most cases, I think it is OK to put device into D3_cold if that is
>> supported.
>
> Well, there may be PM QoS latency requirements preventing us from doing so.

Yes.

>> But there should be some special case where D3_cold is not
>> desirable, for example, we can put SSD into D3_cold safely, but it is
>> not quite safe to put HDD into D3_cold.  So we want to introduce a
>> flag: "may_power_off" like in the following patch
>>
>> https://lkml.org/lkml/2012/3/29/41
>>
>> It gives device driver a chance to prevent the device to be put into D3_cold.
>
> I see.  So your proposal is that the flag might be used to indicate to
> whoever carries out power transitions of devices that power must not be
> removed from this particular device, right?

Yes.

> In that case we can put that flag into struct dev_pm_info after all, but
> perhaps the name should indicate more precisely what it is about.  Something
> like "power_must_be_on" maybe?

I am not good at naming in English :)
I will accept your proposal.

>> > [...]
>> >
>> >> >> > So now please tell me what exactly you want to achieve and why you want to do
>> >> >> > that in the first place.
>> >> >
>> >> > Well, is there any chance to get that information?
>> >>
>> >> You mean the runtime_d3cold flag? That flag is used to tell
>> >> acpi_pci_run_wake() that we need ACPI wakeup setup for the device
>> >> because that is needed by D3cold.  The ACPI wakeup setup here means
>> >> turn on power resources needed by wake up (_PRW) and execute _DSW.
>> >>
>> >> If you mean the whole patch, we want to implement runtime D3cold
>> >> support, which can save more power than D3hot.
>> >
>> > So, do I think correctly that you'd like to put devices into D3_cold
>> > if that's possible via ACPI and to be able to wake it up from that state
>> > using remote wakeup?
>>
>> Yes.  Support both remote wakeup and host wakeup.
>
> OK, so we need to clean up the ACPI D3_HOT/D3_COLD mess first.  Then, we need
> to change PCI so that devices are not put into D3cold (by ACPI) if only D3hot
> is requested by the caller of pci_set_power_state().
>
> Having done that, we can modify pci_set_power_state() to handle D3cold as
> a special case (essentially, it should check that case before doing anything
> else).  Finally, we need to teach the ACPI notify handler about the WAKE#
> event and we need to add the 100 ms wait to the device resume code path
> somewhere (I guess in pci_set_power_state() for the D3cold->D0 transition).

Yes.  Sound good to me.

> Now, there's one more thing to consider.  Namely, if a PCIe endpoint is put
> into D3hot (via native PM) and then the port it is connected to is put into
> D3_hot (via native PM), does that transfer the endpoint into D3cold?

No.

But if a PCIe endpoint is put into D3hot and then the port it is
connected is put into D3_cold (via ACPI), this will transfer the
endpoint into D3_cold, and if the port is put into D0 afterwards, all
subordinate endpoint devices will be put into D0 (because of power on
reset).  I think what we need to do here is:

- when choose power state, if any subordinate device has
power_must_be_on set, will not choose D3_cold

- when put PCIe port from D3_cold to D0, resume all subordinate
devices too.  We design a method to do that in following patch:

https://lkml.org/lkml/2012/3/29/38

Where we will register all subordinate devices via
acpi_power_resource_register_device(endpoint_device,
bridget_acpi_handle).

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 19, 2012, 12:31 p.m. UTC | #33
On Thursday, April 19, 2012, huang ying wrote:
> On Thu, Apr 19, 2012 at 5:00 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > On Wednesday, April 18, 2012, huang ying wrote:
> >> On Wed, Apr 18, 2012 at 5:03 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >> > On Tuesday, April 17, 2012, huang ying wrote:
> >> >> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >> >> > On Monday, April 16, 2012, huang ying wrote:
> >> >> >> Hi,
> >> >> >>
> >> >> >> On Sat, Apr 14, 2012 at 3:41 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >> >> >> > Hi,
> >> >> >> >
> >> >> >> > On Friday, April 13, 2012, Yan, Zheng wrote:
> >> >> >> >> Hi all,
> >> >> >> >>
> >> >> >> >> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
> >> >> >> >> beneath a PCIe port when they all have entered D3. A device in D3cold can only
> >> >> >> >> generate wake event through the WAKE# pin. Because we can not access to a device's
> >> >> >> >> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
> >> >> >> >>
> >> >> >> >> Any comment will be appreciated.
> >> >> >> >>
> >> >> >> >> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
> >> >> >> >> ---
> >> >> >> >> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> >> >> >> >> index 0f150f2..e210e8cb 100644
> >> >> >> >> --- a/drivers/pci/pci-acpi.c
> >> >> >> >> +++ b/drivers/pci/pci-acpi.c
> >> >> >> >> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >> >> >> >>               [PCI_D1] = ACPI_STATE_D1,
> >> >> >> >>               [PCI_D2] = ACPI_STATE_D2,
> >> >> >> >>               [PCI_D3hot] = ACPI_STATE_D3,
> >> >> >> >> -             [PCI_D3cold] = ACPI_STATE_D3
> >> >> >> >> +             [PCI_D3cold] = ACPI_STATE_D3_COLD
> >> >> >> >>       };
> >> >> >> >>       int error = -EINVAL;
> >> >> >> >>
> >> >> >> >
> >> >> >> > Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
> >> >> >> >
> >> >> >> > We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
> >> >> >> > instead.  I'll prepare a patch for that over the weekend if no one has done
> >> >> >> > that already.
> >> >> >> >
> >> >> >> >> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
> >> >> >> >>
> >> >> >> >>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
> >> >> >> >>  {
> >> >> >> >> -     if (dev->pme_interrupt)
> >> >> >> >> +     /* PME interrupt isn't available in the D3cold case */
> >> >> >> >> +     if (dev->pme_interrupt && !dev->runtime_d3cold)
> >> >> >> >
> >> >> >> > This whole thing is wrong.  First off, I don't think that the runtime_d3cold
> >> >> >> > flag makes any sense.  We already cover that in dev->pme_support.
> >> >> >>
> >> >> >> PCIe devices and PCIe root port may have proper PME interrupt support
> >> >> >> (that is, dev->pme_interrupt = true), but the process of remote wakeup
> >> >> >> from D3cold is as follow:
> >> >> >>
> >> >> >> 1) In D3cold, the power of the main link is turned off, aux power is
> >> >> >> provided (PCIe L2 link state)
> >> >> >> 2) Device detect condition to resume, then assert #WAKE pin
> >> >> >> 2) ACPI circuit linked with #WAKE pin, and will generate a GPE for that
> >> >> >> 3) GPE handler will resume device with ACPI (via _PS3 and _PR0), the
> >> >> >> power of the main link is turned on, after a while, link goes into L0
> >> >> >> state
> >> >> >> 4) The PME message is sent to root port, pme interrupt generated
> >> >> >
> >> >> > This isn't how it's supposed to work in theory.  If the device can signal PME
> >> >> > from D3cold, it should be able to reestablish the link and send a PME
> >> >> > message from there.  dev->pme_interrupt set means exactly that.
> >> >> >
> >> >> > ACPI is only supposed to be needed for things that don't send PME
> >> >> > messages (in your case the PME interrupt generated by the port is essentially
> >> >> > useless, because the wakeup event has already been signaled through ACPI).
> >> >> >
> >> >> >> So, for deivce, dev->pme_interrupt = true and dev->pme_support
> >> >> >> advocate it support PME in D3cold.  But we still need ACPI to setup
> >> >> >> run wake for the device.
> >> >> >
> >> >> > OK, so this is nonstandard.
> >> >>
> >> >> This is the standard behavior.  Please refer to PCI Express Base
> >> >> Sepcification Revision 2.0, section 5.3.3.2 Link Wakeup.  In D1, D2
> >> >> and D3hot state, PCIe device can transit the link from L1 to L0 state,
> >> >> and send the PME message.  In D3cold, the main link is powered off,
> >> >> PCIe device will use a STANDARD sideband signal WAKE# to signal wakeup
> >> >> firstly, then platform (power controller in spec) will power on the
> >> >> main link for the device, after main link is back to L0, the PME
> >> >> message is send to root port, pme interrupt is generated.  So in
> >> >> theory, the wake up process can be divided into platform part (which
> >> >> power on the main link) and PCIe part (which send PME).
> >> >
> >> > That's fine.  However, the platform part should be completely transparent
> >> > to the PCI bus type, then.  It just should power up the link to allow a
> >> > PME message to be transmitted through it.
> >>
> >> Yes.
> >>
> >> >[...]
> >> >
> >> >> > So don't use pci_set_power_state() for that, because it's essentially
> >> >> > a different operation.  You need a pci_platform_remove_power() helper or
> >> >> > similar for that.
> >> >> >
> >> >> > What ACPI method exactly is used to remove power from the device?
> >> >>
> >> >> The ACPI method executed is as follow:
> >> >>
> >> >> - _PS3 (if exist)
> >> >> - Power resources in _PR3 is turned on
> >> >> - Power resources in _PR0 is turned off
> >> >> - Power resources in _PR3 is turned off
> >> >
> >> > I'd rather think
> >> >
> >> > - make sure that _PR3 resources are referenced
> >> > - drop references (from this device) for all other power resources
> >> > - execute _PS3 (if any)
> >> > - drop references for _PR3 resources
> >> >
> >> > if Section 7.2.11 of ACPI 5.0 is to be followed.
> >>
> >> Yes.  You are right.
> >>
> >> >> I think the process can fit pci_set_power_state() quite well, so why
> >> >> invent another helper for that?
> >> >
> >> > OK, we can special case it, perhaps.
> >> >
> >> > Suppose we have a "this device may be put into D3_cold" flag.
> >> >
> >> > Who's going to decide whether to put it into D3_hot or D3_cold?
> >>
> >> In most cases, I think it is OK to put device into D3_cold if that is
> >> supported.
> >
> > Well, there may be PM QoS latency requirements preventing us from doing so.
> 
> Yes.
> 
> >> But there should be some special case where D3_cold is not
> >> desirable, for example, we can put SSD into D3_cold safely, but it is
> >> not quite safe to put HDD into D3_cold.  So we want to introduce a
> >> flag: "may_power_off" like in the following patch
> >>
> >> https://lkml.org/lkml/2012/3/29/41
> >>
> >> It gives device driver a chance to prevent the device to be put into D3_cold.
> >
> > I see.  So your proposal is that the flag might be used to indicate to
> > whoever carries out power transitions of devices that power must not be
> > removed from this particular device, right?
> 
> Yes.
> 
> > In that case we can put that flag into struct dev_pm_info after all, but
> > perhaps the name should indicate more precisely what it is about.  Something
> > like "power_must_be_on" maybe?
> 
> I am not good at naming in English :)
> I will accept your proposal.
> 
> >> > [...]
> >> >
> >> >> >> > So now please tell me what exactly you want to achieve and why you want to do
> >> >> >> > that in the first place.
> >> >> >
> >> >> > Well, is there any chance to get that information?
> >> >>
> >> >> You mean the runtime_d3cold flag? That flag is used to tell
> >> >> acpi_pci_run_wake() that we need ACPI wakeup setup for the device
> >> >> because that is needed by D3cold.  The ACPI wakeup setup here means
> >> >> turn on power resources needed by wake up (_PRW) and execute _DSW.
> >> >>
> >> >> If you mean the whole patch, we want to implement runtime D3cold
> >> >> support, which can save more power than D3hot.
> >> >
> >> > So, do I think correctly that you'd like to put devices into D3_cold
> >> > if that's possible via ACPI and to be able to wake it up from that state
> >> > using remote wakeup?
> >>
> >> Yes.  Support both remote wakeup and host wakeup.
> >
> > OK, so we need to clean up the ACPI D3_HOT/D3_COLD mess first.  Then, we need
> > to change PCI so that devices are not put into D3cold (by ACPI) if only D3hot
> > is requested by the caller of pci_set_power_state().
> >
> > Having done that, we can modify pci_set_power_state() to handle D3cold as
> > a special case (essentially, it should check that case before doing anything
> > else).  Finally, we need to teach the ACPI notify handler about the WAKE#
> > event and we need to add the 100 ms wait to the device resume code path
> > somewhere (I guess in pci_set_power_state() for the D3cold->D0 transition).
> 
> Yes.  Sound good to me.
> 
> > Now, there's one more thing to consider.  Namely, if a PCIe endpoint is put
> > into D3hot (via native PM) and then the port it is connected to is put into
> > D3_hot (via native PM), does that transfer the endpoint into D3cold?
> 
> No.
> 
> But if a PCIe endpoint is put into D3hot and then the port it is
> connected is put into D3_cold (via ACPI), this will transfer the
> endpoint into D3_cold, and if the port is put into D0 afterwards, all
> subordinate endpoint devices will be put into D0 (because of power on
> reset).  I think what we need to do here is:
> 
> - when choose power state, if any subordinate device has
> power_must_be_on set, will not choose D3_cold

Well, that's quite obvious. :-)

> - when put PCIe port from D3_cold to D0, resume all subordinate
> devices too.

Alternatively, we can just call pci_restore_state() on them and put
them into PCI_D3hot again without actually resuming.

> We design a method to do that in following patch:
> 
> https://lkml.org/lkml/2012/3/29/38
> 
> Where we will register all subordinate devices via
> acpi_power_resource_register_device(endpoint_device,
> bridget_acpi_handle).

I think that's overkill in the case of PCI devices under a PCIe port.
We only need to walk the bus below the port and do something for each device
in there then.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki April 19, 2012, 12:36 p.m. UTC | #34
On Thursday, April 19, 2012, huang ying wrote:
> On Thu, Apr 19, 2012 at 4:51 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > On Wednesday, April 18, 2012, huang ying wrote:
> >> On Wed, Apr 18, 2012 at 5:10 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >> > On Tuesday, April 17, 2012, huang ying wrote:
> >> >> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >> >> >> >> +     return 0;
> >> >> >> >> +}
> >> >> >> >> +
> >> >> >> >> +static int pcie_port_runtime_resume(struct device *dev)
> >> >> >> >> +{
> >> >> >> >> +     struct pci_dev *pdev = to_pci_dev(dev);
> >> >> >> >> +
> >> >> >> >> +     pci_restore_state(pdev);
> >> >> >> >> +     if (pdev->runtime_d3cold)
> >> >> >> >> +             msleep(100);
> >> >> >> >
> >> >> >> > What's _that_ supposed to do?
> >> >> >>
> >> >> >> When resume from d3cold, PCIe main link will be powered on again, it
> >> >> >> will take quite some time before the main link go into L0 state.
> >> >> >> Otherwise, accessing devices under the port may return wrong result.
> >> >> >
> >> >> > OK, but this is generic code and as per the standard the link should have been
> >> >> > reestablished at this point already.
> >> >> >
> >> >> > Please don't put some nonstandard-platform-specific quirks like this into
> >> >> > code that's supposed to handle _every_ PCIe system.
> >> >>
> >> >> After checking PCIe spec, I found that the 100ms here has its standard origin :)
> >> >>
> >> >> In PCI Express Base Specification Revision 2.0:
> >> >>
> >> >> Section 6.6.1 Conventional Reset
> >> >>
> >> >> "
> >> >> To allow components to perform internal initialization, system
> >> >> software must wait for at least
> >> >> 100 ms from the end of a Conventional Reset of one or more devices
> >> >> before it is permitted to
> >> >> issue Configuration Requests to those devices
> >> >> "
> >> >>
> >> >> But I think we should move the 100ms delay here to PCIe bus code or
> >> >> PCIe/ACPI code, because that is needed by all PCIe devices for D3cold
> >> >> support.
> >> >
> >> > I think it should be sufficient to wait for the PME message to arrive at
> >> > the root port (which will cause the PME interrupt to appear), at which
> >> > point the device that sent it should be able to receive configuration
> >> > requests.
> >>
> >> For remote wake up, it is sufficient.  But for host wake up, we still
> >> need to wait 100ms.
> >
> > Yes, we do.
> >
> >> > At this point, I need to konw what exactly happens when the GPE is triggered
> >> > by WAKE#.
> >>
> >> - Lxx handler will be executed
> >> - in Lxx handler, Notify the ACPI handle PCIe port
> >> - Linux has registered a handler for the ACPI handle of PCIe port, in
> >> the handler, turn on _PR0 and execute _PS0, which will power on the
> >> link.
> >
> > But the handler we have is not the handler we want here.
> >
> > In fact, there are two handlers, pci_acpi_wake_bus() and pci_acpi_wake_dev()
> > and they only do useful things for ACPI_NOTIFY_DEVICE_WAKE.  Is that the
> > event type we receive from that _Lxx?
> 
> I check the DSDT, in _Lxx, there is
> 
> Notify (\_SB.PCI0.RP03, 0x02)
> 
> That is, the event type is ACPI_NOTIFY_DEVICE_WAKE.
> 
> > Even if so, these routines don't seem to be suitable to handle the case at hand.
> 
> Yes.  Maybe add a flag named like "come_from_d3cold", and if
> come_from_d3cold == true, resume the dev itself without checking pme
> bits, because the PCIe main link is not available now.

Actually, I think we can do the full resume (however with the 100 ms wait)
in that case, because we're going to resume the device shortly anyway.

The PME would only be useful as a kind of handshake with the device, so we
know we can access its registers, but as you pointed that out, we need the
100 ms delay in the host wakeup case anyway, so perhaps it's better to make
remote wakeup and host wakeup behave identically in that respect.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 20, 2012, 12:48 a.m. UTC | #35
On Thu, Apr 19, 2012 at 8:31 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Thursday, April 19, 2012, huang ying wrote:
>> On Thu, Apr 19, 2012 at 5:00 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> > On Wednesday, April 18, 2012, huang ying wrote:
>> >> On Wed, Apr 18, 2012 at 5:03 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> >> > On Tuesday, April 17, 2012, huang ying wrote:
>> >> >> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> >> >> > On Monday, April 16, 2012, huang ying wrote:
>> >> >> >> Hi,
>> >> >> >>
>> >> >> >> On Sat, Apr 14, 2012 at 3:41 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> >> >> >> > Hi,
>> >> >> >> >
>> >> >> >> > On Friday, April 13, 2012, Yan, Zheng wrote:
>> >> >> >> >> Hi all,
>> >> >> >> >>
>> >> >> >> >> This patch adds PCIe runtime D3cold support, namely cut power supply for functions
>> >> >> >> >> beneath a PCIe port when they all have entered D3. A device in D3cold can only
>> >> >> >> >> generate wake event through the WAKE# pin. Because we can not access to a device's
>> >> >> >> >> configure space while it's in D3cold, pme_poll is disabled for devices in D3cold.
>> >> >> >> >>
>> >> >> >> >> Any comment will be appreciated.
>> >> >> >> >>
>> >> >> >> >> Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
>> >> >> >> >> ---
>> >> >> >> >> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
>> >> >> >> >> index 0f150f2..e210e8cb 100644
>> >> >> >> >> --- a/drivers/pci/pci-acpi.c
>> >> >> >> >> +++ b/drivers/pci/pci-acpi.c
>> >> >> >> >> @@ -224,7 +224,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>> >> >> >> >>               [PCI_D1] = ACPI_STATE_D1,
>> >> >> >> >>               [PCI_D2] = ACPI_STATE_D2,
>> >> >> >> >>               [PCI_D3hot] = ACPI_STATE_D3,
>> >> >> >> >> -             [PCI_D3cold] = ACPI_STATE_D3
>> >> >> >> >> +             [PCI_D3cold] = ACPI_STATE_D3_COLD
>> >> >> >> >>       };
>> >> >> >> >>       int error = -EINVAL;
>> >> >> >> >>
>> >> >> >> >
>> >> >> >> > Please don't use that ACPI_STATE_D3_COLD thing, it's not defined correctly.
>> >> >> >> >
>> >> >> >> > We should define ACPI_STATE_D3_COLD == ACPI_STATE_D3 and add ACPI_STATE_D3_HOT
>> >> >> >> > instead.  I'll prepare a patch for that over the weekend if no one has done
>> >> >> >> > that already.
>> >> >> >> >
>> >> >> >> >> @@ -296,7 +296,8 @@ static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
>> >> >> >> >>
>> >> >> >> >>  static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
>> >> >> >> >>  {
>> >> >> >> >> -     if (dev->pme_interrupt)
>> >> >> >> >> +     /* PME interrupt isn't available in the D3cold case */
>> >> >> >> >> +     if (dev->pme_interrupt && !dev->runtime_d3cold)
>> >> >> >> >
>> >> >> >> > This whole thing is wrong.  First off, I don't think that the runtime_d3cold
>> >> >> >> > flag makes any sense.  We already cover that in dev->pme_support.
>> >> >> >>
>> >> >> >> PCIe devices and PCIe root port may have proper PME interrupt support
>> >> >> >> (that is, dev->pme_interrupt = true), but the process of remote wakeup
>> >> >> >> from D3cold is as follow:
>> >> >> >>
>> >> >> >> 1) In D3cold, the power of the main link is turned off, aux power is
>> >> >> >> provided (PCIe L2 link state)
>> >> >> >> 2) Device detect condition to resume, then assert #WAKE pin
>> >> >> >> 2) ACPI circuit linked with #WAKE pin, and will generate a GPE for that
>> >> >> >> 3) GPE handler will resume device with ACPI (via _PS3 and _PR0), the
>> >> >> >> power of the main link is turned on, after a while, link goes into L0
>> >> >> >> state
>> >> >> >> 4) The PME message is sent to root port, pme interrupt generated
>> >> >> >
>> >> >> > This isn't how it's supposed to work in theory.  If the device can signal PME
>> >> >> > from D3cold, it should be able to reestablish the link and send a PME
>> >> >> > message from there.  dev->pme_interrupt set means exactly that.
>> >> >> >
>> >> >> > ACPI is only supposed to be needed for things that don't send PME
>> >> >> > messages (in your case the PME interrupt generated by the port is essentially
>> >> >> > useless, because the wakeup event has already been signaled through ACPI).
>> >> >> >
>> >> >> >> So, for deivce, dev->pme_interrupt = true and dev->pme_support
>> >> >> >> advocate it support PME in D3cold.  But we still need ACPI to setup
>> >> >> >> run wake for the device.
>> >> >> >
>> >> >> > OK, so this is nonstandard.
>> >> >>
>> >> >> This is the standard behavior.  Please refer to PCI Express Base
>> >> >> Sepcification Revision 2.0, section 5.3.3.2 Link Wakeup.  In D1, D2
>> >> >> and D3hot state, PCIe device can transit the link from L1 to L0 state,
>> >> >> and send the PME message.  In D3cold, the main link is powered off,
>> >> >> PCIe device will use a STANDARD sideband signal WAKE# to signal wakeup
>> >> >> firstly, then platform (power controller in spec) will power on the
>> >> >> main link for the device, after main link is back to L0, the PME
>> >> >> message is send to root port, pme interrupt is generated.  So in
>> >> >> theory, the wake up process can be divided into platform part (which
>> >> >> power on the main link) and PCIe part (which send PME).
>> >> >
>> >> > That's fine.  However, the platform part should be completely transparent
>> >> > to the PCI bus type, then.  It just should power up the link to allow a
>> >> > PME message to be transmitted through it.
>> >>
>> >> Yes.
>> >>
>> >> >[...]
>> >> >
>> >> >> > So don't use pci_set_power_state() for that, because it's essentially
>> >> >> > a different operation.  You need a pci_platform_remove_power() helper or
>> >> >> > similar for that.
>> >> >> >
>> >> >> > What ACPI method exactly is used to remove power from the device?
>> >> >>
>> >> >> The ACPI method executed is as follow:
>> >> >>
>> >> >> - _PS3 (if exist)
>> >> >> - Power resources in _PR3 is turned on
>> >> >> - Power resources in _PR0 is turned off
>> >> >> - Power resources in _PR3 is turned off
>> >> >
>> >> > I'd rather think
>> >> >
>> >> > - make sure that _PR3 resources are referenced
>> >> > - drop references (from this device) for all other power resources
>> >> > - execute _PS3 (if any)
>> >> > - drop references for _PR3 resources
>> >> >
>> >> > if Section 7.2.11 of ACPI 5.0 is to be followed.
>> >>
>> >> Yes.  You are right.
>> >>
>> >> >> I think the process can fit pci_set_power_state() quite well, so why
>> >> >> invent another helper for that?
>> >> >
>> >> > OK, we can special case it, perhaps.
>> >> >
>> >> > Suppose we have a "this device may be put into D3_cold" flag.
>> >> >
>> >> > Who's going to decide whether to put it into D3_hot or D3_cold?
>> >>
>> >> In most cases, I think it is OK to put device into D3_cold if that is
>> >> supported.
>> >
>> > Well, there may be PM QoS latency requirements preventing us from doing so.
>>
>> Yes.
>>
>> >> But there should be some special case where D3_cold is not
>> >> desirable, for example, we can put SSD into D3_cold safely, but it is
>> >> not quite safe to put HDD into D3_cold.  So we want to introduce a
>> >> flag: "may_power_off" like in the following patch
>> >>
>> >> https://lkml.org/lkml/2012/3/29/41
>> >>
>> >> It gives device driver a chance to prevent the device to be put into D3_cold.
>> >
>> > I see.  So your proposal is that the flag might be used to indicate to
>> > whoever carries out power transitions of devices that power must not be
>> > removed from this particular device, right?
>>
>> Yes.
>>
>> > In that case we can put that flag into struct dev_pm_info after all, but
>> > perhaps the name should indicate more precisely what it is about.  Something
>> > like "power_must_be_on" maybe?
>>
>> I am not good at naming in English :)
>> I will accept your proposal.
>>
>> >> > [...]
>> >> >
>> >> >> >> > So now please tell me what exactly you want to achieve and why you want to do
>> >> >> >> > that in the first place.
>> >> >> >
>> >> >> > Well, is there any chance to get that information?
>> >> >>
>> >> >> You mean the runtime_d3cold flag? That flag is used to tell
>> >> >> acpi_pci_run_wake() that we need ACPI wakeup setup for the device
>> >> >> because that is needed by D3cold.  The ACPI wakeup setup here means
>> >> >> turn on power resources needed by wake up (_PRW) and execute _DSW.
>> >> >>
>> >> >> If you mean the whole patch, we want to implement runtime D3cold
>> >> >> support, which can save more power than D3hot.
>> >> >
>> >> > So, do I think correctly that you'd like to put devices into D3_cold
>> >> > if that's possible via ACPI and to be able to wake it up from that state
>> >> > using remote wakeup?
>> >>
>> >> Yes.  Support both remote wakeup and host wakeup.
>> >
>> > OK, so we need to clean up the ACPI D3_HOT/D3_COLD mess first.  Then, we need
>> > to change PCI so that devices are not put into D3cold (by ACPI) if only D3hot
>> > is requested by the caller of pci_set_power_state().
>> >
>> > Having done that, we can modify pci_set_power_state() to handle D3cold as
>> > a special case (essentially, it should check that case before doing anything
>> > else).  Finally, we need to teach the ACPI notify handler about the WAKE#
>> > event and we need to add the 100 ms wait to the device resume code path
>> > somewhere (I guess in pci_set_power_state() for the D3cold->D0 transition).
>>
>> Yes.  Sound good to me.
>>
>> > Now, there's one more thing to consider.  Namely, if a PCIe endpoint is put
>> > into D3hot (via native PM) and then the port it is connected to is put into
>> > D3_hot (via native PM), does that transfer the endpoint into D3cold?
>>
>> No.
>>
>> But if a PCIe endpoint is put into D3hot and then the port it is
>> connected is put into D3_cold (via ACPI), this will transfer the
>> endpoint into D3_cold, and if the port is put into D0 afterwards, all
>> subordinate endpoint devices will be put into D0 (because of power on
>> reset).  I think what we need to do here is:
>>
>> - when choose power state, if any subordinate device has
>> power_must_be_on set, will not choose D3_cold
>
> Well, that's quite obvious. :-)
>
>> - when put PCIe port from D3_cold to D0, resume all subordinate
>> devices too.
>
> Alternatively, we can just call pci_restore_state() on them and put
> them into PCI_D3hot again without actually resuming.

That is better.  But as the first step, I prefer the simpler way to
just resume the device.  In this way, the synchronization between
remote resume, host resume and system suspend can be easier.

>> We design a method to do that in following patch:
>>
>> https://lkml.org/lkml/2012/3/29/38
>>
>> Where we will register all subordinate devices via
>> acpi_power_resource_register_device(endpoint_device,
>> bridget_acpi_handle).
>
> I think that's overkill in the case of PCI devices under a PCIe port.
> We only need to walk the bus below the port and do something for each device
> in there then.

Yes.  That is something like power domain, if the power resources are
shared by multiple devices.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
huang ying April 20, 2012, 12:53 a.m. UTC | #36
On Thu, Apr 19, 2012 at 8:36 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Thursday, April 19, 2012, huang ying wrote:
>> On Thu, Apr 19, 2012 at 4:51 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> > On Wednesday, April 18, 2012, huang ying wrote:
>> >> On Wed, Apr 18, 2012 at 5:10 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> >> > On Tuesday, April 17, 2012, huang ying wrote:
>> >> >> On Tue, Apr 17, 2012 at 5:30 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> >> >> >> >> +     return 0;
>> >> >> >> >> +}
>> >> >> >> >> +
>> >> >> >> >> +static int pcie_port_runtime_resume(struct device *dev)
>> >> >> >> >> +{
>> >> >> >> >> +     struct pci_dev *pdev = to_pci_dev(dev);
>> >> >> >> >> +
>> >> >> >> >> +     pci_restore_state(pdev);
>> >> >> >> >> +     if (pdev->runtime_d3cold)
>> >> >> >> >> +             msleep(100);
>> >> >> >> >
>> >> >> >> > What's _that_ supposed to do?
>> >> >> >>
>> >> >> >> When resume from d3cold, PCIe main link will be powered on again, it
>> >> >> >> will take quite some time before the main link go into L0 state.
>> >> >> >> Otherwise, accessing devices under the port may return wrong result.
>> >> >> >
>> >> >> > OK, but this is generic code and as per the standard the link should have been
>> >> >> > reestablished at this point already.
>> >> >> >
>> >> >> > Please don't put some nonstandard-platform-specific quirks like this into
>> >> >> > code that's supposed to handle _every_ PCIe system.
>> >> >>
>> >> >> After checking PCIe spec, I found that the 100ms here has its standard origin :)
>> >> >>
>> >> >> In PCI Express Base Specification Revision 2.0:
>> >> >>
>> >> >> Section 6.6.1 Conventional Reset
>> >> >>
>> >> >> "
>> >> >> To allow components to perform internal initialization, system
>> >> >> software must wait for at least
>> >> >> 100 ms from the end of a Conventional Reset of one or more devices
>> >> >> before it is permitted to
>> >> >> issue Configuration Requests to those devices
>> >> >> "
>> >> >>
>> >> >> But I think we should move the 100ms delay here to PCIe bus code or
>> >> >> PCIe/ACPI code, because that is needed by all PCIe devices for D3cold
>> >> >> support.
>> >> >
>> >> > I think it should be sufficient to wait for the PME message to arrive at
>> >> > the root port (which will cause the PME interrupt to appear), at which
>> >> > point the device that sent it should be able to receive configuration
>> >> > requests.
>> >>
>> >> For remote wake up, it is sufficient.  But for host wake up, we still
>> >> need to wait 100ms.
>> >
>> > Yes, we do.
>> >
>> >> > At this point, I need to konw what exactly happens when the GPE is triggered
>> >> > by WAKE#.
>> >>
>> >> - Lxx handler will be executed
>> >> - in Lxx handler, Notify the ACPI handle PCIe port
>> >> - Linux has registered a handler for the ACPI handle of PCIe port, in
>> >> the handler, turn on _PR0 and execute _PS0, which will power on the
>> >> link.
>> >
>> > But the handler we have is not the handler we want here.
>> >
>> > In fact, there are two handlers, pci_acpi_wake_bus() and pci_acpi_wake_dev()
>> > and they only do useful things for ACPI_NOTIFY_DEVICE_WAKE.  Is that the
>> > event type we receive from that _Lxx?
>>
>> I check the DSDT, in _Lxx, there is
>>
>> Notify (\_SB.PCI0.RP03, 0x02)
>>
>> That is, the event type is ACPI_NOTIFY_DEVICE_WAKE.
>>
>> > Even if so, these routines don't seem to be suitable to handle the case at hand.
>>
>> Yes.  Maybe add a flag named like "come_from_d3cold", and if
>> come_from_d3cold == true, resume the dev itself without checking pme
>> bits, because the PCIe main link is not available now.
>
> Actually, I think we can do the full resume (however with the 100 ms wait)
> in that case, because we're going to resume the device shortly anyway.
>
> The PME would only be useful as a kind of handshake with the device, so we
> know we can access its registers, but as you pointed that out, we need the
> 100 ms delay in the host wakeup case anyway, so perhaps it's better to make
> remote wakeup and host wakeup behave identically in that respect.

Yes.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 0f150f2..e210e8cb 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -224,7 +224,7 @@  static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 		[PCI_D1] = ACPI_STATE_D1,
 		[PCI_D2] = ACPI_STATE_D2,
 		[PCI_D3hot] = ACPI_STATE_D3,
-		[PCI_D3cold] = ACPI_STATE_D3
+		[PCI_D3cold] = ACPI_STATE_D3_COLD
 	};
 	int error = -EINVAL;
 
@@ -296,7 +296,8 @@  static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable)
 
 static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
 {
-	if (dev->pme_interrupt)
+	/* PME interrupt isn't available in the D3cold case */
+	if (dev->pme_interrupt && !dev->runtime_d3cold)
 		return 0;
 
 	if (!acpi_pm_device_run_wake(&dev->dev, enable))
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8156744..bc16869 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -731,8 +731,8 @@  int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 	int error;
 
 	/* bound the state we're entering */
-	if (state > PCI_D3hot)
-		state = PCI_D3hot;
+	if (state > PCI_D3cold)
+		state = PCI_D3cold;
 	else if (state < PCI_D0)
 		state = PCI_D0;
 	else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
@@ -750,7 +750,8 @@  int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 	if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
 		return 0;
 
-	error = pci_raw_set_power_state(dev, state);
+	error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
+					PCI_D3hot : state);
 
 	if (!__pci_complete_power_transition(dev, state))
 		error = 0;
@@ -1482,6 +1483,17 @@  bool pci_pme_capable(struct pci_dev *dev, pci_power_t state)
 	return !!(dev->pme_support & (1 << state));
 }
 
+static void pci_pme_poll_wakeup(struct pci_dev *dev)
+{
+	struct pci_dev *bridge = dev->bus->self;
+
+	/* don't poll the pme bit if parent is in low power state */
+	if (bridge && bridge->current_state != PCI_D0)
+		return;
+
+	pci_pme_wakeup(dev, NULL);
+}
+
 static void pci_pme_list_scan(struct work_struct *work)
 {
 	struct pci_pme_device *pme_dev, *n;
@@ -1490,7 +1502,7 @@  static void pci_pme_list_scan(struct work_struct *work)
 	if (!list_empty(&pci_pme_list)) {
 		list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) {
 			if (pme_dev->dev->pme_poll) {
-				pci_pme_wakeup(pme_dev->dev, NULL);
+				pci_pme_poll_wakeup(pme_dev->dev);
 			} else {
 				list_del(&pme_dev->list);
 				kfree(pme_dev);
@@ -1608,6 +1620,10 @@  int __pci_enable_wake(struct pci_dev *dev, pci_power_t state,
 	if (enable) {
 		int error;
 
+		if (runtime && state >= PCI_D3cold)
+			dev->runtime_d3cold = true;
+		else
+			dev->runtime_d3cold = false;
 		if (pci_pme_capable(dev, state))
 			pci_pme_active(dev, true);
 		else
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index e0610bd..d66b7e9 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -11,11 +11,13 @@ 
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/pm.h>
+#include <linux/pm_runtime.h>
 #include <linux/init.h>
 #include <linux/pcieport_if.h>
 #include <linux/aer.h>
 #include <linux/dmi.h>
 #include <linux/pci-aspm.h>
+#include <linux/delay.h>
 
 #include "portdrv.h"
 #include "aer/aerdrv.h"
@@ -99,6 +101,25 @@  static int pcie_port_resume_noirq(struct device *dev)
 	return 0;
 }
 
+static int pcie_port_runtime_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	pci_save_state(pdev);
+	return 0;
+}
+
+static int pcie_port_runtime_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	pci_restore_state(pdev);
+	if (pdev->runtime_d3cold)
+		msleep(100);
+
+	return 0;
+}
+
 static const struct dev_pm_ops pcie_portdrv_pm_ops = {
 	.suspend	= pcie_port_device_suspend,
 	.resume		= pcie_port_device_resume,
@@ -107,6 +128,8 @@  static const struct dev_pm_ops pcie_portdrv_pm_ops = {
 	.poweroff	= pcie_port_device_suspend,
 	.restore	= pcie_port_device_resume,
 	.resume_noirq	= pcie_port_resume_noirq,
+	.runtime_suspend = pcie_port_runtime_suspend,
+	.runtime_resume = pcie_port_runtime_resume,
 };
 
 #define PCIE_PORTDRV_PM_OPS	(&pcie_portdrv_pm_ops)
@@ -144,12 +167,14 @@  static int __devinit pcie_portdrv_probe(struct pci_dev *dev,
 		return status;
 
 	pci_save_state(dev);
+	pm_runtime_put_noidle(&dev->dev);
 
 	return 0;
 }
 
 static void pcie_portdrv_remove(struct pci_dev *dev)
 {
+	pm_runtime_get_noresume(&dev->dev);
 	pcie_port_device_remove(dev);
 	pci_disable_device(dev);
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e444f5b..b41d9a1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -281,6 +281,7 @@  struct pci_dev {
 	unsigned int	no_d1d2:1;	/* Only allow D0 and D3 */
 	unsigned int	mmio_always_on:1;	/* disallow turning off io/mem
 						   decoding during bar sizing */
+	unsigned int	runtime_d3cold:1;
 	unsigned int	wakeup_prepared:1;
 	unsigned int	d3_delay;	/* D3->D0 transition time in ms */