diff mbox

[V7,06/10] powerpc/eeh: Create PE for VFs

Message ID 1432032612-21701-7-git-send-email-weiyang@linux.vnet.ibm.com (mailing list archive)
State Superseded
Headers show

Commit Message

Wei Yang May 19, 2015, 10:50 a.m. UTC
Current EEH recovery code works with the assumption: the PE has primary
bus. Unfortunately, that's not true to VF PEs, which generally contains
one or multiple VFs (for VF group case). The patch creates PEs for VFs
at PCI final fixup time. Those PEs for VFs are indentified with newly
introduced flag EEH_PE_VF so that we handle them differently during
EEH recovery.

[gwshan: changelog and code refactoring]
Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h               |    1 +
 arch/powerpc/kernel/eeh_pe.c                 |   10 ++++++++--
 arch/powerpc/platforms/powernv/eeh-powernv.c |   17 +++++++++++++++++
 3 files changed, 26 insertions(+), 2 deletions(-)

Comments

Bjorn Helgaas June 1, 2015, 11:46 p.m. UTC | #1
On Tue, May 19, 2015 at 06:50:08PM +0800, Wei Yang wrote:
> Current EEH recovery code works with the assumption: the PE has primary
> bus. Unfortunately, that's not true to VF PEs, which generally contains
> one or multiple VFs (for VF group case). The patch creates PEs for VFs
> at PCI final fixup time. Those PEs for VFs are indentified with newly
> introduced flag EEH_PE_VF so that we handle them differently during
> EEH recovery.
> 
> [gwshan: changelog and code refactoring]
> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
> Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/eeh.h               |    1 +
>  arch/powerpc/kernel/eeh_pe.c                 |   10 ++++++++--
>  arch/powerpc/platforms/powernv/eeh-powernv.c |   17 +++++++++++++++++
>  3 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 1b3614d..c1fde48 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -70,6 +70,7 @@ struct pci_dn;
>  #define EEH_PE_PHB	(1 << 1)	/* PHB PE    */
>  #define EEH_PE_DEVICE 	(1 << 2)	/* Device PE */
>  #define EEH_PE_BUS	(1 << 3)	/* Bus PE    */
> +#define EEH_PE_VF	(1 << 4)	/* VF PE     */
>  
>  #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
>  #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
> diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
> index 35f0b62..260a701 100644
> --- a/arch/powerpc/kernel/eeh_pe.c
> +++ b/arch/powerpc/kernel/eeh_pe.c
> @@ -299,7 +299,10 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
>  	 * EEH device already having associated PE, but
>  	 * the direct parent EEH device doesn't have yet.
>  	 */
> -	pdn = pdn ? pdn->parent : NULL;
> +	if (edev->physfn)
> +		pdn = pci_get_pdn(edev->physfn);
> +	else
> +		pdn = pdn ? pdn->parent : NULL;
>  	while (pdn) {
>  		/* We're poking out of PCI territory */
>  		parent = pdn_to_eeh_dev(pdn);
> @@ -382,7 +385,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
>  	}
>  
>  	/* Create a new EEH PE */
> -	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
> +	if (edev->physfn)
> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_VF);
> +	else
> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
>  	if (!pe) {
>  		pr_err("%s: out of memory!\n", __func__);
>  		return -ENOMEM;
> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
> index ce738ab..c505036 100644
> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
> @@ -1520,6 +1520,23 @@ static struct eeh_ops pnv_eeh_ops = {
>  	.restore_config		= pnv_eeh_restore_config
>  };
>  
> +static void pnv_eeh_vf_final_fixup(struct pci_dev *pdev)
> +{
> +	struct pci_dn *pdn = pci_get_pdn(pdev);
> +
> +	if (!pdev->is_virtfn)
> +		return;
> +
> +	/*
> +	 * The following operations will fail if VF's sysfs files
> +	 * aren't created or its resources aren't finalized.
> +	 */

I don't understand this comment.  "The following operations" seems to refer
to eeh_add_device_early() and eeh_add_device_late(), and
"VF's sysfs files being created" seems to refer to eeh_sysfs_add_device().

So the comment suggests that eeh_add_device_early() and
eeh_add_device_late() will fail because they're called before
eeh_sysfs_add_device().  So I think you must be talking about some other
"following operations," not eeh_add_device_early() and
eeh_add_device_late().

> +	eeh_add_device_early(pdn);
> +	eeh_add_device_late(pdev);
> +	eeh_sysfs_add_device(pdev);
> +}
> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pnv_eeh_vf_final_fixup);

Ugh.  This is powerpc code, but I don't like using fixups as a hook like
this.  There is a pcibios_add_device() -- could this be done there?

If not, what happens after pcibios_add_device() that is required for this
code?  Maybe we need a pcibios_bus_add_device() hook?

> +
>  /**
>   * eeh_powernv_init - Register platform dependent EEH operations
>   *
> -- 
> 1.7.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas June 1, 2015, 11:49 p.m. UTC | #2
On Tue, May 19, 2015 at 06:50:08PM +0800, Wei Yang wrote:
> Current EEH recovery code works with the assumption: the PE has primary
> bus. Unfortunately, that's not true to VF PEs, which generally contains

"Primary bus" normally means the bus on the upstream side of a PCI bridge.
But a PE is not a bridge, so I don't know what it means here.

s/not true to VF PEs/not true for VF PEs/

> one or multiple VFs (for VF group case). The patch creates PEs for VFs
> at PCI final fixup time. Those PEs for VFs are indentified with newly

s/indentified/identified/

> introduced flag EEH_PE_VF so that we handle them differently during
> EEH recovery.
Wei Yang June 3, 2015, 3:31 a.m. UTC | #3
On Mon, Jun 01, 2015 at 06:46:45PM -0500, Bjorn Helgaas wrote:
>On Tue, May 19, 2015 at 06:50:08PM +0800, Wei Yang wrote:
>> Current EEH recovery code works with the assumption: the PE has primary
>> bus. Unfortunately, that's not true to VF PEs, which generally contains
>> one or multiple VFs (for VF group case). The patch creates PEs for VFs
>> at PCI final fixup time. Those PEs for VFs are indentified with newly
>> introduced flag EEH_PE_VF so that we handle them differently during
>> EEH recovery.
>> 
>> [gwshan: changelog and code refactoring]
>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>> Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/include/asm/eeh.h               |    1 +
>>  arch/powerpc/kernel/eeh_pe.c                 |   10 ++++++++--
>>  arch/powerpc/platforms/powernv/eeh-powernv.c |   17 +++++++++++++++++
>>  3 files changed, 26 insertions(+), 2 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
>> index 1b3614d..c1fde48 100644
>> --- a/arch/powerpc/include/asm/eeh.h
>> +++ b/arch/powerpc/include/asm/eeh.h
>> @@ -70,6 +70,7 @@ struct pci_dn;
>>  #define EEH_PE_PHB	(1 << 1)	/* PHB PE    */
>>  #define EEH_PE_DEVICE 	(1 << 2)	/* Device PE */
>>  #define EEH_PE_BUS	(1 << 3)	/* Bus PE    */
>> +#define EEH_PE_VF	(1 << 4)	/* VF PE     */
>>  
>>  #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
>>  #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
>> diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
>> index 35f0b62..260a701 100644
>> --- a/arch/powerpc/kernel/eeh_pe.c
>> +++ b/arch/powerpc/kernel/eeh_pe.c
>> @@ -299,7 +299,10 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
>>  	 * EEH device already having associated PE, but
>>  	 * the direct parent EEH device doesn't have yet.
>>  	 */
>> -	pdn = pdn ? pdn->parent : NULL;
>> +	if (edev->physfn)
>> +		pdn = pci_get_pdn(edev->physfn);
>> +	else
>> +		pdn = pdn ? pdn->parent : NULL;
>>  	while (pdn) {
>>  		/* We're poking out of PCI territory */
>>  		parent = pdn_to_eeh_dev(pdn);
>> @@ -382,7 +385,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
>>  	}
>>  
>>  	/* Create a new EEH PE */
>> -	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
>> +	if (edev->physfn)
>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_VF);
>> +	else
>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
>>  	if (!pe) {
>>  		pr_err("%s: out of memory!\n", __func__);
>>  		return -ENOMEM;
>> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
>> index ce738ab..c505036 100644
>> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
>> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
>> @@ -1520,6 +1520,23 @@ static struct eeh_ops pnv_eeh_ops = {
>>  	.restore_config		= pnv_eeh_restore_config
>>  };
>>  
>> +static void pnv_eeh_vf_final_fixup(struct pci_dev *pdev)
>> +{
>> +	struct pci_dn *pdn = pci_get_pdn(pdev);
>> +
>> +	if (!pdev->is_virtfn)
>> +		return;
>> +
>> +	/*
>> +	 * The following operations will fail if VF's sysfs files
>> +	 * aren't created or its resources aren't finalized.
>> +	 */
>
>I don't understand this comment.  "The following operations" seems to refer
>to eeh_add_device_early() and eeh_add_device_late(), and
>"VF's sysfs files being created" seems to refer to eeh_sysfs_add_device().
>
>So the comment suggests that eeh_add_device_early() and
>eeh_add_device_late() will fail because they're called before
>eeh_sysfs_add_device().  So I think you must be talking about some other
>"following operations," not eeh_add_device_early() and
>eeh_add_device_late().

Sorry for this confusion.

The comment here wants to say the eeh_sysfs_add_device() will fail if the VF's
sysfs is not created well. Or it will fail if the VF's resources are not set
properly, since we would cache the VF's BAR in eeh_add_device_late().

Gavin,

If my understanding is not correct please let me know.

>
>> +	eeh_add_device_early(pdn);
>> +	eeh_add_device_late(pdev);
>> +	eeh_sysfs_add_device(pdev);
>> +}
>> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pnv_eeh_vf_final_fixup);
>
>Ugh.  This is powerpc code, but I don't like using fixups as a hook like
>this.  There is a pcibios_add_device() -- could this be done there?
>

I don't like it neither :-) But looks we can't put it in the
pcibios_add_device().

>If not, what happens after pcibios_add_device() that is required for this
>code?  Maybe we need a pcibios_bus_add_device() hook?

The pnv_eeh_vf_final_fixup() will try to create sysfs for VFs. This requires
the VF sysfs(kobj) is initialized properly. If we put these into
pcibios_add_device(), the eeh_sysfs_add_device() would fail.

Below is the call flow for your reference:

pci_device_add()
    pcibios_add_device()
    device_add()                <--- kobj initialized here

>
>> +
>>  /**
>>   * eeh_powernv_init - Register platform dependent EEH operations
>>   *
>> -- 
>> 1.7.9.5
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Yang June 3, 2015, 3:39 a.m. UTC | #4
On Mon, Jun 01, 2015 at 06:49:58PM -0500, Bjorn Helgaas wrote:
>On Tue, May 19, 2015 at 06:50:08PM +0800, Wei Yang wrote:
>> Current EEH recovery code works with the assumption: the PE has primary
>> bus. Unfortunately, that's not true to VF PEs, which generally contains
>
>"Primary bus" normally means the bus on the upstream side of a PCI bridge.
>But a PE is not a bridge, so I don't know what it means here.
>

Before VF PE introduced, a PE is a "Bus PE" which contains a pci bus. Yes, the
"primary bus" may be a little confusing, since this may refer to the upstream
side of a PCI bridge. I think the log here tries to emphasize the Bus PE
contains a whole pci bus.

>s/not true to VF PEs/not true for VF PEs/

Thanks, changed

>
>> one or multiple VFs (for VF group case). The patch creates PEs for VFs
>> at PCI final fixup time. Those PEs for VFs are indentified with newly
>
>s/indentified/identified/

Thanks, changed

>
>> introduced flag EEH_PE_VF so that we handle them differently during
>> EEH recovery.
>_______________________________________________
>Linuxppc-dev mailing list
>Linuxppc-dev@lists.ozlabs.org
>https://lists.ozlabs.org/listinfo/linuxppc-dev
Gavin Shan June 3, 2015, 5:10 a.m. UTC | #5
On Wed, Jun 03, 2015 at 11:31:42AM +0800, Wei Yang wrote:
>On Mon, Jun 01, 2015 at 06:46:45PM -0500, Bjorn Helgaas wrote:
>>On Tue, May 19, 2015 at 06:50:08PM +0800, Wei Yang wrote:
>>> Current EEH recovery code works with the assumption: the PE has primary
>>> bus. Unfortunately, that's not true to VF PEs, which generally contains
>>> one or multiple VFs (for VF group case). The patch creates PEs for VFs
>>> at PCI final fixup time. Those PEs for VFs are indentified with newly
>>> introduced flag EEH_PE_VF so that we handle them differently during
>>> EEH recovery.
>>> 
>>> [gwshan: changelog and code refactoring]
>>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>> Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>> ---
>>>  arch/powerpc/include/asm/eeh.h               |    1 +
>>>  arch/powerpc/kernel/eeh_pe.c                 |   10 ++++++++--
>>>  arch/powerpc/platforms/powernv/eeh-powernv.c |   17 +++++++++++++++++
>>>  3 files changed, 26 insertions(+), 2 deletions(-)
>>> 
>>> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
>>> index 1b3614d..c1fde48 100644
>>> --- a/arch/powerpc/include/asm/eeh.h
>>> +++ b/arch/powerpc/include/asm/eeh.h
>>> @@ -70,6 +70,7 @@ struct pci_dn;
>>>  #define EEH_PE_PHB	(1 << 1)	/* PHB PE    */
>>>  #define EEH_PE_DEVICE 	(1 << 2)	/* Device PE */
>>>  #define EEH_PE_BUS	(1 << 3)	/* Bus PE    */
>>> +#define EEH_PE_VF	(1 << 4)	/* VF PE     */
>>>  
>>>  #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
>>>  #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
>>> diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
>>> index 35f0b62..260a701 100644
>>> --- a/arch/powerpc/kernel/eeh_pe.c
>>> +++ b/arch/powerpc/kernel/eeh_pe.c
>>> @@ -299,7 +299,10 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
>>>  	 * EEH device already having associated PE, but
>>>  	 * the direct parent EEH device doesn't have yet.
>>>  	 */
>>> -	pdn = pdn ? pdn->parent : NULL;
>>> +	if (edev->physfn)
>>> +		pdn = pci_get_pdn(edev->physfn);
>>> +	else
>>> +		pdn = pdn ? pdn->parent : NULL;
>>>  	while (pdn) {
>>>  		/* We're poking out of PCI territory */
>>>  		parent = pdn_to_eeh_dev(pdn);
>>> @@ -382,7 +385,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
>>>  	}
>>>  
>>>  	/* Create a new EEH PE */
>>> -	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
>>> +	if (edev->physfn)
>>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_VF);
>>> +	else
>>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
>>>  	if (!pe) {
>>>  		pr_err("%s: out of memory!\n", __func__);
>>>  		return -ENOMEM;
>>> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
>>> index ce738ab..c505036 100644
>>> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
>>> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
>>> @@ -1520,6 +1520,23 @@ static struct eeh_ops pnv_eeh_ops = {
>>>  	.restore_config		= pnv_eeh_restore_config
>>>  };
>>>  
>>> +static void pnv_eeh_vf_final_fixup(struct pci_dev *pdev)
>>> +{
>>> +	struct pci_dn *pdn = pci_get_pdn(pdev);
>>> +
>>> +	if (!pdev->is_virtfn)
>>> +		return;
>>> +
>>> +	/*
>>> +	 * The following operations will fail if VF's sysfs files
>>> +	 * aren't created or its resources aren't finalized.
>>> +	 */
>>
>>I don't understand this comment.  "The following operations" seems to refer
>>to eeh_add_device_early() and eeh_add_device_late(), and
>>"VF's sysfs files being created" seems to refer to eeh_sysfs_add_device().
>>
>>So the comment suggests that eeh_add_device_early() and
>>eeh_add_device_late() will fail because they're called before
>>eeh_sysfs_add_device().  So I think you must be talking about some other
>>"following operations," not eeh_add_device_early() and
>>eeh_add_device_late().
>
>Sorry for this confusion.
>
>The comment here wants to say the eeh_sysfs_add_device() will fail if the VF's
>sysfs is not created well. Or it will fail if the VF's resources are not set
>properly, since we would cache the VF's BAR in eeh_add_device_late().
>
>Gavin,
>
>If my understanding is not correct please let me know.
>

It's correct. "The following operations" refers to eeh_add_device_late()
and eeh_sysfs_add_device(). The former one requires the resources for
one particular PCI device (VF here) are finalized (assigned). eeh_sysfs_add_device()
will fail if the sysfs entry for the PCI device isn't populated yet.

>>
>>> +	eeh_add_device_early(pdn);
>>> +	eeh_add_device_late(pdev);
>>> +	eeh_sysfs_add_device(pdev);
>>> +}
>>> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pnv_eeh_vf_final_fixup);
>>
>>Ugh.  This is powerpc code, but I don't like using fixups as a hook like
>>this.  There is a pcibios_add_device() -- could this be done there?
>>
>
>I don't like it neither :-) But looks we can't put it in the
>pcibios_add_device().
>
>>If not, what happens after pcibios_add_device() that is required for this
>>code?  Maybe we need a pcibios_bus_add_device() hook?
>
>The pnv_eeh_vf_final_fixup() will try to create sysfs for VFs. This requires
>the VF sysfs(kobj) is initialized properly. If we put these into
>pcibios_add_device(), the eeh_sysfs_add_device() would fail.
>
>Below is the call flow for your reference:
>
>pci_device_add()
>    pcibios_add_device()
>    device_add()                <--- kobj initialized here
>

We can put it into pcibios_bus_add_device(), but we don't it currently. If
Bjorn agree to add pcibios_bus_add_device(), I'm fine to move the block code
there.

Thanks,
Gavin

>>
>>> +
>>>  /**
>>>   * eeh_powernv_init - Register platform dependent EEH operations
>>>   *
>>> -- 
>>> 1.7.9.5
>>> 
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>-- 
>Richard Yang
>Help you, Help me
Bjorn Helgaas June 3, 2015, 3:46 p.m. UTC | #6
On Wed, Jun 03, 2015 at 03:10:23PM +1000, Gavin Shan wrote:
> On Wed, Jun 03, 2015 at 11:31:42AM +0800, Wei Yang wrote:
> >On Mon, Jun 01, 2015 at 06:46:45PM -0500, Bjorn Helgaas wrote:
> >>On Tue, May 19, 2015 at 06:50:08PM +0800, Wei Yang wrote:
> >>> Current EEH recovery code works with the assumption: the PE has primary
> >>> bus. Unfortunately, that's not true to VF PEs, which generally contains
> >>> one or multiple VFs (for VF group case). The patch creates PEs for VFs
> >>> at PCI final fixup time. Those PEs for VFs are indentified with newly
> >>> introduced flag EEH_PE_VF so that we handle them differently during
> >>> EEH recovery.
> >>> 
> >>> [gwshan: changelog and code refactoring]
> >>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
> >>> Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> >>> ---
> >>>  arch/powerpc/include/asm/eeh.h               |    1 +
> >>>  arch/powerpc/kernel/eeh_pe.c                 |   10 ++++++++--
> >>>  arch/powerpc/platforms/powernv/eeh-powernv.c |   17 +++++++++++++++++
> >>>  3 files changed, 26 insertions(+), 2 deletions(-)
> >>> 
> >>> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> >>> index 1b3614d..c1fde48 100644
> >>> --- a/arch/powerpc/include/asm/eeh.h
> >>> +++ b/arch/powerpc/include/asm/eeh.h
> >>> @@ -70,6 +70,7 @@ struct pci_dn;
> >>>  #define EEH_PE_PHB	(1 << 1)	/* PHB PE    */
> >>>  #define EEH_PE_DEVICE 	(1 << 2)	/* Device PE */
> >>>  #define EEH_PE_BUS	(1 << 3)	/* Bus PE    */
> >>> +#define EEH_PE_VF	(1 << 4)	/* VF PE     */
> >>>  
> >>>  #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
> >>>  #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
> >>> diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
> >>> index 35f0b62..260a701 100644
> >>> --- a/arch/powerpc/kernel/eeh_pe.c
> >>> +++ b/arch/powerpc/kernel/eeh_pe.c
> >>> @@ -299,7 +299,10 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
> >>>  	 * EEH device already having associated PE, but
> >>>  	 * the direct parent EEH device doesn't have yet.
> >>>  	 */
> >>> -	pdn = pdn ? pdn->parent : NULL;
> >>> +	if (edev->physfn)
> >>> +		pdn = pci_get_pdn(edev->physfn);
> >>> +	else
> >>> +		pdn = pdn ? pdn->parent : NULL;
> >>>  	while (pdn) {
> >>>  		/* We're poking out of PCI territory */
> >>>  		parent = pdn_to_eeh_dev(pdn);
> >>> @@ -382,7 +385,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
> >>>  	}
> >>>  
> >>>  	/* Create a new EEH PE */
> >>> -	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
> >>> +	if (edev->physfn)
> >>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_VF);
> >>> +	else
> >>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
> >>>  	if (!pe) {
> >>>  		pr_err("%s: out of memory!\n", __func__);
> >>>  		return -ENOMEM;
> >>> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
> >>> index ce738ab..c505036 100644
> >>> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
> >>> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
> >>> @@ -1520,6 +1520,23 @@ static struct eeh_ops pnv_eeh_ops = {
> >>>  	.restore_config		= pnv_eeh_restore_config
> >>>  };
> >>>  
> >>> +static void pnv_eeh_vf_final_fixup(struct pci_dev *pdev)
> >>> +{
> >>> +	struct pci_dn *pdn = pci_get_pdn(pdev);
> >>> +
> >>> +	if (!pdev->is_virtfn)
> >>> +		return;
> >>> +
> >>> +	/*
> >>> +	 * The following operations will fail if VF's sysfs files
> >>> +	 * aren't created or its resources aren't finalized.
> >>> +	 */
> >>
> >>I don't understand this comment.  "The following operations" seems to refer
> >>to eeh_add_device_early() and eeh_add_device_late(), and
> >>"VF's sysfs files being created" seems to refer to eeh_sysfs_add_device().
> >>
> >>So the comment suggests that eeh_add_device_early() and
> >>eeh_add_device_late() will fail because they're called before
> >>eeh_sysfs_add_device().  So I think you must be talking about some other
> >>"following operations," not eeh_add_device_early() and
> >>eeh_add_device_late().
> >
> >Sorry for this confusion.
> >
> >The comment here wants to say the eeh_sysfs_add_device() will fail if the VF's
> >sysfs is not created well. Or it will fail if the VF's resources are not set
> >properly, since we would cache the VF's BAR in eeh_add_device_late().
> >
> >Gavin,
> >
> >If my understanding is not correct please let me know.
> >
> 
> It's correct. "The following operations" refers to eeh_add_device_late()
> and eeh_sysfs_add_device(). The former one requires the resources for
> one particular PCI device (VF here) are finalized (assigned). eeh_sysfs_add_device()
> will fail if the sysfs entry for the PCI device isn't populated yet.

eeh_add_device_late() contains several things that read config space:
eeh_save_bars() caches the entire config header, and
eeh_addr_cache_insert_dev() looks at the device resources (which are
determined by BARs in config space).  I think this is an error-prone
approach.  I think it would be simpler and safer for you to capture what
you need in your PCI config accessors.

eeh_add_device_late() also contains code to deal with an EEH cache that
"might not be removed correctly because of unbalanced kref to the device
during unplug time."  That's unrelated to this patch series, but it sounds
... like a hacky workaround for some bug in the unplug path.

> >>> +	eeh_add_device_early(pdn);
> >>> +	eeh_add_device_late(pdev);
> >>> +	eeh_sysfs_add_device(pdev);
> >>> +}
> >>> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pnv_eeh_vf_final_fixup);
> >>
> >>Ugh.  This is powerpc code, but I don't like using fixups as a hook like
> >>this.  There is a pcibios_add_device() -- could this be done there?
> >>
> >
> >I don't like it neither :-) But looks we can't put it in the
> >pcibios_add_device().
> >
> >>If not, what happens after pcibios_add_device() that is required for this
> >>code?  Maybe we need a pcibios_bus_add_device() hook?
> >
> >The pnv_eeh_vf_final_fixup() will try to create sysfs for VFs. This requires
> >the VF sysfs(kobj) is initialized properly. If we put these into
> >pcibios_add_device(), the eeh_sysfs_add_device() would fail.
> >
> >Below is the call flow for your reference:
> >
> >pci_device_add()
> >    pcibios_add_device()
> >    device_add()                <--- kobj initialized here
> >
> 
> We can put it into pcibios_bus_add_device(), but we don't it currently. If
> Bjorn agree to add pcibios_bus_add_device(), I'm fine to move the block code
> there.

I think I'm OK with adding a pcibios_bus_add_device().  I think that would
be better than using the fixup mechanism for this.

Bjorn
Gavin Shan June 4, 2015, 1:25 a.m. UTC | #7
On Wed, Jun 03, 2015 at 10:46:38AM -0500, Bjorn Helgaas wrote:
>On Wed, Jun 03, 2015 at 03:10:23PM +1000, Gavin Shan wrote:
>> On Wed, Jun 03, 2015 at 11:31:42AM +0800, Wei Yang wrote:
>> >On Mon, Jun 01, 2015 at 06:46:45PM -0500, Bjorn Helgaas wrote:
>> >>On Tue, May 19, 2015 at 06:50:08PM +0800, Wei Yang wrote:
>> >>> Current EEH recovery code works with the assumption: the PE has primary
>> >>> bus. Unfortunately, that's not true to VF PEs, which generally contains
>> >>> one or multiple VFs (for VF group case). The patch creates PEs for VFs
>> >>> at PCI final fixup time. Those PEs for VFs are indentified with newly
>> >>> introduced flag EEH_PE_VF so that we handle them differently during
>> >>> EEH recovery.
>> >>> 
>> >>> [gwshan: changelog and code refactoring]
>> >>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>> >>> Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> >>> ---
>> >>>  arch/powerpc/include/asm/eeh.h               |    1 +
>> >>>  arch/powerpc/kernel/eeh_pe.c                 |   10 ++++++++--
>> >>>  arch/powerpc/platforms/powernv/eeh-powernv.c |   17 +++++++++++++++++
>> >>>  3 files changed, 26 insertions(+), 2 deletions(-)
>> >>> 
>> >>> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
>> >>> index 1b3614d..c1fde48 100644
>> >>> --- a/arch/powerpc/include/asm/eeh.h
>> >>> +++ b/arch/powerpc/include/asm/eeh.h
>> >>> @@ -70,6 +70,7 @@ struct pci_dn;
>> >>>  #define EEH_PE_PHB	(1 << 1)	/* PHB PE    */
>> >>>  #define EEH_PE_DEVICE 	(1 << 2)	/* Device PE */
>> >>>  #define EEH_PE_BUS	(1 << 3)	/* Bus PE    */
>> >>> +#define EEH_PE_VF	(1 << 4)	/* VF PE     */
>> >>>  
>> >>>  #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
>> >>>  #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
>> >>> diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
>> >>> index 35f0b62..260a701 100644
>> >>> --- a/arch/powerpc/kernel/eeh_pe.c
>> >>> +++ b/arch/powerpc/kernel/eeh_pe.c
>> >>> @@ -299,7 +299,10 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
>> >>>  	 * EEH device already having associated PE, but
>> >>>  	 * the direct parent EEH device doesn't have yet.
>> >>>  	 */
>> >>> -	pdn = pdn ? pdn->parent : NULL;
>> >>> +	if (edev->physfn)
>> >>> +		pdn = pci_get_pdn(edev->physfn);
>> >>> +	else
>> >>> +		pdn = pdn ? pdn->parent : NULL;
>> >>>  	while (pdn) {
>> >>>  		/* We're poking out of PCI territory */
>> >>>  		parent = pdn_to_eeh_dev(pdn);
>> >>> @@ -382,7 +385,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
>> >>>  	}
>> >>>  
>> >>>  	/* Create a new EEH PE */
>> >>> -	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
>> >>> +	if (edev->physfn)
>> >>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_VF);
>> >>> +	else
>> >>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
>> >>>  	if (!pe) {
>> >>>  		pr_err("%s: out of memory!\n", __func__);
>> >>>  		return -ENOMEM;
>> >>> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
>> >>> index ce738ab..c505036 100644
>> >>> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
>> >>> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
>> >>> @@ -1520,6 +1520,23 @@ static struct eeh_ops pnv_eeh_ops = {
>> >>>  	.restore_config		= pnv_eeh_restore_config
>> >>>  };
>> >>>  
>> >>> +static void pnv_eeh_vf_final_fixup(struct pci_dev *pdev)
>> >>> +{
>> >>> +	struct pci_dn *pdn = pci_get_pdn(pdev);
>> >>> +
>> >>> +	if (!pdev->is_virtfn)
>> >>> +		return;
>> >>> +
>> >>> +	/*
>> >>> +	 * The following operations will fail if VF's sysfs files
>> >>> +	 * aren't created or its resources aren't finalized.
>> >>> +	 */
>> >>
>> >>I don't understand this comment.  "The following operations" seems to refer
>> >>to eeh_add_device_early() and eeh_add_device_late(), and
>> >>"VF's sysfs files being created" seems to refer to eeh_sysfs_add_device().
>> >>
>> >>So the comment suggests that eeh_add_device_early() and
>> >>eeh_add_device_late() will fail because they're called before
>> >>eeh_sysfs_add_device().  So I think you must be talking about some other
>> >>"following operations," not eeh_add_device_early() and
>> >>eeh_add_device_late().
>> >
>> >Sorry for this confusion.
>> >
>> >The comment here wants to say the eeh_sysfs_add_device() will fail if the VF's
>> >sysfs is not created well. Or it will fail if the VF's resources are not set
>> >properly, since we would cache the VF's BAR in eeh_add_device_late().
>> >
>> >Gavin,
>> >
>> >If my understanding is not correct please let me know.
>> >
>> 
>> It's correct. "The following operations" refers to eeh_add_device_late()
>> and eeh_sysfs_add_device(). The former one requires the resources for
>> one particular PCI device (VF here) are finalized (assigned). eeh_sysfs_add_device()
>> will fail if the sysfs entry for the PCI device isn't populated yet.
>
>eeh_add_device_late() contains several things that read config space:
>eeh_save_bars() caches the entire config header, and
>eeh_addr_cache_insert_dev() looks at the device resources (which are
>determined by BARs in config space).  I think this is an error-prone
>approach.  I think it would be simpler and safer for you to capture what
>you need in your PCI config accessors.
>

I don't follow you very well. I think you're saying the source of all
information should be config space exclusively. The code is shared by
multple platforms, one of which is pSeries running on top of PowerVM
hypervisor or KVM/QEMU. The device resources are figured from device-tree,
not from config space.

>eeh_add_device_late() also contains code to deal with an EEH cache that
>"might not be removed correctly because of unbalanced kref to the device
>during unplug time."  That's unrelated to this patch series, but it sounds
>... like a hacky workaround for some bug in the unplug path.
>

Yes. We depend on pcibios_release_device() to disconnect EEH device
and PCI device. pcibios_release_device() might not be called because
of unbalanced refcount. The workaround here is to disconnect EEH device
and PCI device lazily, and then the EEH device is connected to the
right PCI device and EEH address cache is updated accordingly.

>> >>> +	eeh_add_device_early(pdn);
>> >>> +	eeh_add_device_late(pdev);
>> >>> +	eeh_sysfs_add_device(pdev);
>> >>> +}
>> >>> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pnv_eeh_vf_final_fixup);
>> >>
>> >>Ugh.  This is powerpc code, but I don't like using fixups as a hook like
>> >>this.  There is a pcibios_add_device() -- could this be done there?
>> >>
>> >
>> >I don't like it neither :-) But looks we can't put it in the
>> >pcibios_add_device().
>> >
>> >>If not, what happens after pcibios_add_device() that is required for this
>> >>code?  Maybe we need a pcibios_bus_add_device() hook?
>> >
>> >The pnv_eeh_vf_final_fixup() will try to create sysfs for VFs. This requires
>> >the VF sysfs(kobj) is initialized properly. If we put these into
>> >pcibios_add_device(), the eeh_sysfs_add_device() would fail.
>> >
>> >Below is the call flow for your reference:
>> >
>> >pci_device_add()
>> >    pcibios_add_device()
>> >    device_add()                <--- kobj initialized here
>> >
>> 
>> We can put it into pcibios_bus_add_device(), but we don't it currently. If
>> Bjorn agree to add pcibios_bus_add_device(), I'm fine to move the block code
>> there.
>
>I think I'm OK with adding a pcibios_bus_add_device().  I think that would
>be better than using the fixup mechanism for this.
>

Ok. Thanks for confirm.

Thanks,
Gavin
Wei Yang June 4, 2015, 5:46 a.m. UTC | #8
On Wed, Jun 03, 2015 at 10:46:38AM -0500, Bjorn Helgaas wrote:
>On Wed, Jun 03, 2015 at 03:10:23PM +1000, Gavin Shan wrote:
>> On Wed, Jun 03, 2015 at 11:31:42AM +0800, Wei Yang wrote:
>> >On Mon, Jun 01, 2015 at 06:46:45PM -0500, Bjorn Helgaas wrote:
>> >>On Tue, May 19, 2015 at 06:50:08PM +0800, Wei Yang wrote:
>> >>> Current EEH recovery code works with the assumption: the PE has primary
>> >>> bus. Unfortunately, that's not true to VF PEs, which generally contains
>> >>> one or multiple VFs (for VF group case). The patch creates PEs for VFs
>> >>> at PCI final fixup time. Those PEs for VFs are indentified with newly
>> >>> introduced flag EEH_PE_VF so that we handle them differently during
>> >>> EEH recovery.
>> >>> 
>> >>> [gwshan: changelog and code refactoring]
>> >>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>> >>> Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> >>> ---
>> >>>  arch/powerpc/include/asm/eeh.h               |    1 +
>> >>>  arch/powerpc/kernel/eeh_pe.c                 |   10 ++++++++--
>> >>>  arch/powerpc/platforms/powernv/eeh-powernv.c |   17 +++++++++++++++++
>> >>>  3 files changed, 26 insertions(+), 2 deletions(-)
>> >>> 
>> >>> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
>> >>> index 1b3614d..c1fde48 100644
>> >>> --- a/arch/powerpc/include/asm/eeh.h
>> >>> +++ b/arch/powerpc/include/asm/eeh.h
>> >>> @@ -70,6 +70,7 @@ struct pci_dn;
>> >>>  #define EEH_PE_PHB	(1 << 1)	/* PHB PE    */
>> >>>  #define EEH_PE_DEVICE 	(1 << 2)	/* Device PE */
>> >>>  #define EEH_PE_BUS	(1 << 3)	/* Bus PE    */
>> >>> +#define EEH_PE_VF	(1 << 4)	/* VF PE     */
>> >>>  
>> >>>  #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
>> >>>  #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
>> >>> diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
>> >>> index 35f0b62..260a701 100644
>> >>> --- a/arch/powerpc/kernel/eeh_pe.c
>> >>> +++ b/arch/powerpc/kernel/eeh_pe.c
>> >>> @@ -299,7 +299,10 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
>> >>>  	 * EEH device already having associated PE, but
>> >>>  	 * the direct parent EEH device doesn't have yet.
>> >>>  	 */
>> >>> -	pdn = pdn ? pdn->parent : NULL;
>> >>> +	if (edev->physfn)
>> >>> +		pdn = pci_get_pdn(edev->physfn);
>> >>> +	else
>> >>> +		pdn = pdn ? pdn->parent : NULL;
>> >>>  	while (pdn) {
>> >>>  		/* We're poking out of PCI territory */
>> >>>  		parent = pdn_to_eeh_dev(pdn);
>> >>> @@ -382,7 +385,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
>> >>>  	}
>> >>>  
>> >>>  	/* Create a new EEH PE */
>> >>> -	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
>> >>> +	if (edev->physfn)
>> >>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_VF);
>> >>> +	else
>> >>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
>> >>>  	if (!pe) {
>> >>>  		pr_err("%s: out of memory!\n", __func__);
>> >>>  		return -ENOMEM;
>> >>> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
>> >>> index ce738ab..c505036 100644
>> >>> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
>> >>> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
>> >>> @@ -1520,6 +1520,23 @@ static struct eeh_ops pnv_eeh_ops = {
>> >>>  	.restore_config		= pnv_eeh_restore_config
>> >>>  };
>> >>>  
>> >>> +static void pnv_eeh_vf_final_fixup(struct pci_dev *pdev)
>> >>> +{
>> >>> +	struct pci_dn *pdn = pci_get_pdn(pdev);
>> >>> +
>> >>> +	if (!pdev->is_virtfn)
>> >>> +		return;
>> >>> +
>> >>> +	/*
>> >>> +	 * The following operations will fail if VF's sysfs files
>> >>> +	 * aren't created or its resources aren't finalized.
>> >>> +	 */
>> >>
>> >>I don't understand this comment.  "The following operations" seems to refer
>> >>to eeh_add_device_early() and eeh_add_device_late(), and
>> >>"VF's sysfs files being created" seems to refer to eeh_sysfs_add_device().
>> >>
>> >>So the comment suggests that eeh_add_device_early() and
>> >>eeh_add_device_late() will fail because they're called before
>> >>eeh_sysfs_add_device().  So I think you must be talking about some other
>> >>"following operations," not eeh_add_device_early() and
>> >>eeh_add_device_late().
>> >
>> >Sorry for this confusion.
>> >
>> >The comment here wants to say the eeh_sysfs_add_device() will fail if the VF's
>> >sysfs is not created well. Or it will fail if the VF's resources are not set
>> >properly, since we would cache the VF's BAR in eeh_add_device_late().
>> >
>> >Gavin,
>> >
>> >If my understanding is not correct please let me know.
>> >
>> 
>> It's correct. "The following operations" refers to eeh_add_device_late()
>> and eeh_sysfs_add_device(). The former one requires the resources for
>> one particular PCI device (VF here) are finalized (assigned). eeh_sysfs_add_device()
>> will fail if the sysfs entry for the PCI device isn't populated yet.
>
>eeh_add_device_late() contains several things that read config space:
>eeh_save_bars() caches the entire config header, and
>eeh_addr_cache_insert_dev() looks at the device resources (which are
>determined by BARs in config space).  I think this is an error-prone
>approach.  I think it would be simpler and safer for you to capture what
>you need in your PCI config accessors.
>
>eeh_add_device_late() also contains code to deal with an EEH cache that
>"might not be removed correctly because of unbalanced kref to the device
>during unplug time."  That's unrelated to this patch series, but it sounds
>... like a hacky workaround for some bug in the unplug path.
>
>> >>> +	eeh_add_device_early(pdn);
>> >>> +	eeh_add_device_late(pdev);
>> >>> +	eeh_sysfs_add_device(pdev);
>> >>> +}
>> >>> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pnv_eeh_vf_final_fixup);
>> >>
>> >>Ugh.  This is powerpc code, but I don't like using fixups as a hook like
>> >>this.  There is a pcibios_add_device() -- could this be done there?
>> >>
>> >
>> >I don't like it neither :-) But looks we can't put it in the
>> >pcibios_add_device().
>> >
>> >>If not, what happens after pcibios_add_device() that is required for this
>> >>code?  Maybe we need a pcibios_bus_add_device() hook?
>> >
>> >The pnv_eeh_vf_final_fixup() will try to create sysfs for VFs. This requires
>> >the VF sysfs(kobj) is initialized properly. If we put these into
>> >pcibios_add_device(), the eeh_sysfs_add_device() would fail.
>> >
>> >Below is the call flow for your reference:
>> >
>> >pci_device_add()
>> >    pcibios_add_device()
>> >    device_add()                <--- kobj initialized here
>> >
>> 
>> We can put it into pcibios_bus_add_device(), but we don't it currently. If
>> Bjorn agree to add pcibios_bus_add_device(), I'm fine to move the block code
>> there.
>
>I think I'm OK with adding a pcibios_bus_add_device().  I think that would
>be better than using the fixup mechanism for this.
>

Thanks for your confirmation.

While I am a little out of the page, where should I put the
pcibios_bus_add_device()?

Gavin,

After we have this, we should move the EEH probe related code all to this
place, right? Then both PF and VF has the same place to initialized the EEH,
right?

If my understanding is not correct, please let me know:)

>Bjorn
Gavin Shan June 4, 2015, 7:10 a.m. UTC | #9
On Thu, Jun 04, 2015 at 01:46:15PM +0800, Wei Yang wrote:
>On Wed, Jun 03, 2015 at 10:46:38AM -0500, Bjorn Helgaas wrote:
>>On Wed, Jun 03, 2015 at 03:10:23PM +1000, Gavin Shan wrote:
>>> On Wed, Jun 03, 2015 at 11:31:42AM +0800, Wei Yang wrote:
>>> >On Mon, Jun 01, 2015 at 06:46:45PM -0500, Bjorn Helgaas wrote:
>>> >>On Tue, May 19, 2015 at 06:50:08PM +0800, Wei Yang wrote:
>>> >>> Current EEH recovery code works with the assumption: the PE has primary
>>> >>> bus. Unfortunately, that's not true to VF PEs, which generally contains
>>> >>> one or multiple VFs (for VF group case). The patch creates PEs for VFs
>>> >>> at PCI final fixup time. Those PEs for VFs are indentified with newly
>>> >>> introduced flag EEH_PE_VF so that we handle them differently during
>>> >>> EEH recovery.
>>> >>> 
>>> >>> [gwshan: changelog and code refactoring]
>>> >>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>> >>> Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>> >>> ---
>>> >>>  arch/powerpc/include/asm/eeh.h               |    1 +
>>> >>>  arch/powerpc/kernel/eeh_pe.c                 |   10 ++++++++--
>>> >>>  arch/powerpc/platforms/powernv/eeh-powernv.c |   17 +++++++++++++++++
>>> >>>  3 files changed, 26 insertions(+), 2 deletions(-)
>>> >>> 
>>> >>> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
>>> >>> index 1b3614d..c1fde48 100644
>>> >>> --- a/arch/powerpc/include/asm/eeh.h
>>> >>> +++ b/arch/powerpc/include/asm/eeh.h
>>> >>> @@ -70,6 +70,7 @@ struct pci_dn;
>>> >>>  #define EEH_PE_PHB	(1 << 1)	/* PHB PE    */
>>> >>>  #define EEH_PE_DEVICE 	(1 << 2)	/* Device PE */
>>> >>>  #define EEH_PE_BUS	(1 << 3)	/* Bus PE    */
>>> >>> +#define EEH_PE_VF	(1 << 4)	/* VF PE     */
>>> >>>  
>>> >>>  #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
>>> >>>  #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
>>> >>> diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
>>> >>> index 35f0b62..260a701 100644
>>> >>> --- a/arch/powerpc/kernel/eeh_pe.c
>>> >>> +++ b/arch/powerpc/kernel/eeh_pe.c
>>> >>> @@ -299,7 +299,10 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
>>> >>>  	 * EEH device already having associated PE, but
>>> >>>  	 * the direct parent EEH device doesn't have yet.
>>> >>>  	 */
>>> >>> -	pdn = pdn ? pdn->parent : NULL;
>>> >>> +	if (edev->physfn)
>>> >>> +		pdn = pci_get_pdn(edev->physfn);
>>> >>> +	else
>>> >>> +		pdn = pdn ? pdn->parent : NULL;
>>> >>>  	while (pdn) {
>>> >>>  		/* We're poking out of PCI territory */
>>> >>>  		parent = pdn_to_eeh_dev(pdn);
>>> >>> @@ -382,7 +385,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
>>> >>>  	}
>>> >>>  
>>> >>>  	/* Create a new EEH PE */
>>> >>> -	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
>>> >>> +	if (edev->physfn)
>>> >>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_VF);
>>> >>> +	else
>>> >>> +		pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
>>> >>>  	if (!pe) {
>>> >>>  		pr_err("%s: out of memory!\n", __func__);
>>> >>>  		return -ENOMEM;
>>> >>> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
>>> >>> index ce738ab..c505036 100644
>>> >>> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
>>> >>> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
>>> >>> @@ -1520,6 +1520,23 @@ static struct eeh_ops pnv_eeh_ops = {
>>> >>>  	.restore_config		= pnv_eeh_restore_config
>>> >>>  };
>>> >>>  
>>> >>> +static void pnv_eeh_vf_final_fixup(struct pci_dev *pdev)
>>> >>> +{
>>> >>> +	struct pci_dn *pdn = pci_get_pdn(pdev);
>>> >>> +
>>> >>> +	if (!pdev->is_virtfn)
>>> >>> +		return;
>>> >>> +
>>> >>> +	/*
>>> >>> +	 * The following operations will fail if VF's sysfs files
>>> >>> +	 * aren't created or its resources aren't finalized.
>>> >>> +	 */
>>> >>
>>> >>I don't understand this comment.  "The following operations" seems to refer
>>> >>to eeh_add_device_early() and eeh_add_device_late(), and
>>> >>"VF's sysfs files being created" seems to refer to eeh_sysfs_add_device().
>>> >>
>>> >>So the comment suggests that eeh_add_device_early() and
>>> >>eeh_add_device_late() will fail because they're called before
>>> >>eeh_sysfs_add_device().  So I think you must be talking about some other
>>> >>"following operations," not eeh_add_device_early() and
>>> >>eeh_add_device_late().
>>> >
>>> >Sorry for this confusion.
>>> >
>>> >The comment here wants to say the eeh_sysfs_add_device() will fail if the VF's
>>> >sysfs is not created well. Or it will fail if the VF's resources are not set
>>> >properly, since we would cache the VF's BAR in eeh_add_device_late().
>>> >
>>> >Gavin,
>>> >
>>> >If my understanding is not correct please let me know.
>>> >
>>> 
>>> It's correct. "The following operations" refers to eeh_add_device_late()
>>> and eeh_sysfs_add_device(). The former one requires the resources for
>>> one particular PCI device (VF here) are finalized (assigned). eeh_sysfs_add_device()
>>> will fail if the sysfs entry for the PCI device isn't populated yet.
>>
>>eeh_add_device_late() contains several things that read config space:
>>eeh_save_bars() caches the entire config header, and
>>eeh_addr_cache_insert_dev() looks at the device resources (which are
>>determined by BARs in config space).  I think this is an error-prone
>>approach.  I think it would be simpler and safer for you to capture what
>>you need in your PCI config accessors.
>>
>>eeh_add_device_late() also contains code to deal with an EEH cache that
>>"might not be removed correctly because of unbalanced kref to the device
>>during unplug time."  That's unrelated to this patch series, but it sounds
>>... like a hacky workaround for some bug in the unplug path.
>>
>>> >>> +	eeh_add_device_early(pdn);
>>> >>> +	eeh_add_device_late(pdev);
>>> >>> +	eeh_sysfs_add_device(pdev);
>>> >>> +}
>>> >>> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pnv_eeh_vf_final_fixup);
>>> >>
>>> >>Ugh.  This is powerpc code, but I don't like using fixups as a hook like
>>> >>this.  There is a pcibios_add_device() -- could this be done there?
>>> >>
>>> >
>>> >I don't like it neither :-) But looks we can't put it in the
>>> >pcibios_add_device().
>>> >
>>> >>If not, what happens after pcibios_add_device() that is required for this
>>> >>code?  Maybe we need a pcibios_bus_add_device() hook?
>>> >
>>> >The pnv_eeh_vf_final_fixup() will try to create sysfs for VFs. This requires
>>> >the VF sysfs(kobj) is initialized properly. If we put these into
>>> >pcibios_add_device(), the eeh_sysfs_add_device() would fail.
>>> >
>>> >Below is the call flow for your reference:
>>> >
>>> >pci_device_add()
>>> >    pcibios_add_device()
>>> >    device_add()                <--- kobj initialized here
>>> >
>>> 
>>> We can put it into pcibios_bus_add_device(), but we don't it currently. If
>>> Bjorn agree to add pcibios_bus_add_device(), I'm fine to move the block code
>>> there.
>>
>>I think I'm OK with adding a pcibios_bus_add_device().  I think that would
>>be better than using the fixup mechanism for this.
>>
>
>Thanks for your confirmation.
>
>While I am a little out of the page, where should I put the
>pcibios_bus_add_device()?
>
>Gavin,
>
>After we have this, we should move the EEH probe related code all to this
>place, right? Then both PF and VF has the same place to initialized the EEH,
>right?
>
>If my understanding is not correct, please let me know:)
>

No, we can't do things as you suggested for various reasons: The EEH device
is probed based on device-tree (or pdn on pSeries) or PCI device (on PowerNV)
though eeh_ops->probe() takes "pdn" as argument. That means the time for
probing EEH device is different for PowerNV/pSeries and we can't unify them
by simply puting the logic to pcibios_bus_add_device().

So I'm expecting something like below:

- (A) Introduce weak pcibios_bus_add_device() as Bjorn suggested.
- (B) Introduce pci_controller_ops::bus_add_device(), whose PowerNV backend
  does those things here (EEH device probing, building EEH addr cache ...)
  for VF only.

(B) potentially conflicts with current code if PF is involved in hot plug.
I'm not sure if you tested this case or not.

- PF is removed during hot unplug time;
- PF is added during hot plug time. PF's driver is loaded and VFs are enabled.
- pcibios_bus_add_device() is called for VFs and initialize EEH stuff.
- At later point, pcibios_finish_adding_to_bus() is called to initialize EEH
  stuff for VFs again.

Current code already should have avoided the conflict, but worhty to test to
see if there're any problems:

- In probing time, EEH device is skipped if EEH device already had parent PE
  connected.
- sysfs won't be populated again when seeing flag EEH_DEV_SYSFS.
- The PCI device shouldn't be added to address cache if it has been there.


Thanks,
Gavin
Wei Yang June 16, 2015, 8:50 a.m. UTC | #10
On Wed, Jun 03, 2015 at 10:46:38AM -0500, Bjorn Helgaas wrote:
>On Wed, Jun 03, 2015 at 03:10:23PM +1000, Gavin Shan wrote:
>> It's correct. "The following operations" refers to eeh_add_device_late()
>> and eeh_sysfs_add_device(). The former one requires the resources for
>> one particular PCI device (VF here) are finalized (assigned). eeh_sysfs_add_device()
>> will fail if the sysfs entry for the PCI device isn't populated yet.
>
>eeh_add_device_late() contains several things that read config space:
>eeh_save_bars() caches the entire config header, and
>eeh_addr_cache_insert_dev() looks at the device resources (which are
>determined by BARs in config space).  I think this is an error-prone
>approach.  I think it would be simpler and safer for you to capture what
>you need in your PCI config accessors.
>
>eeh_add_device_late() also contains code to deal with an EEH cache that
>"might not be removed correctly because of unbalanced kref to the device
>during unplug time."  That's unrelated to this patch series, but it sounds
>... like a hacky workaround for some bug in the unplug path.
>
>> >>> +	eeh_add_device_early(pdn);
>> >>> +	eeh_add_device_late(pdev);
>> >>> +	eeh_sysfs_add_device(pdev);
>> >>> +}
>> >>> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pnv_eeh_vf_final_fixup);
>> >>
>> >>Ugh.  This is powerpc code, but I don't like using fixups as a hook like
>> >>this.  There is a pcibios_add_device() -- could this be done there?
>> >>
>> >
>> >I don't like it neither :-) But looks we can't put it in the
>> >pcibios_add_device().
>> >
>> >>If not, what happens after pcibios_add_device() that is required for this
>> >>code?  Maybe we need a pcibios_bus_add_device() hook?
>> >
>> >The pnv_eeh_vf_final_fixup() will try to create sysfs for VFs. This requires
>> >the VF sysfs(kobj) is initialized properly. If we put these into
>> >pcibios_add_device(), the eeh_sysfs_add_device() would fail.
>> >
>> >Below is the call flow for your reference:
>> >
>> >pci_device_add()
>> >    pcibios_add_device()
>> >    device_add()                <--- kobj initialized here
>> >
>> 
>> We can put it into pcibios_bus_add_device(), but we don't it currently. If
>> Bjorn agree to add pcibios_bus_add_device(), I'm fine to move the block code
>> there.
>
>I think I'm OK with adding a pcibios_bus_add_device().  I think that would
>be better than using the fixup mechanism for this.
>

Hi, Bjorn, Gavin,

Been working on some bug recently, just got a chance to this one.

Would you mind giving me some hint, where you suggest to put the
pcibios_bus_add_device()?

>Bjorn
Bjorn Helgaas June 16, 2015, 1:22 p.m. UTC | #11
On Tue, Jun 16, 2015 at 3:50 AM, Wei Yang <weiyang@linux.vnet.ibm.com> wrote:
> On Wed, Jun 03, 2015 at 10:46:38AM -0500, Bjorn Helgaas wrote:
>>On Wed, Jun 03, 2015 at 03:10:23PM +1000, Gavin Shan wrote:
>>> It's correct. "The following operations" refers to eeh_add_device_late()
>>> and eeh_sysfs_add_device(). The former one requires the resources for
>>> one particular PCI device (VF here) are finalized (assigned). eeh_sysfs_add_device()
>>> will fail if the sysfs entry for the PCI device isn't populated yet.
>>
>>eeh_add_device_late() contains several things that read config space:
>>eeh_save_bars() caches the entire config header, and
>>eeh_addr_cache_insert_dev() looks at the device resources (which are
>>determined by BARs in config space).  I think this is an error-prone
>>approach.  I think it would be simpler and safer for you to capture what
>>you need in your PCI config accessors.
>>
>>eeh_add_device_late() also contains code to deal with an EEH cache that
>>"might not be removed correctly because of unbalanced kref to the device
>>during unplug time."  That's unrelated to this patch series, but it sounds
>>... like a hacky workaround for some bug in the unplug path.
>>
>>> >>> +        eeh_add_device_early(pdn);
>>> >>> +        eeh_add_device_late(pdev);
>>> >>> +        eeh_sysfs_add_device(pdev);
>>> >>> +}
>>> >>> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pnv_eeh_vf_final_fixup);
>>> >>
>>> >>Ugh.  This is powerpc code, but I don't like using fixups as a hook like
>>> >>this.  There is a pcibios_add_device() -- could this be done there?
>>> >>
>>> >
>>> >I don't like it neither :-) But looks we can't put it in the
>>> >pcibios_add_device().
>>> >
>>> >>If not, what happens after pcibios_add_device() that is required for this
>>> >>code?  Maybe we need a pcibios_bus_add_device() hook?
>>> >
>>> >The pnv_eeh_vf_final_fixup() will try to create sysfs for VFs. This requires
>>> >the VF sysfs(kobj) is initialized properly. If we put these into
>>> >pcibios_add_device(), the eeh_sysfs_add_device() would fail.
>>> >
>>> >Below is the call flow for your reference:
>>> >
>>> >pci_device_add()
>>> >    pcibios_add_device()
>>> >    device_add()                <--- kobj initialized here
>>> >
>>>
>>> We can put it into pcibios_bus_add_device(), but we don't it currently. If
>>> Bjorn agree to add pcibios_bus_add_device(), I'm fine to move the block code
>>> there.
>>
>>I think I'm OK with adding a pcibios_bus_add_device().  I think that would
>>be better than using the fixup mechanism for this.
>>
>
> Hi, Bjorn, Gavin,
>
> Been working on some bug recently, just got a chance to this one.
>
> Would you mind giving me some hint, where you suggest to put the
> pcibios_bus_add_device()?

I would expect it to be called from pci_bus_add_device().

Bjorn
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 1b3614d..c1fde48 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -70,6 +70,7 @@  struct pci_dn;
 #define EEH_PE_PHB	(1 << 1)	/* PHB PE    */
 #define EEH_PE_DEVICE 	(1 << 2)	/* Device PE */
 #define EEH_PE_BUS	(1 << 3)	/* Bus PE    */
+#define EEH_PE_VF	(1 << 4)	/* VF PE     */
 
 #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
 #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 35f0b62..260a701 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -299,7 +299,10 @@  static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
 	 * EEH device already having associated PE, but
 	 * the direct parent EEH device doesn't have yet.
 	 */
-	pdn = pdn ? pdn->parent : NULL;
+	if (edev->physfn)
+		pdn = pci_get_pdn(edev->physfn);
+	else
+		pdn = pdn ? pdn->parent : NULL;
 	while (pdn) {
 		/* We're poking out of PCI territory */
 		parent = pdn_to_eeh_dev(pdn);
@@ -382,7 +385,10 @@  int eeh_add_to_parent_pe(struct eeh_dev *edev)
 	}
 
 	/* Create a new EEH PE */
-	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
+	if (edev->physfn)
+		pe = eeh_pe_alloc(edev->phb, EEH_PE_VF);
+	else
+		pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
 	if (!pe) {
 		pr_err("%s: out of memory!\n", __func__);
 		return -ENOMEM;
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index ce738ab..c505036 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1520,6 +1520,23 @@  static struct eeh_ops pnv_eeh_ops = {
 	.restore_config		= pnv_eeh_restore_config
 };
 
+static void pnv_eeh_vf_final_fixup(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+
+	if (!pdev->is_virtfn)
+		return;
+
+	/*
+	 * The following operations will fail if VF's sysfs files
+	 * aren't created or its resources aren't finalized.
+	 */
+	eeh_add_device_early(pdn);
+	eeh_add_device_late(pdev);
+	eeh_sysfs_add_device(pdev);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pnv_eeh_vf_final_fixup);
+
 /**
  * eeh_powernv_init - Register platform dependent EEH operations
  *