diff mbox

[v12,15/21] powerpc/powernv: Reserve additional space for IOV BAR according to the number of total_pe

Message ID 20150224083442.32124.96716.stgit@bhelgaas-glaptop2.roam.corp.google.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Bjorn Helgaas Feb. 24, 2015, 8:34 a.m. UTC
From: Wei Yang <weiyang@linux.vnet.ibm.com>

On PHB3, PF IOV BAR will be covered by M64 window to have better PE
isolation.  The total_pe number is usually different from total_VFs, which
can lead to a conflict between MMIO space and the PE number.

For example, if total_VFs is 128 and total_pe is 256, the second half of
M64 window will be part of other PCI device, which may already belong
to other PEs.

Prevent the conflict by reserving additional space for the PF IOV BAR,
which is total_pe number of VF's BAR size.

[bhelgaas: make dev_printk() output more consistent, index resource[]
conventionally]
Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/powerpc/include/asm/machdep.h        |    4 ++
 arch/powerpc/include/asm/pci-bridge.h     |    3 ++
 arch/powerpc/kernel/pci-common.c          |    5 +++
 arch/powerpc/platforms/powernv/pci-ioda.c |   58 +++++++++++++++++++++++++++++
 4 files changed, 70 insertions(+)

Comments

Bjorn Helgaas Feb. 24, 2015, 8:52 a.m. UTC | #1
On Tue, Feb 24, 2015 at 02:34:42AM -0600, Bjorn Helgaas wrote:
> From: Wei Yang <weiyang@linux.vnet.ibm.com>
> 
> On PHB3, PF IOV BAR will be covered by M64 window to have better PE
> isolation.  The total_pe number is usually different from total_VFs, which
> can lead to a conflict between MMIO space and the PE number.
> 
> For example, if total_VFs is 128 and total_pe is 256, the second half of
> M64 window will be part of other PCI device, which may already belong
> to other PEs.

I'm still trying to wrap my mind around the explanation here.

I *think* what's going on is that the M64 window must be a power-of-two
size.  If the VF(n) BAR space doesn't completely fill it, we might allocate
the leftover space to another device.  Then the M64 window for *this*
device may cause the other device to be associated with a PE it didn't
expect.

But I don't understand this well enough to describe it clearly.

More serious code question below...

> Prevent the conflict by reserving additional space for the PF IOV BAR,
> which is total_pe number of VF's BAR size.
> 
> [bhelgaas: make dev_printk() output more consistent, index resource[]
> conventionally]
> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> ---
>  arch/powerpc/include/asm/machdep.h        |    4 ++
>  arch/powerpc/include/asm/pci-bridge.h     |    3 ++
>  arch/powerpc/kernel/pci-common.c          |    5 +++
>  arch/powerpc/platforms/powernv/pci-ioda.c |   58 +++++++++++++++++++++++++++++
>  4 files changed, 70 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
> index c8175a3fe560..965547c58497 100644
> --- a/arch/powerpc/include/asm/machdep.h
> +++ b/arch/powerpc/include/asm/machdep.h
> @@ -250,6 +250,10 @@ struct machdep_calls {
>  	/* Reset the secondary bus of bridge */
>  	void  (*pcibios_reset_secondary_bus)(struct pci_dev *dev);
>  
> +#ifdef CONFIG_PCI_IOV
> +	void (*pcibios_fixup_sriov)(struct pci_bus *bus);
> +#endif /* CONFIG_PCI_IOV */
> +
>  	/* Called to shutdown machine specific hardware not already controlled
>  	 * by other drivers.
>  	 */
> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
> index 513f8f27060d..de11de7d4547 100644
> --- a/arch/powerpc/include/asm/pci-bridge.h
> +++ b/arch/powerpc/include/asm/pci-bridge.h
> @@ -175,6 +175,9 @@ struct pci_dn {
>  #define IODA_INVALID_PE		(-1)
>  #ifdef CONFIG_PPC_POWERNV
>  	int	pe_number;
> +#ifdef CONFIG_PCI_IOV
> +	u16     max_vfs;		/* number of VFs IOV BAR expended */
> +#endif /* CONFIG_PCI_IOV */
>  #endif
>  	struct list_head child_list;
>  	struct list_head list;
> diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
> index 82031011522f..022e9feeb1f2 100644
> --- a/arch/powerpc/kernel/pci-common.c
> +++ b/arch/powerpc/kernel/pci-common.c
> @@ -1646,6 +1646,11 @@ void pcibios_scan_phb(struct pci_controller *hose)
>  	if (ppc_md.pcibios_fixup_phb)
>  		ppc_md.pcibios_fixup_phb(hose);
>  
> +#ifdef CONFIG_PCI_IOV
> +	if (ppc_md.pcibios_fixup_sriov)
> +		ppc_md.pcibios_fixup_sriov(bus);
> +#endif /* CONFIG_PCI_IOV */
> +
>  	/* Configure PCI Express settings */
>  	if (bus && !pci_has_flag(PCI_PROBE_ONLY)) {
>  		struct pci_bus *child;
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index cd1a56160ded..36c533da5ccb 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1749,6 +1749,61 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
>  static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
>  #endif /* CONFIG_PCI_MSI */
>  
> +#ifdef CONFIG_PCI_IOV
> +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> +{
> +	struct pci_controller *hose;
> +	struct pnv_phb *phb;
> +	struct resource *res;
> +	int i;
> +	resource_size_t size;
> +	struct pci_dn *pdn;
> +
> +	if (!pdev->is_physfn || pdev->is_added)
> +		return;
> +
> +	hose = pci_bus_to_host(pdev->bus);
> +	phb = hose->private_data;
> +
> +	pdn = pci_get_pdn(pdev);
> +	pdn->max_vfs = 0;
> +
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> +		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> +		if (!res->flags || res->parent)
> +			continue;
> +		if (!pnv_pci_is_mem_pref_64(res->flags)) {
> +			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
> +				 i, res);
> +			continue;
> +		}
> +
> +		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
> +		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
> +		res->end = res->start + size * phb->ioda.total_pe - 1;
> +		dev_dbg(&pdev->dev, "                       %pR\n", res);
> +		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
> +				i, res, phb->ioda.total_pe);
> +	}
> +	pdn->max_vfs = phb->ioda.total_pe;
> +}
> +
> +static void pnv_pci_ioda_fixup_sriov(struct pci_bus *bus)
> +{
> +	struct pci_dev *pdev;
> +	struct pci_bus *b;
> +
> +	list_for_each_entry(pdev, &bus->devices, bus_list) {
> +		b = pdev->subordinate;
> +
> +		if (b)
> +			pnv_pci_ioda_fixup_sriov(b);
> +
> +		pnv_pci_ioda_fixup_iov_resources(pdev);

I'm not sure this happens at the right time.  We have this call chain:

  pcibios_scan_phb
    pci_create_root_bus
    pci_scan_child_bus
    pnv_pci_ioda_fixup_sriov
      pnv_pci_ioda_fixup_iov_resources
	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
	  increase res->size to accomodate 256 PEs (or roundup(totalVFs)

so we only do the fixup_iov_resources() when we scan the PHB, and we
wouldn't do it at all for hot-added devices.

> +	}
> +}
> +#endif /* CONFIG_PCI_IOV */
> +
>  /*
>   * This function is supposed to be called on basis of PE from top
>   * to bottom style. So the the I/O or MMIO segment assigned to
> @@ -2125,6 +2180,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>  	ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
>  	ppc_md.pcibios_window_alignment = pnv_pci_window_alignment;
>  	ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
> +#ifdef CONFIG_PCI_IOV
> +	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_sriov;
> +#endif /* CONFIG_PCI_IOV */
>  	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
>  
>  	/* Reset IODA tables to a clean state */
>
Wei Yang March 2, 2015, 7:41 a.m. UTC | #2
On Tue, Feb 24, 2015 at 02:52:34AM -0600, Bjorn Helgaas wrote:
>On Tue, Feb 24, 2015 at 02:34:42AM -0600, Bjorn Helgaas wrote:
>> From: Wei Yang <weiyang@linux.vnet.ibm.com>
>> 
>> On PHB3, PF IOV BAR will be covered by M64 window to have better PE
>> isolation.  The total_pe number is usually different from total_VFs, which
>> can lead to a conflict between MMIO space and the PE number.
>> 
>> For example, if total_VFs is 128 and total_pe is 256, the second half of
>> M64 window will be part of other PCI device, which may already belong
>> to other PEs.
>
>I'm still trying to wrap my mind around the explanation here.
>
>I *think* what's going on is that the M64 window must be a power-of-two
>size.  If the VF(n) BAR space doesn't completely fill it, we might allocate
>the leftover space to another device.  Then the M64 window for *this*
>device may cause the other device to be associated with a PE it didn't
>expect.

Yes, this is the exact reason.

>
>But I don't understand this well enough to describe it clearly.
>
>More serious code question below...
>
>> Prevent the conflict by reserving additional space for the PF IOV BAR,
>> which is total_pe number of VF's BAR size.
>> 
>> [bhelgaas: make dev_printk() output more consistent, index resource[]
>> conventionally]
>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
>> ---
>>  arch/powerpc/include/asm/machdep.h        |    4 ++
>>  arch/powerpc/include/asm/pci-bridge.h     |    3 ++
>>  arch/powerpc/kernel/pci-common.c          |    5 +++
>>  arch/powerpc/platforms/powernv/pci-ioda.c |   58 +++++++++++++++++++++++++++++
>>  4 files changed, 70 insertions(+)
>> 
>> diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
>> index c8175a3fe560..965547c58497 100644
>> --- a/arch/powerpc/include/asm/machdep.h
>> +++ b/arch/powerpc/include/asm/machdep.h
>> @@ -250,6 +250,10 @@ struct machdep_calls {
>>  	/* Reset the secondary bus of bridge */
>>  	void  (*pcibios_reset_secondary_bus)(struct pci_dev *dev);
>>  
>> +#ifdef CONFIG_PCI_IOV
>> +	void (*pcibios_fixup_sriov)(struct pci_bus *bus);
>> +#endif /* CONFIG_PCI_IOV */
>> +
>>  	/* Called to shutdown machine specific hardware not already controlled
>>  	 * by other drivers.
>>  	 */
>> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>> index 513f8f27060d..de11de7d4547 100644
>> --- a/arch/powerpc/include/asm/pci-bridge.h
>> +++ b/arch/powerpc/include/asm/pci-bridge.h
>> @@ -175,6 +175,9 @@ struct pci_dn {
>>  #define IODA_INVALID_PE		(-1)
>>  #ifdef CONFIG_PPC_POWERNV
>>  	int	pe_number;
>> +#ifdef CONFIG_PCI_IOV
>> +	u16     max_vfs;		/* number of VFs IOV BAR expended */
>> +#endif /* CONFIG_PCI_IOV */
>>  #endif
>>  	struct list_head child_list;
>>  	struct list_head list;
>> diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
>> index 82031011522f..022e9feeb1f2 100644
>> --- a/arch/powerpc/kernel/pci-common.c
>> +++ b/arch/powerpc/kernel/pci-common.c
>> @@ -1646,6 +1646,11 @@ void pcibios_scan_phb(struct pci_controller *hose)
>>  	if (ppc_md.pcibios_fixup_phb)
>>  		ppc_md.pcibios_fixup_phb(hose);
>>  
>> +#ifdef CONFIG_PCI_IOV
>> +	if (ppc_md.pcibios_fixup_sriov)
>> +		ppc_md.pcibios_fixup_sriov(bus);
>> +#endif /* CONFIG_PCI_IOV */
>> +
>>  	/* Configure PCI Express settings */
>>  	if (bus && !pci_has_flag(PCI_PROBE_ONLY)) {
>>  		struct pci_bus *child;
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index cd1a56160ded..36c533da5ccb 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -1749,6 +1749,61 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
>>  static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
>>  #endif /* CONFIG_PCI_MSI */
>>  
>> +#ifdef CONFIG_PCI_IOV
>> +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> +{
>> +	struct pci_controller *hose;
>> +	struct pnv_phb *phb;
>> +	struct resource *res;
>> +	int i;
>> +	resource_size_t size;
>> +	struct pci_dn *pdn;
>> +
>> +	if (!pdev->is_physfn || pdev->is_added)
>> +		return;
>> +
>> +	hose = pci_bus_to_host(pdev->bus);
>> +	phb = hose->private_data;
>> +
>> +	pdn = pci_get_pdn(pdev);
>> +	pdn->max_vfs = 0;
>> +
>> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> +		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>> +		if (!res->flags || res->parent)
>> +			continue;
>> +		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>> +			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>> +				 i, res);
>> +			continue;
>> +		}
>> +
>> +		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>> +		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>> +		res->end = res->start + size * phb->ioda.total_pe - 1;
>> +		dev_dbg(&pdev->dev, "                       %pR\n", res);
>> +		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
>> +				i, res, phb->ioda.total_pe);
>> +	}
>> +	pdn->max_vfs = phb->ioda.total_pe;
>> +}
>> +
>> +static void pnv_pci_ioda_fixup_sriov(struct pci_bus *bus)
>> +{
>> +	struct pci_dev *pdev;
>> +	struct pci_bus *b;
>> +
>> +	list_for_each_entry(pdev, &bus->devices, bus_list) {
>> +		b = pdev->subordinate;
>> +
>> +		if (b)
>> +			pnv_pci_ioda_fixup_sriov(b);
>> +
>> +		pnv_pci_ioda_fixup_iov_resources(pdev);
>
>I'm not sure this happens at the right time.  We have this call chain:
>
>  pcibios_scan_phb
>    pci_create_root_bus
>    pci_scan_child_bus
>    pnv_pci_ioda_fixup_sriov
>      pnv_pci_ioda_fixup_iov_resources
>	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>	  increase res->size to accomodate 256 PEs (or roundup(totalVFs)
>
>so we only do the fixup_iov_resources() when we scan the PHB, and we
>wouldn't do it at all for hot-added devices.

Yep, you are right :-)

I had a separate patch to do this in pcibios_add_pci_devices(). Looks we could
merge them.

>
>> +	}
>> +}
>> +#endif /* CONFIG_PCI_IOV */
>> +
>>  /*
>>   * This function is supposed to be called on basis of PE from top
>>   * to bottom style. So the the I/O or MMIO segment assigned to
>> @@ -2125,6 +2180,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>>  	ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
>>  	ppc_md.pcibios_window_alignment = pnv_pci_window_alignment;
>>  	ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
>> +#ifdef CONFIG_PCI_IOV
>> +	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_sriov;
>> +#endif /* CONFIG_PCI_IOV */
>>  	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
>>  
>>  	/* Reset IODA tables to a clean state */
>>
Bjorn Helgaas March 11, 2015, 2:51 a.m. UTC | #3
On Mon, Mar 02, 2015 at 03:41:32PM +0800, Wei Yang wrote:
> On Tue, Feb 24, 2015 at 02:52:34AM -0600, Bjorn Helgaas wrote:
> >On Tue, Feb 24, 2015 at 02:34:42AM -0600, Bjorn Helgaas wrote:
> >> From: Wei Yang <weiyang@linux.vnet.ibm.com>
> >> 
> >> On PHB3, PF IOV BAR will be covered by M64 window to have better PE
> >> isolation.  The total_pe number is usually different from total_VFs, which
> >> can lead to a conflict between MMIO space and the PE number.
> >> 
> >> For example, if total_VFs is 128 and total_pe is 256, the second half of
> >> M64 window will be part of other PCI device, which may already belong
> >> to other PEs.
> >
> >I'm still trying to wrap my mind around the explanation here.
> >
> >I *think* what's going on is that the M64 window must be a power-of-two
> >size.  If the VF(n) BAR space doesn't completely fill it, we might allocate
> >the leftover space to another device.  Then the M64 window for *this*
> >device may cause the other device to be associated with a PE it didn't
> >expect.
> 
> Yes, this is the exact reason.

Can you include some of this text in your changelog, then?  I can wordsmith
it and try to make it fit together better.

> >> +#ifdef CONFIG_PCI_IOV
> >> +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> >> +{
> >> +	struct pci_controller *hose;
> >> +	struct pnv_phb *phb;
> >> +	struct resource *res;
> >> +	int i;
> >> +	resource_size_t size;
> >> +	struct pci_dn *pdn;
> >> +
> >> +	if (!pdev->is_physfn || pdev->is_added)
> >> +		return;
> >> +
> >> +	hose = pci_bus_to_host(pdev->bus);
> >> +	phb = hose->private_data;
> >> +
> >> +	pdn = pci_get_pdn(pdev);
> >> +	pdn->max_vfs = 0;
> >> +
> >> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> >> +		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> >> +		if (!res->flags || res->parent)
> >> +			continue;
> >> +		if (!pnv_pci_is_mem_pref_64(res->flags)) {
> >> +			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
> >> +				 i, res);
> >> +			continue;
> >> +		}
> >> +
> >> +		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
> >> +		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
> >> +		res->end = res->start + size * phb->ioda.total_pe - 1;
> >> +		dev_dbg(&pdev->dev, "                       %pR\n", res);
> >> +		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
> >> +				i, res, phb->ioda.total_pe);
> >> +	}
> >> +	pdn->max_vfs = phb->ioda.total_pe;
> >> +}
> >> +
> >> +static void pnv_pci_ioda_fixup_sriov(struct pci_bus *bus)
> >> +{
> >> +	struct pci_dev *pdev;
> >> +	struct pci_bus *b;
> >> +
> >> +	list_for_each_entry(pdev, &bus->devices, bus_list) {
> >> +		b = pdev->subordinate;
> >> +
> >> +		if (b)
> >> +			pnv_pci_ioda_fixup_sriov(b);
> >> +
> >> +		pnv_pci_ioda_fixup_iov_resources(pdev);
> >
> >I'm not sure this happens at the right time.  We have this call chain:
> >
> >  pcibios_scan_phb
> >    pci_create_root_bus
> >    pci_scan_child_bus
> >    pnv_pci_ioda_fixup_sriov
> >      pnv_pci_ioda_fixup_iov_resources
> >	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
> >	  increase res->size to accomodate 256 PEs (or roundup(totalVFs)
> >
> >so we only do the fixup_iov_resources() when we scan the PHB, and we
> >wouldn't do it at all for hot-added devices.
> 
> Yep, you are right :-)
> 
> I had a separate patch to do this in pcibios_add_pci_devices(). Looks we could
> merge them.

Did you fix this in v13?  I don't see the change if you did.

> >> +	}
> >> +}
> >> +#endif /* CONFIG_PCI_IOV */
> >> +
> >>  /*
> >>   * This function is supposed to be called on basis of PE from top
> >>   * to bottom style. So the the I/O or MMIO segment assigned to
> >> @@ -2125,6 +2180,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
> >>  	ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
> >>  	ppc_md.pcibios_window_alignment = pnv_pci_window_alignment;
> >>  	ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
> >> +#ifdef CONFIG_PCI_IOV
> >> +	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_sriov;
> >> +#endif /* CONFIG_PCI_IOV */
> >>  	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
> >>  
> >>  	/* Reset IODA tables to a clean state */
> >> 
> 
> -- 
> Richard Yang
> Help you, Help me
>
Wei Yang March 11, 2015, 6:22 a.m. UTC | #4
On Tue, Mar 10, 2015 at 09:51:25PM -0500, Bjorn Helgaas wrote:
>On Mon, Mar 02, 2015 at 03:41:32PM +0800, Wei Yang wrote:
>> On Tue, Feb 24, 2015 at 02:52:34AM -0600, Bjorn Helgaas wrote:
>> >On Tue, Feb 24, 2015 at 02:34:42AM -0600, Bjorn Helgaas wrote:
>> >> From: Wei Yang <weiyang@linux.vnet.ibm.com>
>> >> 
>> >> On PHB3, PF IOV BAR will be covered by M64 window to have better PE
>> >> isolation.  The total_pe number is usually different from total_VFs, which
>> >> can lead to a conflict between MMIO space and the PE number.
>> >> 
>> >> For example, if total_VFs is 128 and total_pe is 256, the second half of
>> >> M64 window will be part of other PCI device, which may already belong
>> >> to other PEs.
>> >
>> >I'm still trying to wrap my mind around the explanation here.
>> >
>> >I *think* what's going on is that the M64 window must be a power-of-two
>> >size.  If the VF(n) BAR space doesn't completely fill it, we might allocate
>> >the leftover space to another device.  Then the M64 window for *this*
>> >device may cause the other device to be associated with a PE it didn't
>> >expect.
>> 
>> Yes, this is the exact reason.
>
>Can you include some of this text in your changelog, then?  I can wordsmith
>it and try to make it fit together better.
>

Sure, I will do this.

>> >> +#ifdef CONFIG_PCI_IOV
>> >> +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> >> +{
>> >> +	struct pci_controller *hose;
>> >> +	struct pnv_phb *phb;
>> >> +	struct resource *res;
>> >> +	int i;
>> >> +	resource_size_t size;
>> >> +	struct pci_dn *pdn;
>> >> +
>> >> +	if (!pdev->is_physfn || pdev->is_added)
>> >> +		return;
>> >> +
>> >> +	hose = pci_bus_to_host(pdev->bus);
>> >> +	phb = hose->private_data;
>> >> +
>> >> +	pdn = pci_get_pdn(pdev);
>> >> +	pdn->max_vfs = 0;
>> >> +
>> >> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> >> +		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>> >> +		if (!res->flags || res->parent)
>> >> +			continue;
>> >> +		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>> >> +			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>> >> +				 i, res);
>> >> +			continue;
>> >> +		}
>> >> +
>> >> +		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>> >> +		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>> >> +		res->end = res->start + size * phb->ioda.total_pe - 1;
>> >> +		dev_dbg(&pdev->dev, "                       %pR\n", res);
>> >> +		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
>> >> +				i, res, phb->ioda.total_pe);
>> >> +	}
>> >> +	pdn->max_vfs = phb->ioda.total_pe;
>> >> +}
>> >> +
>> >> +static void pnv_pci_ioda_fixup_sriov(struct pci_bus *bus)
>> >> +{
>> >> +	struct pci_dev *pdev;
>> >> +	struct pci_bus *b;
>> >> +
>> >> +	list_for_each_entry(pdev, &bus->devices, bus_list) {
>> >> +		b = pdev->subordinate;
>> >> +
>> >> +		if (b)
>> >> +			pnv_pci_ioda_fixup_sriov(b);
>> >> +
>> >> +		pnv_pci_ioda_fixup_iov_resources(pdev);
>> >
>> >I'm not sure this happens at the right time.  We have this call chain:
>> >
>> >  pcibios_scan_phb
>> >    pci_create_root_bus
>> >    pci_scan_child_bus
>> >    pnv_pci_ioda_fixup_sriov
>> >      pnv_pci_ioda_fixup_iov_resources
>> >	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>> >	  increase res->size to accomodate 256 PEs (or roundup(totalVFs)
>> >
>> >so we only do the fixup_iov_resources() when we scan the PHB, and we
>> >wouldn't do it at all for hot-added devices.
>> 
>> Yep, you are right :-)
>> 
>> I had a separate patch to do this in pcibios_add_pci_devices(). Looks we could
>> merge them.
>
>Did you fix this in v13?  I don't see the change if you did.
>

I add this in [PATCH V13 15/21].

In the file arch/powerpc/kernel/pci-hotplug.c, when hotplug a device, the
fixup will be called on that bus too.

>> >> +	}
>> >> +}
>> >> +#endif /* CONFIG_PCI_IOV */
>> >> +
>> >>  /*
>> >>   * This function is supposed to be called on basis of PE from top
>> >>   * to bottom style. So the the I/O or MMIO segment assigned to
>> >> @@ -2125,6 +2180,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>> >>  	ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
>> >>  	ppc_md.pcibios_window_alignment = pnv_pci_window_alignment;
>> >>  	ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
>> >> +#ifdef CONFIG_PCI_IOV
>> >> +	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_sriov;
>> >> +#endif /* CONFIG_PCI_IOV */
>> >>  	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
>> >>  
>> >>  	/* Reset IODA tables to a clean state */
>> >> 
>> 
>> -- 
>> Richard Yang
>> Help you, Help me
>>
Bjorn Helgaas March 11, 2015, 1:40 p.m. UTC | #5
On Wed, Mar 11, 2015 at 1:22 AM, Wei Yang <weiyang@linux.vnet.ibm.com> wrote:
> On Tue, Mar 10, 2015 at 09:51:25PM -0500, Bjorn Helgaas wrote:
>>On Mon, Mar 02, 2015 at 03:41:32PM +0800, Wei Yang wrote:
>>> On Tue, Feb 24, 2015 at 02:52:34AM -0600, Bjorn Helgaas wrote:
>>> >On Tue, Feb 24, 2015 at 02:34:42AM -0600, Bjorn Helgaas wrote:
>>> >> From: Wei Yang <weiyang@linux.vnet.ibm.com>
>>> >>
>>> >> On PHB3, PF IOV BAR will be covered by M64 window to have better PE
>>> >> isolation.  The total_pe number is usually different from total_VFs, which
>>> >> can lead to a conflict between MMIO space and the PE number.
>>> >>
>>> >> For example, if total_VFs is 128 and total_pe is 256, the second half of
>>> >> M64 window will be part of other PCI device, which may already belong
>>> >> to other PEs.
>>> >
>>> >I'm still trying to wrap my mind around the explanation here.
>>> >
>>> >I *think* what's going on is that the M64 window must be a power-of-two
>>> >size.  If the VF(n) BAR space doesn't completely fill it, we might allocate
>>> >the leftover space to another device.  Then the M64 window for *this*
>>> >device may cause the other device to be associated with a PE it didn't
>>> >expect.
>>>
>>> Yes, this is the exact reason.
>>
>>Can you include some of this text in your changelog, then?  I can wordsmith
>>it and try to make it fit together better.
>>
>
> Sure, I will do this.
>
>>> >> +#ifdef CONFIG_PCI_IOV
>>> >> +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>> >> +{
>>> >> + struct pci_controller *hose;
>>> >> + struct pnv_phb *phb;
>>> >> + struct resource *res;
>>> >> + int i;
>>> >> + resource_size_t size;
>>> >> + struct pci_dn *pdn;
>>> >> +
>>> >> + if (!pdev->is_physfn || pdev->is_added)
>>> >> +         return;
>>> >> +
>>> >> + hose = pci_bus_to_host(pdev->bus);
>>> >> + phb = hose->private_data;
>>> >> +
>>> >> + pdn = pci_get_pdn(pdev);
>>> >> + pdn->max_vfs = 0;
>>> >> +
>>> >> + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>> >> +         res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>> >> +         if (!res->flags || res->parent)
>>> >> +                 continue;
>>> >> +         if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>> >> +                 dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>> >> +                          i, res);
>>> >> +                 continue;
>>> >> +         }
>>> >> +
>>> >> +         dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>>> >> +         size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>> >> +         res->end = res->start + size * phb->ioda.total_pe - 1;
>>> >> +         dev_dbg(&pdev->dev, "                       %pR\n", res);
>>> >> +         dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
>>> >> +                         i, res, phb->ioda.total_pe);
>>> >> + }
>>> >> + pdn->max_vfs = phb->ioda.total_pe;
>>> >> +}
>>> >> +
>>> >> +static void pnv_pci_ioda_fixup_sriov(struct pci_bus *bus)
>>> >> +{
>>> >> + struct pci_dev *pdev;
>>> >> + struct pci_bus *b;
>>> >> +
>>> >> + list_for_each_entry(pdev, &bus->devices, bus_list) {
>>> >> +         b = pdev->subordinate;
>>> >> +
>>> >> +         if (b)
>>> >> +                 pnv_pci_ioda_fixup_sriov(b);
>>> >> +
>>> >> +         pnv_pci_ioda_fixup_iov_resources(pdev);
>>> >
>>> >I'm not sure this happens at the right time.  We have this call chain:
>>> >
>>> >  pcibios_scan_phb
>>> >    pci_create_root_bus
>>> >    pci_scan_child_bus
>>> >    pnv_pci_ioda_fixup_sriov
>>> >      pnv_pci_ioda_fixup_iov_resources
>>> >    for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>> >      increase res->size to accomodate 256 PEs (or roundup(totalVFs)
>>> >
>>> >so we only do the fixup_iov_resources() when we scan the PHB, and we
>>> >wouldn't do it at all for hot-added devices.
>>>
>>> Yep, you are right :-)
>>>
>>> I had a separate patch to do this in pcibios_add_pci_devices(). Looks we could
>>> merge them.
>>
>>Did you fix this in v13?  I don't see the change if you did.
>>
>
> I add this in [PATCH V13 15/21].
>
> In the file arch/powerpc/kernel/pci-hotplug.c, when hotplug a device, the
> fixup will be called on that bus too.

Ah, OK, thanks for the pointer.

>>> >> + }
>>> >> +}
>>> >> +#endif /* CONFIG_PCI_IOV */
>>> >> +
>>> >>  /*
>>> >>   * This function is supposed to be called on basis of PE from top
>>> >>   * to bottom style. So the the I/O or MMIO segment assigned to
>>> >> @@ -2125,6 +2180,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>>> >>   ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
>>> >>   ppc_md.pcibios_window_alignment = pnv_pci_window_alignment;
>>> >>   ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
>>> >> +#ifdef CONFIG_PCI_IOV
>>> >> + ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_sriov;
>>> >> +#endif /* CONFIG_PCI_IOV */
>>> >>   pci_add_flags(PCI_REASSIGN_ALL_RSRC);
>>> >>
>>> >>   /* Reset IODA tables to a clean state */
>>> >>
>>>
>>> --
>>> Richard Yang
>>> Help you, Help me
>>>
>
> --
> Richard Yang
> Help you, Help me
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index c8175a3fe560..965547c58497 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -250,6 +250,10 @@  struct machdep_calls {
 	/* Reset the secondary bus of bridge */
 	void  (*pcibios_reset_secondary_bus)(struct pci_dev *dev);
 
+#ifdef CONFIG_PCI_IOV
+	void (*pcibios_fixup_sriov)(struct pci_bus *bus);
+#endif /* CONFIG_PCI_IOV */
+
 	/* Called to shutdown machine specific hardware not already controlled
 	 * by other drivers.
 	 */
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 513f8f27060d..de11de7d4547 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -175,6 +175,9 @@  struct pci_dn {
 #define IODA_INVALID_PE		(-1)
 #ifdef CONFIG_PPC_POWERNV
 	int	pe_number;
+#ifdef CONFIG_PCI_IOV
+	u16     max_vfs;		/* number of VFs IOV BAR expended */
+#endif /* CONFIG_PCI_IOV */
 #endif
 	struct list_head child_list;
 	struct list_head list;
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 82031011522f..022e9feeb1f2 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1646,6 +1646,11 @@  void pcibios_scan_phb(struct pci_controller *hose)
 	if (ppc_md.pcibios_fixup_phb)
 		ppc_md.pcibios_fixup_phb(hose);
 
+#ifdef CONFIG_PCI_IOV
+	if (ppc_md.pcibios_fixup_sriov)
+		ppc_md.pcibios_fixup_sriov(bus);
+#endif /* CONFIG_PCI_IOV */
+
 	/* Configure PCI Express settings */
 	if (bus && !pci_has_flag(PCI_PROBE_ONLY)) {
 		struct pci_bus *child;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index cd1a56160ded..36c533da5ccb 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1749,6 +1749,61 @@  static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
 #endif /* CONFIG_PCI_MSI */
 
+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct resource *res;
+	int i;
+	resource_size_t size;
+	struct pci_dn *pdn;
+
+	if (!pdev->is_physfn || pdev->is_added)
+		return;
+
+	hose = pci_bus_to_host(pdev->bus);
+	phb = hose->private_data;
+
+	pdn = pci_get_pdn(pdev);
+	pdn->max_vfs = 0;
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || res->parent)
+			continue;
+		if (!pnv_pci_is_mem_pref_64(res->flags)) {
+			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
+				 i, res);
+			continue;
+		}
+
+		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
+		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+		res->end = res->start + size * phb->ioda.total_pe - 1;
+		dev_dbg(&pdev->dev, "                       %pR\n", res);
+		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
+				i, res, phb->ioda.total_pe);
+	}
+	pdn->max_vfs = phb->ioda.total_pe;
+}
+
+static void pnv_pci_ioda_fixup_sriov(struct pci_bus *bus)
+{
+	struct pci_dev *pdev;
+	struct pci_bus *b;
+
+	list_for_each_entry(pdev, &bus->devices, bus_list) {
+		b = pdev->subordinate;
+
+		if (b)
+			pnv_pci_ioda_fixup_sriov(b);
+
+		pnv_pci_ioda_fixup_iov_resources(pdev);
+	}
+}
+#endif /* CONFIG_PCI_IOV */
+
 /*
  * This function is supposed to be called on basis of PE from top
  * to bottom style. So the the I/O or MMIO segment assigned to
@@ -2125,6 +2180,9 @@  static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
 	ppc_md.pcibios_window_alignment = pnv_pci_window_alignment;
 	ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
+#ifdef CONFIG_PCI_IOV
+	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_sriov;
+#endif /* CONFIG_PCI_IOV */
 	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
 
 	/* Reset IODA tables to a clean state */