diff mbox

[V9,08/18] powrepc/pci: Refactor pci_dn

Message ID 1414942894-17034-9-git-send-email-weiyang@linux.vnet.ibm.com
State Changes Requested
Headers show

Commit Message

Wei Yang Nov. 2, 2014, 3:41 p.m. UTC
From: Gavin Shan <gwshan@linux.vnet.ibm.com>

pci_dn is the extension of PCI device node and it's created from
device node. Unfortunately, VFs that are enabled dynamically by
PF's driver and they don't have corresponding device nodes, and
pci_dn. The patch refactors pci_dn to support VFs:

   * pci_dn is organized as a hierarchy tree. VF's pci_dn is put
     to the child list of pci_dn of PF's bridge. pci_dn of other
     device put to the child list of pci_dn of its upstream bridge.

   * VF's pci_dn is expected to be created dynamically when applying
     final fixup to PF. VF's pci_dn will be destroyed when releasing
     PF's pci_dev instance. pci_dn of other device is still created
     from device node as before.

   * For one particular PCI device (VF or not), its pci_dn can be
     found from pdev->dev.archdata.firmware_data, PCI_DN(devnode),
     or parent's list. The fast path (fetching pci_dn through PCI
     device instance) is populated during early fixup time.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/device.h     |    3 +
 arch/powerpc/include/asm/pci-bridge.h |   14 +-
 arch/powerpc/kernel/pci-hotplug.c     |    3 +
 arch/powerpc/kernel/pci_dn.c          |  246 ++++++++++++++++++++++++++++++++-
 4 files changed, 261 insertions(+), 5 deletions(-)

Comments

Bjorn Helgaas Nov. 19, 2014, 11:30 p.m. UTC | #1
On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote:
> From: Gavin Shan <gwshan@linux.vnet.ibm.com>
> 
> pci_dn is the extension of PCI device node and it's created from
> device node. Unfortunately, VFs that are enabled dynamically by
> PF's driver and they don't have corresponding device nodes, and
> pci_dn. The patch refactors pci_dn to support VFs:
> 
>    * pci_dn is organized as a hierarchy tree. VF's pci_dn is put
>      to the child list of pci_dn of PF's bridge. pci_dn of other
>      device put to the child list of pci_dn of its upstream bridge.
> 
>    * VF's pci_dn is expected to be created dynamically when applying
>      final fixup to PF. VF's pci_dn will be destroyed when releasing
>      PF's pci_dev instance. pci_dn of other device is still created
>      from device node as before.
> 
>    * For one particular PCI device (VF or not), its pci_dn can be
>      found from pdev->dev.archdata.firmware_data, PCI_DN(devnode),
>      or parent's list. The fast path (fetching pci_dn through PCI
>      device instance) is populated during early fixup time.
> 
> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> ---
> ...

> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev)
> +{
> +#ifdef CONFIG_PCI_IOV
> +	struct pci_dn *parent, *pdn;
> +	int i;
> +
> +	/* Only support IOV for now */
> +	if (!pdev->is_physfn)
> +		return pci_get_pdn(pdev);
> +
> +	/* Check if VFs have been populated */
> +	pdn = pci_get_pdn(pdev);
> +	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
> +		return NULL;
> +
> +	pdn->flags |= PCI_DN_FLAG_IOV_VF;
> +	parent = pci_bus_to_pdn(pdev->bus);
> +	if (!parent)
> +		return NULL;
> +
> +	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
> +		pdn = add_one_dev_pci_info(parent, NULL,
> +					   pci_iov_virtfn_bus(pdev, i),
> +					   pci_iov_virtfn_devfn(pdev, i));

I'm not sure this makes sense, but I certainly don't know this code, so
maybe I'm missing something.

pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on
pdev->sriov->stride and pdev->sriov->offset.  These are read from VF Stride
and First VF Offset in the SR-IOV capability by sriov_init(), which is
called before add_dev_pci_info():

  pci_scan_child_bus
    pci_scan_slot
      pci_scan_single_device
	pci_device_add
	  pci_init_capabilities
	    pci_iov_init(PF)
	      sriov_init(PF, pos)
		pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0)
		pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset)
		pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride)
		iov->offset = offset
		iov->stride = stride

  pci_bus_add_devices
    pci_bus_add_device
      pci_fixup_device(pci_fixup_final)
	add_dev_pci_info
	  pci_iov_virtfn_bus
	    return ... + sriov->offset + (sriov->stride * id) ...

But both First VF Offset and VF Stride change when ARI Capable Hierarchy or
NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10).  We set NumVFs to zero in
sriov_init() above.  We will change NumVFs to something different when a
driver calls pci_enable_sriov():

  pci_enable_sriov
    sriov_enable
      pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn)

Now First VF Offset and VF Stride have changed from what they were when we
called pci_iov_virtfn_bus() above.

> +		if (!pdn) {
> +			pr_warn("%s: Cannot create firmware data "
> +				"for VF#%d of %s\n",
> +				__func__, i, pci_name(pdev));
> +			return NULL;
> +		}
> +	}
> +#endif
> +
> +	return pci_get_pdn(pdev);
> +}
> ...

> +static void pci_dev_pdn_create(struct pci_dev *pdev)
> +{
> +	add_dev_pci_info(pdev);
> +}
> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_create);

There are no other callers of add_dev_pci_info(), so it seems pointless to
declare it in arch/powerpc/include/asm/pci-bridge.h and add this wrapper
around it.

> +
> +static void pci_dev_pdn_setup(struct pci_dev *pdev)
> +{
> +	struct pci_dn *pdn;
> +
> +	if (pdev->dev.archdata.firmware_data)
> +		return;
> +
> +	/* Setup the fast path */
> +	pdn = pci_get_pdn(pdev);
> +	pdev->dev.archdata.firmware_data = pdn;
> +}
> +DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_setup);
> -- 
> 1.7.9.5
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gavin Shan Nov. 20, 2014, 1:02 a.m. UTC | #2
On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote:
>On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote:
>> From: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> 
>> pci_dn is the extension of PCI device node and it's created from
>> device node. Unfortunately, VFs that are enabled dynamically by
>> PF's driver and they don't have corresponding device nodes, and
>> pci_dn. The patch refactors pci_dn to support VFs:
>> 
>>    * pci_dn is organized as a hierarchy tree. VF's pci_dn is put
>>      to the child list of pci_dn of PF's bridge. pci_dn of other
>>      device put to the child list of pci_dn of its upstream bridge.
>> 
>>    * VF's pci_dn is expected to be created dynamically when applying
>>      final fixup to PF. VF's pci_dn will be destroyed when releasing
>>      PF's pci_dev instance. pci_dn of other device is still created
>>      from device node as before.
>> 
>>    * For one particular PCI device (VF or not), its pci_dn can be
>>      found from pdev->dev.archdata.firmware_data, PCI_DN(devnode),
>>      or parent's list. The fast path (fetching pci_dn through PCI
>>      device instance) is populated during early fixup time.
>> 
>> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> ---
>> ...
>
>> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev)
>> +{
>> +#ifdef CONFIG_PCI_IOV
>> +	struct pci_dn *parent, *pdn;
>> +	int i;
>> +
>> +	/* Only support IOV for now */
>> +	if (!pdev->is_physfn)
>> +		return pci_get_pdn(pdev);
>> +
>> +	/* Check if VFs have been populated */
>> +	pdn = pci_get_pdn(pdev);
>> +	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
>> +		return NULL;
>> +
>> +	pdn->flags |= PCI_DN_FLAG_IOV_VF;
>> +	parent = pci_bus_to_pdn(pdev->bus);
>> +	if (!parent)
>> +		return NULL;
>> +
>> +	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
>> +		pdn = add_one_dev_pci_info(parent, NULL,
>> +					   pci_iov_virtfn_bus(pdev, i),
>> +					   pci_iov_virtfn_devfn(pdev, i));
>
>I'm not sure this makes sense, but I certainly don't know this code, so
>maybe I'm missing something.
>

For ARI, Richard had some patches to fix the issue from firmware side.

>pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on
>pdev->sriov->stride and pdev->sriov->offset.  These are read from VF Stride
>and First VF Offset in the SR-IOV capability by sriov_init(), which is
>called before add_dev_pci_info():
>
>  pci_scan_child_bus
>    pci_scan_slot
>      pci_scan_single_device
>	pci_device_add
>	  pci_init_capabilities
>	    pci_iov_init(PF)
>	      sriov_init(PF, pos)
>		pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0)
>		pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset)
>		pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride)
>		iov->offset = offset
>		iov->stride = stride
>
>  pci_bus_add_devices
>    pci_bus_add_device
>      pci_fixup_device(pci_fixup_final)
>	add_dev_pci_info
>	  pci_iov_virtfn_bus
>	    return ... + sriov->offset + (sriov->stride * id) ...
>
>But both First VF Offset and VF Stride change when ARI Capable Hierarchy or
>NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10).  We set NumVFs to zero in
>sriov_init() above.  We will change NumVFs to something different when a
>driver calls pci_enable_sriov():
>
>  pci_enable_sriov
>    sriov_enable
>      pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn)
>
>Now First VF Offset and VF Stride have changed from what they were when we
>called pci_iov_virtfn_bus() above.
>

It's the case we missed: First VF Offset and VF Stride can change when
PF's number of VFs is changed. It means the BDFN (Bus/Device/Function
number) for one VF can't be determined until PF's number of VFs is
populated and updated to HW (before calling to virtfn_add()).

The dynamically created pci_dn is used in PCI config accessors currently.
That means we have to get it ready before first PCI config request to the
VF in pci_setup_device(). In the code of old revision, we had some weak
function called in pci_alloc_dev(), which gave platform chance to create
pci_dn. I think we have to switch back to the old way in order to fix
the problem you catched. However, the old way is implemented with cost
of more weak function, which you're probably unhappy to see.

  sriov_enable()
    virtfn_add()
      virtfn_add_bus()
      pci_alloc_dev()
      pci_setup_device()

>> +		if (!pdn) {
>> +			pr_warn("%s: Cannot create firmware data "
>> +				"for VF#%d of %s\n",
>> +				__func__, i, pci_name(pdev));
>> +			return NULL;
>> +		}
>> +	}
>> +#endif
>> +
>> +	return pci_get_pdn(pdev);
>> +}
>> ...
>
>> +static void pci_dev_pdn_create(struct pci_dev *pdev)
>> +{
>> +	add_dev_pci_info(pdev);
>> +}
>> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_create);
>
>There are no other callers of add_dev_pci_info(), so it seems pointless to
>declare it in arch/powerpc/include/asm/pci-bridge.h and add this wrapper
>around it.
>

Yep and will fix.

Thanks,
Gavin

>> +
>> +static void pci_dev_pdn_setup(struct pci_dev *pdev)
>> +{
>> +	struct pci_dn *pdn;
>> +
>> +	if (pdev->dev.archdata.firmware_data)
>> +		return;
>> +
>> +	/* Setup the fast path */
>> +	pdn = pci_get_pdn(pdev);
>> +	pdev->dev.archdata.firmware_data = pdn;
>> +}
>> +DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_setup);
>> -- 
>> 1.7.9.5
>> 
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Yang Nov. 20, 2014, 7:20 a.m. UTC | #3
On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote:
>On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote:
>> From: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> 
>> pci_dn is the extension of PCI device node and it's created from
>> device node. Unfortunately, VFs that are enabled dynamically by
>> PF's driver and they don't have corresponding device nodes, and
>> pci_dn. The patch refactors pci_dn to support VFs:
>> 
>>    * pci_dn is organized as a hierarchy tree. VF's pci_dn is put
>>      to the child list of pci_dn of PF's bridge. pci_dn of other
>>      device put to the child list of pci_dn of its upstream bridge.
>> 
>>    * VF's pci_dn is expected to be created dynamically when applying
>>      final fixup to PF. VF's pci_dn will be destroyed when releasing
>>      PF's pci_dev instance. pci_dn of other device is still created
>>      from device node as before.
>> 
>>    * For one particular PCI device (VF or not), its pci_dn can be
>>      found from pdev->dev.archdata.firmware_data, PCI_DN(devnode),
>>      or parent's list. The fast path (fetching pci_dn through PCI
>>      device instance) is populated during early fixup time.
>> 
>> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> ---
>> ...
>
>> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev)
>> +{
>> +#ifdef CONFIG_PCI_IOV
>> +	struct pci_dn *parent, *pdn;
>> +	int i;
>> +
>> +	/* Only support IOV for now */
>> +	if (!pdev->is_physfn)
>> +		return pci_get_pdn(pdev);
>> +
>> +	/* Check if VFs have been populated */
>> +	pdn = pci_get_pdn(pdev);
>> +	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
>> +		return NULL;
>> +
>> +	pdn->flags |= PCI_DN_FLAG_IOV_VF;
>> +	parent = pci_bus_to_pdn(pdev->bus);
>> +	if (!parent)
>> +		return NULL;
>> +
>> +	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
>> +		pdn = add_one_dev_pci_info(parent, NULL,
>> +					   pci_iov_virtfn_bus(pdev, i),
>> +					   pci_iov_virtfn_devfn(pdev, i));
>
>I'm not sure this makes sense, but I certainly don't know this code, so
>maybe I'm missing something.
>
>pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on
>pdev->sriov->stride and pdev->sriov->offset.  These are read from VF Stride
>and First VF Offset in the SR-IOV capability by sriov_init(), which is
>called before add_dev_pci_info():
>
>  pci_scan_child_bus
>    pci_scan_slot
>      pci_scan_single_device
>	pci_device_add
>	  pci_init_capabilities
>	    pci_iov_init(PF)
>	      sriov_init(PF, pos)
>		pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0)
>		pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset)
>		pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride)
>		iov->offset = offset
>		iov->stride = stride
>
>  pci_bus_add_devices
>    pci_bus_add_device
>      pci_fixup_device(pci_fixup_final)
>	add_dev_pci_info
>	  pci_iov_virtfn_bus
>	    return ... + sriov->offset + (sriov->stride * id) ...
>
>But both First VF Offset and VF Stride change when ARI Capable Hierarchy or
>NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10).  We set NumVFs to zero in
>sriov_init() above.  We will change NumVFs to something different when a
>driver calls pci_enable_sriov():
>
>  pci_enable_sriov
>    sriov_enable
>      pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn)
>
>Now First VF Offset and VF Stride have changed from what they were when we
>called pci_iov_virtfn_bus() above.
>

Oops, I see the ARI would affect those value, while missed the NumVFs also
would.

Let's look at the problem one by one.

1. The ARI capability.
===============================================================================
The kernel initialize the capability like this:

pci_init_capabilities()
	pci_configure_ari()
	pci_iov_init()
		iov->offset = offset
		iov->stride = stride

When offset/stride is retrieved at this point, the ARI capability is taken
into consideration.

2. The PF's NumVFs field
===============================================================================
2.1 Potential problem in current code
===============================================================================
First, is current pci code has some potential problem?

sriov_enable()
	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
	iov->offset = offset;
	iov->stride = stride;
	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
	virtfn_add()
		...
		virtfn->devfn = pci_iov_virtfn_devfn(dev, id);

The sriov_enable() retrieve the offset/stride then write the NumVFs. According
to the SPEC, at this moment the offset/stride may change. While I don't see
some code to retrieve and store those value again. And these fields will be
used in virtfn_add().

If my understanding is correct, I suggest to move the retrieve and store
operation after NumVFs is written.

2.2 The IOV bus range may not be correct in pci_scan_child_bus()?
===============================================================================
In current pci core, when enumerating the pci tree, we do like this:

pci_scan_child_bus()
	pci_scan_slot()
		pci_scan_single_device()
			pci_device_add()
				pci_init_capabilities()
					pci_iov_init()
	max += pci_iov_bus_range(bus);
		busnr = pci_iov_virtfn_bus(dev, dev->sriov->total_VFs - 1);
	max = pci_scan_bridge(bus, dev, max, pass);

From this point, we see pci core reserve some bus range for VFs. This
calculation is based on the offset/stride at this moment. And do the
enumeration with the new bus number.

sriov_enable() could be called several times from driver to enable SRIOV, and
with different nr_virtfn. If each time the NumVFs written, the offset/stride
will change. This means we may try to address an extra bus we didn't reserve?
Or this means it is out of control?

Do I miss something?

2.3 How can I reserve bus range in FW?
===============================================================================
This question comes from the previous one.

Based on my understanding, current pci core will rely on the bus number in HW
if pcibios_assign_all_busses() is not set. If we want to support those VFs
sits on different bus with PF, we need to reserve bus range and write the
correct secondary/subordinate in bridge. Otherwise, those VFs on different bus
may not be addressed.

Currently I am writing the code in FW to reserve the range with the same
mechanism in pci core. While as you mentioned the offset/stride may change
after sriov_enable(), I am confused whether this is the correct way.

2.4 The potential problem for [Patch 08/18]
===============================================================================
According to the SPEC, the offset/stride will change after each
sriov_enable(). This means the bus/devfn will change after each
sriov_enable().

My current thought is to fix it up in virtfn_add(). If the total VF number
will not change, we could create those pci_dn at the beginning and fix the
bus/devfn at each time the VF is truely created.
Wei Yang Nov. 20, 2014, 7:25 a.m. UTC | #4
On Thu, Nov 20, 2014 at 12:02:13PM +1100, Gavin Shan wrote:
>On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote:
>>On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote:
>>> From: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>> 
>>> pci_dn is the extension of PCI device node and it's created from
>>> device node. Unfortunately, VFs that are enabled dynamically by
>>> PF's driver and they don't have corresponding device nodes, and
>>> pci_dn. The patch refactors pci_dn to support VFs:
>>> 
>>>    * pci_dn is organized as a hierarchy tree. VF's pci_dn is put
>>>      to the child list of pci_dn of PF's bridge. pci_dn of other
>>>      device put to the child list of pci_dn of its upstream bridge.
>>> 
>>>    * VF's pci_dn is expected to be created dynamically when applying
>>>      final fixup to PF. VF's pci_dn will be destroyed when releasing
>>>      PF's pci_dev instance. pci_dn of other device is still created
>>>      from device node as before.
>>> 
>>>    * For one particular PCI device (VF or not), its pci_dn can be
>>>      found from pdev->dev.archdata.firmware_data, PCI_DN(devnode),
>>>      or parent's list. The fast path (fetching pci_dn through PCI
>>>      device instance) is populated during early fixup time.
>>> 
>>> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>> ---
>>> ...
>>
>>> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev)
>>> +{
>>> +#ifdef CONFIG_PCI_IOV
>>> +	struct pci_dn *parent, *pdn;
>>> +	int i;
>>> +
>>> +	/* Only support IOV for now */
>>> +	if (!pdev->is_physfn)
>>> +		return pci_get_pdn(pdev);
>>> +
>>> +	/* Check if VFs have been populated */
>>> +	pdn = pci_get_pdn(pdev);
>>> +	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
>>> +		return NULL;
>>> +
>>> +	pdn->flags |= PCI_DN_FLAG_IOV_VF;
>>> +	parent = pci_bus_to_pdn(pdev->bus);
>>> +	if (!parent)
>>> +		return NULL;
>>> +
>>> +	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
>>> +		pdn = add_one_dev_pci_info(parent, NULL,
>>> +					   pci_iov_virtfn_bus(pdev, i),
>>> +					   pci_iov_virtfn_devfn(pdev, i));
>>
>>I'm not sure this makes sense, but I certainly don't know this code, so
>>maybe I'm missing something.
>>
>
>For ARI, Richard had some patches to fix the issue from firmware side.
>
>>pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on
>>pdev->sriov->stride and pdev->sriov->offset.  These are read from VF Stride
>>and First VF Offset in the SR-IOV capability by sriov_init(), which is
>>called before add_dev_pci_info():
>>
>>  pci_scan_child_bus
>>    pci_scan_slot
>>      pci_scan_single_device
>>	pci_device_add
>>	  pci_init_capabilities
>>	    pci_iov_init(PF)
>>	      sriov_init(PF, pos)
>>		pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0)
>>		pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset)
>>		pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride)
>>		iov->offset = offset
>>		iov->stride = stride
>>
>>  pci_bus_add_devices
>>    pci_bus_add_device
>>      pci_fixup_device(pci_fixup_final)
>>	add_dev_pci_info
>>	  pci_iov_virtfn_bus
>>	    return ... + sriov->offset + (sriov->stride * id) ...
>>
>>But both First VF Offset and VF Stride change when ARI Capable Hierarchy or
>>NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10).  We set NumVFs to zero in
>>sriov_init() above.  We will change NumVFs to something different when a
>>driver calls pci_enable_sriov():
>>
>>  pci_enable_sriov
>>    sriov_enable
>>      pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn)
>>
>>Now First VF Offset and VF Stride have changed from what they were when we
>>called pci_iov_virtfn_bus() above.
>>
>
>It's the case we missed: First VF Offset and VF Stride can change when
>PF's number of VFs is changed. It means the BDFN (Bus/Device/Function
>number) for one VF can't be determined until PF's number of VFs is
>populated and updated to HW (before calling to virtfn_add()).
>
>The dynamically created pci_dn is used in PCI config accessors currently.
>That means we have to get it ready before first PCI config request to the
>VF in pci_setup_device(). In the code of old revision, we had some weak
>function called in pci_alloc_dev(), which gave platform chance to create
>pci_dn. I think we have to switch back to the old way in order to fix
>the problem you catched. However, the old way is implemented with cost
>of more weak function, which you're probably unhappy to see.
>
>  sriov_enable()
>    virtfn_add()
>      virtfn_add_bus()
>      pci_alloc_dev()
>      pci_setup_device()

Ok, sounds my solution in previous reply can't work. We need the pci_dn ready
before access the configuration space of VFs.
Bjorn Helgaas Nov. 20, 2014, 7:05 p.m. UTC | #5
On Thu, Nov 20, 2014 at 03:20:57PM +0800, Wei Yang wrote:
> On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote:
> >On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote:
> >> From: Gavin Shan <gwshan@linux.vnet.ibm.com>
> >> 
> >> pci_dn is the extension of PCI device node and it's created from
> >> device node. Unfortunately, VFs that are enabled dynamically by
> >> PF's driver and they don't have corresponding device nodes, and
> >> pci_dn. The patch refactors pci_dn to support VFs:
> >> 
> >>    * pci_dn is organized as a hierarchy tree. VF's pci_dn is put
> >>      to the child list of pci_dn of PF's bridge. pci_dn of other
> >>      device put to the child list of pci_dn of its upstream bridge.
> >> 
> >>    * VF's pci_dn is expected to be created dynamically when applying
> >>      final fixup to PF. VF's pci_dn will be destroyed when releasing
> >>      PF's pci_dev instance. pci_dn of other device is still created
> >>      from device node as before.
> >> 
> >>    * For one particular PCI device (VF or not), its pci_dn can be
> >>      found from pdev->dev.archdata.firmware_data, PCI_DN(devnode),
> >>      or parent's list. The fast path (fetching pci_dn through PCI
> >>      device instance) is populated during early fixup time.
> >> 
> >> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> >> ---
> >> ...
> >
> >> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev)
> >> +{
> >> +#ifdef CONFIG_PCI_IOV
> >> +	struct pci_dn *parent, *pdn;
> >> +	int i;
> >> +
> >> +	/* Only support IOV for now */
> >> +	if (!pdev->is_physfn)
> >> +		return pci_get_pdn(pdev);
> >> +
> >> +	/* Check if VFs have been populated */
> >> +	pdn = pci_get_pdn(pdev);
> >> +	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
> >> +		return NULL;
> >> +
> >> +	pdn->flags |= PCI_DN_FLAG_IOV_VF;
> >> +	parent = pci_bus_to_pdn(pdev->bus);
> >> +	if (!parent)
> >> +		return NULL;
> >> +
> >> +	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
> >> +		pdn = add_one_dev_pci_info(parent, NULL,
> >> +					   pci_iov_virtfn_bus(pdev, i),
> >> +					   pci_iov_virtfn_devfn(pdev, i));
> >
> >I'm not sure this makes sense, but I certainly don't know this code, so
> >maybe I'm missing something.
> >
> >pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on
> >pdev->sriov->stride and pdev->sriov->offset.  These are read from VF Stride
> >and First VF Offset in the SR-IOV capability by sriov_init(), which is
> >called before add_dev_pci_info():
> >
> >  pci_scan_child_bus
> >    pci_scan_slot
> >      pci_scan_single_device
> >	pci_device_add
> >	  pci_init_capabilities
> >	    pci_iov_init(PF)
> >	      sriov_init(PF, pos)
> >		pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0)
> >		pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset)
> >		pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride)
> >		iov->offset = offset
> >		iov->stride = stride
> >
> >  pci_bus_add_devices
> >    pci_bus_add_device
> >      pci_fixup_device(pci_fixup_final)
> >	add_dev_pci_info
> >	  pci_iov_virtfn_bus
> >	    return ... + sriov->offset + (sriov->stride * id) ...
> >
> >But both First VF Offset and VF Stride change when ARI Capable Hierarchy or
> >NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10).  We set NumVFs to zero in
> >sriov_init() above.  We will change NumVFs to something different when a
> >driver calls pci_enable_sriov():
> >
> >  pci_enable_sriov
> >    sriov_enable
> >      pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn)
> >
> >Now First VF Offset and VF Stride have changed from what they were when we
> >called pci_iov_virtfn_bus() above.
> >
> 
> Oops, I see the ARI would affect those value, while missed the NumVFs also
> would.
> 
> Let's look at the problem one by one.
> 
> 1. The ARI capability.
> ===============================================================================
> The kernel initialize the capability like this:
> 
> pci_init_capabilities()
> 	pci_configure_ari()
> 	pci_iov_init()
> 		iov->offset = offset
> 		iov->stride = stride
> 
> When offset/stride is retrieved at this point, the ARI capability is taken
> into consideration.

PCI_SRIOV_CTRL_ARI is currently only changed at the time we enumerate the
PF, so I don't think this is really a problem.

> 2. The PF's NumVFs field
> ===============================================================================
> 2.1 Potential problem in current code
> ===============================================================================
> First, is current pci code has some potential problem?
> 
> sriov_enable()
> 	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
> 	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
> 	iov->offset = offset;
> 	iov->stride = stride;
> 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
> 	virtfn_add()
> 		...
> 		virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
> 
> The sriov_enable() retrieve the offset/stride then write the NumVFs. According
> to the SPEC, at this moment the offset/stride may change. While I don't see
> some code to retrieve and store those value again. And these fields will be
> used in virtfn_add().
> 
> If my understanding is correct, I suggest to move the retrieve and store
> operation after NumVFs is written.

Yep, it looks like the existing code has similar problems.  We might want
to add a simple function that writes PCI_SRIOV_NUM_VF, then reads
PCI_SRIOV_VF_OFFSET and PCI_SRIOV_VF_STRIDE and refreshes the cached values
in dev->sriov.

Then we'd at least know that virtfn_bus() and virtfn_devfn() return values
that are correct until the next NumVFs change.

> 2.2 The IOV bus range may not be correct in pci_scan_child_bus()?
> ===============================================================================
> In current pci core, when enumerating the pci tree, we do like this:
> 
> pci_scan_child_bus()
> 	pci_scan_slot()
> 		pci_scan_single_device()
> 			pci_device_add()
> 				pci_init_capabilities()
> 					pci_iov_init()
> 	max += pci_iov_bus_range(bus);
> 		busnr = pci_iov_virtfn_bus(dev, dev->sriov->total_VFs - 1);
> 	max = pci_scan_bridge(bus, dev, max, pass);
> 
> From this point, we see pci core reserve some bus range for VFs. This
> calculation is based on the offset/stride at this moment. And do the
> enumeration with the new bus number.
> 
> sriov_enable() could be called several times from driver to enable SRIOV, and
> with different nr_virtfn. If each time the NumVFs written, the offset/stride
> will change. This means we may try to address an extra bus we didn't reserve?
> Or this means it is out of control?

This looks like a problem, too.  I don't have a good suggestion for fixing
it.

> 2.3 How can I reserve bus range in FW?
> ===============================================================================
> This question comes from the previous one.
> 
> Based on my understanding, current pci core will rely on the bus number in HW
> if pcibios_assign_all_busses() is not set. If we want to support those VFs
> sits on different bus with PF, we need to reserve bus range and write the
> correct secondary/subordinate in bridge. Otherwise, those VFs on different bus
> may not be addressed.
> 
> Currently I am writing the code in FW to reserve the range with the same
> mechanism in pci core. While as you mentioned the offset/stride may change
> after sriov_enable(), I am confused whether this is the correct way.

If your firmware knows something about the device and can compute the
number of buses it will likely need, it can set up bridges with appropriate
bus number ranges, and Linux will generally leave those alone.

I don't know the best way to figure out the number of buses, though.  It
seems like you almost need to experimentally set NumVFs and read the
resulting offset/stride, because I think it's really up to the device to
decide how to number the VFs.  Maybe pci_iov_bus_range() needs to do
something similar.

> 2.4 The potential problem for [Patch 08/18]
> ===============================================================================
> According to the SPEC, the offset/stride will change after each
> sriov_enable(). This means the bus/devfn will change after each
> sriov_enable().
> 
> My current thought is to fix it up in virtfn_add(). If the total VF number
> will not change, we could create those pci_dn at the beginning and fix the
> bus/devfn at each time the VF is truely created.

By "fix it up," I assume you mean call an arch function that does the
pci_pdn setup you need.

It sounds reasonable to do this either in virtfn_add()/virtfn_remove() or
at the points where we write PCI_SRIOV_CTRL_VFE, i.e., in sriov_init(),
sriov_enable(), sriov_disable(), and sriov_restore_state().  From a
hardware point of view, the VFs exist whenever PCI_SRIOV_CTRL_VFE is set,
so it might be nice to have this setup connected to that.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gavin Shan Nov. 21, 2014, 12:04 a.m. UTC | #6
On Thu, Nov 20, 2014 at 12:05:41PM -0700, Bjorn Helgaas wrote:
>On Thu, Nov 20, 2014 at 03:20:57PM +0800, Wei Yang wrote:
>> On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote:
>> >On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote:
>> >> From: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> >> 
>> >> pci_dn is the extension of PCI device node and it's created from
>> >> device node. Unfortunately, VFs that are enabled dynamically by
>> >> PF's driver and they don't have corresponding device nodes, and
>> >> pci_dn. The patch refactors pci_dn to support VFs:
>> >> 
>> >>    * pci_dn is organized as a hierarchy tree. VF's pci_dn is put
>> >>      to the child list of pci_dn of PF's bridge. pci_dn of other
>> >>      device put to the child list of pci_dn of its upstream bridge.
>> >> 
>> >>    * VF's pci_dn is expected to be created dynamically when applying
>> >>      final fixup to PF. VF's pci_dn will be destroyed when releasing
>> >>      PF's pci_dev instance. pci_dn of other device is still created
>> >>      from device node as before.
>> >> 
>> >>    * For one particular PCI device (VF or not), its pci_dn can be
>> >>      found from pdev->dev.archdata.firmware_data, PCI_DN(devnode),
>> >>      or parent's list. The fast path (fetching pci_dn through PCI
>> >>      device instance) is populated during early fixup time.
>> >> 
>> >> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> >> ---
>> >> ...
>> >
>> >> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev)
>> >> +{
>> >> +#ifdef CONFIG_PCI_IOV
>> >> +	struct pci_dn *parent, *pdn;
>> >> +	int i;
>> >> +
>> >> +	/* Only support IOV for now */
>> >> +	if (!pdev->is_physfn)
>> >> +		return pci_get_pdn(pdev);
>> >> +
>> >> +	/* Check if VFs have been populated */
>> >> +	pdn = pci_get_pdn(pdev);
>> >> +	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
>> >> +		return NULL;
>> >> +
>> >> +	pdn->flags |= PCI_DN_FLAG_IOV_VF;
>> >> +	parent = pci_bus_to_pdn(pdev->bus);
>> >> +	if (!parent)
>> >> +		return NULL;
>> >> +
>> >> +	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
>> >> +		pdn = add_one_dev_pci_info(parent, NULL,
>> >> +					   pci_iov_virtfn_bus(pdev, i),
>> >> +					   pci_iov_virtfn_devfn(pdev, i));
>> >
>> >I'm not sure this makes sense, but I certainly don't know this code, so
>> >maybe I'm missing something.
>> >
>> >pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on
>> >pdev->sriov->stride and pdev->sriov->offset.  These are read from VF Stride
>> >and First VF Offset in the SR-IOV capability by sriov_init(), which is
>> >called before add_dev_pci_info():
>> >
>> >  pci_scan_child_bus
>> >    pci_scan_slot
>> >      pci_scan_single_device
>> >	pci_device_add
>> >	  pci_init_capabilities
>> >	    pci_iov_init(PF)
>> >	      sriov_init(PF, pos)
>> >		pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0)
>> >		pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset)
>> >		pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride)
>> >		iov->offset = offset
>> >		iov->stride = stride
>> >
>> >  pci_bus_add_devices
>> >    pci_bus_add_device
>> >      pci_fixup_device(pci_fixup_final)
>> >	add_dev_pci_info
>> >	  pci_iov_virtfn_bus
>> >	    return ... + sriov->offset + (sriov->stride * id) ...
>> >
>> >But both First VF Offset and VF Stride change when ARI Capable Hierarchy or
>> >NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10).  We set NumVFs to zero in
>> >sriov_init() above.  We will change NumVFs to something different when a
>> >driver calls pci_enable_sriov():
>> >
>> >  pci_enable_sriov
>> >    sriov_enable
>> >      pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn)
>> >
>> >Now First VF Offset and VF Stride have changed from what they were when we
>> >called pci_iov_virtfn_bus() above.
>> >
>> 
>> Oops, I see the ARI would affect those value, while missed the NumVFs also
>> would.
>> 
>> Let's look at the problem one by one.
>> 
>> 1. The ARI capability.
>> ===============================================================================
>> The kernel initialize the capability like this:
>> 
>> pci_init_capabilities()
>> 	pci_configure_ari()
>> 	pci_iov_init()
>> 		iov->offset = offset
>> 		iov->stride = stride
>> 
>> When offset/stride is retrieved at this point, the ARI capability is taken
>> into consideration.
>
>PCI_SRIOV_CTRL_ARI is currently only changed at the time we enumerate the
>PF, so I don't think this is really a problem.
>
>> 2. The PF's NumVFs field
>> ===============================================================================
>> 2.1 Potential problem in current code
>> ===============================================================================
>> First, is current pci code has some potential problem?
>> 
>> sriov_enable()
>> 	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
>> 	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
>> 	iov->offset = offset;
>> 	iov->stride = stride;
>> 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
>> 	virtfn_add()
>> 		...
>> 		virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
>> 
>> The sriov_enable() retrieve the offset/stride then write the NumVFs. According
>> to the SPEC, at this moment the offset/stride may change. While I don't see
>> some code to retrieve and store those value again. And these fields will be
>> used in virtfn_add().
>> 
>> If my understanding is correct, I suggest to move the retrieve and store
>> operation after NumVFs is written.
>
>Yep, it looks like the existing code has similar problems.  We might want
>to add a simple function that writes PCI_SRIOV_NUM_VF, then reads
>PCI_SRIOV_VF_OFFSET and PCI_SRIOV_VF_STRIDE and refreshes the cached values
>in dev->sriov.
>
>Then we'd at least know that virtfn_bus() and virtfn_devfn() return values
>that are correct until the next NumVFs change.
>
>> 2.2 The IOV bus range may not be correct in pci_scan_child_bus()?
>> ===============================================================================
>> In current pci core, when enumerating the pci tree, we do like this:
>> 
>> pci_scan_child_bus()
>> 	pci_scan_slot()
>> 		pci_scan_single_device()
>> 			pci_device_add()
>> 				pci_init_capabilities()
>> 					pci_iov_init()
>> 	max += pci_iov_bus_range(bus);
>> 		busnr = pci_iov_virtfn_bus(dev, dev->sriov->total_VFs - 1);
>> 	max = pci_scan_bridge(bus, dev, max, pass);
>> 
>> From this point, we see pci core reserve some bus range for VFs. This
>> calculation is based on the offset/stride at this moment. And do the
>> enumeration with the new bus number.
>> 
>> sriov_enable() could be called several times from driver to enable SRIOV, and
>> with different nr_virtfn. If each time the NumVFs written, the offset/stride
>> will change. This means we may try to address an extra bus we didn't reserve?
>> Or this means it is out of control?
>
>This looks like a problem, too.  I don't have a good suggestion for fixing
>it.
>
>> 2.3 How can I reserve bus range in FW?
>> ===============================================================================
>> This question comes from the previous one.
>> 
>> Based on my understanding, current pci core will rely on the bus number in HW
>> if pcibios_assign_all_busses() is not set. If we want to support those VFs
>> sits on different bus with PF, we need to reserve bus range and write the
>> correct secondary/subordinate in bridge. Otherwise, those VFs on different bus
>> may not be addressed.
>> 
>> Currently I am writing the code in FW to reserve the range with the same
>> mechanism in pci core. While as you mentioned the offset/stride may change
>> after sriov_enable(), I am confused whether this is the correct way.
>
>If your firmware knows something about the device and can compute the
>number of buses it will likely need, it can set up bridges with appropriate
>bus number ranges, and Linux will generally leave those alone.
>
>I don't know the best way to figure out the number of buses, though.  It
>seems like you almost need to experimentally set NumVFs and read the
>resulting offset/stride, because I think it's really up to the device to
>decide how to number the VFs.  Maybe pci_iov_bus_range() needs to do
>something similar.
>

Yep, it's the reasonable way to probe maximal number of buses for the
upstream bridge of PF. I guess Richard need implement similar thing in
firmware.

>> 2.4 The potential problem for [Patch 08/18]
>> ===============================================================================
>> According to the SPEC, the offset/stride will change after each
>> sriov_enable(). This means the bus/devfn will change after each
>> sriov_enable().
>> 
>> My current thought is to fix it up in virtfn_add(). If the total VF number
>> will not change, we could create those pci_dn at the beginning and fix the
>> bus/devfn at each time the VF is truely created.
>
>By "fix it up," I assume you mean call an arch function that does the
>pci_pdn setup you need.
>
>It sounds reasonable to do this either in virtfn_add()/virtfn_remove() or
>at the points where we write PCI_SRIOV_CTRL_VFE, i.e., in sriov_init(),
>sriov_enable(), sriov_disable(), and sriov_restore_state().  From a
>hardware point of view, the VFs exist whenever PCI_SRIOV_CTRL_VFE is set,
>so it might be nice to have this setup connected to that.
>

Yes, Both ways can fix the issue. For couple reasons, I want add weak
pcibios_virtfn_add(), which is called in virtfn_add() if you agree.

- PCI_SRIOV_CTRL_VFE might be set somewhere except the functions you pointed.
  Set/clear PCI_SRIOV_CTRL_VFE will invoke background work to check pci_dn
  and add/remove accordingly. It would be overhead which we can avoid.
- We plan to support EEH for VFs in future. virtfn_add() way matches with
  current EEH implementation better. EEH device and PE are created based
  on (struct pci_dev), and EEH devices and PE can be destroied in time in
  pcibios_release_device(), which is invoked by virtfn_remove(). So we only
  need one weak function. In contrast, we have to create EEH device and PE
  for VFs a bit early before any VFs are instantiated, and destroy them a
  bit late after all VFs are offline.

Thanks,
Gavin

>Bjorn
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Yang Nov. 21, 2014, 1:46 a.m. UTC | #7
On Thu, Nov 20, 2014 at 12:05:41PM -0700, Bjorn Helgaas wrote:
>On Thu, Nov 20, 2014 at 03:20:57PM +0800, Wei Yang wrote:
>> On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote:
>> >On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote:
>> >> From: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> >> 
>> >> pci_dn is the extension of PCI device node and it's created from
>> >> device node. Unfortunately, VFs that are enabled dynamically by
>> >> PF's driver and they don't have corresponding device nodes, and
>> >> pci_dn. The patch refactors pci_dn to support VFs:
>> >> 
>> >>    * pci_dn is organized as a hierarchy tree. VF's pci_dn is put
>> >>      to the child list of pci_dn of PF's bridge. pci_dn of other
>> >>      device put to the child list of pci_dn of its upstream bridge.
>> >> 
>> >>    * VF's pci_dn is expected to be created dynamically when applying
>> >>      final fixup to PF. VF's pci_dn will be destroyed when releasing
>> >>      PF's pci_dev instance. pci_dn of other device is still created
>> >>      from device node as before.
>> >> 
>> >>    * For one particular PCI device (VF or not), its pci_dn can be
>> >>      found from pdev->dev.archdata.firmware_data, PCI_DN(devnode),
>> >>      or parent's list. The fast path (fetching pci_dn through PCI
>> >>      device instance) is populated during early fixup time.
>> >> 
>> >> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> >> ---
>> >> ...
>> >
>> >> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev)
>> >> +{
>> >> +#ifdef CONFIG_PCI_IOV
>> >> +	struct pci_dn *parent, *pdn;
>> >> +	int i;
>> >> +
>> >> +	/* Only support IOV for now */
>> >> +	if (!pdev->is_physfn)
>> >> +		return pci_get_pdn(pdev);
>> >> +
>> >> +	/* Check if VFs have been populated */
>> >> +	pdn = pci_get_pdn(pdev);
>> >> +	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
>> >> +		return NULL;
>> >> +
>> >> +	pdn->flags |= PCI_DN_FLAG_IOV_VF;
>> >> +	parent = pci_bus_to_pdn(pdev->bus);
>> >> +	if (!parent)
>> >> +		return NULL;
>> >> +
>> >> +	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
>> >> +		pdn = add_one_dev_pci_info(parent, NULL,
>> >> +					   pci_iov_virtfn_bus(pdev, i),
>> >> +					   pci_iov_virtfn_devfn(pdev, i));
>> >
>> >I'm not sure this makes sense, but I certainly don't know this code, so
>> >maybe I'm missing something.
>> >
>> >pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on
>> >pdev->sriov->stride and pdev->sriov->offset.  These are read from VF Stride
>> >and First VF Offset in the SR-IOV capability by sriov_init(), which is
>> >called before add_dev_pci_info():
>> >
>> >  pci_scan_child_bus
>> >    pci_scan_slot
>> >      pci_scan_single_device
>> >	pci_device_add
>> >	  pci_init_capabilities
>> >	    pci_iov_init(PF)
>> >	      sriov_init(PF, pos)
>> >		pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0)
>> >		pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset)
>> >		pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride)
>> >		iov->offset = offset
>> >		iov->stride = stride
>> >
>> >  pci_bus_add_devices
>> >    pci_bus_add_device
>> >      pci_fixup_device(pci_fixup_final)
>> >	add_dev_pci_info
>> >	  pci_iov_virtfn_bus
>> >	    return ... + sriov->offset + (sriov->stride * id) ...
>> >
>> >But both First VF Offset and VF Stride change when ARI Capable Hierarchy or
>> >NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10).  We set NumVFs to zero in
>> >sriov_init() above.  We will change NumVFs to something different when a
>> >driver calls pci_enable_sriov():
>> >
>> >  pci_enable_sriov
>> >    sriov_enable
>> >      pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn)
>> >
>> >Now First VF Offset and VF Stride have changed from what they were when we
>> >called pci_iov_virtfn_bus() above.
>> >
>> 
>> Oops, I see the ARI would affect those value, while missed the NumVFs also
>> would.
>> 
>> Let's look at the problem one by one.
>> 
>> 1. The ARI capability.
>> ===============================================================================
>> The kernel initialize the capability like this:
>> 
>> pci_init_capabilities()
>> 	pci_configure_ari()
>> 	pci_iov_init()
>> 		iov->offset = offset
>> 		iov->stride = stride
>> 
>> When offset/stride is retrieved at this point, the ARI capability is taken
>> into consideration.
>
>PCI_SRIOV_CTRL_ARI is currently only changed at the time we enumerate the
>PF, so I don't think this is really a problem.
>
>> 2. The PF's NumVFs field
>> ===============================================================================
>> 2.1 Potential problem in current code
>> ===============================================================================
>> First, is current pci code has some potential problem?
>> 
>> sriov_enable()
>> 	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
>> 	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
>> 	iov->offset = offset;
>> 	iov->stride = stride;
>> 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
>> 	virtfn_add()
>> 		...
>> 		virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
>> 
>> The sriov_enable() retrieve the offset/stride then write the NumVFs. According
>> to the SPEC, at this moment the offset/stride may change. While I don't see
>> some code to retrieve and store those value again. And these fields will be
>> used in virtfn_add().
>> 
>> If my understanding is correct, I suggest to move the retrieve and store
>> operation after NumVFs is written.
>
>Yep, it looks like the existing code has similar problems.  We might want
>to add a simple function that writes PCI_SRIOV_NUM_VF, then reads
>PCI_SRIOV_VF_OFFSET and PCI_SRIOV_VF_STRIDE and refreshes the cached values
>in dev->sriov.
>
>Then we'd at least know that virtfn_bus() and virtfn_devfn() return values
>that are correct until the next NumVFs change.
>

Ok, I will write a function to wrap it.

>> 2.2 The IOV bus range may not be correct in pci_scan_child_bus()?
>> ===============================================================================
>> In current pci core, when enumerating the pci tree, we do like this:
>> 
>> pci_scan_child_bus()
>> 	pci_scan_slot()
>> 		pci_scan_single_device()
>> 			pci_device_add()
>> 				pci_init_capabilities()
>> 					pci_iov_init()
>> 	max += pci_iov_bus_range(bus);
>> 		busnr = pci_iov_virtfn_bus(dev, dev->sriov->total_VFs - 1);
>> 	max = pci_scan_bridge(bus, dev, max, pass);
>> 
>> From this point, we see pci core reserve some bus range for VFs. This
>> calculation is based on the offset/stride at this moment. And do the
>> enumeration with the new bus number.
>> 
>> sriov_enable() could be called several times from driver to enable SRIOV, and
>> with different nr_virtfn. If each time the NumVFs written, the offset/stride
>> will change. This means we may try to address an extra bus we didn't reserve?
>> Or this means it is out of control?
>
>This looks like a problem, too.  I don't have a good suggestion for fixing
>it.

How about enumerating all possible NumVFs and select the maximum?

I am not sure what will happen if the FW sets a different number? So FW should
listen to kernel, right?

I could write a code with this logic and test, while I am afraid this will
break some platfrom in some case.

>
>> 2.3 How can I reserve bus range in FW?
>> ===============================================================================
>> This question comes from the previous one.
>> 
>> Based on my understanding, current pci core will rely on the bus number in HW
>> if pcibios_assign_all_busses() is not set. If we want to support those VFs
>> sits on different bus with PF, we need to reserve bus range and write the
>> correct secondary/subordinate in bridge. Otherwise, those VFs on different bus
>> may not be addressed.
>> 
>> Currently I am writing the code in FW to reserve the range with the same
>> mechanism in pci core. While as you mentioned the offset/stride may change
>> after sriov_enable(), I am confused whether this is the correct way.
>
>If your firmware knows something about the device and can compute the
>number of buses it will likely need, it can set up bridges with appropriate
>bus number ranges, and Linux will generally leave those alone.
>

Yep, this is what I am trying to do.

>I don't know the best way to figure out the number of buses, though.  It
>seems like you almost need to experimentally set NumVFs and read the
>resulting offset/stride, because I think it's really up to the device to
>decide how to number the VFs.  Maybe pci_iov_bus_range() needs to do
>something similar.

Got it, I will add this logic.

>
>> 2.4 The potential problem for [Patch 08/18]
>> ===============================================================================
>> According to the SPEC, the offset/stride will change after each
>> sriov_enable(). This means the bus/devfn will change after each
>> sriov_enable().
>> 
>> My current thought is to fix it up in virtfn_add(). If the total VF number
>> will not change, we could create those pci_dn at the beginning and fix the
>> bus/devfn at each time the VF is truely created.
>
>By "fix it up," I assume you mean call an arch function that does the
>pci_pdn setup you need.
>
>It sounds reasonable to do this either in virtfn_add()/virtfn_remove() or
>at the points where we write PCI_SRIOV_CTRL_VFE, i.e., in sriov_init(),
>sriov_enable(), sriov_disable(), and sriov_restore_state().  From a
>hardware point of view, the VFs exist whenever PCI_SRIOV_CTRL_VFE is set,
>so it might be nice to have this setup connected to that.

If my understanding is correct, we could wrap up the configuration write/read
on PCI_SRIOV_CTRL and when it involves PCI_SRIOV_CTRL_VFE, do the fix up?

>
>Bjorn
Wei Yang Nov. 25, 2014, 9:28 a.m. UTC | #8
On Fri, Nov 21, 2014 at 11:04:11AM +1100, Gavin Shan wrote:
>>> 2.4 The potential problem for [Patch 08/18]
>>> ===============================================================================
>>> According to the SPEC, the offset/stride will change after each
>>> sriov_enable(). This means the bus/devfn will change after each
>>> sriov_enable().
>>> 
>>> My current thought is to fix it up in virtfn_add(). If the total VF number
>>> will not change, we could create those pci_dn at the beginning and fix the
>>> bus/devfn at each time the VF is truely created.
>>
>>By "fix it up," I assume you mean call an arch function that does the
>>pci_pdn setup you need.
>>
>>It sounds reasonable to do this either in virtfn_add()/virtfn_remove() or
>>at the points where we write PCI_SRIOV_CTRL_VFE, i.e., in sriov_init(),
>>sriov_enable(), sriov_disable(), and sriov_restore_state().  From a
>>hardware point of view, the VFs exist whenever PCI_SRIOV_CTRL_VFE is set,
>>so it might be nice to have this setup connected to that.
>>
>
>Yes, Both ways can fix the issue. For couple reasons, I want add weak
>pcibios_virtfn_add(), which is called in virtfn_add() if you agree.
>
>- PCI_SRIOV_CTRL_VFE might be set somewhere except the functions you pointed.
>  Set/clear PCI_SRIOV_CTRL_VFE will invoke background work to check pci_dn
>  and add/remove accordingly. It would be overhead which we can avoid.
>- We plan to support EEH for VFs in future. virtfn_add() way matches with
>  current EEH implementation better. EEH device and PE are created based
>  on (struct pci_dev), and EEH devices and PE can be destroied in time in
>  pcibios_release_device(), which is invoked by virtfn_remove(). So we only
>  need one weak function. In contrast, we have to create EEH device and PE
>  for VFs a bit early before any VFs are instantiated, and destroy them a
>  bit late after all VFs are offline.
>

Bjorn,

Since my mail box got some problem in the last few days, I am not sure you
agree with Gavin's proposal or not?

And if you agree to enumerate the maximum VF bus range, I will add this logic
in the next versin.

>Thanks,
>Gavin
>
>>Bjorn
>>
>
>--
>To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
index 38faede..29992cd 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -34,6 +34,9 @@  struct dev_archdata {
 #ifdef CONFIG_SWIOTLB
 	dma_addr_t		max_direct_dma_addr;
 #endif
+#ifdef CONFIG_PPC64
+	void			*firmware_data;
+#endif
 #ifdef CONFIG_EEH
 	struct eeh_dev		*edev;
 #endif
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 4ca90a3..757d7bb 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -89,6 +89,7 @@  struct pci_controller {
 
 #ifdef CONFIG_PPC64
 	unsigned long buid;
+	void *firmware_data;
 #endif	/* CONFIG_PPC64 */
 
 	void *private_data;
@@ -150,9 +151,13 @@  static inline int isa_vaddr_is_ioport(void __iomem *address)
 struct iommu_table;
 
 struct pci_dn {
+	int     flags;
+#define PCI_DN_FLAG_IOV_VF     0x01
+
 	int	busno;			/* pci bus number */
 	int	devfn;			/* pci device and function number */
 
+	struct  pci_dn *parent;
 	struct  pci_controller *phb;	/* for pci devices */
 	struct	iommu_table *iommu_table;	/* for phb's or bridges */
 	struct	device_node *node;	/* back-pointer to the device_node */
@@ -169,14 +174,19 @@  struct pci_dn {
 #ifdef CONFIG_PPC_POWERNV
 	int	pe_number;
 #endif
+	struct list_head child_list;
+	struct list_head list;
 };
 
 /* Get the pointer to a device_node's pci_dn */
 #define PCI_DN(dn)	((struct pci_dn *) (dn)->data)
 
+extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
+					   int devfn);
 extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev);
-
-extern void * update_dn_pci_info(struct device_node *dn, void *data);
+extern struct pci_dn *add_dev_pci_info(struct pci_dev *pdev);
+extern void remove_dev_pci_info(struct pci_dev *pdev);
+extern void *update_dn_pci_info(struct device_node *dn, void *data);
 
 static inline int pci_device_from_OF_node(struct device_node *np,
 					  u8 *bus, u8 *devfn)
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index 5b78917..af60efe 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -30,6 +30,9 @@ 
 void pcibios_release_device(struct pci_dev *dev)
 {
 	eeh_remove_device(dev);
+
+	/* Release firmware data */
+	remove_dev_pci_info(dev);
 }
 
 /**
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 1f61fab..fa966ae 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -32,12 +32,222 @@ 
 #include <asm/ppc-pci.h>
 #include <asm/firmware.h>
 
+/*
+ * The function is used to find the firmware data of one
+ * specific PCI device, which is attached to the indicated
+ * PCI bus. For VFs, their firmware data is linked to that
+ * one of PF's bridge. For other devices, their firmware
+ * data is linked to that of their bridge.
+ */
+static struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus)
+{
+	struct pci_bus *pbus;
+	struct device_node *dn;
+	struct pci_dn *pdn;
+
+	/*
+	 * We probably have virtual bus which doesn't
+	 * have associated bridge.
+	 */
+	pbus = bus;
+	while (pbus) {
+		if (pci_is_root_bus(pbus) || pbus->self)
+			break;
+
+		pbus = pbus->parent;
+	}
+
+	/*
+	 * Except virtual bus, all PCI buses should
+	 * have device nodes.
+	 */
+	dn = pci_bus_to_OF_node(pbus);
+	pdn = dn ? PCI_DN(dn) : NULL;
+
+	return pdn;
+}
+
+struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
+				    int devfn)
+{
+	struct device_node *dn = NULL;
+	struct pci_dn *parent, *pdn;
+	struct pci_dev *pdev = NULL;
+
+	/* Fast path: fetch from PCI device */
+	list_for_each_entry(pdev, &bus->devices, bus_list) {
+		if (pdev->devfn == devfn) {
+			if (pdev->dev.archdata.firmware_data)
+				return pdev->dev.archdata.firmware_data;
+
+			dn = pci_device_to_OF_node(pdev);
+			break;
+		}
+	}
+
+	/* Fast path: fetch from device node */
+	pdn = dn ? PCI_DN(dn) : NULL;
+	if (pdn)
+		return pdn;
+
+	/* Slow path: fetch from firmware data hierarchy */
+	parent = pci_bus_to_pdn(bus);
+	if (!parent)
+		return NULL;
+
+	list_for_each_entry(pdn, &parent->child_list, list) {
+		if (pdn->busno == bus->number &&
+                    pdn->devfn == devfn)
+                        return pdn;
+        }
+
+	return NULL;
+}
+
 struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
 {
-	struct device_node *dn = pci_device_to_OF_node(pdev);
-	if (!dn)
+	struct device_node *dn;
+	struct pci_dn *parent, *pdn;
+
+	/* Search device directly */
+	if (pdev->dev.archdata.firmware_data)
+		return pdev->dev.archdata.firmware_data;
+
+	/* Check device node */
+	dn = pci_device_to_OF_node(pdev);
+	pdn = dn ? PCI_DN(dn) : NULL;
+	if (pdn)
+		return pdn;
+
+	/*
+	 * VFs don't have device nodes. We hook their
+	 * firmware data to PF's bridge.
+	 */
+	parent = pci_bus_to_pdn(pdev->bus);
+	if (!parent)
 		return NULL;
-	return PCI_DN(dn);
+
+	list_for_each_entry(pdn, &parent->child_list, list) {
+		if (pdn->busno == pdev->bus->number &&
+		    pdn->devfn == pdev->devfn)
+			return pdn;
+	}
+
+	return NULL;
+}
+
+static struct pci_dn *add_one_dev_pci_info(struct pci_dn *parent,
+					   struct pci_dev *pdev,
+					   int busno, int devfn)
+{
+	struct pci_dn *pdn;
+
+	/* Except PHB, we always have parent firmware data */
+	if (!parent)
+		return NULL;
+
+	pdn = kzalloc(sizeof(*pdn), GFP_KERNEL);
+	if (!pdn) {
+		pr_warn("%s: Out of memory !\n", __func__);
+		return NULL;
+	}
+
+	pdn->phb = parent->phb;
+	pdn->parent = parent;
+	pdn->busno = busno;
+	pdn->devfn = devfn;
+#ifdef CONFIG_PPC_POWERNV
+	pdn->pe_number = IODA_INVALID_PE;
+#endif
+	INIT_LIST_HEAD(&pdn->child_list);
+	INIT_LIST_HEAD(&pdn->list);
+	list_add_tail(&pdn->list, &parent->child_list);
+
+	/*
+	 * If we already have PCI device instance, lets
+	 * bind them.
+	 */
+	if (pdev)
+		pdev->dev.archdata.firmware_data = pdn;
+
+	return pdn;
+}
+
+struct pci_dn *add_dev_pci_info(struct pci_dev *pdev)
+{
+#ifdef CONFIG_PCI_IOV
+	struct pci_dn *parent, *pdn;
+	int i;
+
+	/* Only support IOV for now */
+	if (!pdev->is_physfn)
+		return pci_get_pdn(pdev);
+
+	/* Check if VFs have been populated */
+	pdn = pci_get_pdn(pdev);
+	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
+		return NULL;
+
+	pdn->flags |= PCI_DN_FLAG_IOV_VF;
+	parent = pci_bus_to_pdn(pdev->bus);
+	if (!parent)
+		return NULL;
+
+	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
+		pdn = add_one_dev_pci_info(parent, NULL,
+					   pci_iov_virtfn_bus(pdev, i),
+					   pci_iov_virtfn_devfn(pdev, i));
+		if (!pdn) {
+			pr_warn("%s: Cannot create firmware data "
+				"for VF#%d of %s\n",
+				__func__, i, pci_name(pdev));
+			return NULL;
+		}
+	}
+#endif
+
+	return pci_get_pdn(pdev);
+}
+
+void remove_dev_pci_info(struct pci_dev *pdev)
+{
+#ifdef CONFIG_PCI_IOV
+	struct pci_dn *parent;
+	struct pci_dn *pdn, *tmp;
+	int i;
+
+	/* Only support IOV PF for now */
+	if (!pdev->is_physfn)
+		return;
+
+	/* Check if VFs have been populated */
+	pdn = pci_get_pdn(pdev);
+	if (!pdn || !(pdn->flags & PCI_DN_FLAG_IOV_VF))
+		return;
+
+	pdn->flags &= ~PCI_DN_FLAG_IOV_VF;
+	parent = pci_bus_to_pdn(pdev->bus);
+	if (!parent)
+		return;
+
+	/*
+	 * We might introduce flag to pci_dn in future
+	 * so that we can release VF's firmware data in
+	 * a batch mode.
+	 */
+	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
+		list_for_each_entry_safe(pdn, tmp,
+			&parent->child_list, list) {
+			if (pdn->busno != pci_iov_virtfn_bus(pdev, i) ||
+			    pdn->devfn != pci_iov_virtfn_devfn(pdev, i))
+				continue;
+
+			if (!list_empty(&pdn->list))
+				list_del(&pdn->list);
+			kfree(pdn);
+		}
+	}
+#endif
 }
 
 /*
@@ -49,6 +259,7 @@  void *update_dn_pci_info(struct device_node *dn, void *data)
 	struct pci_controller *phb = data;
 	const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL);
 	const __be32 *regs;
+	struct device_node *parent;
 	struct pci_dn *pdn;
 
 	pdn = zalloc_maybe_bootmem(sizeof(*pdn), GFP_KERNEL);
@@ -70,6 +281,15 @@  void *update_dn_pci_info(struct device_node *dn, void *data)
 	}
 
 	pdn->pci_ext_config_space = (type && of_read_number(type, 1) == 1);
+
+	/* Attach to parent node */
+	INIT_LIST_HEAD(&pdn->child_list);
+	INIT_LIST_HEAD(&pdn->list);
+	parent = of_get_parent(dn);
+	pdn->parent = parent ? PCI_DN(parent) : NULL;
+	if (pdn->parent)
+		list_add_tail(&pdn->list, &pdn->parent->child_list);
+
 	return NULL;
 }
 
@@ -150,6 +370,7 @@  void pci_devs_phb_init_dynamic(struct pci_controller *phb)
 	if (pdn) {
 		pdn->devfn = pdn->busno = -1;
 		pdn->phb = phb;
+		phb->firmware_data = pdn;
 	}
 
 	/* Update dn->phb ptrs for new phb and children devices */
@@ -173,3 +394,22 @@  void __init pci_devs_phb_init(void)
 	list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
 		pci_devs_phb_init_dynamic(phb);
 }
+
+static void pci_dev_pdn_create(struct pci_dev *pdev)
+{
+	add_dev_pci_info(pdev);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_create);
+
+static void pci_dev_pdn_setup(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn;
+
+	if (pdev->dev.archdata.firmware_data)
+		return;
+
+	/* Setup the fast path */
+	pdn = pci_get_pdn(pdev);
+	pdev->dev.archdata.firmware_data = pdn;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_setup);