diff mbox series

PCI: allow drivers to limit the number of VFs to 0

Message ID 20180402224652.4058-1-jakub.kicinski@netronome.com
State Not Applicable, archived
Delegated to: David Miller
Headers show
Series PCI: allow drivers to limit the number of VFs to 0 | expand

Commit Message

Jakub Kicinski April 2, 2018, 10:46 p.m. UTC
Some user space depends on enabling sriov_totalvfs number of VFs
to not fail, e.g.:

$ cat .../sriov_totalvfs > .../sriov_numvfs

For devices which VF support depends on loaded FW we have the
pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
a special "unset" value, meaning drivers can't limit sriov_totalvfs
to 0.  Remove the special values completely and simply initialize
driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
Add a helper for drivers to reset the VF limit back to total.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_main.c |  6 +++---
 drivers/pci/iov.c                             | 27 +++++++++++++++++++++------
 include/linux/pci.h                           |  2 ++
 3 files changed, 26 insertions(+), 9 deletions(-)

Comments

Bjorn Helgaas May 24, 2018, 11:57 p.m. UTC | #1
Hi Jakub,

On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:
> Some user space depends on enabling sriov_totalvfs number of VFs
> to not fail, e.g.:
> 
> $ cat .../sriov_totalvfs > .../sriov_numvfs
> 
> For devices which VF support depends on loaded FW we have the
> pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> a special "unset" value, meaning drivers can't limit sriov_totalvfs
> to 0.  Remove the special values completely and simply initialize
> driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> Add a helper for drivers to reset the VF limit back to total.

I still can't really make sense out of the changelog.

I think part of the reason it's confusing is because there are two
things going on:

  1) You want this:
  
       pci_sriov_set_totalvfs(dev, 0);
       x = pci_sriov_get_totalvfs(dev) 

     to return 0 instead of total_VFs.  That seems to connect with
     your subject line.  It means "sriov_totalvfs" in sysfs could be
     0, but I don't know how that is useful (I'm sure it is; just
     educate me :))

  2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
     sure what you intend for this.  Is *every* driver supposed to
     call it in .remove()?  Could/should this be done in the core
     somehow instead of depending on every driver?

I'm also having a hard time connecting your user-space command example
with the rest of this.  Maybe it will make more sense to me tomorrow
after some coffee.

> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> ---
>  drivers/net/ethernet/netronome/nfp/nfp_main.c |  6 +++---
>  drivers/pci/iov.c                             | 27 +++++++++++++++++++++------
>  include/linux/pci.h                           |  2 ++
>  3 files changed, 26 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
> index c4b1f344b4da..a76d177e40dd 100644
> --- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
> +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
> @@ -123,7 +123,7 @@ static int nfp_pcie_sriov_read_nfd_limit(struct nfp_pf *pf)
>  		return pci_sriov_set_totalvfs(pf->pdev, pf->limit_vfs);
>  
>  	pf->limit_vfs = ~0;
> -	pci_sriov_set_totalvfs(pf->pdev, 0); /* 0 is unset */
> +	pci_sriov_reset_totalvfs(pf->pdev);
>  	/* Allow any setting for backwards compatibility if symbol not found */
>  	if (err == -ENOENT)
>  		return 0;
> @@ -537,7 +537,7 @@ static int nfp_pci_probe(struct pci_dev *pdev,
>  err_net_remove:
>  	nfp_net_pci_remove(pf);
>  err_sriov_unlimit:
> -	pci_sriov_set_totalvfs(pf->pdev, 0);
> +	pci_sriov_reset_totalvfs(pf->pdev);
>  err_fw_unload:
>  	kfree(pf->rtbl);
>  	nfp_mip_close(pf->mip);
> @@ -570,7 +570,7 @@ static void nfp_pci_remove(struct pci_dev *pdev)
>  	nfp_hwmon_unregister(pf);
>  
>  	nfp_pcie_sriov_disable(pdev);
> -	pci_sriov_set_totalvfs(pf->pdev, 0);
> +	pci_sriov_reset_totalvfs(pf->pdev);
>  
>  	nfp_net_pci_remove(pf);
>  
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 677924ae0350..c63ea870d8be 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -443,6 +443,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  	iov->nres = nres;
>  	iov->ctrl = ctrl;
>  	iov->total_VFs = total;
> +	iov->driver_max_VFs = total;
>  	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
>  	iov->pgsz = pgsz;
>  	iov->self = dev;
> @@ -788,12 +789,29 @@ int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs)
>  }
>  EXPORT_SYMBOL_GPL(pci_sriov_set_totalvfs);
>  
> +/**
> + * pci_sriov_reset_totalvfs -- return the TotalVFs value to the default
> + * @dev: the PCI PF device
> + *
> + * Should be called from PF driver's remove routine with
> + * device's mutex held.
> + */
> +void pci_sriov_reset_totalvfs(struct pci_dev *dev)
> +{
> +	/* Shouldn't change if VFs already enabled */
> +	if (!dev->is_physfn || dev->sriov->ctrl & PCI_SRIOV_CTRL_VFE)
> +		return;
> +
> +	dev->sriov->driver_max_VFs = dev->sriov->total_VFs;
> +}
> +EXPORT_SYMBOL_GPL(pci_sriov_reset_totalvfs);
> +
>  /**
>   * pci_sriov_get_totalvfs -- get total VFs supported on this device
>   * @dev: the PCI PF device
>   *
> - * For a PCIe device with SRIOV support, return the PCIe
> - * SRIOV capability value of TotalVFs or the value of driver_max_VFs
> + * For a PCIe device with SRIOV support, return the value of driver_max_VFs
> + * which can be equal to the PCIe SRIOV capability value of TotalVFs or lower
>   * if the driver reduced it.  Otherwise 0.
>   */
>  int pci_sriov_get_totalvfs(struct pci_dev *dev)
> @@ -801,9 +819,6 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
>  	if (!dev->is_physfn)
>  		return 0;
>  
> -	if (dev->sriov->driver_max_VFs)
> -		return dev->sriov->driver_max_VFs;
> -
> -	return dev->sriov->total_VFs;
> +	return dev->sriov->driver_max_VFs;
>  }
>  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 024a1beda008..95fde8850393 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1952,6 +1952,7 @@ void pci_iov_remove_virtfn(struct pci_dev *dev, int id);
>  int pci_num_vf(struct pci_dev *dev);
>  int pci_vfs_assigned(struct pci_dev *dev);
>  int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
> +void pci_sriov_reset_totalvfs(struct pci_dev *dev);
>  int pci_sriov_get_totalvfs(struct pci_dev *dev);
>  resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
>  void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
> @@ -1978,6 +1979,7 @@ static inline int pci_vfs_assigned(struct pci_dev *dev)
>  { return 0; }
>  static inline int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs)
>  { return 0; }
> +static inline void pci_sriov_reset_totalvfs(struct pci_dev *dev) { }
>  static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
>  { return 0; }
>  static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
> -- 
> 2.16.2
>
Jakub Kicinski May 25, 2018, 1:20 a.m. UTC | #2
Hi Bjorn!

On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:
> On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:
> > Some user space depends on enabling sriov_totalvfs number of VFs
> > to not fail, e.g.:
> > 
> > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > 
> > For devices which VF support depends on loaded FW we have the
> > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > to 0.  Remove the special values completely and simply initialize
> > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > Add a helper for drivers to reset the VF limit back to total.  
> 
> I still can't really make sense out of the changelog.
>
> I think part of the reason it's confusing is because there are two
> things going on:
> 
>   1) You want this:
>   
>        pci_sriov_set_totalvfs(dev, 0);
>        x = pci_sriov_get_totalvfs(dev) 
> 
>      to return 0 instead of total_VFs.  That seems to connect with
>      your subject line.  It means "sriov_totalvfs" in sysfs could be
>      0, but I don't know how that is useful (I'm sure it is; just
>      educate me :))

Let me just quote the bug report that got filed on our internal bug
tracker :)

  When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
  errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
  then tries to set that as the sriov_numvfs parameter.

  For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0, 
  but it's set to max.  When FW is switched to flower*, the correct 
  sriov_totalvfs value is presented.

* flower is a project name

My understanding is OpenStack uses sriov_totalvfs to determine how many
VFs can be enabled, looks like this is the code:

http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464

>   2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
>      sure what you intend for this.  Is *every* driver supposed to
>      call it in .remove()?  Could/should this be done in the core
>      somehow instead of depending on every driver?

Good question, I was just thinking yesterday we may want to call it
from the core, but I don't think it's strictly necessary nor always
sufficient (we may reload FW without re-probing).

We have a device which supports different number of VFs based on the FW
loaded.  Some legacy FWs does not inform the driver how many VFs it can
support, because it supports max.  So the flow in our driver is this:

load_fw(dev);
...
max_vfs = ask_fw_for_max_vfs(dev);
if (max_vfs >= 0)
	return pci_sriov_set_totalvfs(dev, max_vfs);
else /* FW didn't tell us, assume max */
	return pci_sriov_reset_totalvfs(dev); 

We also reset the max on device remove, but that's not strictly
necessary.

Other users of pci_sriov_set_totalvfs() always know the value to set
the total to (either always get it from FW or it's a constant).

If you prefer we can work out the correct max for those legacy cases in
the driver as well, although it seemed cleaner to just ask the core,
since it already has total_VFs value handy :)

> I'm also having a hard time connecting your user-space command example
> with the rest of this.  Maybe it will make more sense to me tomorrow
> after some coffee.

OpenStack assumes it will always be able to set sriov_numvfs to
sriov_totalvfs, see this 'if':

http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512

I tried to morph that into an concise bash command, but clearly failed.
Sorry about the lack of clarity! :(
Bjorn Helgaas May 25, 2018, 2:02 p.m. UTC | #3
On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
> Hi Bjorn!
> 
> On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:
> > On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:
> > > Some user space depends on enabling sriov_totalvfs number of VFs
> > > to not fail, e.g.:
> > > 
> > > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > > 
> > > For devices which VF support depends on loaded FW we have the
> > > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > > to 0.  Remove the special values completely and simply initialize
> > > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > > Add a helper for drivers to reset the VF limit back to total.  
> > 
> > I still can't really make sense out of the changelog.
> >
> > I think part of the reason it's confusing is because there are two
> > things going on:
> > 
> >   1) You want this:
> >   
> >        pci_sriov_set_totalvfs(dev, 0);
> >        x = pci_sriov_get_totalvfs(dev) 
> > 
> >      to return 0 instead of total_VFs.  That seems to connect with
> >      your subject line.  It means "sriov_totalvfs" in sysfs could be
> >      0, but I don't know how that is useful (I'm sure it is; just
> >      educate me :))
> 
> Let me just quote the bug report that got filed on our internal bug
> tracker :)
> 
>   When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
>   errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
>   then tries to set that as the sriov_numvfs parameter.
> 
>   For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0, 
>   but it's set to max.  When FW is switched to flower*, the correct 
>   sriov_totalvfs value is presented.
> 
> * flower is a project name

From the point of view of the PCI core (which knows nothing about
device firmware and relies on the architected config space described
by the PCIe spec), this sounds like an erratum: with some firmware
installed, the device is not capable of SR-IOV, but still advertises
an SR-IOV capability with "TotalVFs > 0".

Regardless of whether that's an erratum, we do allow PF drivers to use
pci_sriov_set_totalvfs() to limit the number of VFs that may be
enabled by writing to the PF's "sriov_numvfs" sysfs file.

But the current implementation does not allow a PF driver to limit VFs
to 0, and that does seem nonsensical.

> My understanding is OpenStack uses sriov_totalvfs to determine how many
> VFs can be enabled, looks like this is the code:
> 
> http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
> 
> >   2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
> >      sure what you intend for this.  Is *every* driver supposed to
> >      call it in .remove()?  Could/should this be done in the core
> >      somehow instead of depending on every driver?
> 
> Good question, I was just thinking yesterday we may want to call it
> from the core, but I don't think it's strictly necessary nor always
> sufficient (we may reload FW without re-probing).
> 
> We have a device which supports different number of VFs based on the FW
> loaded.  Some legacy FWs does not inform the driver how many VFs it can
> support, because it supports max.  So the flow in our driver is this:
> 
> load_fw(dev);
> ...
> max_vfs = ask_fw_for_max_vfs(dev);
> if (max_vfs >= 0)
> 	return pci_sriov_set_totalvfs(dev, max_vfs);
> else /* FW didn't tell us, assume max */
> 	return pci_sriov_reset_totalvfs(dev); 
> 
> We also reset the max on device remove, but that's not strictly
> necessary.
> 
> Other users of pci_sriov_set_totalvfs() always know the value to set
> the total to (either always get it from FW or it's a constant).
> 
> If you prefer we can work out the correct max for those legacy cases in
> the driver as well, although it seemed cleaner to just ask the core,
> since it already has total_VFs value handy :)
> 
> > I'm also having a hard time connecting your user-space command example
> > with the rest of this.  Maybe it will make more sense to me tomorrow
> > after some coffee.
> 
> OpenStack assumes it will always be able to set sriov_numvfs to
> sriov_totalvfs, see this 'if':
> 
> http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512

Thanks for educating me.  I think there are two issues here that we
can separate.  I extracted the patch below for the first.

The second is the question of resetting driver_max_VFs.  I think we
currently have a general issue in the core:

  - load PF driver 1
  - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
  - unload PF driver 1
  - load PF driver 2

Now driver_max_VFs is still stuck at the lower value set by driver 1.
I don't think that's the way this should work.

I guess this is partly a consequence of setting driver_max_VFs in
sriov_init(), which is called before driver attach and should only
depend on hardware characteristics, so it is related to the patch
below.  But I think we should fix it in general, not just for
netronome.


commit 4a338bc6f94b9ad824ac944f5dfc249d6838719c
Author: Jakub Kicinski <jakub.kicinski@netronome.com>
Date:   Fri May 25 08:18:34 2018 -0500

    PCI/IOV: Allow PF drivers to limit total_VFs to 0
    
    Some SR-IOV PF drivers implement .sriov_configure(), which allows
    user-space to enable VFs by writing the desired number of VFs to the sysfs
    "sriov_numvfs" file (see sriov_numvfs_store()).
    
    The PCI core limits the number of VFs to the TotalVFs advertised by the
    device in its SR-IOV capability.  The PF driver can limit the number of VFs
    to even fewer (it may have pre-allocated data structures or knowledge of
    device limitations) by calling pci_sriov_set_totalvfs(), but previously it
    could not limit the VFs to 0.
    
    Change pci_sriov_get_totalvfs() so it always respects the VF limit imposed
    by the PF driver, even if the limit is 0.
    
    This sequence:
    
      pci_sriov_set_totalvfs(dev, 0);
      x = pci_sriov_get_totalvfs(dev);
    
    previously set "x" to TotalVFs from the SR-IOV capability.  Now it will set
    "x" to 0.
    
    Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
    Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 192b82898a38..d0d73dbbd5ca 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -469,6 +469,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
 	iov->nres = nres;
 	iov->ctrl = ctrl;
 	iov->total_VFs = total;
+	iov->driver_max_VFs = total;
 	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
 	iov->pgsz = pgsz;
 	iov->self = dev;
@@ -827,10 +828,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
 	if (!dev->is_physfn)
 		return 0;
 
-	if (dev->sriov->driver_max_VFs)
-		return dev->sriov->driver_max_VFs;
-
-	return dev->sriov->total_VFs;
+	return dev->sriov->driver_max_VFs;
 }
 EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
Bjorn Helgaas May 25, 2018, 5:01 p.m. UTC | #4
[+cc liquidio, benet, fm10k maintainers:

  The patch below will affect you if your driver calls
    pci_sriov_set_totalvfs(dev, 0);

  Previously that caused a subsequent pci_sriov_get_totalvfs() to return
  the totalVFs value from the SR-IOV capability.  After this patch, it will
  return 0, which has implications for VF enablement via the sysfs
  "sriov_numvfs" file.]

On Fri, May 25, 2018 at 09:02:23AM -0500, Bjorn Helgaas wrote:
> On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
> > Hi Bjorn!
> > 
> > On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:
> > > On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:
> > > > Some user space depends on enabling sriov_totalvfs number of VFs
> > > > to not fail, e.g.:
> > > > 
> > > > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > > > 
> > > > For devices which VF support depends on loaded FW we have the
> > > > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > > > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > > > to 0.  Remove the special values completely and simply initialize
> > > > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > > > Add a helper for drivers to reset the VF limit back to total.  
> > > 
> > > I still can't really make sense out of the changelog.
> > >
> > > I think part of the reason it's confusing is because there are two
> > > things going on:
> > > 
> > >   1) You want this:
> > >   
> > >        pci_sriov_set_totalvfs(dev, 0);
> > >        x = pci_sriov_get_totalvfs(dev) 
> > > 
> > >      to return 0 instead of total_VFs.  That seems to connect with
> > >      your subject line.  It means "sriov_totalvfs" in sysfs could be
> > >      0, but I don't know how that is useful (I'm sure it is; just
> > >      educate me :))
> > 
> > Let me just quote the bug report that got filed on our internal bug
> > tracker :)
> > 
> >   When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
> >   errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
> >   then tries to set that as the sriov_numvfs parameter.
> > 
> >   For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0, 
> >   but it's set to max.  When FW is switched to flower*, the correct 
> >   sriov_totalvfs value is presented.
> > 
> > * flower is a project name
> 
> From the point of view of the PCI core (which knows nothing about
> device firmware and relies on the architected config space described
> by the PCIe spec), this sounds like an erratum: with some firmware
> installed, the device is not capable of SR-IOV, but still advertises
> an SR-IOV capability with "TotalVFs > 0".
> 
> Regardless of whether that's an erratum, we do allow PF drivers to use
> pci_sriov_set_totalvfs() to limit the number of VFs that may be
> enabled by writing to the PF's "sriov_numvfs" sysfs file.
> 
> But the current implementation does not allow a PF driver to limit VFs
> to 0, and that does seem nonsensical.
> 
> > My understanding is OpenStack uses sriov_totalvfs to determine how many
> > VFs can be enabled, looks like this is the code:
> > 
> > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
> > 
> > >   2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
> > >      sure what you intend for this.  Is *every* driver supposed to
> > >      call it in .remove()?  Could/should this be done in the core
> > >      somehow instead of depending on every driver?
> > 
> > Good question, I was just thinking yesterday we may want to call it
> > from the core, but I don't think it's strictly necessary nor always
> > sufficient (we may reload FW without re-probing).
> > 
> > We have a device which supports different number of VFs based on the FW
> > loaded.  Some legacy FWs does not inform the driver how many VFs it can
> > support, because it supports max.  So the flow in our driver is this:
> > 
> > load_fw(dev);
> > ...
> > max_vfs = ask_fw_for_max_vfs(dev);
> > if (max_vfs >= 0)
> > 	return pci_sriov_set_totalvfs(dev, max_vfs);
> > else /* FW didn't tell us, assume max */
> > 	return pci_sriov_reset_totalvfs(dev); 
> > 
> > We also reset the max on device remove, but that's not strictly
> > necessary.
> > 
> > Other users of pci_sriov_set_totalvfs() always know the value to set
> > the total to (either always get it from FW or it's a constant).
> > 
> > If you prefer we can work out the correct max for those legacy cases in
> > the driver as well, although it seemed cleaner to just ask the core,
> > since it already has total_VFs value handy :)
> > 
> > > I'm also having a hard time connecting your user-space command example
> > > with the rest of this.  Maybe it will make more sense to me tomorrow
> > > after some coffee.
> > 
> > OpenStack assumes it will always be able to set sriov_numvfs to
> > sriov_totalvfs, see this 'if':
> > 
> > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512
> 
> Thanks for educating me.  I think there are two issues here that we
> can separate.  I extracted the patch below for the first.
> 
> The second is the question of resetting driver_max_VFs.  I think we
> currently have a general issue in the core:
> 
>   - load PF driver 1
>   - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
>   - unload PF driver 1
>   - load PF driver 2
> 
> Now driver_max_VFs is still stuck at the lower value set by driver 1.
> I don't think that's the way this should work.
> 
> I guess this is partly a consequence of setting driver_max_VFs in
> sriov_init(), which is called before driver attach and should only
> depend on hardware characteristics, so it is related to the patch
> below.  But I think we should fix it in general, not just for
> netronome.
> 
> 
> commit 4a338bc6f94b9ad824ac944f5dfc249d6838719c
> Author: Jakub Kicinski <jakub.kicinski@netronome.com>
> Date:   Fri May 25 08:18:34 2018 -0500
> 
>     PCI/IOV: Allow PF drivers to limit total_VFs to 0
>     
>     Some SR-IOV PF drivers implement .sriov_configure(), which allows
>     user-space to enable VFs by writing the desired number of VFs to the sysfs
>     "sriov_numvfs" file (see sriov_numvfs_store()).
>     
>     The PCI core limits the number of VFs to the TotalVFs advertised by the
>     device in its SR-IOV capability.  The PF driver can limit the number of VFs
>     to even fewer (it may have pre-allocated data structures or knowledge of
>     device limitations) by calling pci_sriov_set_totalvfs(), but previously it
>     could not limit the VFs to 0.
>     
>     Change pci_sriov_get_totalvfs() so it always respects the VF limit imposed
>     by the PF driver, even if the limit is 0.
>     
>     This sequence:
>     
>       pci_sriov_set_totalvfs(dev, 0);
>       x = pci_sriov_get_totalvfs(dev);
>     
>     previously set "x" to TotalVFs from the SR-IOV capability.  Now it will set
>     "x" to 0.
>     
>     Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>     Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 192b82898a38..d0d73dbbd5ca 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -469,6 +469,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  	iov->nres = nres;
>  	iov->ctrl = ctrl;
>  	iov->total_VFs = total;
> +	iov->driver_max_VFs = total;
>  	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
>  	iov->pgsz = pgsz;
>  	iov->self = dev;
> @@ -827,10 +828,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
>  	if (!dev->is_physfn)
>  		return 0;
>  
> -	if (dev->sriov->driver_max_VFs)
> -		return dev->sriov->driver_max_VFs;
> -
> -	return dev->sriov->total_VFs;
> +	return dev->sriov->driver_max_VFs;
>  }
>  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
>
Jacob Keller May 25, 2018, 5:46 p.m. UTC | #5
> -----Original Message-----
> From: Bjorn Helgaas [mailto:helgaas@kernel.org]
> Sent: Friday, May 25, 2018 10:01 AM
> To: Jakub Kicinski <jakub.kicinski@netronome.com>
> Cc: Bjorn Helgaas <bhelgaas@google.com>; linux-pci@vger.kernel.org;
> netdev@vger.kernel.org; Sathya Perla <sathya.perla@broadcom.com>; Felix
> Manlunas <felix.manlunas@caviumnetworks.com>;
> alexander.duyck@gmail.com; john.fastabend@gmail.com; Keller, Jacob E
> <jacob.e.keller@intel.com>; Donald Dutile <ddutile@redhat.com>; oss-
> drivers@netronome.com; Christoph Hellwig <hch@infradead.org>; Derek
> Chickles <derek.chickles@caviumnetworks.com>; Satanand Burla
> <satananda.burla@caviumnetworks.com>; Raghu Vatsavayi
> <raghu.vatsavayi@caviumnetworks.com>; Ajit Khaparde
> <ajit.khaparde@broadcom.com>; Sriharsha Basavapatna
> <sriharsha.basavapatna@broadcom.com>; Somnath Kotur
> <somnath.kotur@broadcom.com>; Kirsher, Jeffrey T
> <jeffrey.t.kirsher@intel.com>; intel-wired-lan@lists.osuosl.org
> Subject: Re: [PATCH] PCI: allow drivers to limit the number of VFs to 0
> 
> [+cc liquidio, benet, fm10k maintainers:
> 
>   The patch below will affect you if your driver calls
>     pci_sriov_set_totalvfs(dev, 0);
> 
>   Previously that caused a subsequent pci_sriov_get_totalvfs() to return
>   the totalVFs value from the SR-IOV capability.  After this patch, it will
>   return 0, which has implications for VF enablement via the sysfs
>   "sriov_numvfs" file.]
> 

Thanks. I don't foresee any issues with fm10k regarding this..

Thanks,
Jake
Don Dutile May 25, 2018, 7:27 p.m. UTC | #6
On 05/25/2018 10:02 AM, Bjorn Helgaas wrote:
> On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
>> Hi Bjorn!
>>
>> On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:
>>> On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:
>>>> Some user space depends on enabling sriov_totalvfs number of VFs
>>>> to not fail, e.g.:
>>>>
>>>> $ cat .../sriov_totalvfs > .../sriov_numvfs
>>>>
>>>> For devices which VF support depends on loaded FW we have the
>>>> pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
>>>> a special "unset" value, meaning drivers can't limit sriov_totalvfs
>>>> to 0.  Remove the special values completely and simply initialize
>>>> driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
>>>> Add a helper for drivers to reset the VF limit back to total.
>>>
>>> I still can't really make sense out of the changelog.
>>>
>>> I think part of the reason it's confusing is because there are two
>>> things going on:
>>>
>>>    1) You want this:
>>>    
>>>         pci_sriov_set_totalvfs(dev, 0);
>>>         x = pci_sriov_get_totalvfs(dev)
>>>
>>>       to return 0 instead of total_VFs.  That seems to connect with
>>>       your subject line.  It means "sriov_totalvfs" in sysfs could be
>>>       0, but I don't know how that is useful (I'm sure it is; just
>>>       educate me :))
>>
>> Let me just quote the bug report that got filed on our internal bug
>> tracker :)
>>
>>    When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
>>    errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
>>    then tries to set that as the sriov_numvfs parameter.
>>
>>    For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0,
>>    but it's set to max.  When FW is switched to flower*, the correct
>>    sriov_totalvfs value is presented.
>>
>> * flower is a project name
> 
>  From the point of view of the PCI core (which knows nothing about
> device firmware and relies on the architected config space described
> by the PCIe spec), this sounds like an erratum: with some firmware
> installed, the device is not capable of SR-IOV, but still advertises
> an SR-IOV capability with "TotalVFs > 0".
> 
> Regardless of whether that's an erratum, we do allow PF drivers to use
> pci_sriov_set_totalvfs() to limit the number of VFs that may be
> enabled by writing to the PF's "sriov_numvfs" sysfs file.
> 
+1.

> But the current implementation does not allow a PF driver to limit VFs
> to 0, and that does seem nonsensical.
> 
Well, not really -- claiming to support VFs, and then wanting it to be 0...
I could certainly argue is non-sensical.
 From a sw perspective, sure, see if we can set VFs to 0 (and reset to another value later).

/me wishes that implementers would follow the architecture vs torquing it into strange shapes.

>> My understanding is OpenStack uses sriov_totalvfs to determine how many
>> VFs can be enabled, looks like this is the code:
>>
>> http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
>>
>>>    2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
>>>       sure what you intend for this.  Is *every* driver supposed to
>>>       call it in .remove()?  Could/should this be done in the core
>>>       somehow instead of depending on every driver?
>>
>> Good question, I was just thinking yesterday we may want to call it
>> from the core, but I don't think it's strictly necessary nor always
>> sufficient (we may reload FW without re-probing).
>>
>> We have a device which supports different number of VFs based on the FW
>> loaded.  Some legacy FWs does not inform the driver how many VFs it can
>> support, because it supports max.  So the flow in our driver is this:
>>
>> load_fw(dev);
>> ...
>> max_vfs = ask_fw_for_max_vfs(dev);
>> if (max_vfs >= 0)
>> 	return pci_sriov_set_totalvfs(dev, max_vfs);
>> else /* FW didn't tell us, assume max */
>> 	return pci_sriov_reset_totalvfs(dev);
>>
>> We also reset the max on device remove, but that's not strictly
>> necessary.
>>
>> Other users of pci_sriov_set_totalvfs() always know the value to set
>> the total to (either always get it from FW or it's a constant).
>>
>> If you prefer we can work out the correct max for those legacy cases in
>> the driver as well, although it seemed cleaner to just ask the core,
>> since it already has total_VFs value handy :)
>>
>>> I'm also having a hard time connecting your user-space command example
>>> with the rest of this.  Maybe it will make more sense to me tomorrow
>>> after some coffee.
>>
>> OpenStack assumes it will always be able to set sriov_numvfs to
>> sriov_totalvfs, see this 'if':
>>
>> http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512
> 
> Thanks for educating me.  I think there are two issues here that we
> can separate.  I extracted the patch below for the first.
> 
> The second is the question of resetting driver_max_VFs.  I think we
> currently have a general issue in the core:
> 
>    - load PF driver 1
>    - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
>    - unload PF driver 1
>    - load PF driver 2
> 
> Now driver_max_VFs is still stuck at the lower value set by driver 1.
> I don't think that's the way this should work.
> 
> I guess this is partly a consequence of setting driver_max_VFs in
> sriov_init(), which is called before driver attach and should only
um, if it's at sriov_init() how is max changed by a PF driver?
or am I missing something subtle (a new sysfs param) as to what is being changed?

> depend on hardware characteristics, so it is related to the patch
> below.  But I think we should fix it in general, not just for
> netronome.
> 
> 
> commit 4a338bc6f94b9ad824ac944f5dfc249d6838719c
> Author: Jakub Kicinski <jakub.kicinski@netronome.com>
> Date:   Fri May 25 08:18:34 2018 -0500
> 
>      PCI/IOV: Allow PF drivers to limit total_VFs to 0
>      
>      Some SR-IOV PF drivers implement .sriov_configure(), which allows
>      user-space to enable VFs by writing the desired number of VFs to the sysfs
>      "sriov_numvfs" file (see sriov_numvfs_store()).
>      
>      The PCI core limits the number of VFs to the TotalVFs advertised by the
>      device in its SR-IOV capability.  The PF driver can limit the number of VFs
>      to even fewer (it may have pre-allocated data structures or knowledge of
>      device limitations) by calling pci_sriov_set_totalvfs(), but previously it
>      could not limit the VFs to 0.
>      
>      Change pci_sriov_get_totalvfs() so it always respects the VF limit imposed
>      by the PF driver, even if the limit is 0.
>      
>      This sequence:
>      
>        pci_sriov_set_totalvfs(dev, 0);
>        x = pci_sriov_get_totalvfs(dev);
>      
>      previously set "x" to TotalVFs from the SR-IOV capability.  Now it will set
>      "x" to 0.
>      
>      Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>      Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 192b82898a38..d0d73dbbd5ca 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -469,6 +469,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>   	iov->nres = nres;
>   	iov->ctrl = ctrl;
>   	iov->total_VFs = total;
> +	iov->driver_max_VFs = total;
>   	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
>   	iov->pgsz = pgsz;
>   	iov->self = dev;
> @@ -827,10 +828,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
>   	if (!dev->is_physfn)
>   		return 0;
>   
> -	if (dev->sriov->driver_max_VFs)
> -		return dev->sriov->driver_max_VFs;
> -
> -	return dev->sriov->total_VFs;
> +	return dev->sriov->driver_max_VFs;
>   }
>   EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
>   
>
Bjorn Helgaas May 25, 2018, 8:46 p.m. UTC | #7
On Fri, May 25, 2018 at 03:27:52PM -0400, Don Dutile wrote:
> On 05/25/2018 10:02 AM, Bjorn Helgaas wrote:
> > On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
> > > Hi Bjorn!
> > > 
> > > On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:
> > > > On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:
> > > > > Some user space depends on enabling sriov_totalvfs number of VFs
> > > > > to not fail, e.g.:
> > > > > 
> > > > > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > > > > 
> > > > > For devices which VF support depends on loaded FW we have the
> > > > > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > > > > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > > > > to 0.  Remove the special values completely and simply initialize
> > > > > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > > > > Add a helper for drivers to reset the VF limit back to total.
> > > > 
> > > > I still can't really make sense out of the changelog.
> > > > 
> > > > I think part of the reason it's confusing is because there are two
> > > > things going on:
> > > > 
> > > >    1) You want this:
> > > >         pci_sriov_set_totalvfs(dev, 0);
> > > >         x = pci_sriov_get_totalvfs(dev)
> > > > 
> > > >       to return 0 instead of total_VFs.  That seems to connect with
> > > >       your subject line.  It means "sriov_totalvfs" in sysfs could be
> > > >       0, but I don't know how that is useful (I'm sure it is; just
> > > >       educate me :))
> > > 
> > > Let me just quote the bug report that got filed on our internal bug
> > > tracker :)
> > > 
> > >    When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
> > >    errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
> > >    then tries to set that as the sriov_numvfs parameter.
> > > 
> > >    For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0,
> > >    but it's set to max.  When FW is switched to flower*, the correct
> > >    sriov_totalvfs value is presented.
> > > 
> > > * flower is a project name
> > 
> >  From the point of view of the PCI core (which knows nothing about
> > device firmware and relies on the architected config space described
> > by the PCIe spec), this sounds like an erratum: with some firmware
> > installed, the device is not capable of SR-IOV, but still advertises
> > an SR-IOV capability with "TotalVFs > 0".
> > 
> > Regardless of whether that's an erratum, we do allow PF drivers to use
> > pci_sriov_set_totalvfs() to limit the number of VFs that may be
> > enabled by writing to the PF's "sriov_numvfs" sysfs file.
> > 
> +1.
> 
> > But the current implementation does not allow a PF driver to limit VFs
> > to 0, and that does seem nonsensical.
> > 
> Well, not really -- claiming to support VFs, and then wanting it to be 0...
> I could certainly argue is non-sensical.
> From a sw perspective, sure, see if we can set VFs to 0 (and reset to another value later).
> 
> /me wishes that implementers would follow the architecture vs torquing it into strange shapes.
> 
> > > My understanding is OpenStack uses sriov_totalvfs to determine how many
> > > VFs can be enabled, looks like this is the code:
> > > 
> > > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
> > > 
> > > >    2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
> > > >       sure what you intend for this.  Is *every* driver supposed to
> > > >       call it in .remove()?  Could/should this be done in the core
> > > >       somehow instead of depending on every driver?
> > > 
> > > Good question, I was just thinking yesterday we may want to call it
> > > from the core, but I don't think it's strictly necessary nor always
> > > sufficient (we may reload FW without re-probing).
> > > 
> > > We have a device which supports different number of VFs based on the FW
> > > loaded.  Some legacy FWs does not inform the driver how many VFs it can
> > > support, because it supports max.  So the flow in our driver is this:
> > > 
> > > load_fw(dev);
> > > ...
> > > max_vfs = ask_fw_for_max_vfs(dev);
> > > if (max_vfs >= 0)
> > > 	return pci_sriov_set_totalvfs(dev, max_vfs);
> > > else /* FW didn't tell us, assume max */
> > > 	return pci_sriov_reset_totalvfs(dev);
> > > 
> > > We also reset the max on device remove, but that's not strictly
> > > necessary.
> > > 
> > > Other users of pci_sriov_set_totalvfs() always know the value to set
> > > the total to (either always get it from FW or it's a constant).
> > > 
> > > If you prefer we can work out the correct max for those legacy cases in
> > > the driver as well, although it seemed cleaner to just ask the core,
> > > since it already has total_VFs value handy :)
> > > 
> > > > I'm also having a hard time connecting your user-space command example
> > > > with the rest of this.  Maybe it will make more sense to me tomorrow
> > > > after some coffee.
> > > 
> > > OpenStack assumes it will always be able to set sriov_numvfs to
> > > sriov_totalvfs, see this 'if':
> > > 
> > > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512
> > 
> > Thanks for educating me.  I think there are two issues here that we
> > can separate.  I extracted the patch below for the first.
> > 
> > The second is the question of resetting driver_max_VFs.  I think we
> > currently have a general issue in the core:
> > 
> >    - load PF driver 1
> >    - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
> >    - unload PF driver 1
> >    - load PF driver 2
> > 
> > Now driver_max_VFs is still stuck at the lower value set by driver 1.
> > I don't think that's the way this should work.
> > 
> > I guess this is partly a consequence of setting driver_max_VFs in
> > sriov_init(), which is called before driver attach and should only
> um, if it's at sriov_init() how is max changed by a PF driver?
> or am I missing something subtle (a new sysfs param) as to what is being changed?

sriov_init() basically just sets the default driver_max_VFs to Total_VFs.

If the PF driver later calls pci_sriov_set_totalvfs(), it can reduce
driver_max_VFs.

My concern is that there's nothing that resets driver_max_VFs back to
Total_VFs if we unload and reload the PF driver.

> > depend on hardware characteristics, so it is related to the patch
> > below.  But I think we should fix it in general, not just for
> > netronome.
> > 
> > 
> > commit 4a338bc6f94b9ad824ac944f5dfc249d6838719c
> > Author: Jakub Kicinski <jakub.kicinski@netronome.com>
> > Date:   Fri May 25 08:18:34 2018 -0500
> > 
> >      PCI/IOV: Allow PF drivers to limit total_VFs to 0
> >      Some SR-IOV PF drivers implement .sriov_configure(), which allows
> >      user-space to enable VFs by writing the desired number of VFs to the sysfs
> >      "sriov_numvfs" file (see sriov_numvfs_store()).
> >      The PCI core limits the number of VFs to the TotalVFs advertised by the
> >      device in its SR-IOV capability.  The PF driver can limit the number of VFs
> >      to even fewer (it may have pre-allocated data structures or knowledge of
> >      device limitations) by calling pci_sriov_set_totalvfs(), but previously it
> >      could not limit the VFs to 0.
> >      Change pci_sriov_get_totalvfs() so it always respects the VF limit imposed
> >      by the PF driver, even if the limit is 0.
> >      This sequence:
> >        pci_sriov_set_totalvfs(dev, 0);
> >        x = pci_sriov_get_totalvfs(dev);
> >      previously set "x" to TotalVFs from the SR-IOV capability.  Now it will set
> >      "x" to 0.
> >      Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> >      Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> > 
> > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> > index 192b82898a38..d0d73dbbd5ca 100644
> > --- a/drivers/pci/iov.c
> > +++ b/drivers/pci/iov.c
> > @@ -469,6 +469,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
> >   	iov->nres = nres;
> >   	iov->ctrl = ctrl;
> >   	iov->total_VFs = total;
> > +	iov->driver_max_VFs = total;
> >   	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
> >   	iov->pgsz = pgsz;
> >   	iov->self = dev;
> > @@ -827,10 +828,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
> >   	if (!dev->is_physfn)
> >   		return 0;
> > -	if (dev->sriov->driver_max_VFs)
> > -		return dev->sriov->driver_max_VFs;
> > -
> > -	return dev->sriov->total_VFs;
> > +	return dev->sriov->driver_max_VFs;
> >   }
> >   EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
> > 
>
Jakub Kicinski May 25, 2018, 9:05 p.m. UTC | #8
On Fri, 25 May 2018 09:02:23 -0500, Bjorn Helgaas wrote:
> On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
> > On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:  
> > > On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:  
> > > > Some user space depends on enabling sriov_totalvfs number of VFs
> > > > to not fail, e.g.:
> > > > 
> > > > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > > > 
> > > > For devices which VF support depends on loaded FW we have the
> > > > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > > > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > > > to 0.  Remove the special values completely and simply initialize
> > > > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > > > Add a helper for drivers to reset the VF limit back to total.    
> > > 
> > > I still can't really make sense out of the changelog.
> > >
> > > I think part of the reason it's confusing is because there are two
> > > things going on:
> > > 
> > >   1) You want this:
> > >   
> > >        pci_sriov_set_totalvfs(dev, 0);
> > >        x = pci_sriov_get_totalvfs(dev) 
> > > 
> > >      to return 0 instead of total_VFs.  That seems to connect with
> > >      your subject line.  It means "sriov_totalvfs" in sysfs could be
> > >      0, but I don't know how that is useful (I'm sure it is; just
> > >      educate me :))  
> > 
> > Let me just quote the bug report that got filed on our internal bug
> > tracker :)
> > 
> >   When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
> >   errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
> >   then tries to set that as the sriov_numvfs parameter.
> > 
> >   For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0, 
> >   but it's set to max.  When FW is switched to flower*, the correct 
> >   sriov_totalvfs value is presented.
> > 
> > * flower is a project name  
> 
> From the point of view of the PCI core (which knows nothing about
> device firmware and relies on the architected config space described
> by the PCIe spec), this sounds like an erratum: with some firmware
> installed, the device is not capable of SR-IOV, but still advertises
> an SR-IOV capability with "TotalVFs > 0".
> 
> Regardless of whether that's an erratum, we do allow PF drivers to use
> pci_sriov_set_totalvfs() to limit the number of VFs that may be
> enabled by writing to the PF's "sriov_numvfs" sysfs file.

Think more of an FPGA which can be reprogrammed at runtime to have
different capabilities than an erratum.  Some FWs simply have no use
for VFs and save resources (and validation time) by not supporting it.

> But the current implementation does not allow a PF driver to limit VFs
> to 0, and that does seem nonsensical.
> 
> > My understanding is OpenStack uses sriov_totalvfs to determine how many
> > VFs can be enabled, looks like this is the code:
> > 
> > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
> >   
> > >   2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
> > >      sure what you intend for this.  Is *every* driver supposed to
> > >      call it in .remove()?  Could/should this be done in the core
> > >      somehow instead of depending on every driver?  
> > 
> > Good question, I was just thinking yesterday we may want to call it
> > from the core, but I don't think it's strictly necessary nor always
> > sufficient (we may reload FW without re-probing).
> > 
> > We have a device which supports different number of VFs based on the FW
> > loaded.  Some legacy FWs does not inform the driver how many VFs it can
> > support, because it supports max.  So the flow in our driver is this:
> > 
> > load_fw(dev);
> > ...
> > max_vfs = ask_fw_for_max_vfs(dev);
> > if (max_vfs >= 0)
> > 	return pci_sriov_set_totalvfs(dev, max_vfs);
> > else /* FW didn't tell us, assume max */
> > 	return pci_sriov_reset_totalvfs(dev); 
> > 
> > We also reset the max on device remove, but that's not strictly
> > necessary.
> > 
> > Other users of pci_sriov_set_totalvfs() always know the value to set
> > the total to (either always get it from FW or it's a constant).
> > 
> > If you prefer we can work out the correct max for those legacy cases in
> > the driver as well, although it seemed cleaner to just ask the core,
> > since it already has total_VFs value handy :)
> >   
> > > I'm also having a hard time connecting your user-space command example
> > > with the rest of this.  Maybe it will make more sense to me tomorrow
> > > after some coffee.  
> > 
> > OpenStack assumes it will always be able to set sriov_numvfs to
> > sriov_totalvfs, see this 'if':
> > 
> > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512  
> 
> Thanks for educating me.  I think there are two issues here that we
> can separate.  I extracted the patch below for the first.
> 
> The second is the question of resetting driver_max_VFs.  I think we
> currently have a general issue in the core:
> 
>   - load PF driver 1
>   - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
>   - unload PF driver 1
>   - load PF driver 2
> 
> Now driver_max_VFs is still stuck at the lower value set by driver 1.
> I don't think that's the way this should work.
> 
> I guess this is partly a consequence of setting driver_max_VFs in
> sriov_init(), which is called before driver attach and should only
> depend on hardware characteristics, so it is related to the patch
> below.  But I think we should fix it in general, not just for
> netronome.

Okay, perfect.  That makes sense.  The patch below certainly fixes the
first issue for us.  Thank you!

As far as the second issue goes - agreed, having the core reset the
number of VFs to total_VFs definitely makes sense.  It doesn't cater to
the case where FW is reloaded without reprobing, but we don't do this
today anyway.

Should I try to come up with a patch to reset total_VFs after detach?

> commit 4a338bc6f94b9ad824ac944f5dfc249d6838719c
> Author: Jakub Kicinski <jakub.kicinski@netronome.com>
> Date:   Fri May 25 08:18:34 2018 -0500
> 
>     PCI/IOV: Allow PF drivers to limit total_VFs to 0
>     
>     Some SR-IOV PF drivers implement .sriov_configure(), which allows
>     user-space to enable VFs by writing the desired number of VFs to the sysfs
>     "sriov_numvfs" file (see sriov_numvfs_store()).
>     
>     The PCI core limits the number of VFs to the TotalVFs advertised by the
>     device in its SR-IOV capability.  The PF driver can limit the number of VFs
>     to even fewer (it may have pre-allocated data structures or knowledge of
>     device limitations) by calling pci_sriov_set_totalvfs(), but previously it
>     could not limit the VFs to 0.
>     
>     Change pci_sriov_get_totalvfs() so it always respects the VF limit imposed
>     by the PF driver, even if the limit is 0.
>     
>     This sequence:
>     
>       pci_sriov_set_totalvfs(dev, 0);
>       x = pci_sriov_get_totalvfs(dev);
>     
>     previously set "x" to TotalVFs from the SR-IOV capability.  Now it will set
>     "x" to 0.
>     
>     Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>     Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 192b82898a38..d0d73dbbd5ca 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -469,6 +469,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  	iov->nres = nres;
>  	iov->ctrl = ctrl;
>  	iov->total_VFs = total;
> +	iov->driver_max_VFs = total;
>  	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
>  	iov->pgsz = pgsz;
>  	iov->self = dev;
> @@ -827,10 +828,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
>  	if (!dev->is_physfn)
>  		return 0;
>  
> -	if (dev->sriov->driver_max_VFs)
> -		return dev->sriov->driver_max_VFs;
> -
> -	return dev->sriov->total_VFs;
> +	return dev->sriov->driver_max_VFs;
>  }
>  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
>
Bjorn Helgaas May 25, 2018, 9:45 p.m. UTC | #9
On Fri, May 25, 2018 at 02:05:21PM -0700, Jakub Kicinski wrote:
> On Fri, 25 May 2018 09:02:23 -0500, Bjorn Helgaas wrote:
> > On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
> > > On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:  
> > > > On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:  
> > > > > Some user space depends on enabling sriov_totalvfs number of VFs
> > > > > to not fail, e.g.:
> > > > > 
> > > > > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > > > > 
> > > > > For devices which VF support depends on loaded FW we have the
> > > > > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > > > > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > > > > to 0.  Remove the special values completely and simply initialize
> > > > > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > > > > Add a helper for drivers to reset the VF limit back to total.    
> > > > 
> > > > I still can't really make sense out of the changelog.
> > > >
> > > > I think part of the reason it's confusing is because there are two
> > > > things going on:
> > > > 
> > > >   1) You want this:
> > > >   
> > > >        pci_sriov_set_totalvfs(dev, 0);
> > > >        x = pci_sriov_get_totalvfs(dev) 
> > > > 
> > > >      to return 0 instead of total_VFs.  That seems to connect with
> > > >      your subject line.  It means "sriov_totalvfs" in sysfs could be
> > > >      0, but I don't know how that is useful (I'm sure it is; just
> > > >      educate me :))  
> > > 
> > > Let me just quote the bug report that got filed on our internal bug
> > > tracker :)
> > > 
> > >   When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
> > >   errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
> > >   then tries to set that as the sriov_numvfs parameter.
> > > 
> > >   For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0, 
> > >   but it's set to max.  When FW is switched to flower*, the correct 
> > >   sriov_totalvfs value is presented.
> > > 
> > > * flower is a project name  
> > 
> > From the point of view of the PCI core (which knows nothing about
> > device firmware and relies on the architected config space described
> > by the PCIe spec), this sounds like an erratum: with some firmware
> > installed, the device is not capable of SR-IOV, but still advertises
> > an SR-IOV capability with "TotalVFs > 0".
> > 
> > Regardless of whether that's an erratum, we do allow PF drivers to use
> > pci_sriov_set_totalvfs() to limit the number of VFs that may be
> > enabled by writing to the PF's "sriov_numvfs" sysfs file.
> 
> Think more of an FPGA which can be reprogrammed at runtime to have
> different capabilities than an erratum.  Some FWs simply have no use
> for VFs and save resources (and validation time) by not supporting it.

This is a bit of a gray area.  Reloading firmware or reprogramming an
FPGA has the potential to create a new and different device than we
had before, but the PCI core doesn't know that.  The typical sequence
is:

  - PCI core enumerates device
  - driver binds to device (we call .probe())
  - driver loads new firmware to device
  - driver resets device with pci_reset_function() or similar
  - pci_reset_function() saves config space
  - pci_reset_function() resets device
  - device uses new firmware when it comes out of reset
  - pci_reset_function() restores config space

Loading the new firmware might change what the device looks like in
config space -- it could change the number or size of BARs, the
capabilities advertised, etc.  We currently sweep that under the rug
and blindly restore the old config space.

It looks like your driver does the reset differently, so maybe it
keeps the original config space setup.

But all that said, I agree that we should allow a PF driver to prevent
VF enablement, whether because the firmware doesn't support it or the
PF driver just wants to prevent use of VFs for whatever reason (maybe
we don't have enough MMIO resources, we don't need the VFs, etc.)

> Okay, perfect.  That makes sense.  The patch below certainly fixes the
> first issue for us.  Thank you!
> 
> As far as the second issue goes - agreed, having the core reset the
> number of VFs to total_VFs definitely makes sense.  It doesn't cater to
> the case where FW is reloaded without reprobing, but we don't do this
> today anyway.
> 
> Should I try to come up with a patch to reset total_VFs after detach?

Yes, please.

Bjorn
Don Dutile May 29, 2018, 2:29 p.m. UTC | #10
On 05/25/2018 04:46 PM, Bjorn Helgaas wrote:
> On Fri, May 25, 2018 at 03:27:52PM -0400, Don Dutile wrote:
>> On 05/25/2018 10:02 AM, Bjorn Helgaas wrote:
>>> On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
>>>> Hi Bjorn!
>>>>
>>>> On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:
>>>>> On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:
>>>>>> Some user space depends on enabling sriov_totalvfs number of VFs
>>>>>> to not fail, e.g.:
>>>>>>
>>>>>> $ cat .../sriov_totalvfs > .../sriov_numvfs
>>>>>>
>>>>>> For devices which VF support depends on loaded FW we have the
>>>>>> pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
>>>>>> a special "unset" value, meaning drivers can't limit sriov_totalvfs
>>>>>> to 0.  Remove the special values completely and simply initialize
>>>>>> driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
>>>>>> Add a helper for drivers to reset the VF limit back to total.
>>>>>
>>>>> I still can't really make sense out of the changelog.
>>>>>
>>>>> I think part of the reason it's confusing is because there are two
>>>>> things going on:
>>>>>
>>>>>     1) You want this:
>>>>>          pci_sriov_set_totalvfs(dev, 0);
>>>>>          x = pci_sriov_get_totalvfs(dev)
>>>>>
>>>>>        to return 0 instead of total_VFs.  That seems to connect with
>>>>>        your subject line.  It means "sriov_totalvfs" in sysfs could be
>>>>>        0, but I don't know how that is useful (I'm sure it is; just
>>>>>        educate me :))
>>>>
>>>> Let me just quote the bug report that got filed on our internal bug
>>>> tracker :)
>>>>
>>>>     When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
>>>>     errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
>>>>     then tries to set that as the sriov_numvfs parameter.
>>>>
>>>>     For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0,
>>>>     but it's set to max.  When FW is switched to flower*, the correct
>>>>     sriov_totalvfs value is presented.
>>>>
>>>> * flower is a project name
>>>
>>>   From the point of view of the PCI core (which knows nothing about
>>> device firmware and relies on the architected config space described
>>> by the PCIe spec), this sounds like an erratum: with some firmware
>>> installed, the device is not capable of SR-IOV, but still advertises
>>> an SR-IOV capability with "TotalVFs > 0".
>>>
>>> Regardless of whether that's an erratum, we do allow PF drivers to use
>>> pci_sriov_set_totalvfs() to limit the number of VFs that may be
>>> enabled by writing to the PF's "sriov_numvfs" sysfs file.
>>>
>> +1.
>>
>>> But the current implementation does not allow a PF driver to limit VFs
>>> to 0, and that does seem nonsensical.
>>>
>> Well, not really -- claiming to support VFs, and then wanting it to be 0...
>> I could certainly argue is non-sensical.
>>  From a sw perspective, sure, see if we can set VFs to 0 (and reset to another value later).
>>
>> /me wishes that implementers would follow the architecture vs torquing it into strange shapes.
>>
>>>> My understanding is OpenStack uses sriov_totalvfs to determine how many
>>>> VFs can be enabled, looks like this is the code:
>>>>
>>>> http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
>>>>
>>>>>     2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
>>>>>        sure what you intend for this.  Is *every* driver supposed to
>>>>>        call it in .remove()?  Could/should this be done in the core
>>>>>        somehow instead of depending on every driver?
>>>>
>>>> Good question, I was just thinking yesterday we may want to call it
>>>> from the core, but I don't think it's strictly necessary nor always
>>>> sufficient (we may reload FW without re-probing).
>>>>
>>>> We have a device which supports different number of VFs based on the FW
>>>> loaded.  Some legacy FWs does not inform the driver how many VFs it can
>>>> support, because it supports max.  So the flow in our driver is this:
>>>>
>>>> load_fw(dev);
>>>> ...
>>>> max_vfs = ask_fw_for_max_vfs(dev);
>>>> if (max_vfs >= 0)
>>>> 	return pci_sriov_set_totalvfs(dev, max_vfs);
>>>> else /* FW didn't tell us, assume max */
>>>> 	return pci_sriov_reset_totalvfs(dev);
>>>>
>>>> We also reset the max on device remove, but that's not strictly
>>>> necessary.
>>>>
>>>> Other users of pci_sriov_set_totalvfs() always know the value to set
>>>> the total to (either always get it from FW or it's a constant).
>>>>
>>>> If you prefer we can work out the correct max for those legacy cases in
>>>> the driver as well, although it seemed cleaner to just ask the core,
>>>> since it already has total_VFs value handy :)
>>>>
>>>>> I'm also having a hard time connecting your user-space command example
>>>>> with the rest of this.  Maybe it will make more sense to me tomorrow
>>>>> after some coffee.
>>>>
>>>> OpenStack assumes it will always be able to set sriov_numvfs to
>>>> sriov_totalvfs, see this 'if':
>>>>
>>>> http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512
>>>
>>> Thanks for educating me.  I think there are two issues here that we
>>> can separate.  I extracted the patch below for the first.
>>>
>>> The second is the question of resetting driver_max_VFs.  I think we
>>> currently have a general issue in the core:
>>>
>>>     - load PF driver 1
>>>     - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
>>>     - unload PF driver 1
>>>     - load PF driver 2
>>>
>>> Now driver_max_VFs is still stuck at the lower value set by driver 1.
>>> I don't think that's the way this should work.
>>>
>>> I guess this is partly a consequence of setting driver_max_VFs in
>>> sriov_init(), which is called before driver attach and should only
>> um, if it's at sriov_init() how is max changed by a PF driver?
>> or am I missing something subtle (a new sysfs param) as to what is being changed?
> 
> sriov_init() basically just sets the default driver_max_VFs to Total_VFs.
> 
> If the PF driver later calls pci_sriov_set_totalvfs(), it can reduce
> driver_max_VFs.
> 
> My concern is that there's nothing that resets driver_max_VFs back to
> Total_VFs if we unload and reload the PF driver.
> 
ok, gotcha.
any complication of this non-arch quirk. :-/

>>> depend on hardware characteristics, so it is related to the patch
>>> below.  But I think we should fix it in general, not just for
>>> netronome.
>>>
>>>
>>> commit 4a338bc6f94b9ad824ac944f5dfc249d6838719c
>>> Author: Jakub Kicinski <jakub.kicinski@netronome.com>
>>> Date:   Fri May 25 08:18:34 2018 -0500
>>>
>>>       PCI/IOV: Allow PF drivers to limit total_VFs to 0
>>>       Some SR-IOV PF drivers implement .sriov_configure(), which allows
>>>       user-space to enable VFs by writing the desired number of VFs to the sysfs
>>>       "sriov_numvfs" file (see sriov_numvfs_store()).
>>>       The PCI core limits the number of VFs to the TotalVFs advertised by the
>>>       device in its SR-IOV capability.  The PF driver can limit the number of VFs
>>>       to even fewer (it may have pre-allocated data structures or knowledge of
>>>       device limitations) by calling pci_sriov_set_totalvfs(), but previously it
>>>       could not limit the VFs to 0.
>>>       Change pci_sriov_get_totalvfs() so it always respects the VF limit imposed
>>>       by the PF driver, even if the limit is 0.
>>>       This sequence:
>>>         pci_sriov_set_totalvfs(dev, 0);
>>>         x = pci_sriov_get_totalvfs(dev);
>>>       previously set "x" to TotalVFs from the SR-IOV capability.  Now it will set
>>>       "x" to 0.
>>>       Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>>>       Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
>>>
>>> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
>>> index 192b82898a38..d0d73dbbd5ca 100644
>>> --- a/drivers/pci/iov.c
>>> +++ b/drivers/pci/iov.c
>>> @@ -469,6 +469,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>>>    	iov->nres = nres;
>>>    	iov->ctrl = ctrl;
>>>    	iov->total_VFs = total;
>>> +	iov->driver_max_VFs = total;
>>>    	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
>>>    	iov->pgsz = pgsz;
>>>    	iov->self = dev;
>>> @@ -827,10 +828,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
>>>    	if (!dev->is_physfn)
>>>    		return 0;
>>> -	if (dev->sriov->driver_max_VFs)
>>> -		return dev->sriov->driver_max_VFs;
>>> -
>>> -	return dev->sriov->total_VFs;
>>> +	return dev->sriov->driver_max_VFs;
>>>    }
>>>    EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
>>>
>>
Don Dutile May 29, 2018, 2:34 p.m. UTC | #11
On 05/25/2018 05:05 PM, Jakub Kicinski wrote:
> On Fri, 25 May 2018 09:02:23 -0500, Bjorn Helgaas wrote:
>> On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
>>> On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:
>>>> On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:
>>>>> Some user space depends on enabling sriov_totalvfs number of VFs
>>>>> to not fail, e.g.:
>>>>>
>>>>> $ cat .../sriov_totalvfs > .../sriov_numvfs
>>>>>
>>>>> For devices which VF support depends on loaded FW we have the
>>>>> pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
>>>>> a special "unset" value, meaning drivers can't limit sriov_totalvfs
>>>>> to 0.  Remove the special values completely and simply initialize
>>>>> driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
>>>>> Add a helper for drivers to reset the VF limit back to total.
>>>>
>>>> I still can't really make sense out of the changelog.
>>>>
>>>> I think part of the reason it's confusing is because there are two
>>>> things going on:
>>>>
>>>>    1) You want this:
>>>>    
>>>>         pci_sriov_set_totalvfs(dev, 0);
>>>>         x = pci_sriov_get_totalvfs(dev)
>>>>
>>>>       to return 0 instead of total_VFs.  That seems to connect with
>>>>       your subject line.  It means "sriov_totalvfs" in sysfs could be
>>>>       0, but I don't know how that is useful (I'm sure it is; just
>>>>       educate me :))
>>>
>>> Let me just quote the bug report that got filed on our internal bug
>>> tracker :)
>>>
>>>    When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
>>>    errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
>>>    then tries to set that as the sriov_numvfs parameter.
>>>
>>>    For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0,
>>>    but it's set to max.  When FW is switched to flower*, the correct
>>>    sriov_totalvfs value is presented.
>>>
>>> * flower is a project name
>>
>>  From the point of view of the PCI core (which knows nothing about
>> device firmware and relies on the architected config space described
>> by the PCIe spec), this sounds like an erratum: with some firmware
>> installed, the device is not capable of SR-IOV, but still advertises
>> an SR-IOV capability with "TotalVFs > 0".
>>
>> Regardless of whether that's an erratum, we do allow PF drivers to use
>> pci_sriov_set_totalvfs() to limit the number of VFs that may be
>> enabled by writing to the PF's "sriov_numvfs" sysfs file.
> 
> Think more of an FPGA which can be reprogrammed at runtime to have
> different capabilities than an erratum.  Some FWs simply have no use
> for VFs and save resources (and validation time) by not supporting it.
> 
Sure, then the steps should be:
a) (re-)program FPGA
b) invoke hot-plug for new device.
    -- by default, VFs aren't configured(enabled) in a Linux kernel;
       -- some drivers provide boot-time enablement, but that becomes
          system-wide, and can cause major config issues when multiples of a device are installed in the system.
       -- otherwise, configure via sysfs
    -- this should clear/reset the VF values too.

>> But the current implementation does not allow a PF driver to limit VFs
>> to 0, and that does seem nonsensical.
>>
>>> My understanding is OpenStack uses sriov_totalvfs to determine how many
>>> VFs can be enabled, looks like this is the code:
>>>
>>> http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
>>>    
>>>>    2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
>>>>       sure what you intend for this.  Is *every* driver supposed to
>>>>       call it in .remove()?  Could/should this be done in the core
>>>>       somehow instead of depending on every driver?
>>>
>>> Good question, I was just thinking yesterday we may want to call it
>>> from the core, but I don't think it's strictly necessary nor always
>>> sufficient (we may reload FW without re-probing).
>>>
>>> We have a device which supports different number of VFs based on the FW
>>> loaded.  Some legacy FWs does not inform the driver how many VFs it can
>>> support, because it supports max.  So the flow in our driver is this:
>>>
>>> load_fw(dev);
>>> ...
>>> max_vfs = ask_fw_for_max_vfs(dev);
>>> if (max_vfs >= 0)
>>> 	return pci_sriov_set_totalvfs(dev, max_vfs);
>>> else /* FW didn't tell us, assume max */
>>> 	return pci_sriov_reset_totalvfs(dev);
>>>
>>> We also reset the max on device remove, but that's not strictly
>>> necessary.
>>>
>>> Other users of pci_sriov_set_totalvfs() always know the value to set
>>> the total to (either always get it from FW or it's a constant).
>>>
>>> If you prefer we can work out the correct max for those legacy cases in
>>> the driver as well, although it seemed cleaner to just ask the core,
>>> since it already has total_VFs value handy :)
>>>    
>>>> I'm also having a hard time connecting your user-space command example
>>>> with the rest of this.  Maybe it will make more sense to me tomorrow
>>>> after some coffee.
>>>
>>> OpenStack assumes it will always be able to set sriov_numvfs to
>>> sriov_totalvfs, see this 'if':
>>>
>>> http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512
>>
>> Thanks for educating me.  I think there are two issues here that we
>> can separate.  I extracted the patch below for the first.
>>
>> The second is the question of resetting driver_max_VFs.  I think we
>> currently have a general issue in the core:
>>
>>    - load PF driver 1
>>    - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
>>    - unload PF driver 1
>>    - load PF driver 2
>>
>> Now driver_max_VFs is still stuck at the lower value set by driver 1.
>> I don't think that's the way this should work.
>>
>> I guess this is partly a consequence of setting driver_max_VFs in
>> sriov_init(), which is called before driver attach and should only
>> depend on hardware characteristics, so it is related to the patch
>> below.  But I think we should fix it in general, not just for
>> netronome.
> 
> Okay, perfect.  That makes sense.  The patch below certainly fixes the
> first issue for us.  Thank you!
> 
> As far as the second issue goes - agreed, having the core reset the
> number of VFs to total_VFs definitely makes sense.  It doesn't cater to
> the case where FW is reloaded without reprobing, but we don't do this
> today anyway.
> 
> Should I try to come up with a patch to reset total_VFs after detach?
> 
>> commit 4a338bc6f94b9ad824ac944f5dfc249d6838719c
>> Author: Jakub Kicinski <jakub.kicinski@netronome.com>
>> Date:   Fri May 25 08:18:34 2018 -0500
>>
>>      PCI/IOV: Allow PF drivers to limit total_VFs to 0
>>      
>>      Some SR-IOV PF drivers implement .sriov_configure(), which allows
>>      user-space to enable VFs by writing the desired number of VFs to the sysfs
>>      "sriov_numvfs" file (see sriov_numvfs_store()).
>>      
>>      The PCI core limits the number of VFs to the TotalVFs advertised by the
>>      device in its SR-IOV capability.  The PF driver can limit the number of VFs
>>      to even fewer (it may have pre-allocated data structures or knowledge of
>>      device limitations) by calling pci_sriov_set_totalvfs(), but previously it
>>      could not limit the VFs to 0.
>>      
>>      Change pci_sriov_get_totalvfs() so it always respects the VF limit imposed
>>      by the PF driver, even if the limit is 0.
>>      
>>      This sequence:
>>      
>>        pci_sriov_set_totalvfs(dev, 0);
>>        x = pci_sriov_get_totalvfs(dev);
>>      
>>      previously set "x" to TotalVFs from the SR-IOV capability.  Now it will set
>>      "x" to 0.
>>      
>>      Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>>      Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
>>
>> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
>> index 192b82898a38..d0d73dbbd5ca 100644
>> --- a/drivers/pci/iov.c
>> +++ b/drivers/pci/iov.c
>> @@ -469,6 +469,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>>   	iov->nres = nres;
>>   	iov->ctrl = ctrl;
>>   	iov->total_VFs = total;
>> +	iov->driver_max_VFs = total;
>>   	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
>>   	iov->pgsz = pgsz;
>>   	iov->self = dev;
>> @@ -827,10 +828,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
>>   	if (!dev->is_physfn)
>>   		return 0;
>>   
>> -	if (dev->sriov->driver_max_VFs)
>> -		return dev->sriov->driver_max_VFs;
>> -
>> -	return dev->sriov->total_VFs;
>> +	return dev->sriov->driver_max_VFs;
>>   }
>>   EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
>>   
>
Bjorn Helgaas June 19, 2018, 9:37 p.m. UTC | #12
On Fri, May 25, 2018 at 09:02:23AM -0500, Bjorn Helgaas wrote:
> On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
> > Hi Bjorn!
> > 
> > On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:
> > > On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:
> > > > Some user space depends on enabling sriov_totalvfs number of VFs
> > > > to not fail, e.g.:
> > > > 
> > > > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > > > 
> > > > For devices which VF support depends on loaded FW we have the
> > > > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > > > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > > > to 0.  Remove the special values completely and simply initialize
> > > > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > > > Add a helper for drivers to reset the VF limit back to total.  
> > > 
> > > I still can't really make sense out of the changelog.
> > >
> > > I think part of the reason it's confusing is because there are two
> > > things going on:
> > > 
> > >   1) You want this:
> > >   
> > >        pci_sriov_set_totalvfs(dev, 0);
> > >        x = pci_sriov_get_totalvfs(dev) 
> > > 
> > >      to return 0 instead of total_VFs.  That seems to connect with
> > >      your subject line.  It means "sriov_totalvfs" in sysfs could be
> > >      0, but I don't know how that is useful (I'm sure it is; just
> > >      educate me :))
> > 
> > Let me just quote the bug report that got filed on our internal bug
> > tracker :)
> > 
> >   When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
> >   errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
> >   then tries to set that as the sriov_numvfs parameter.
> > 
> >   For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0, 
> >   but it's set to max.  When FW is switched to flower*, the correct 
> >   sriov_totalvfs value is presented.
> > 
> > * flower is a project name
> 
> From the point of view of the PCI core (which knows nothing about
> device firmware and relies on the architected config space described
> by the PCIe spec), this sounds like an erratum: with some firmware
> installed, the device is not capable of SR-IOV, but still advertises
> an SR-IOV capability with "TotalVFs > 0".
> 
> Regardless of whether that's an erratum, we do allow PF drivers to use
> pci_sriov_set_totalvfs() to limit the number of VFs that may be
> enabled by writing to the PF's "sriov_numvfs" sysfs file.
> 
> But the current implementation does not allow a PF driver to limit VFs
> to 0, and that does seem nonsensical.
> 
> > My understanding is OpenStack uses sriov_totalvfs to determine how many
> > VFs can be enabled, looks like this is the code:
> > 
> > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
> > 
> > >   2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
> > >      sure what you intend for this.  Is *every* driver supposed to
> > >      call it in .remove()?  Could/should this be done in the core
> > >      somehow instead of depending on every driver?
> > 
> > Good question, I was just thinking yesterday we may want to call it
> > from the core, but I don't think it's strictly necessary nor always
> > sufficient (we may reload FW without re-probing).
> > 
> > We have a device which supports different number of VFs based on the FW
> > loaded.  Some legacy FWs does not inform the driver how many VFs it can
> > support, because it supports max.  So the flow in our driver is this:
> > 
> > load_fw(dev);
> > ...
> > max_vfs = ask_fw_for_max_vfs(dev);
> > if (max_vfs >= 0)
> > 	return pci_sriov_set_totalvfs(dev, max_vfs);
> > else /* FW didn't tell us, assume max */
> > 	return pci_sriov_reset_totalvfs(dev); 
> > 
> > We also reset the max on device remove, but that's not strictly
> > necessary.
> > 
> > Other users of pci_sriov_set_totalvfs() always know the value to set
> > the total to (either always get it from FW or it's a constant).
> > 
> > If you prefer we can work out the correct max for those legacy cases in
> > the driver as well, although it seemed cleaner to just ask the core,
> > since it already has total_VFs value handy :)
> > 
> > > I'm also having a hard time connecting your user-space command example
> > > with the rest of this.  Maybe it will make more sense to me tomorrow
> > > after some coffee.
> > 
> > OpenStack assumes it will always be able to set sriov_numvfs to
> > sriov_totalvfs, see this 'if':
> > 
> > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512
> 
> Thanks for educating me.  I think there are two issues here that we
> can separate.  I extracted the patch below for the first.
> 
> The second is the question of resetting driver_max_VFs.  I think we
> currently have a general issue in the core:
> 
>   - load PF driver 1
>   - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
>   - unload PF driver 1
>   - load PF driver 2
> 
> Now driver_max_VFs is still stuck at the lower value set by driver 1.
> I don't think that's the way this should work.
> 
> I guess this is partly a consequence of setting driver_max_VFs in
> sriov_init(), which is called before driver attach and should only
> depend on hardware characteristics, so it is related to the patch
> below.  But I think we should fix it in general, not just for
> netronome.

Hi Jakub, the patch below is in v4.18-rc1 as 8d85a7a4f2c9 ("PCI/IOV:
Allow PF drivers to limit total_VFs to 0").  If there's more we need
to do here, would you mind rebasing what's left to v4.18-rc1 and
reposting it?

> commit 4a338bc6f94b9ad824ac944f5dfc249d6838719c
> Author: Jakub Kicinski <jakub.kicinski@netronome.com>
> Date:   Fri May 25 08:18:34 2018 -0500
> 
>     PCI/IOV: Allow PF drivers to limit total_VFs to 0
>     
>     Some SR-IOV PF drivers implement .sriov_configure(), which allows
>     user-space to enable VFs by writing the desired number of VFs to the sysfs
>     "sriov_numvfs" file (see sriov_numvfs_store()).
>     
>     The PCI core limits the number of VFs to the TotalVFs advertised by the
>     device in its SR-IOV capability.  The PF driver can limit the number of VFs
>     to even fewer (it may have pre-allocated data structures or knowledge of
>     device limitations) by calling pci_sriov_set_totalvfs(), but previously it
>     could not limit the VFs to 0.
>     
>     Change pci_sriov_get_totalvfs() so it always respects the VF limit imposed
>     by the PF driver, even if the limit is 0.
>     
>     This sequence:
>     
>       pci_sriov_set_totalvfs(dev, 0);
>       x = pci_sriov_get_totalvfs(dev);
>     
>     previously set "x" to TotalVFs from the SR-IOV capability.  Now it will set
>     "x" to 0.
>     
>     Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>     Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 192b82898a38..d0d73dbbd5ca 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -469,6 +469,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  	iov->nres = nres;
>  	iov->ctrl = ctrl;
>  	iov->total_VFs = total;
> +	iov->driver_max_VFs = total;
>  	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
>  	iov->pgsz = pgsz;
>  	iov->self = dev;
> @@ -827,10 +828,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
>  	if (!dev->is_physfn)
>  		return 0;
>  
> -	if (dev->sriov->driver_max_VFs)
> -		return dev->sriov->driver_max_VFs;
> -
> -	return dev->sriov->total_VFs;
> +	return dev->sriov->driver_max_VFs;
>  }
>  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
>
Jakub Kicinski June 20, 2018, 2:56 a.m. UTC | #13
On Tue, 19 Jun 2018 16:37:15 -0500, Bjorn Helgaas wrote:
> On Fri, May 25, 2018 at 09:02:23AM -0500, Bjorn Helgaas wrote:
> > On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:  
> > > Hi Bjorn!
> > > 
> > > On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:  
> > > > On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:  
> > > > > Some user space depends on enabling sriov_totalvfs number of VFs
> > > > > to not fail, e.g.:
> > > > > 
> > > > > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > > > > 
> > > > > For devices which VF support depends on loaded FW we have the
> > > > > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > > > > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > > > > to 0.  Remove the special values completely and simply initialize
> > > > > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > > > > Add a helper for drivers to reset the VF limit back to total.    
> > > > 
> > > > I still can't really make sense out of the changelog.
> > > >
> > > > I think part of the reason it's confusing is because there are two
> > > > things going on:
> > > > 
> > > >   1) You want this:
> > > >   
> > > >        pci_sriov_set_totalvfs(dev, 0);
> > > >        x = pci_sriov_get_totalvfs(dev) 
> > > > 
> > > >      to return 0 instead of total_VFs.  That seems to connect with
> > > >      your subject line.  It means "sriov_totalvfs" in sysfs could be
> > > >      0, but I don't know how that is useful (I'm sure it is; just
> > > >      educate me :))  
> > > 
> > > Let me just quote the bug report that got filed on our internal bug
> > > tracker :)
> > > 
> > >   When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
> > >   errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
> > >   then tries to set that as the sriov_numvfs parameter.
> > > 
> > >   For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0, 
> > >   but it's set to max.  When FW is switched to flower*, the correct 
> > >   sriov_totalvfs value is presented.
> > > 
> > > * flower is a project name  
> > 
> > From the point of view of the PCI core (which knows nothing about
> > device firmware and relies on the architected config space described
> > by the PCIe spec), this sounds like an erratum: with some firmware
> > installed, the device is not capable of SR-IOV, but still advertises
> > an SR-IOV capability with "TotalVFs > 0".
> > 
> > Regardless of whether that's an erratum, we do allow PF drivers to use
> > pci_sriov_set_totalvfs() to limit the number of VFs that may be
> > enabled by writing to the PF's "sriov_numvfs" sysfs file.
> > 
> > But the current implementation does not allow a PF driver to limit VFs
> > to 0, and that does seem nonsensical.
> >   
> > > My understanding is OpenStack uses sriov_totalvfs to determine how many
> > > VFs can be enabled, looks like this is the code:
> > > 
> > > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
> > >   
> > > >   2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
> > > >      sure what you intend for this.  Is *every* driver supposed to
> > > >      call it in .remove()?  Could/should this be done in the core
> > > >      somehow instead of depending on every driver?  
> > > 
> > > Good question, I was just thinking yesterday we may want to call it
> > > from the core, but I don't think it's strictly necessary nor always
> > > sufficient (we may reload FW without re-probing).
> > > 
> > > We have a device which supports different number of VFs based on the FW
> > > loaded.  Some legacy FWs does not inform the driver how many VFs it can
> > > support, because it supports max.  So the flow in our driver is this:
> > > 
> > > load_fw(dev);
> > > ...
> > > max_vfs = ask_fw_for_max_vfs(dev);
> > > if (max_vfs >= 0)
> > > 	return pci_sriov_set_totalvfs(dev, max_vfs);
> > > else /* FW didn't tell us, assume max */
> > > 	return pci_sriov_reset_totalvfs(dev); 
> > > 
> > > We also reset the max on device remove, but that's not strictly
> > > necessary.
> > > 
> > > Other users of pci_sriov_set_totalvfs() always know the value to set
> > > the total to (either always get it from FW or it's a constant).
> > > 
> > > If you prefer we can work out the correct max for those legacy cases in
> > > the driver as well, although it seemed cleaner to just ask the core,
> > > since it already has total_VFs value handy :)
> > >   
> > > > I'm also having a hard time connecting your user-space command example
> > > > with the rest of this.  Maybe it will make more sense to me tomorrow
> > > > after some coffee.  
> > > 
> > > OpenStack assumes it will always be able to set sriov_numvfs to
> > > sriov_totalvfs, see this 'if':
> > > 
> > > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512  
> > 
> > Thanks for educating me.  I think there are two issues here that we
> > can separate.  I extracted the patch below for the first.
> > 
> > The second is the question of resetting driver_max_VFs.  I think we
> > currently have a general issue in the core:
> > 
> >   - load PF driver 1
> >   - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
> >   - unload PF driver 1
> >   - load PF driver 2
> > 
> > Now driver_max_VFs is still stuck at the lower value set by driver 1.
> > I don't think that's the way this should work.
> > 
> > I guess this is partly a consequence of setting driver_max_VFs in
> > sriov_init(), which is called before driver attach and should only
> > depend on hardware characteristics, so it is related to the patch
> > below.  But I think we should fix it in general, not just for
> > netronome.  
> 
> Hi Jakub, the patch below is in v4.18-rc1 as 8d85a7a4f2c9 ("PCI/IOV:
> Allow PF drivers to limit total_VFs to 0").  If there's more we need
> to do here, would you mind rebasing what's left to v4.18-rc1 and
> reposting it?

Hi Bjorn!

Thanks a lot for looking into this!  My understanding is that we have
two ways forward:
 - add a pci_sriov_reset_totalvfs() helper for drivers to call;
 - make the core reset the totalVFs after driver is detached.

IMHO second option is better.  I went ahead and posted:

https://patchwork.ozlabs.org/patch/931210/

This works very well for nfp driver (modulo minor clean ups but I'd
rather route those via networking trees to avoid conflicts).

> > commit 4a338bc6f94b9ad824ac944f5dfc249d6838719c
> > Author: Jakub Kicinski <jakub.kicinski@netronome.com>
> > Date:   Fri May 25 08:18:34 2018 -0500
> > 
> >     PCI/IOV: Allow PF drivers to limit total_VFs to 0
> >     
> >     Some SR-IOV PF drivers implement .sriov_configure(), which allows
> >     user-space to enable VFs by writing the desired number of VFs to the sysfs
> >     "sriov_numvfs" file (see sriov_numvfs_store()).
> >     
> >     The PCI core limits the number of VFs to the TotalVFs advertised by the
> >     device in its SR-IOV capability.  The PF driver can limit the number of VFs
> >     to even fewer (it may have pre-allocated data structures or knowledge of
> >     device limitations) by calling pci_sriov_set_totalvfs(), but previously it
> >     could not limit the VFs to 0.
> >     
> >     Change pci_sriov_get_totalvfs() so it always respects the VF limit imposed
> >     by the PF driver, even if the limit is 0.
> >     
> >     This sequence:
> >     
> >       pci_sriov_set_totalvfs(dev, 0);
> >       x = pci_sriov_get_totalvfs(dev);
> >     
> >     previously set "x" to TotalVFs from the SR-IOV capability.  Now it will set
> >     "x" to 0.
> >     
> >     Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> >     Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> > 
> > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> > index 192b82898a38..d0d73dbbd5ca 100644
> > --- a/drivers/pci/iov.c
> > +++ b/drivers/pci/iov.c
> > @@ -469,6 +469,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
> >  	iov->nres = nres;
> >  	iov->ctrl = ctrl;
> >  	iov->total_VFs = total;
> > +	iov->driver_max_VFs = total;
> >  	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
> >  	iov->pgsz = pgsz;
> >  	iov->self = dev;
> > @@ -827,10 +828,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
> >  	if (!dev->is_physfn)
> >  		return 0;
> >  
> > -	if (dev->sriov->driver_max_VFs)
> > -		return dev->sriov->driver_max_VFs;
> > -
> > -	return dev->sriov->total_VFs;
> > +	return dev->sriov->driver_max_VFs;
> >  }
> >  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
> >
diff mbox series

Patch

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index c4b1f344b4da..a76d177e40dd 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -123,7 +123,7 @@  static int nfp_pcie_sriov_read_nfd_limit(struct nfp_pf *pf)
 		return pci_sriov_set_totalvfs(pf->pdev, pf->limit_vfs);
 
 	pf->limit_vfs = ~0;
-	pci_sriov_set_totalvfs(pf->pdev, 0); /* 0 is unset */
+	pci_sriov_reset_totalvfs(pf->pdev);
 	/* Allow any setting for backwards compatibility if symbol not found */
 	if (err == -ENOENT)
 		return 0;
@@ -537,7 +537,7 @@  static int nfp_pci_probe(struct pci_dev *pdev,
 err_net_remove:
 	nfp_net_pci_remove(pf);
 err_sriov_unlimit:
-	pci_sriov_set_totalvfs(pf->pdev, 0);
+	pci_sriov_reset_totalvfs(pf->pdev);
 err_fw_unload:
 	kfree(pf->rtbl);
 	nfp_mip_close(pf->mip);
@@ -570,7 +570,7 @@  static void nfp_pci_remove(struct pci_dev *pdev)
 	nfp_hwmon_unregister(pf);
 
 	nfp_pcie_sriov_disable(pdev);
-	pci_sriov_set_totalvfs(pf->pdev, 0);
+	pci_sriov_reset_totalvfs(pf->pdev);
 
 	nfp_net_pci_remove(pf);
 
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 677924ae0350..c63ea870d8be 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -443,6 +443,7 @@  static int sriov_init(struct pci_dev *dev, int pos)
 	iov->nres = nres;
 	iov->ctrl = ctrl;
 	iov->total_VFs = total;
+	iov->driver_max_VFs = total;
 	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
 	iov->pgsz = pgsz;
 	iov->self = dev;
@@ -788,12 +789,29 @@  int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs)
 }
 EXPORT_SYMBOL_GPL(pci_sriov_set_totalvfs);
 
+/**
+ * pci_sriov_reset_totalvfs -- return the TotalVFs value to the default
+ * @dev: the PCI PF device
+ *
+ * Should be called from PF driver's remove routine with
+ * device's mutex held.
+ */
+void pci_sriov_reset_totalvfs(struct pci_dev *dev)
+{
+	/* Shouldn't change if VFs already enabled */
+	if (!dev->is_physfn || dev->sriov->ctrl & PCI_SRIOV_CTRL_VFE)
+		return;
+
+	dev->sriov->driver_max_VFs = dev->sriov->total_VFs;
+}
+EXPORT_SYMBOL_GPL(pci_sriov_reset_totalvfs);
+
 /**
  * pci_sriov_get_totalvfs -- get total VFs supported on this device
  * @dev: the PCI PF device
  *
- * For a PCIe device with SRIOV support, return the PCIe
- * SRIOV capability value of TotalVFs or the value of driver_max_VFs
+ * For a PCIe device with SRIOV support, return the value of driver_max_VFs
+ * which can be equal to the PCIe SRIOV capability value of TotalVFs or lower
  * if the driver reduced it.  Otherwise 0.
  */
 int pci_sriov_get_totalvfs(struct pci_dev *dev)
@@ -801,9 +819,6 @@  int pci_sriov_get_totalvfs(struct pci_dev *dev)
 	if (!dev->is_physfn)
 		return 0;
 
-	if (dev->sriov->driver_max_VFs)
-		return dev->sriov->driver_max_VFs;
-
-	return dev->sriov->total_VFs;
+	return dev->sriov->driver_max_VFs;
 }
 EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 024a1beda008..95fde8850393 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1952,6 +1952,7 @@  void pci_iov_remove_virtfn(struct pci_dev *dev, int id);
 int pci_num_vf(struct pci_dev *dev);
 int pci_vfs_assigned(struct pci_dev *dev);
 int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
+void pci_sriov_reset_totalvfs(struct pci_dev *dev);
 int pci_sriov_get_totalvfs(struct pci_dev *dev);
 resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
 void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
@@ -1978,6 +1979,7 @@  static inline int pci_vfs_assigned(struct pci_dev *dev)
 { return 0; }
 static inline int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs)
 { return 0; }
+static inline void pci_sriov_reset_totalvfs(struct pci_dev *dev) { }
 static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
 { return 0; }
 static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)