diff mbox series

[mlx5-next,v1,2/5] PCI: Add SR-IOV sysfs entry to read number of MSI-X vectors

Message ID 20210110150727.1965295-3-leon@kernel.org
State New
Headers show
Series Dynamically assign MSI-X vectors count | expand

Commit Message

Leon Romanovsky Jan. 10, 2021, 3:07 p.m. UTC
From: Leon Romanovsky <leonro@nvidia.com>

Some SR-IOV capable devices provide an ability to configure specific
number of MSI-X vectors on their VF prior driver is probed on that VF.

In order to make management easy, provide new read-only sysfs file that
returns a total number of possible to configure MSI-X vectors.

cat /sys/bus/pci/devices/.../sriov_vf_total_msix
  = 0 - feature is not supported
  > 0 - total number of MSI-X vectors to consume by the VFs

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 Documentation/ABI/testing/sysfs-bus-pci | 14 +++++++++++
 drivers/pci/iov.c                       | 31 +++++++++++++++++++++++++
 drivers/pci/pci.h                       |  3 +++
 include/linux/pci.h                     |  2 ++
 4 files changed, 50 insertions(+)

--
2.29.2

Comments

Alexander Duyck Jan. 11, 2021, 7:30 p.m. UTC | #1
On Sun, Jan 10, 2021 at 7:10 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> From: Leon Romanovsky <leonro@nvidia.com>
>
> Some SR-IOV capable devices provide an ability to configure specific
> number of MSI-X vectors on their VF prior driver is probed on that VF.
>
> In order to make management easy, provide new read-only sysfs file that
> returns a total number of possible to configure MSI-X vectors.
>
> cat /sys/bus/pci/devices/.../sriov_vf_total_msix
>   = 0 - feature is not supported
>   > 0 - total number of MSI-X vectors to consume by the VFs
>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  Documentation/ABI/testing/sysfs-bus-pci | 14 +++++++++++
>  drivers/pci/iov.c                       | 31 +++++++++++++++++++++++++
>  drivers/pci/pci.h                       |  3 +++
>  include/linux/pci.h                     |  2 ++
>  4 files changed, 50 insertions(+)
>
> diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> index 05e26e5da54e..64e9b700acc9 100644
> --- a/Documentation/ABI/testing/sysfs-bus-pci
> +++ b/Documentation/ABI/testing/sysfs-bus-pci
> @@ -395,3 +395,17 @@ Description:
>                 The file is writable if the PF is bound to a driver that
>                 supports the ->sriov_set_msix_vec_count() callback and there
>                 is no driver bound to the VF.
> +
> +What:          /sys/bus/pci/devices/.../sriov_vf_total_msix

In this case I would drop the "vf" and just go with sriov_total_msix
since now you are referring to a global value instead of a per VF
value.

> +Date:          January 2021
> +Contact:       Leon Romanovsky <leonro@nvidia.com>
> +Description:
> +               This file is associated with the SR-IOV PFs.
> +               It returns a total number of possible to configure MSI-X
> +               vectors on the enabled VFs.
> +
> +               The values returned are:
> +                * > 0 - this will be total number possible to consume by VFs,
> +                * = 0 - feature is not supported
> +
> +               If no SR-IOV VFs are enabled, this value will return 0.
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 42c0df4158d1..0a6ddf3230fd 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -394,12 +394,22 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
>         return count;
>  }
>
> +static ssize_t sriov_vf_total_msix_show(struct device *dev,
> +                                       struct device_attribute *attr,
> +                                       char *buf)
> +{
> +       struct pci_dev *pdev = to_pci_dev(dev);
> +
> +       return sprintf(buf, "%d\n", pdev->sriov->vf_total_msix);
> +}
> +

You display it as a signed value, but unsigned values are not
supported, correct?

>  static DEVICE_ATTR_RO(sriov_totalvfs);
>  static DEVICE_ATTR_RW(sriov_numvfs);
>  static DEVICE_ATTR_RO(sriov_offset);
>  static DEVICE_ATTR_RO(sriov_stride);
>  static DEVICE_ATTR_RO(sriov_vf_device);
>  static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
> +static DEVICE_ATTR_RO(sriov_vf_total_msix);
>
>  static struct attribute *sriov_dev_attrs[] = {
>         &dev_attr_sriov_totalvfs.attr,
> @@ -408,6 +418,7 @@ static struct attribute *sriov_dev_attrs[] = {
>         &dev_attr_sriov_stride.attr,
>         &dev_attr_sriov_vf_device.attr,
>         &dev_attr_sriov_drivers_autoprobe.attr,
> +       &dev_attr_sriov_vf_total_msix.attr,
>         NULL,
>  };
>
> @@ -658,6 +669,7 @@ static void sriov_disable(struct pci_dev *dev)
>                 sysfs_remove_link(&dev->dev.kobj, "dep_link");
>
>         iov->num_VFs = 0;
> +       iov->vf_total_msix = 0;
>         pci_iov_set_numvfs(dev, 0);
>  }
>
> @@ -1116,6 +1128,25 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
>  }
>  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
>
> +/**
> + * pci_sriov_set_vf_total_msix - set total number of MSI-X vectors for the VFs
> + * @dev: the PCI PF device
> + * @numb: the total number of MSI-X vector to consume by the VFs
> + *
> + * Sets the number of MSI-X vectors that is possible to consume by the VFs.
> + * This interface is complimentary part of the pci_set_msix_vec_count()
> + * that will be used to configure the required number on the VF.
> + */
> +void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb)
> +{
> +       if (!dev->is_physfn || !dev->driver ||
> +           !dev->driver->sriov_set_msix_vec_count)
> +               return;
> +
> +       dev->sriov->vf_total_msix = numb;
> +}
> +EXPORT_SYMBOL_GPL(pci_sriov_set_vf_total_msix);
> +

This seems broken. What validation is being done on the numb value?
You pass it as int, and your documentation all refers to tests for >=
0, but isn't a signed input a possibility as well? Also "numb" doesn't
make for a good abbreviation as it is already a word of its own. It
might make more sense to use count or something like that rather than
trying to abbreviate number.


>  /**
>   * pci_sriov_configure_simple - helper to configure SR-IOV
>   * @dev: the PCI device
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 1fd273077637..0fbe291eb0f2 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -327,6 +327,9 @@ struct pci_sriov {
>         u16             subsystem_device; /* VF subsystem device */
>         resource_size_t barsz[PCI_SRIOV_NUM_BARS];      /* VF BAR size */
>         bool            drivers_autoprobe; /* Auto probing of VFs by driver */
> +       int             vf_total_msix;  /* Total number of MSI-X vectors the VFs
> +                                        * can consume
> +                                        */
>  };
>
>  /**
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index a17cfc28eb66..fd9ff1f42a09 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -2074,6 +2074,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev);
>  int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
>  resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
>  void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
> +void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb);
>
>  /* Arch may override these (weak) */
>  int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs);
> @@ -2114,6 +2115,7 @@ static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
>  static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
>  { return 0; }
>  static inline void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe) { }
> +static inline void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb) {}
>  #endif
>
>  #if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
> --
> 2.29.2
>
Leon Romanovsky Jan. 12, 2021, 6:56 a.m. UTC | #2
On Mon, Jan 11, 2021 at 11:30:39AM -0800, Alexander Duyck wrote:
> On Sun, Jan 10, 2021 at 7:10 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > From: Leon Romanovsky <leonro@nvidia.com>
> >
> > Some SR-IOV capable devices provide an ability to configure specific
> > number of MSI-X vectors on their VF prior driver is probed on that VF.
> >
> > In order to make management easy, provide new read-only sysfs file that
> > returns a total number of possible to configure MSI-X vectors.
> >
> > cat /sys/bus/pci/devices/.../sriov_vf_total_msix
> >   = 0 - feature is not supported
> >   > 0 - total number of MSI-X vectors to consume by the VFs
> >
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> >  Documentation/ABI/testing/sysfs-bus-pci | 14 +++++++++++
> >  drivers/pci/iov.c                       | 31 +++++++++++++++++++++++++
> >  drivers/pci/pci.h                       |  3 +++
> >  include/linux/pci.h                     |  2 ++
> >  4 files changed, 50 insertions(+)
> >
> > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > index 05e26e5da54e..64e9b700acc9 100644
> > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > @@ -395,3 +395,17 @@ Description:
> >                 The file is writable if the PF is bound to a driver that
> >                 supports the ->sriov_set_msix_vec_count() callback and there
> >                 is no driver bound to the VF.
> > +
> > +What:          /sys/bus/pci/devices/.../sriov_vf_total_msix
>
> In this case I would drop the "vf" and just go with sriov_total_msix
> since now you are referring to a global value instead of a per VF
> value.

This field indicates the amount of MSI-X available for VFs, it doesn't
include PFs. The missing "_vf_" will mislead users who will believe that
it is all MSI-X vectors available for this device. They will need to take
into consideration amount of PF MSI-X in order to calculate the VF distribution.

So I would leave "_vf_" here.

>
> > +Date:          January 2021
> > +Contact:       Leon Romanovsky <leonro@nvidia.com>
> > +Description:
> > +               This file is associated with the SR-IOV PFs.
> > +               It returns a total number of possible to configure MSI-X
> > +               vectors on the enabled VFs.
> > +
> > +               The values returned are:
> > +                * > 0 - this will be total number possible to consume by VFs,
> > +                * = 0 - feature is not supported
> > +
> > +               If no SR-IOV VFs are enabled, this value will return 0.
> > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> > index 42c0df4158d1..0a6ddf3230fd 100644
> > --- a/drivers/pci/iov.c
> > +++ b/drivers/pci/iov.c
> > @@ -394,12 +394,22 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
> >         return count;
> >  }
> >
> > +static ssize_t sriov_vf_total_msix_show(struct device *dev,
> > +                                       struct device_attribute *attr,
> > +                                       char *buf)
> > +{
> > +       struct pci_dev *pdev = to_pci_dev(dev);
> > +
> > +       return sprintf(buf, "%d\n", pdev->sriov->vf_total_msix);
> > +}
> > +
>
> You display it as a signed value, but unsigned values are not
> supported, correct?

Right, I made it similar to the vf_msix_set. I can change.

>
> >  static DEVICE_ATTR_RO(sriov_totalvfs);
> >  static DEVICE_ATTR_RW(sriov_numvfs);
> >  static DEVICE_ATTR_RO(sriov_offset);
> >  static DEVICE_ATTR_RO(sriov_stride);
> >  static DEVICE_ATTR_RO(sriov_vf_device);
> >  static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
> > +static DEVICE_ATTR_RO(sriov_vf_total_msix);
> >
> >  static struct attribute *sriov_dev_attrs[] = {
> >         &dev_attr_sriov_totalvfs.attr,
> > @@ -408,6 +418,7 @@ static struct attribute *sriov_dev_attrs[] = {
> >         &dev_attr_sriov_stride.attr,
> >         &dev_attr_sriov_vf_device.attr,
> >         &dev_attr_sriov_drivers_autoprobe.attr,
> > +       &dev_attr_sriov_vf_total_msix.attr,
> >         NULL,
> >  };
> >
> > @@ -658,6 +669,7 @@ static void sriov_disable(struct pci_dev *dev)
> >                 sysfs_remove_link(&dev->dev.kobj, "dep_link");
> >
> >         iov->num_VFs = 0;
> > +       iov->vf_total_msix = 0;
> >         pci_iov_set_numvfs(dev, 0);
> >  }
> >
> > @@ -1116,6 +1128,25 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
> >  }
> >  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
> >
> > +/**
> > + * pci_sriov_set_vf_total_msix - set total number of MSI-X vectors for the VFs
> > + * @dev: the PCI PF device
> > + * @numb: the total number of MSI-X vector to consume by the VFs
> > + *
> > + * Sets the number of MSI-X vectors that is possible to consume by the VFs.
> > + * This interface is complimentary part of the pci_set_msix_vec_count()
> > + * that will be used to configure the required number on the VF.
> > + */
> > +void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb)
> > +{
> > +       if (!dev->is_physfn || !dev->driver ||
> > +           !dev->driver->sriov_set_msix_vec_count)
> > +               return;
> > +
> > +       dev->sriov->vf_total_msix = numb;
> > +}
> > +EXPORT_SYMBOL_GPL(pci_sriov_set_vf_total_msix);
> > +
>
> This seems broken. What validation is being done on the numb value?
> You pass it as int, and your documentation all refers to tests for >=
> 0, but isn't a signed input a possibility as well? Also "numb" doesn't
> make for a good abbreviation as it is already a word of its own. It
> might make more sense to use count or something like that rather than
> trying to abbreviate number.

"Broken" is a nice word to describe misunderstanding.

The vf_total_msix is not set by the users and used solely by the drivers
to advertise their capability. This field is needed to give a way to
calculate how much MSI-X VFs can get. The driver code is part of the
kernel and like any other kernel code, it is trusted.

I'm checking < 0 in another _set_ routine to make sure that we will be
able to extend this sysfs entry if at some point of time negative vector
count will make sense.

"Count" instead of "numb" is fine by me.

>
>
> >  /**
> >   * pci_sriov_configure_simple - helper to configure SR-IOV
> >   * @dev: the PCI device
> > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> > index 1fd273077637..0fbe291eb0f2 100644
> > --- a/drivers/pci/pci.h
> > +++ b/drivers/pci/pci.h
> > @@ -327,6 +327,9 @@ struct pci_sriov {
> >         u16             subsystem_device; /* VF subsystem device */
> >         resource_size_t barsz[PCI_SRIOV_NUM_BARS];      /* VF BAR size */
> >         bool            drivers_autoprobe; /* Auto probing of VFs by driver */
> > +       int             vf_total_msix;  /* Total number of MSI-X vectors the VFs
> > +                                        * can consume
> > +                                        */
> >  };
> >
> >  /**
> > diff --git a/include/linux/pci.h b/include/linux/pci.h
> > index a17cfc28eb66..fd9ff1f42a09 100644
> > --- a/include/linux/pci.h
> > +++ b/include/linux/pci.h
> > @@ -2074,6 +2074,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev);
> >  int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
> >  resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
> >  void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
> > +void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb);
> >
> >  /* Arch may override these (weak) */
> >  int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs);
> > @@ -2114,6 +2115,7 @@ static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
> >  static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
> >  { return 0; }
> >  static inline void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe) { }
> > +static inline void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb) {}
> >  #endif
> >
> >  #if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
> > --
> > 2.29.2
> >
Alexander Duyck Jan. 12, 2021, 9:34 p.m. UTC | #3
On Mon, Jan 11, 2021 at 10:56 PM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Mon, Jan 11, 2021 at 11:30:39AM -0800, Alexander Duyck wrote:
> > On Sun, Jan 10, 2021 at 7:10 AM Leon Romanovsky <leon@kernel.org> wrote:
> > >
> > > From: Leon Romanovsky <leonro@nvidia.com>
> > >
> > > Some SR-IOV capable devices provide an ability to configure specific
> > > number of MSI-X vectors on their VF prior driver is probed on that VF.
> > >
> > > In order to make management easy, provide new read-only sysfs file that
> > > returns a total number of possible to configure MSI-X vectors.
> > >
> > > cat /sys/bus/pci/devices/.../sriov_vf_total_msix
> > >   = 0 - feature is not supported
> > >   > 0 - total number of MSI-X vectors to consume by the VFs
> > >
> > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > > ---
> > >  Documentation/ABI/testing/sysfs-bus-pci | 14 +++++++++++
> > >  drivers/pci/iov.c                       | 31 +++++++++++++++++++++++++
> > >  drivers/pci/pci.h                       |  3 +++
> > >  include/linux/pci.h                     |  2 ++
> > >  4 files changed, 50 insertions(+)
> > >
> > > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > > index 05e26e5da54e..64e9b700acc9 100644
> > > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > > @@ -395,3 +395,17 @@ Description:
> > >                 The file is writable if the PF is bound to a driver that
> > >                 supports the ->sriov_set_msix_vec_count() callback and there
> > >                 is no driver bound to the VF.
> > > +
> > > +What:          /sys/bus/pci/devices/.../sriov_vf_total_msix
> >
> > In this case I would drop the "vf" and just go with sriov_total_msix
> > since now you are referring to a global value instead of a per VF
> > value.
>
> This field indicates the amount of MSI-X available for VFs, it doesn't
> include PFs. The missing "_vf_" will mislead users who will believe that
> it is all MSI-X vectors available for this device. They will need to take
> into consideration amount of PF MSI-X in order to calculate the VF distribution.
>
> So I would leave "_vf_" here.

The problem is you aren't indicating how many are available for an
individual VF though, you are indicating how many are available for
use by SR-IOV to give to the VFs. The fact that you are dealing with a
pool makes things confusing in my opinion. For example sriov_vf_device
describes the device ID that will be given to each VF.

> >
> > > +Date:          January 2021
> > > +Contact:       Leon Romanovsky <leonro@nvidia.com>
> > > +Description:
> > > +               This file is associated with the SR-IOV PFs.
> > > +               It returns a total number of possible to configure MSI-X
> > > +               vectors on the enabled VFs.
> > > +
> > > +               The values returned are:
> > > +                * > 0 - this will be total number possible to consume by VFs,
> > > +                * = 0 - feature is not supported
> > > +
> > > +               If no SR-IOV VFs are enabled, this value will return 0.
> > > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> > > index 42c0df4158d1..0a6ddf3230fd 100644
> > > --- a/drivers/pci/iov.c
> > > +++ b/drivers/pci/iov.c
> > > @@ -394,12 +394,22 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
> > >         return count;
> > >  }
> > >
> > > +static ssize_t sriov_vf_total_msix_show(struct device *dev,
> > > +                                       struct device_attribute *attr,
> > > +                                       char *buf)
> > > +{
> > > +       struct pci_dev *pdev = to_pci_dev(dev);
> > > +
> > > +       return sprintf(buf, "%d\n", pdev->sriov->vf_total_msix);
> > > +}
> > > +
> >
> > You display it as a signed value, but unsigned values are not
> > supported, correct?
>
> Right, I made it similar to the vf_msix_set. I can change.
>
> >
> > >  static DEVICE_ATTR_RO(sriov_totalvfs);
> > >  static DEVICE_ATTR_RW(sriov_numvfs);
> > >  static DEVICE_ATTR_RO(sriov_offset);
> > >  static DEVICE_ATTR_RO(sriov_stride);
> > >  static DEVICE_ATTR_RO(sriov_vf_device);
> > >  static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
> > > +static DEVICE_ATTR_RO(sriov_vf_total_msix);
> > >
> > >  static struct attribute *sriov_dev_attrs[] = {
> > >         &dev_attr_sriov_totalvfs.attr,
> > > @@ -408,6 +418,7 @@ static struct attribute *sriov_dev_attrs[] = {
> > >         &dev_attr_sriov_stride.attr,
> > >         &dev_attr_sriov_vf_device.attr,
> > >         &dev_attr_sriov_drivers_autoprobe.attr,
> > > +       &dev_attr_sriov_vf_total_msix.attr,
> > >         NULL,
> > >  };
> > >
> > > @@ -658,6 +669,7 @@ static void sriov_disable(struct pci_dev *dev)
> > >                 sysfs_remove_link(&dev->dev.kobj, "dep_link");
> > >
> > >         iov->num_VFs = 0;
> > > +       iov->vf_total_msix = 0;
> > >         pci_iov_set_numvfs(dev, 0);
> > >  }
> > >
> > > @@ -1116,6 +1128,25 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
> > >  }
> > >  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
> > >
> > > +/**
> > > + * pci_sriov_set_vf_total_msix - set total number of MSI-X vectors for the VFs
> > > + * @dev: the PCI PF device
> > > + * @numb: the total number of MSI-X vector to consume by the VFs
> > > + *
> > > + * Sets the number of MSI-X vectors that is possible to consume by the VFs.
> > > + * This interface is complimentary part of the pci_set_msix_vec_count()
> > > + * that will be used to configure the required number on the VF.
> > > + */
> > > +void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb)
> > > +{
> > > +       if (!dev->is_physfn || !dev->driver ||
> > > +           !dev->driver->sriov_set_msix_vec_count)
> > > +               return;
> > > +
> > > +       dev->sriov->vf_total_msix = numb;
> > > +}
> > > +EXPORT_SYMBOL_GPL(pci_sriov_set_vf_total_msix);
> > > +
> >
> > This seems broken. What validation is being done on the numb value?
> > You pass it as int, and your documentation all refers to tests for >=
> > 0, but isn't a signed input a possibility as well? Also "numb" doesn't
> > make for a good abbreviation as it is already a word of its own. It
> > might make more sense to use count or something like that rather than
> > trying to abbreviate number.
>
> "Broken" is a nice word to describe misunderstanding.

Would you prefer "lacking input validation".

I see all this code in there checking for is_physfn and driver and
sriov_set_msix_vec_count before allowing the setting of vf_total_msix.
It just seems like a lot of validation is taking place on the wrong
things if you are just going to be setting a value reporting the total
number of MSI-X vectors in use for SR-IOV.

In addition this value seems like a custom purpose being pushed into
the PCIe code since there isn't anything that defaults the value. It
seems like at a minimum there should be something that programs a
default value for both of these new fields that are being added so
that you pull the maximum number of VFs when SR-IOV is enabled, the
maximum number of MSI-X vectors from a single VF, and then the default
value for this should be the multiple of the two which can then be
overridden later.

> The vf_total_msix is not set by the users and used solely by the drivers
> to advertise their capability. This field is needed to give a way to
> calculate how much MSI-X VFs can get. The driver code is part of the
> kernel and like any other kernel code, it is trusted.
>
> I'm checking < 0 in another _set_ routine to make sure that we will be
> able to extend this sysfs entry if at some point of time negative vector
> count will make sense.

I would rather have a strict interface that doesn't allow for
unintended flexibility. Out-of-tree drivers tend to exploit that kind
of stuff and it is problematic when it can occur.

> "Count" instead of "numb" is fine by me.
> >
> >
> > >  /**
> > >   * pci_sriov_configure_simple - helper to configure SR-IOV
> > >   * @dev: the PCI device
> > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> > > index 1fd273077637..0fbe291eb0f2 100644
> > > --- a/drivers/pci/pci.h
> > > +++ b/drivers/pci/pci.h
> > > @@ -327,6 +327,9 @@ struct pci_sriov {
> > >         u16             subsystem_device; /* VF subsystem device */
> > >         resource_size_t barsz[PCI_SRIOV_NUM_BARS];      /* VF BAR size */
> > >         bool            drivers_autoprobe; /* Auto probing of VFs by driver */
> > > +       int             vf_total_msix;  /* Total number of MSI-X vectors the VFs
> > > +                                        * can consume
> > > +                                        */
> > >  };
> > >
> > >  /**
> > > diff --git a/include/linux/pci.h b/include/linux/pci.h
> > > index a17cfc28eb66..fd9ff1f42a09 100644
> > > --- a/include/linux/pci.h
> > > +++ b/include/linux/pci.h
> > > @@ -2074,6 +2074,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev);
> > >  int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
> > >  resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
> > >  void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
> > > +void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb);
> > >
> > >  /* Arch may override these (weak) */
> > >  int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs);
> > > @@ -2114,6 +2115,7 @@ static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
> > >  static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
> > >  { return 0; }
> > >  static inline void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe) { }
> > > +static inline void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb) {}
> > >  #endif
> > >
> > >  #if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
> > > --
> > > 2.29.2
> > >
Leon Romanovsky Jan. 13, 2021, 6:19 a.m. UTC | #4
On Tue, Jan 12, 2021 at 01:34:50PM -0800, Alexander Duyck wrote:
> On Mon, Jan 11, 2021 at 10:56 PM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Mon, Jan 11, 2021 at 11:30:39AM -0800, Alexander Duyck wrote:
> > > On Sun, Jan 10, 2021 at 7:10 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > >
> > > > From: Leon Romanovsky <leonro@nvidia.com>
> > > >
> > > > Some SR-IOV capable devices provide an ability to configure specific
> > > > number of MSI-X vectors on their VF prior driver is probed on that VF.
> > > >
> > > > In order to make management easy, provide new read-only sysfs file that
> > > > returns a total number of possible to configure MSI-X vectors.
> > > >
> > > > cat /sys/bus/pci/devices/.../sriov_vf_total_msix
> > > >   = 0 - feature is not supported
> > > >   > 0 - total number of MSI-X vectors to consume by the VFs
> > > >
> > > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > > > ---
> > > >  Documentation/ABI/testing/sysfs-bus-pci | 14 +++++++++++
> > > >  drivers/pci/iov.c                       | 31 +++++++++++++++++++++++++
> > > >  drivers/pci/pci.h                       |  3 +++
> > > >  include/linux/pci.h                     |  2 ++
> > > >  4 files changed, 50 insertions(+)
> > > >
> > > > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > > > index 05e26e5da54e..64e9b700acc9 100644
> > > > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > > > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > > > @@ -395,3 +395,17 @@ Description:
> > > >                 The file is writable if the PF is bound to a driver that
> > > >                 supports the ->sriov_set_msix_vec_count() callback and there
> > > >                 is no driver bound to the VF.
> > > > +
> > > > +What:          /sys/bus/pci/devices/.../sriov_vf_total_msix
> > >
> > > In this case I would drop the "vf" and just go with sriov_total_msix
> > > since now you are referring to a global value instead of a per VF
> > > value.
> >
> > This field indicates the amount of MSI-X available for VFs, it doesn't
> > include PFs. The missing "_vf_" will mislead users who will believe that
> > it is all MSI-X vectors available for this device. They will need to take
> > into consideration amount of PF MSI-X in order to calculate the VF distribution.
> >
> > So I would leave "_vf_" here.
>
> The problem is you aren't indicating how many are available for an
> individual VF though, you are indicating how many are available for
> use by SR-IOV to give to the VFs. The fact that you are dealing with a
> pool makes things confusing in my opinion. For example sriov_vf_device
> describes the device ID that will be given to each VF.

sriov_vf_device is different and is implemented accordingly to the PCI
spec, 9.3.3.11 VF Device ID (Offset 1Ah)
"This field contains the Device ID that should be presented for every VF
to the SI."

It is one ID for all VFs.

>
> > >
> > > > +Date:          January 2021
> > > > +Contact:       Leon Romanovsky <leonro@nvidia.com>
> > > > +Description:
> > > > +               This file is associated with the SR-IOV PFs.
> > > > +               It returns a total number of possible to configure MSI-X
> > > > +               vectors on the enabled VFs.
> > > > +
> > > > +               The values returned are:
> > > > +                * > 0 - this will be total number possible to consume by VFs,
> > > > +                * = 0 - feature is not supported
> > > > +
> > > > +               If no SR-IOV VFs are enabled, this value will return 0.
> > > > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> > > > index 42c0df4158d1..0a6ddf3230fd 100644
> > > > --- a/drivers/pci/iov.c
> > > > +++ b/drivers/pci/iov.c
> > > > @@ -394,12 +394,22 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
> > > >         return count;
> > > >  }
> > > >
> > > > +static ssize_t sriov_vf_total_msix_show(struct device *dev,
> > > > +                                       struct device_attribute *attr,
> > > > +                                       char *buf)
> > > > +{
> > > > +       struct pci_dev *pdev = to_pci_dev(dev);
> > > > +
> > > > +       return sprintf(buf, "%d\n", pdev->sriov->vf_total_msix);
> > > > +}
> > > > +
> > >
> > > You display it as a signed value, but unsigned values are not
> > > supported, correct?
> >
> > Right, I made it similar to the vf_msix_set. I can change.
> >
> > >
> > > >  static DEVICE_ATTR_RO(sriov_totalvfs);
> > > >  static DEVICE_ATTR_RW(sriov_numvfs);
> > > >  static DEVICE_ATTR_RO(sriov_offset);
> > > >  static DEVICE_ATTR_RO(sriov_stride);
> > > >  static DEVICE_ATTR_RO(sriov_vf_device);
> > > >  static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
> > > > +static DEVICE_ATTR_RO(sriov_vf_total_msix);
> > > >
> > > >  static struct attribute *sriov_dev_attrs[] = {
> > > >         &dev_attr_sriov_totalvfs.attr,
> > > > @@ -408,6 +418,7 @@ static struct attribute *sriov_dev_attrs[] = {
> > > >         &dev_attr_sriov_stride.attr,
> > > >         &dev_attr_sriov_vf_device.attr,
> > > >         &dev_attr_sriov_drivers_autoprobe.attr,
> > > > +       &dev_attr_sriov_vf_total_msix.attr,
> > > >         NULL,
> > > >  };
> > > >
> > > > @@ -658,6 +669,7 @@ static void sriov_disable(struct pci_dev *dev)
> > > >                 sysfs_remove_link(&dev->dev.kobj, "dep_link");
> > > >
> > > >         iov->num_VFs = 0;
> > > > +       iov->vf_total_msix = 0;
> > > >         pci_iov_set_numvfs(dev, 0);
> > > >  }
> > > >
> > > > @@ -1116,6 +1128,25 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
> > > >
> > > > +/**
> > > > + * pci_sriov_set_vf_total_msix - set total number of MSI-X vectors for the VFs
> > > > + * @dev: the PCI PF device
> > > > + * @numb: the total number of MSI-X vector to consume by the VFs
> > > > + *
> > > > + * Sets the number of MSI-X vectors that is possible to consume by the VFs.
> > > > + * This interface is complimentary part of the pci_set_msix_vec_count()
> > > > + * that will be used to configure the required number on the VF.
> > > > + */
> > > > +void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb)
> > > > +{
> > > > +       if (!dev->is_physfn || !dev->driver ||
> > > > +           !dev->driver->sriov_set_msix_vec_count)
> > > > +               return;
> > > > +
> > > > +       dev->sriov->vf_total_msix = numb;
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(pci_sriov_set_vf_total_msix);
> > > > +
> > >
> > > This seems broken. What validation is being done on the numb value?
> > > You pass it as int, and your documentation all refers to tests for >=
> > > 0, but isn't a signed input a possibility as well? Also "numb" doesn't
> > > make for a good abbreviation as it is already a word of its own. It
> > > might make more sense to use count or something like that rather than
> > > trying to abbreviate number.
> >
> > "Broken" is a nice word to describe misunderstanding.
>
> Would you prefer "lacking input validation".
>
> I see all this code in there checking for is_physfn and driver and
> sriov_set_msix_vec_count before allowing the setting of vf_total_msix.
> It just seems like a lot of validation is taking place on the wrong
> things if you are just going to be setting a value reporting the total
> number of MSI-X vectors in use for SR-IOV.

All those checks are in place to ensure that we are not overwriting the
default value, which is 0.

>
> In addition this value seems like a custom purpose being pushed into
> the PCIe code since there isn't anything that defaults the value. It
> seems like at a minimum there should be something that programs a
> default value for both of these new fields that are being added so
> that you pull the maximum number of VFs when SR-IOV is enabled, the
> maximum number of MSI-X vectors from a single VF, and then the default
> value for this should be the multiple of the two which can then be
> overridden later.

The default is 0, because most SR-IOV doesn't have proper support of
setting VF MSI-X.

Regarding the calculation, it is not correct for the mlx5. We have large
pool of MSI-X vectors, but setting small number of them. This allows us
to increase that number on specific VF without need to decrease on
others "to free" the vectors.

Thanks
Alexander Duyck Jan. 13, 2021, 10:44 p.m. UTC | #5
On Tue, Jan 12, 2021 at 10:19 PM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Tue, Jan 12, 2021 at 01:34:50PM -0800, Alexander Duyck wrote:
> > On Mon, Jan 11, 2021 at 10:56 PM Leon Romanovsky <leon@kernel.org> wrote:
> > >
> > > On Mon, Jan 11, 2021 at 11:30:39AM -0800, Alexander Duyck wrote:
> > > > On Sun, Jan 10, 2021 at 7:10 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > > >
> > > > > From: Leon Romanovsky <leonro@nvidia.com>
> > > > >
> > > > > Some SR-IOV capable devices provide an ability to configure specific
> > > > > number of MSI-X vectors on their VF prior driver is probed on that VF.
> > > > >
> > > > > In order to make management easy, provide new read-only sysfs file that
> > > > > returns a total number of possible to configure MSI-X vectors.
> > > > >
> > > > > cat /sys/bus/pci/devices/.../sriov_vf_total_msix
> > > > >   = 0 - feature is not supported
> > > > >   > 0 - total number of MSI-X vectors to consume by the VFs
> > > > >
> > > > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > > > > ---
> > > > >  Documentation/ABI/testing/sysfs-bus-pci | 14 +++++++++++
> > > > >  drivers/pci/iov.c                       | 31 +++++++++++++++++++++++++
> > > > >  drivers/pci/pci.h                       |  3 +++
> > > > >  include/linux/pci.h                     |  2 ++
> > > > >  4 files changed, 50 insertions(+)
> > > > >
> > > > > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > > > > index 05e26e5da54e..64e9b700acc9 100644
> > > > > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > > > > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > > > > @@ -395,3 +395,17 @@ Description:
> > > > >                 The file is writable if the PF is bound to a driver that
> > > > >                 supports the ->sriov_set_msix_vec_count() callback and there
> > > > >                 is no driver bound to the VF.
> > > > > +
> > > > > +What:          /sys/bus/pci/devices/.../sriov_vf_total_msix
> > > >
> > > > In this case I would drop the "vf" and just go with sriov_total_msix
> > > > since now you are referring to a global value instead of a per VF
> > > > value.
> > >
> > > This field indicates the amount of MSI-X available for VFs, it doesn't
> > > include PFs. The missing "_vf_" will mislead users who will believe that
> > > it is all MSI-X vectors available for this device. They will need to take
> > > into consideration amount of PF MSI-X in order to calculate the VF distribution.
> > >
> > > So I would leave "_vf_" here.
> >
> > The problem is you aren't indicating how many are available for an
> > individual VF though, you are indicating how many are available for
> > use by SR-IOV to give to the VFs. The fact that you are dealing with a
> > pool makes things confusing in my opinion. For example sriov_vf_device
> > describes the device ID that will be given to each VF.
>
> sriov_vf_device is different and is implemented accordingly to the PCI
> spec, 9.3.3.11 VF Device ID (Offset 1Ah)
> "This field contains the Device ID that should be presented for every VF
> to the SI."
>
> It is one ID for all VFs.

Yes, but that is what I am getting at. It is also what the device
configuration will be for one VF. So when I read sriov_vf_total_msix
it reads as the total for a single VF, not all of the the VFs. That is
why I think dropping the "vf_" part of the name would make sense, as
what you are describing is the total number of MSI-X vectors for use
by SR-IOV VFs.

> >
> > > >
> > > > > +Date:          January 2021
> > > > > +Contact:       Leon Romanovsky <leonro@nvidia.com>
> > > > > +Description:
> > > > > +               This file is associated with the SR-IOV PFs.
> > > > > +               It returns a total number of possible to configure MSI-X
> > > > > +               vectors on the enabled VFs.
> > > > > +
> > > > > +               The values returned are:
> > > > > +                * > 0 - this will be total number possible to consume by VFs,
> > > > > +                * = 0 - feature is not supported
> > > > > +
> > > > > +               If no SR-IOV VFs are enabled, this value will return 0.
> > > > > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> > > > > index 42c0df4158d1..0a6ddf3230fd 100644
> > > > > --- a/drivers/pci/iov.c
> > > > > +++ b/drivers/pci/iov.c
> > > > > @@ -394,12 +394,22 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
> > > > >         return count;
> > > > >  }
> > > > >
> > > > > +static ssize_t sriov_vf_total_msix_show(struct device *dev,
> > > > > +                                       struct device_attribute *attr,
> > > > > +                                       char *buf)
> > > > > +{
> > > > > +       struct pci_dev *pdev = to_pci_dev(dev);
> > > > > +
> > > > > +       return sprintf(buf, "%d\n", pdev->sriov->vf_total_msix);
> > > > > +}
> > > > > +
> > > >
> > > > You display it as a signed value, but unsigned values are not
> > > > supported, correct?
> > >
> > > Right, I made it similar to the vf_msix_set. I can change.
> > >
> > > >
> > > > >  static DEVICE_ATTR_RO(sriov_totalvfs);
> > > > >  static DEVICE_ATTR_RW(sriov_numvfs);
> > > > >  static DEVICE_ATTR_RO(sriov_offset);
> > > > >  static DEVICE_ATTR_RO(sriov_stride);
> > > > >  static DEVICE_ATTR_RO(sriov_vf_device);
> > > > >  static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
> > > > > +static DEVICE_ATTR_RO(sriov_vf_total_msix);
> > > > >
> > > > >  static struct attribute *sriov_dev_attrs[] = {
> > > > >         &dev_attr_sriov_totalvfs.attr,
> > > > > @@ -408,6 +418,7 @@ static struct attribute *sriov_dev_attrs[] = {
> > > > >         &dev_attr_sriov_stride.attr,
> > > > >         &dev_attr_sriov_vf_device.attr,
> > > > >         &dev_attr_sriov_drivers_autoprobe.attr,
> > > > > +       &dev_attr_sriov_vf_total_msix.attr,
> > > > >         NULL,
> > > > >  };
> > > > >
> > > > > @@ -658,6 +669,7 @@ static void sriov_disable(struct pci_dev *dev)
> > > > >                 sysfs_remove_link(&dev->dev.kobj, "dep_link");
> > > > >
> > > > >         iov->num_VFs = 0;
> > > > > +       iov->vf_total_msix = 0;
> > > > >         pci_iov_set_numvfs(dev, 0);
> > > > >  }
> > > > >
> > > > > @@ -1116,6 +1128,25 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
> > > > >  }
> > > > >  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
> > > > >
> > > > > +/**
> > > > > + * pci_sriov_set_vf_total_msix - set total number of MSI-X vectors for the VFs
> > > > > + * @dev: the PCI PF device
> > > > > + * @numb: the total number of MSI-X vector to consume by the VFs
> > > > > + *
> > > > > + * Sets the number of MSI-X vectors that is possible to consume by the VFs.
> > > > > + * This interface is complimentary part of the pci_set_msix_vec_count()
> > > > > + * that will be used to configure the required number on the VF.
> > > > > + */
> > > > > +void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb)
> > > > > +{
> > > > > +       if (!dev->is_physfn || !dev->driver ||
> > > > > +           !dev->driver->sriov_set_msix_vec_count)
> > > > > +               return;
> > > > > +
> > > > > +       dev->sriov->vf_total_msix = numb;
> > > > > +}
> > > > > +EXPORT_SYMBOL_GPL(pci_sriov_set_vf_total_msix);
> > > > > +
> > > >
> > > > This seems broken. What validation is being done on the numb value?
> > > > You pass it as int, and your documentation all refers to tests for >=
> > > > 0, but isn't a signed input a possibility as well? Also "numb" doesn't
> > > > make for a good abbreviation as it is already a word of its own. It
> > > > might make more sense to use count or something like that rather than
> > > > trying to abbreviate number.
> > >
> > > "Broken" is a nice word to describe misunderstanding.
> >
> > Would you prefer "lacking input validation".
> >
> > I see all this code in there checking for is_physfn and driver and
> > sriov_set_msix_vec_count before allowing the setting of vf_total_msix.
> > It just seems like a lot of validation is taking place on the wrong
> > things if you are just going to be setting a value reporting the total
> > number of MSI-X vectors in use for SR-IOV.
>
> All those checks are in place to ensure that we are not overwriting the
> default value, which is 0.

Okay, so what you really have is surplus interrupts that you are
wanting to give out to VF devices. So when we indicate 0 here as the
default it really means we have no additional interrupts to give out.
Am I understanding that correctly?

The problem is this is very vendor specific so I am doing my best to
understand it as it is different then the other NICs I have worked
with.

So this value is the size of the total pool of interrupt vectors you
have to split up between the functions, or just the spare ones you
could add to individual VFs? Since you say "total" I am assuming it is
the total pool which means that in order to figure out how many are
available to be reserved we would have to run through all the VFs and
figure out what has already been assigned, correct? If so it wouldn't
hurt to also think about having a free and in-use count somewhere as
well.

> >
> > In addition this value seems like a custom purpose being pushed into
> > the PCIe code since there isn't anything that defaults the value. It
> > seems like at a minimum there should be something that programs a
> > default value for both of these new fields that are being added so
> > that you pull the maximum number of VFs when SR-IOV is enabled, the
> > maximum number of MSI-X vectors from a single VF, and then the default
> > value for this should be the multiple of the two which can then be
> > overridden later.
>
> The default is 0, because most SR-IOV doesn't have proper support of
> setting VF MSI-X.

It wasn't designed to work this way. That is why it doesn't really work.

> Regarding the calculation, it is not correct for the mlx5. We have large
> pool of MSI-X vectors, but setting small number of them. This allows us
> to increase that number on specific VF without need to decrease on
> others "to free" the vectors.

I think I am finally starting to grok what is going on here, but I
really don't like the approach.

Is there any reason why you couldn't have configured your VF to
support whatever the maximum number of MSI-X vectors you wanted to use
was, and then just internally masked off or disabled the ones that you
couldn't allocate to the VF and communicate that to the VF via some
sort of firmware message so it wouldn't use them? If I am not mistaken
that is the approach that has been taken in the past for at least this
portion of things in the Intel drivers.

Then the matter is how to configure it. I'm not a big fan of adding
sysfs to the VF to manage resources that are meant to be controlled by
the PF. Especially when you are having to add sysfs to the PF as well
which creates an asymmetric setup where you are having to read the PF
to find out what resources you can move to the VF. I wonder if
something like the Devlink Resource interface wouldn't make more sense
in this case. Then you would just manage "vf-interrupts" or something
like that with the resource split between each of the VFs with each VF
uniquely identified as a separate sub resource.
Leon Romanovsky Jan. 14, 2021, 6:50 a.m. UTC | #6
On Wed, Jan 13, 2021 at 02:44:45PM -0800, Alexander Duyck wrote:
> On Tue, Jan 12, 2021 at 10:19 PM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Tue, Jan 12, 2021 at 01:34:50PM -0800, Alexander Duyck wrote:
> > > On Mon, Jan 11, 2021 at 10:56 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > >
> > > > On Mon, Jan 11, 2021 at 11:30:39AM -0800, Alexander Duyck wrote:
> > > > > On Sun, Jan 10, 2021 at 7:10 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > > > >
> > > > > > From: Leon Romanovsky <leonro@nvidia.com>
> > > > > >
> > > > > > Some SR-IOV capable devices provide an ability to configure specific
> > > > > > number of MSI-X vectors on their VF prior driver is probed on that VF.
> > > > > >
> > > > > > In order to make management easy, provide new read-only sysfs file that
> > > > > > returns a total number of possible to configure MSI-X vectors.
> > > > > >
> > > > > > cat /sys/bus/pci/devices/.../sriov_vf_total_msix
> > > > > >   = 0 - feature is not supported
> > > > > >   > 0 - total number of MSI-X vectors to consume by the VFs
> > > > > >
> > > > > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > > > > > ---
> > > > > >  Documentation/ABI/testing/sysfs-bus-pci | 14 +++++++++++
> > > > > >  drivers/pci/iov.c                       | 31 +++++++++++++++++++++++++
> > > > > >  drivers/pci/pci.h                       |  3 +++
> > > > > >  include/linux/pci.h                     |  2 ++
> > > > > >  4 files changed, 50 insertions(+)
> > > > > >
> > > > > > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > > > > > index 05e26e5da54e..64e9b700acc9 100644
> > > > > > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > > > > > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > > > > > @@ -395,3 +395,17 @@ Description:
> > > > > >                 The file is writable if the PF is bound to a driver that
> > > > > >                 supports the ->sriov_set_msix_vec_count() callback and there
> > > > > >                 is no driver bound to the VF.
> > > > > > +
> > > > > > +What:          /sys/bus/pci/devices/.../sriov_vf_total_msix
> > > > >
> > > > > In this case I would drop the "vf" and just go with sriov_total_msix
> > > > > since now you are referring to a global value instead of a per VF
> > > > > value.
> > > >
> > > > This field indicates the amount of MSI-X available for VFs, it doesn't
> > > > include PFs. The missing "_vf_" will mislead users who will believe that
> > > > it is all MSI-X vectors available for this device. They will need to take
> > > > into consideration amount of PF MSI-X in order to calculate the VF distribution.
> > > >
> > > > So I would leave "_vf_" here.
> > >
> > > The problem is you aren't indicating how many are available for an
> > > individual VF though, you are indicating how many are available for
> > > use by SR-IOV to give to the VFs. The fact that you are dealing with a
> > > pool makes things confusing in my opinion. For example sriov_vf_device
> > > describes the device ID that will be given to each VF.
> >
> > sriov_vf_device is different and is implemented accordingly to the PCI
> > spec, 9.3.3.11 VF Device ID (Offset 1Ah)
> > "This field contains the Device ID that should be presented for every VF
> > to the SI."
> >
> > It is one ID for all VFs.
>
> Yes, but that is what I am getting at. It is also what the device
> configuration will be for one VF. So when I read sriov_vf_total_msix
> it reads as the total for a single VF, not all of the the VFs. That is
> why I think dropping the "vf_" part of the name would make sense, as
> what you are describing is the total number of MSI-X vectors for use
> by SR-IOV VFs.

I can change to anything as long as new name will give clear indication
that this total number is for VFs and doesn't include SR-IOV PF MSI-X.

>
> > >
> > > > >
> > > > > > +Date:          January 2021
> > > > > > +Contact:       Leon Romanovsky <leonro@nvidia.com>
> > > > > > +Description:
> > > > > > +               This file is associated with the SR-IOV PFs.
> > > > > > +               It returns a total number of possible to configure MSI-X
> > > > > > +               vectors on the enabled VFs.
> > > > > > +
> > > > > > +               The values returned are:
> > > > > > +                * > 0 - this will be total number possible to consume by VFs,
> > > > > > +                * = 0 - feature is not supported
> > > > > > +
> > > > > > +               If no SR-IOV VFs are enabled, this value will return 0.
> > > > > > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> > > > > > index 42c0df4158d1..0a6ddf3230fd 100644
> > > > > > --- a/drivers/pci/iov.c
> > > > > > +++ b/drivers/pci/iov.c
> > > > > > @@ -394,12 +394,22 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
> > > > > >         return count;
> > > > > >  }
> > > > > >
> > > > > > +static ssize_t sriov_vf_total_msix_show(struct device *dev,
> > > > > > +                                       struct device_attribute *attr,
> > > > > > +                                       char *buf)
> > > > > > +{
> > > > > > +       struct pci_dev *pdev = to_pci_dev(dev);
> > > > > > +
> > > > > > +       return sprintf(buf, "%d\n", pdev->sriov->vf_total_msix);
> > > > > > +}
> > > > > > +
> > > > >
> > > > > You display it as a signed value, but unsigned values are not
> > > > > supported, correct?
> > > >
> > > > Right, I made it similar to the vf_msix_set. I can change.
> > > >
> > > > >
> > > > > >  static DEVICE_ATTR_RO(sriov_totalvfs);
> > > > > >  static DEVICE_ATTR_RW(sriov_numvfs);
> > > > > >  static DEVICE_ATTR_RO(sriov_offset);
> > > > > >  static DEVICE_ATTR_RO(sriov_stride);
> > > > > >  static DEVICE_ATTR_RO(sriov_vf_device);
> > > > > >  static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
> > > > > > +static DEVICE_ATTR_RO(sriov_vf_total_msix);
> > > > > >
> > > > > >  static struct attribute *sriov_dev_attrs[] = {
> > > > > >         &dev_attr_sriov_totalvfs.attr,
> > > > > > @@ -408,6 +418,7 @@ static struct attribute *sriov_dev_attrs[] = {
> > > > > >         &dev_attr_sriov_stride.attr,
> > > > > >         &dev_attr_sriov_vf_device.attr,
> > > > > >         &dev_attr_sriov_drivers_autoprobe.attr,
> > > > > > +       &dev_attr_sriov_vf_total_msix.attr,
> > > > > >         NULL,
> > > > > >  };
> > > > > >
> > > > > > @@ -658,6 +669,7 @@ static void sriov_disable(struct pci_dev *dev)
> > > > > >                 sysfs_remove_link(&dev->dev.kobj, "dep_link");
> > > > > >
> > > > > >         iov->num_VFs = 0;
> > > > > > +       iov->vf_total_msix = 0;
> > > > > >         pci_iov_set_numvfs(dev, 0);
> > > > > >  }
> > > > > >
> > > > > > @@ -1116,6 +1128,25 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
> > > > > >  }
> > > > > >  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
> > > > > >
> > > > > > +/**
> > > > > > + * pci_sriov_set_vf_total_msix - set total number of MSI-X vectors for the VFs
> > > > > > + * @dev: the PCI PF device
> > > > > > + * @numb: the total number of MSI-X vector to consume by the VFs
> > > > > > + *
> > > > > > + * Sets the number of MSI-X vectors that is possible to consume by the VFs.
> > > > > > + * This interface is complimentary part of the pci_set_msix_vec_count()
> > > > > > + * that will be used to configure the required number on the VF.
> > > > > > + */
> > > > > > +void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb)
> > > > > > +{
> > > > > > +       if (!dev->is_physfn || !dev->driver ||
> > > > > > +           !dev->driver->sriov_set_msix_vec_count)
> > > > > > +               return;
> > > > > > +
> > > > > > +       dev->sriov->vf_total_msix = numb;
> > > > > > +}
> > > > > > +EXPORT_SYMBOL_GPL(pci_sriov_set_vf_total_msix);
> > > > > > +
> > > > >
> > > > > This seems broken. What validation is being done on the numb value?
> > > > > You pass it as int, and your documentation all refers to tests for >=
> > > > > 0, but isn't a signed input a possibility as well? Also "numb" doesn't
> > > > > make for a good abbreviation as it is already a word of its own. It
> > > > > might make more sense to use count or something like that rather than
> > > > > trying to abbreviate number.
> > > >
> > > > "Broken" is a nice word to describe misunderstanding.
> > >
> > > Would you prefer "lacking input validation".
> > >
> > > I see all this code in there checking for is_physfn and driver and
> > > sriov_set_msix_vec_count before allowing the setting of vf_total_msix.
> > > It just seems like a lot of validation is taking place on the wrong
> > > things if you are just going to be setting a value reporting the total
> > > number of MSI-X vectors in use for SR-IOV.
> >
> > All those checks are in place to ensure that we are not overwriting the
> > default value, which is 0.
>
> Okay, so what you really have is surplus interrupts that you are
> wanting to give out to VF devices. So when we indicate 0 here as the
> default it really means we have no additional interrupts to give out.
> Am I understanding that correctly?

The vf_total_msix is static value and shouldn't be recalculated after
every MSI-X vector number change. So 0 means that driver doesn't support
at all this feature. The operator is responsible to make proper assignment
calculations, because he is already doing it for the CPUs and netdev queues.

>
> The problem is this is very vendor specific so I am doing my best to
> understand it as it is different then the other NICs I have worked
> with.

There is nothing vendor specific here. There are two types of devices:
1. Support this feature. - The vf_total_msix will be greater than 0 for them
and their FW will do sanity checks when user overwrites their default number
that they sat in the VF creation stage.
2. Doesn't support this feature - The vf_total_msix will be 0.

It is PCI spec, so those "other NICs" that didn't implement the PCI spec
will stay with option #2. It is not different from current situation.

>
> So this value is the size of the total pool of interrupt vectors you
> have to split up between the functions, or just the spare ones you
> could add to individual VFs? Since you say "total" I am assuming it is
> the total pool which means that in order to figure out how many are
> available to be reserved we would have to run through all the VFs and
> figure out what has already been assigned, correct? If so it wouldn't
> hurt to also think about having a free and in-use count somewhere as
> well.

It is not really necessary, the VFs are created with some defaults,
because FW needs to ensure that after "echo X > sriov_numvfs" everything
is operable.

As we already discussed, FW has no other way but create VFs with same
configuration. It means that user needs to check only first VF MSI-X
number and multiply by number of VFs to get total number of already
consumed.

The reserved vectors are implemented by checks in FW to ensure that user
can't write number below certain level.

I prefer to stay "lean" as much as possible and add extra fields only
after the real need arise. Right now and for seen future, it is not needed.

>
> > >
> > > In addition this value seems like a custom purpose being pushed into
> > > the PCIe code since there isn't anything that defaults the value. It
> > > seems like at a minimum there should be something that programs a
> > > default value for both of these new fields that are being added so
> > > that you pull the maximum number of VFs when SR-IOV is enabled, the
> > > maximum number of MSI-X vectors from a single VF, and then the default
> > > value for this should be the multiple of the two which can then be
> > > overridden later.
> >
> > The default is 0, because most SR-IOV doesn't have proper support of
> > setting VF MSI-X.
>
> It wasn't designed to work this way. That is why it doesn't really work.

It can be true for everything that doesn't work.
I will rewrite my sentence above:
"The default is 0, because I don't have other than mlx5 devices at my disposal,
so leave it to other driver authors to implement their callbacks"

>
> > Regarding the calculation, it is not correct for the mlx5. We have large
> > pool of MSI-X vectors, but setting small number of them. This allows us
> > to increase that number on specific VF without need to decrease on
> > others "to free" the vectors.
>
> I think I am finally starting to grok what is going on here, but I
> really don't like the approach.
>
> Is there any reason why you couldn't have configured your VF to
> support whatever the maximum number of MSI-X vectors you wanted to use
> was, and then just internally masked off or disabled the ones that you
> couldn't allocate to the VF and communicate that to the VF via some
> sort of firmware message so it wouldn't use them? If I am not mistaken
> that is the approach that has been taken in the past for at least this
> portion of things in the Intel drivers.

I'm not proficient in Intel drivers and can't comment it, but the idea
looks very controversial and hacky to me.

There are so many issues with proposed model:
1. During driver probe, the __pci_enable_msix() is called and device MSI-X
number is used to set internal to the kernel and device configuration.
It means that vfio will try to set it to be large (our default is huge),
so we will need to provide some module parameter or sysfs to limit in VFIO
and it should be done per-VM.
2. Without limiting in VFIO during VM attach, the first VM can and will
grab everything and we are returning to square one.
3. The difference in lspci output on the hypervisor and inside VM in
regards of MSI-X vector count will be source of endless bug reports.
4. I afraid that is not how orchestration works.

This supports even more than before the need to do it properly in
pci/core and make user experience clean, easy and reliable.

>
> Then the matter is how to configure it. I'm not a big fan of adding
> sysfs to the VF to manage resources that are meant to be controlled by
> the PF. Especially when you are having to add sysfs to the PF as well
> which creates an asymmetric setup where you are having to read the PF
> to find out what resources you can move to the VF. I wonder if
> something like the Devlink Resource interface wouldn't make more sense
> in this case. Then you would just manage "vf-interrupts" or something
> like that with the resource split between each of the VFs with each VF
> uniquely identified as a separate sub resource.

I remind you that this feature is applicable to all SR-IOV devices, we have
RDMA, NVMe, crypto, FPGA and netdev VFs. The devlink is not supported
outside of netdev world and implementation there will make this feature
is far from being usable.

Right now, the configuration of PCI/core is done through sysfs, so let's
review this feature from PCI/core perspective and not from netdev where
sysfs is not widely used.

Thanks
Alexander Duyck Jan. 14, 2021, 4:40 p.m. UTC | #7
On Wed, Jan 13, 2021 at 10:50 PM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Wed, Jan 13, 2021 at 02:44:45PM -0800, Alexander Duyck wrote:
> > On Tue, Jan 12, 2021 at 10:19 PM Leon Romanovsky <leon@kernel.org> wrote:
> > >
> > > On Tue, Jan 12, 2021 at 01:34:50PM -0800, Alexander Duyck wrote:
> > > > On Mon, Jan 11, 2021 at 10:56 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > > >
> > > > > On Mon, Jan 11, 2021 at 11:30:39AM -0800, Alexander Duyck wrote:
> > > > > > On Sun, Jan 10, 2021 at 7:10 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > > > > >
> > > > > > > From: Leon Romanovsky <leonro@nvidia.com>
> > > > > > >
> > > > > > > Some SR-IOV capable devices provide an ability to configure specific
> > > > > > > number of MSI-X vectors on their VF prior driver is probed on that VF.
> > > > > > >
> > > > > > > In order to make management easy, provide new read-only sysfs file that
> > > > > > > returns a total number of possible to configure MSI-X vectors.
> > > > > > >
> > > > > > > cat /sys/bus/pci/devices/.../sriov_vf_total_msix
> > > > > > >   = 0 - feature is not supported
> > > > > > >   > 0 - total number of MSI-X vectors to consume by the VFs
> > > > > > >
> > > > > > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > > > > > > ---
> > > > > > >  Documentation/ABI/testing/sysfs-bus-pci | 14 +++++++++++
> > > > > > >  drivers/pci/iov.c                       | 31 +++++++++++++++++++++++++
> > > > > > >  drivers/pci/pci.h                       |  3 +++
> > > > > > >  include/linux/pci.h                     |  2 ++
> > > > > > >  4 files changed, 50 insertions(+)
> > > > > > >
> > > > > > > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > > > > > > index 05e26e5da54e..64e9b700acc9 100644
> > > > > > > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > > > > > > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > > > > > > @@ -395,3 +395,17 @@ Description:
> > > > > > >                 The file is writable if the PF is bound to a driver that
> > > > > > >                 supports the ->sriov_set_msix_vec_count() callback and there
> > > > > > >                 is no driver bound to the VF.
> > > > > > > +
> > > > > > > +What:          /sys/bus/pci/devices/.../sriov_vf_total_msix
> > > > > >
> > > > > > In this case I would drop the "vf" and just go with sriov_total_msix
> > > > > > since now you are referring to a global value instead of a per VF
> > > > > > value.
> > > > >
> > > > > This field indicates the amount of MSI-X available for VFs, it doesn't
> > > > > include PFs. The missing "_vf_" will mislead users who will believe that
> > > > > it is all MSI-X vectors available for this device. They will need to take
> > > > > into consideration amount of PF MSI-X in order to calculate the VF distribution.
> > > > >
> > > > > So I would leave "_vf_" here.
> > > >
> > > > The problem is you aren't indicating how many are available for an
> > > > individual VF though, you are indicating how many are available for
> > > > use by SR-IOV to give to the VFs. The fact that you are dealing with a
> > > > pool makes things confusing in my opinion. For example sriov_vf_device
> > > > describes the device ID that will be given to each VF.
> > >
> > > sriov_vf_device is different and is implemented accordingly to the PCI
> > > spec, 9.3.3.11 VF Device ID (Offset 1Ah)
> > > "This field contains the Device ID that should be presented for every VF
> > > to the SI."
> > >
> > > It is one ID for all VFs.
> >
> > Yes, but that is what I am getting at. It is also what the device
> > configuration will be for one VF. So when I read sriov_vf_total_msix
> > it reads as the total for a single VF, not all of the the VFs. That is
> > why I think dropping the "vf_" part of the name would make sense, as
> > what you are describing is the total number of MSI-X vectors for use
> > by SR-IOV VFs.
>
> I can change to anything as long as new name will give clear indication
> that this total number is for VFs and doesn't include SR-IOV PF MSI-X.

It is interesting that you make that distinction.

So in the case of the Intel hardware we had one pool of MSI-X
interrupts that was available for the entire port, both PF and VF.
When we enabled SR-IOV we had to repartition that pool in order to
assign interrupts to devices. So it sounds like in your case you don't
do that and instead the PF is static and the VFs are the only piece
that is flexible. Do I have that correct?

> >
> > > >
> > > > > >
> > > > > > > +Date:          January 2021
> > > > > > > +Contact:       Leon Romanovsky <leonro@nvidia.com>
> > > > > > > +Description:
> > > > > > > +               This file is associated with the SR-IOV PFs.
> > > > > > > +               It returns a total number of possible to configure MSI-X
> > > > > > > +               vectors on the enabled VFs.
> > > > > > > +
> > > > > > > +               The values returned are:
> > > > > > > +                * > 0 - this will be total number possible to consume by VFs,
> > > > > > > +                * = 0 - feature is not supported
> > > > > > > +
> > > > > > > +               If no SR-IOV VFs are enabled, this value will return 0.
> > > > > > > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> > > > > > > index 42c0df4158d1..0a6ddf3230fd 100644
> > > > > > > --- a/drivers/pci/iov.c
> > > > > > > +++ b/drivers/pci/iov.c
> > > > > > > @@ -394,12 +394,22 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
> > > > > > >         return count;
> > > > > > >  }
> > > > > > >
> > > > > > > +static ssize_t sriov_vf_total_msix_show(struct device *dev,
> > > > > > > +                                       struct device_attribute *attr,
> > > > > > > +                                       char *buf)
> > > > > > > +{
> > > > > > > +       struct pci_dev *pdev = to_pci_dev(dev);
> > > > > > > +
> > > > > > > +       return sprintf(buf, "%d\n", pdev->sriov->vf_total_msix);
> > > > > > > +}
> > > > > > > +
> > > > > >
> > > > > > You display it as a signed value, but unsigned values are not
> > > > > > supported, correct?
> > > > >
> > > > > Right, I made it similar to the vf_msix_set. I can change.
> > > > >
> > > > > >
> > > > > > >  static DEVICE_ATTR_RO(sriov_totalvfs);
> > > > > > >  static DEVICE_ATTR_RW(sriov_numvfs);
> > > > > > >  static DEVICE_ATTR_RO(sriov_offset);
> > > > > > >  static DEVICE_ATTR_RO(sriov_stride);
> > > > > > >  static DEVICE_ATTR_RO(sriov_vf_device);
> > > > > > >  static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
> > > > > > > +static DEVICE_ATTR_RO(sriov_vf_total_msix);
> > > > > > >
> > > > > > >  static struct attribute *sriov_dev_attrs[] = {
> > > > > > >         &dev_attr_sriov_totalvfs.attr,
> > > > > > > @@ -408,6 +418,7 @@ static struct attribute *sriov_dev_attrs[] = {
> > > > > > >         &dev_attr_sriov_stride.attr,
> > > > > > >         &dev_attr_sriov_vf_device.attr,
> > > > > > >         &dev_attr_sriov_drivers_autoprobe.attr,
> > > > > > > +       &dev_attr_sriov_vf_total_msix.attr,
> > > > > > >         NULL,
> > > > > > >  };
> > > > > > >
> > > > > > > @@ -658,6 +669,7 @@ static void sriov_disable(struct pci_dev *dev)
> > > > > > >                 sysfs_remove_link(&dev->dev.kobj, "dep_link");
> > > > > > >
> > > > > > >         iov->num_VFs = 0;
> > > > > > > +       iov->vf_total_msix = 0;
> > > > > > >         pci_iov_set_numvfs(dev, 0);
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1116,6 +1128,25 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
> > > > > > >  }
> > > > > > >  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
> > > > > > >
> > > > > > > +/**
> > > > > > > + * pci_sriov_set_vf_total_msix - set total number of MSI-X vectors for the VFs
> > > > > > > + * @dev: the PCI PF device
> > > > > > > + * @numb: the total number of MSI-X vector to consume by the VFs
> > > > > > > + *
> > > > > > > + * Sets the number of MSI-X vectors that is possible to consume by the VFs.
> > > > > > > + * This interface is complimentary part of the pci_set_msix_vec_count()
> > > > > > > + * that will be used to configure the required number on the VF.
> > > > > > > + */
> > > > > > > +void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb)
> > > > > > > +{
> > > > > > > +       if (!dev->is_physfn || !dev->driver ||
> > > > > > > +           !dev->driver->sriov_set_msix_vec_count)
> > > > > > > +               return;
> > > > > > > +
> > > > > > > +       dev->sriov->vf_total_msix = numb;
> > > > > > > +}
> > > > > > > +EXPORT_SYMBOL_GPL(pci_sriov_set_vf_total_msix);
> > > > > > > +
> > > > > >
> > > > > > This seems broken. What validation is being done on the numb value?
> > > > > > You pass it as int, and your documentation all refers to tests for >=
> > > > > > 0, but isn't a signed input a possibility as well? Also "numb" doesn't
> > > > > > make for a good abbreviation as it is already a word of its own. It
> > > > > > might make more sense to use count or something like that rather than
> > > > > > trying to abbreviate number.
> > > > >
> > > > > "Broken" is a nice word to describe misunderstanding.
> > > >
> > > > Would you prefer "lacking input validation".
> > > >
> > > > I see all this code in there checking for is_physfn and driver and
> > > > sriov_set_msix_vec_count before allowing the setting of vf_total_msix.
> > > > It just seems like a lot of validation is taking place on the wrong
> > > > things if you are just going to be setting a value reporting the total
> > > > number of MSI-X vectors in use for SR-IOV.
> > >
> > > All those checks are in place to ensure that we are not overwriting the
> > > default value, which is 0.
> >
> > Okay, so what you really have is surplus interrupts that you are
> > wanting to give out to VF devices. So when we indicate 0 here as the
> > default it really means we have no additional interrupts to give out.
> > Am I understanding that correctly?
>
> The vf_total_msix is static value and shouldn't be recalculated after
> every MSI-X vector number change. So 0 means that driver doesn't support
> at all this feature. The operator is responsible to make proper assignment
> calculations, because he is already doing it for the CPUs and netdev queues.

Honestly that makes things even uglier. So basically this is a feature
where if it isn't supported it will make it look like the SR-IOV
device doesn't support MSI-X. I realize it is just the way it is
worded but it isn't very pretty.

> >
> > The problem is this is very vendor specific so I am doing my best to
> > understand it as it is different then the other NICs I have worked
> > with.
>
> There is nothing vendor specific here. There are two types of devices:
> 1. Support this feature. - The vf_total_msix will be greater than 0 for them
> and their FW will do sanity checks when user overwrites their default number
> that they sat in the VF creation stage.
> 2. Doesn't support this feature - The vf_total_msix will be 0.
>
> It is PCI spec, so those "other NICs" that didn't implement the PCI spec
> will stay with option #2. It is not different from current situation.

Where in the spec is this?

I know in the PCI spec it says that the MSI-X table size is read-only
and is not supposed to be written by system software. That is what is
being overwritten right now by your patches that has me concerned.

As far as the logic behind the approach described above. You can
define up to the MSI-X table size worth of interrupt vectors, however
nothing says the device has to make use of those vectors. So defining
a table with unused entries works within the spec since all the table
is defining is interrupts you can use, not interrupts you must use. It
basically comes down to the fact that an interrupt can be broken into
target actions which are defined in the table, and "initiators" or
"sources" which are controlled by the internal configuration of the
device and must be associated with the target action for something to
occur. You can have a fixed number of initiators that can be shared
between the targets.

> >
> > So this value is the size of the total pool of interrupt vectors you
> > have to split up between the functions, or just the spare ones you
> > could add to individual VFs? Since you say "total" I am assuming it is
> > the total pool which means that in order to figure out how many are
> > available to be reserved we would have to run through all the VFs and
> > figure out what has already been assigned, correct? If so it wouldn't
> > hurt to also think about having a free and in-use count somewhere as
> > well.
>
> It is not really necessary, the VFs are created with some defaults,
> because FW needs to ensure that after "echo X > sriov_numvfs" everything
> is operable.
>
> As we already discussed, FW has no other way but create VFs with same
> configuration. It means that user needs to check only first VF MSI-X
> number and multiply by number of VFs to get total number of already
> consumed.

That is only at the start though. Once they start modifying VF
interrupts the individual numbers could be all over the place.

> The reserved vectors are implemented by checks in FW to ensure that user
> can't write number below certain level.
>
> I prefer to stay "lean" as much as possible and add extra fields only
> after the real need arise. Right now and for seen future, it is not needed.

It isn't so much about extra fields as the availability of data. When
you have data placed in multiple locations and only a single error
return type if things go wrong it makes it kind of ugly in terms of a
user interface.

> >
> > > >
> > > > In addition this value seems like a custom purpose being pushed into
> > > > the PCIe code since there isn't anything that defaults the value. It
> > > > seems like at a minimum there should be something that programs a
> > > > default value for both of these new fields that are being added so
> > > > that you pull the maximum number of VFs when SR-IOV is enabled, the
> > > > maximum number of MSI-X vectors from a single VF, and then the default
> > > > value for this should be the multiple of the two which can then be
> > > > overridden later.
> > >
> > > The default is 0, because most SR-IOV doesn't have proper support of
> > > setting VF MSI-X.
> >
> > It wasn't designed to work this way. That is why it doesn't really work.
>
> It can be true for everything that doesn't work.
> I will rewrite my sentence above:
> "The default is 0, because I don't have other than mlx5 devices at my disposal,
> so leave it to other driver authors to implement their callbacks"

So you are saying this is a vendor specific interface then? Until we
have another party willing to sign on as this being an approach they
are going to take with their NIC I would question if this is really
the way we want to go about handling this.

I would be curious if any other vendor supports editing the VF MSI-X
table size on the fly. If so then maybe we will have to accept it as a
quirk of SR-IOV. However we really should be trying to avoid this as a
supported method if possible.

> >
> > > Regarding the calculation, it is not correct for the mlx5. We have large
> > > pool of MSI-X vectors, but setting small number of them. This allows us
> > > to increase that number on specific VF without need to decrease on
> > > others "to free" the vectors.
> >
> > I think I am finally starting to grok what is going on here, but I
> > really don't like the approach.
> >
> > Is there any reason why you couldn't have configured your VF to
> > support whatever the maximum number of MSI-X vectors you wanted to use
> > was, and then just internally masked off or disabled the ones that you
> > couldn't allocate to the VF and communicate that to the VF via some
> > sort of firmware message so it wouldn't use them? If I am not mistaken
> > that is the approach that has been taken in the past for at least this
> > portion of things in the Intel drivers.
>
> I'm not proficient in Intel drivers and can't comment it, but the idea
> looks very controversial and hacky to me.
>
> There are so many issues with proposed model:
> 1. During driver probe, the __pci_enable_msix() is called and device MSI-X
> number is used to set internal to the kernel and device configuration.
> It means that vfio will try to set it to be large (our default is huge),
> so we will need to provide some module parameter or sysfs to limit in VFIO
> and it should be done per-VM.

How would that be much different than what you have now? What I was
suggesting is that you could expose your sysfs value either as a part
of binding the VFIO driver, or as a module parameter. You said that
your orchestration was managing this per VM so I see this behaving the
same way. Your orchestration layer would be setting this value to
limit what is exposed in the VM.

The extra call to the PF may be needed in order to guarantee that the
device/firmware knows that it is supposed to ignore the interrupts
past a certain point.

> 2. Without limiting in VFIO during VM attach, the first VM can and will
> grab everything and we are returning to square one.

It depends on how you have this implemented. As I mentioned earlier
the editing of the VF configuration space is troubling since you are
essentially using the firmware to circumvent the read-only protection
that is provided by the spec in order to change the MSI-X table size
on the fly.

Where I think you and I disagree is that I really think the MSI-X
table size should be fixed at one value for the life of the VF.
Instead of changing the table size it should be the number of vectors
that are functional that should be the limit. Basically there should
be only some that are functional and some that are essentially just
dummy vectors that you can write values into that will never be used.

> 3. The difference in lspci output on the hypervisor and inside VM in
> regards of MSI-X vector count will be source of endless bug reports.

Nothing says the two have to differ other than the fact that at a
certain point the MSI-X table stops being written to the hardware and
those vectors beyond that point are internally masked and disabled.

> 4. I afraid that is not how orchestration works.
>
> This supports even more than before the need to do it properly in
> pci/core and make user experience clean, easy and reliable.

I would think that would be pretty straight forward. Basically the
hardware would ignore interrupt vectors entries past a certain point,
and the PBA bits for those entries would always read 0.

> >
> > Then the matter is how to configure it. I'm not a big fan of adding
> > sysfs to the VF to manage resources that are meant to be controlled by
> > the PF. Especially when you are having to add sysfs to the PF as well
> > which creates an asymmetric setup where you are having to read the PF
> > to find out what resources you can move to the VF. I wonder if
> > something like the Devlink Resource interface wouldn't make more sense
> > in this case. Then you would just manage "vf-interrupts" or something
> > like that with the resource split between each of the VFs with each VF
> > uniquely identified as a separate sub resource.
>
> I remind you that this feature is applicable to all SR-IOV devices, we have
> RDMA, NVMe, crypto, FPGA and netdev VFs. The devlink is not supported
> outside of netdev world and implementation there will make this feature
> is far from being usable.

To quote the documentation:
"devlink is an API to expose device information and resources not directly
related to any device class, such as chip-wide/switch-ASIC-wide configuration."

> Right now, the configuration of PCI/core is done through sysfs, so let's
> review this feature from PCI/core perspective and not from netdev where
> sysfs is not widely used.

The problem is what you are configuring is not necessarily PCI device
specific. You are configuring the firmware which operates at a level
above the PCI device. You just have it manifesting as a PCI behavior
as you are editing a read-only configuration space field.

Also as I mentioned a few times now the approach you are taking
violates the PCIe spec by essentially providing a means of making a
read-only field writable.
Jason Gunthorpe Jan. 14, 2021, 4:48 p.m. UTC | #8
On Thu, Jan 14, 2021 at 08:40:14AM -0800, Alexander Duyck wrote:

> Where I think you and I disagree is that I really think the MSI-X
> table size should be fixed at one value for the life of the VF.
> Instead of changing the table size it should be the number of vectors
> that are functional that should be the limit. Basically there should
> be only some that are functional and some that are essentially just
> dummy vectors that you can write values into that will never be used.

Ignoring the PCI config space to learn the # of available MSI-X
vectors is big break on the how the device's programming ABI works.

Or stated another way, that isn't compatible with any existing drivers
so it is basically not a useful approach as it can't be deployed.

I don't know why you think that is better.

Jason
Leon Romanovsky Jan. 14, 2021, 5:26 p.m. UTC | #9
On Thu, Jan 14, 2021 at 08:40:14AM -0800, Alexander Duyck wrote:
> On Wed, Jan 13, 2021 at 10:50 PM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Wed, Jan 13, 2021 at 02:44:45PM -0800, Alexander Duyck wrote:
> > > On Tue, Jan 12, 2021 at 10:19 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > >
> > > > On Tue, Jan 12, 2021 at 01:34:50PM -0800, Alexander Duyck wrote:
> > > > > On Mon, Jan 11, 2021 at 10:56 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > > > >
> > > > > > On Mon, Jan 11, 2021 at 11:30:39AM -0800, Alexander Duyck wrote:
> > > > > > > On Sun, Jan 10, 2021 at 7:10 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > > > > > >
> > > > > > > > From: Leon Romanovsky <leonro@nvidia.com>
> > > > > > > >
> > > > > > > > Some SR-IOV capable devices provide an ability to configure specific
> > > > > > > > number of MSI-X vectors on their VF prior driver is probed on that VF.
> > > > > > > >
> > > > > > > > In order to make management easy, provide new read-only sysfs file that
> > > > > > > > returns a total number of possible to configure MSI-X vectors.
> > > > > > > >
> > > > > > > > cat /sys/bus/pci/devices/.../sriov_vf_total_msix
> > > > > > > >   = 0 - feature is not supported
> > > > > > > >   > 0 - total number of MSI-X vectors to consume by the VFs
> > > > > > > >
> > > > > > > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > > > > > > > ---
> > > > > > > >  Documentation/ABI/testing/sysfs-bus-pci | 14 +++++++++++
> > > > > > > >  drivers/pci/iov.c                       | 31 +++++++++++++++++++++++++
> > > > > > > >  drivers/pci/pci.h                       |  3 +++
> > > > > > > >  include/linux/pci.h                     |  2 ++
> > > > > > > >  4 files changed, 50 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > > > > > > > index 05e26e5da54e..64e9b700acc9 100644
> > > > > > > > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > > > > > > > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > > > > > > > @@ -395,3 +395,17 @@ Description:
> > > > > > > >                 The file is writable if the PF is bound to a driver that
> > > > > > > >                 supports the ->sriov_set_msix_vec_count() callback and there
> > > > > > > >                 is no driver bound to the VF.
> > > > > > > > +
> > > > > > > > +What:          /sys/bus/pci/devices/.../sriov_vf_total_msix
> > > > > > >
> > > > > > > In this case I would drop the "vf" and just go with sriov_total_msix
> > > > > > > since now you are referring to a global value instead of a per VF
> > > > > > > value.
> > > > > >
> > > > > > This field indicates the amount of MSI-X available for VFs, it doesn't
> > > > > > include PFs. The missing "_vf_" will mislead users who will believe that
> > > > > > it is all MSI-X vectors available for this device. They will need to take
> > > > > > into consideration amount of PF MSI-X in order to calculate the VF distribution.
> > > > > >
> > > > > > So I would leave "_vf_" here.
> > > > >
> > > > > The problem is you aren't indicating how many are available for an
> > > > > individual VF though, you are indicating how many are available for
> > > > > use by SR-IOV to give to the VFs. The fact that you are dealing with a
> > > > > pool makes things confusing in my opinion. For example sriov_vf_device
> > > > > describes the device ID that will be given to each VF.
> > > >
> > > > sriov_vf_device is different and is implemented accordingly to the PCI
> > > > spec, 9.3.3.11 VF Device ID (Offset 1Ah)
> > > > "This field contains the Device ID that should be presented for every VF
> > > > to the SI."
> > > >
> > > > It is one ID for all VFs.
> > >
> > > Yes, but that is what I am getting at. It is also what the device
> > > configuration will be for one VF. So when I read sriov_vf_total_msix
> > > it reads as the total for a single VF, not all of the the VFs. That is
> > > why I think dropping the "vf_" part of the name would make sense, as
> > > what you are describing is the total number of MSI-X vectors for use
> > > by SR-IOV VFs.
> >
> > I can change to anything as long as new name will give clear indication
> > that this total number is for VFs and doesn't include SR-IOV PF MSI-X.
>
> It is interesting that you make that distinction.
>
> So in the case of the Intel hardware we had one pool of MSI-X
> interrupts that was available for the entire port, both PF and VF.
> When we enabled SR-IOV we had to repartition that pool in order to
> assign interrupts to devices. So it sounds like in your case you don't
> do that and instead the PF is static and the VFs are the only piece
> that is flexible. Do I have that correct?

It is partially correct. The mlx5 devices have ability to change MSI-X
vectors of PF too, but to do so, you will need driver reload and much
more complex user interface. So we (SW) leave it as static.

>
> > >
> > > > >
> > > > > > >
> > > > > > > > +Date:          January 2021
> > > > > > > > +Contact:       Leon Romanovsky <leonro@nvidia.com>
> > > > > > > > +Description:
> > > > > > > > +               This file is associated with the SR-IOV PFs.
> > > > > > > > +               It returns a total number of possible to configure MSI-X
> > > > > > > > +               vectors on the enabled VFs.
> > > > > > > > +
> > > > > > > > +               The values returned are:
> > > > > > > > +                * > 0 - this will be total number possible to consume by VFs,
> > > > > > > > +                * = 0 - feature is not supported
> > > > > > > > +
> > > > > > > > +               If no SR-IOV VFs are enabled, this value will return 0.
> > > > > > > > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> > > > > > > > index 42c0df4158d1..0a6ddf3230fd 100644
> > > > > > > > --- a/drivers/pci/iov.c
> > > > > > > > +++ b/drivers/pci/iov.c
> > > > > > > > @@ -394,12 +394,22 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
> > > > > > > >         return count;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static ssize_t sriov_vf_total_msix_show(struct device *dev,
> > > > > > > > +                                       struct device_attribute *attr,
> > > > > > > > +                                       char *buf)
> > > > > > > > +{
> > > > > > > > +       struct pci_dev *pdev = to_pci_dev(dev);
> > > > > > > > +
> > > > > > > > +       return sprintf(buf, "%d\n", pdev->sriov->vf_total_msix);
> > > > > > > > +}
> > > > > > > > +
> > > > > > >
> > > > > > > You display it as a signed value, but unsigned values are not
> > > > > > > supported, correct?
> > > > > >
> > > > > > Right, I made it similar to the vf_msix_set. I can change.
> > > > > >
> > > > > > >
> > > > > > > >  static DEVICE_ATTR_RO(sriov_totalvfs);
> > > > > > > >  static DEVICE_ATTR_RW(sriov_numvfs);
> > > > > > > >  static DEVICE_ATTR_RO(sriov_offset);
> > > > > > > >  static DEVICE_ATTR_RO(sriov_stride);
> > > > > > > >  static DEVICE_ATTR_RO(sriov_vf_device);
> > > > > > > >  static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
> > > > > > > > +static DEVICE_ATTR_RO(sriov_vf_total_msix);
> > > > > > > >
> > > > > > > >  static struct attribute *sriov_dev_attrs[] = {
> > > > > > > >         &dev_attr_sriov_totalvfs.attr,
> > > > > > > > @@ -408,6 +418,7 @@ static struct attribute *sriov_dev_attrs[] = {
> > > > > > > >         &dev_attr_sriov_stride.attr,
> > > > > > > >         &dev_attr_sriov_vf_device.attr,
> > > > > > > >         &dev_attr_sriov_drivers_autoprobe.attr,
> > > > > > > > +       &dev_attr_sriov_vf_total_msix.attr,
> > > > > > > >         NULL,
> > > > > > > >  };
> > > > > > > >
> > > > > > > > @@ -658,6 +669,7 @@ static void sriov_disable(struct pci_dev *dev)
> > > > > > > >                 sysfs_remove_link(&dev->dev.kobj, "dep_link");
> > > > > > > >
> > > > > > > >         iov->num_VFs = 0;
> > > > > > > > +       iov->vf_total_msix = 0;
> > > > > > > >         pci_iov_set_numvfs(dev, 0);
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -1116,6 +1128,25 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
> > > > > > > >  }
> > > > > > > >  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
> > > > > > > >
> > > > > > > > +/**
> > > > > > > > + * pci_sriov_set_vf_total_msix - set total number of MSI-X vectors for the VFs
> > > > > > > > + * @dev: the PCI PF device
> > > > > > > > + * @numb: the total number of MSI-X vector to consume by the VFs
> > > > > > > > + *
> > > > > > > > + * Sets the number of MSI-X vectors that is possible to consume by the VFs.
> > > > > > > > + * This interface is complimentary part of the pci_set_msix_vec_count()
> > > > > > > > + * that will be used to configure the required number on the VF.
> > > > > > > > + */
> > > > > > > > +void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb)
> > > > > > > > +{
> > > > > > > > +       if (!dev->is_physfn || !dev->driver ||
> > > > > > > > +           !dev->driver->sriov_set_msix_vec_count)
> > > > > > > > +               return;
> > > > > > > > +
> > > > > > > > +       dev->sriov->vf_total_msix = numb;
> > > > > > > > +}
> > > > > > > > +EXPORT_SYMBOL_GPL(pci_sriov_set_vf_total_msix);
> > > > > > > > +
> > > > > > >
> > > > > > > This seems broken. What validation is being done on the numb value?
> > > > > > > You pass it as int, and your documentation all refers to tests for >=
> > > > > > > 0, but isn't a signed input a possibility as well? Also "numb" doesn't
> > > > > > > make for a good abbreviation as it is already a word of its own. It
> > > > > > > might make more sense to use count or something like that rather than
> > > > > > > trying to abbreviate number.
> > > > > >
> > > > > > "Broken" is a nice word to describe misunderstanding.
> > > > >
> > > > > Would you prefer "lacking input validation".
> > > > >
> > > > > I see all this code in there checking for is_physfn and driver and
> > > > > sriov_set_msix_vec_count before allowing the setting of vf_total_msix.
> > > > > It just seems like a lot of validation is taking place on the wrong
> > > > > things if you are just going to be setting a value reporting the total
> > > > > number of MSI-X vectors in use for SR-IOV.
> > > >
> > > > All those checks are in place to ensure that we are not overwriting the
> > > > default value, which is 0.
> > >
> > > Okay, so what you really have is surplus interrupts that you are
> > > wanting to give out to VF devices. So when we indicate 0 here as the
> > > default it really means we have no additional interrupts to give out.
> > > Am I understanding that correctly?
> >
> > The vf_total_msix is static value and shouldn't be recalculated after
> > every MSI-X vector number change. So 0 means that driver doesn't support
> > at all this feature. The operator is responsible to make proper assignment
> > calculations, because he is already doing it for the CPUs and netdev queues.
>
> Honestly that makes things even uglier. So basically this is a feature
> where if it isn't supported it will make it look like the SR-IOV
> device doesn't support MSI-X. I realize it is just the way it is
> worded but it isn't very pretty.

I'm not native English speaker, we can work together later on the
Documentation to make it more clear.

>
> > >
> > > The problem is this is very vendor specific so I am doing my best to
> > > understand it as it is different then the other NICs I have worked
> > > with.
> >
> > There is nothing vendor specific here. There are two types of devices:
> > 1. Support this feature. - The vf_total_msix will be greater than 0 for them
> > and their FW will do sanity checks when user overwrites their default number
> > that they sat in the VF creation stage.
> > 2. Doesn't support this feature - The vf_total_msix will be 0.
> >
> > It is PCI spec, so those "other NICs" that didn't implement the PCI spec
> > will stay with option #2. It is not different from current situation.
>
> Where in the spec is this?
>
> I know in the PCI spec it says that the MSI-X table size is read-only
> and is not supposed to be written by system software. That is what is
> being overwritten right now by your patches that has me concerned.

My patches are not over-writting anything, they are asking from FW to
set more appropriate value. The field stays read-only.

<...>

> >
> > I remind you that this feature is applicable to all SR-IOV devices, we have
> > RDMA, NVMe, crypto, FPGA and netdev VFs. The devlink is not supported
> > outside of netdev world and implementation there will make this feature
> > is far from being usable.
>
> To quote the documentation:
> "devlink is an API to expose device information and resources not directly
> related to any device class, such as chip-wide/switch-ASIC-wide configuration."

There is a great world outside of netdev and it doesn't include devlink. :)

>
> > Right now, the configuration of PCI/core is done through sysfs, so let's
> > review this feature from PCI/core perspective and not from netdev where
> > sysfs is not widely used.
>
> The problem is what you are configuring is not necessarily PCI device
> specific. You are configuring the firmware which operates at a level
> above the PCI device. You just have it manifesting as a PCI behavior
> as you are editing a read-only configuration space field.

In our devices, PCI config space is managed by FW and it represents HW.

>
> Also as I mentioned a few times now the approach you are taking
> violates the PCIe spec by essentially providing a means of making a
> read-only field writable.

AS you said, we see the same picture differently.

Thansk
Alexander Duyck Jan. 14, 2021, 5:55 p.m. UTC | #10
On Thu, Jan 14, 2021 at 8:49 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Jan 14, 2021 at 08:40:14AM -0800, Alexander Duyck wrote:
>
> > Where I think you and I disagree is that I really think the MSI-X
> > table size should be fixed at one value for the life of the VF.
> > Instead of changing the table size it should be the number of vectors
> > that are functional that should be the limit. Basically there should
> > be only some that are functional and some that are essentially just
> > dummy vectors that you can write values into that will never be used.
>
> Ignoring the PCI config space to learn the # of available MSI-X
> vectors is big break on the how the device's programming ABI works.
>
> Or stated another way, that isn't compatible with any existing drivers
> so it is basically not a useful approach as it can't be deployed.
>
> I don't know why you think that is better.
>
> Jason

First off, this is technically violating the PCIe spec section 7.7.2.2
because this is the device driver modifying the Message Control
register for a device, even if it is the PF firmware modifying the VF.
The table size is something that should be set and fixed at device
creation and not changed.

The MSI-X table is essentially just an MMIO resource, and I believe it
should not be resized, just as you wouldn't expect any MMIO BAR to be
dynamically resized. Many drivers don't make use of the full MSI-X
table nor do they bother reading the size. We just populate a subset
of the table based on the number of interrupt causes we will need to
associate to interrupt handlers. You can check for yourself. There are
only a handful of drivers such as vfio that ever bother reading at the
offset "PCI_MSIX_FLAGS". Normally it is the number of interrupt causes
that determine how many we need, not the size of the table. In
addition the OS may restrict us further since there are only so many
MSI-X interrupts that are supported per system/socket.

As far as the programming ABI, having support for some number of MSI-X
vectors isn't the same as having that number of MSI-X vectors. The
MSI-X vector table entry is nothing more than a spot to write an
address/data pair with the ability to mask the value to prevent it
from being triggered. The driver/OS will associate some handler to
that address/data pair. Populating an entry doesn't guarantee the
interrupt will ever be used. The programming model for the device is
what defines what trigger will be associated with it and when/how it
will be used.

What I see this patch doing is trying to push driver PF policy onto
the VF PCIe device configuration space dynamically. Having some
limited number of interrupt causes should really be what is limiting
things here. I see that being mostly a thing between the firmware and
the VF in terms of configuration and not something that necessarily
has to be pushed down onto the PCIe configuration space itself.
Jason Gunthorpe Jan. 14, 2021, 6:29 p.m. UTC | #11
On Thu, Jan 14, 2021 at 09:55:24AM -0800, Alexander Duyck wrote:
> On Thu, Jan 14, 2021 at 8:49 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Thu, Jan 14, 2021 at 08:40:14AM -0800, Alexander Duyck wrote:
> >
> > > Where I think you and I disagree is that I really think the MSI-X
> > > table size should be fixed at one value for the life of the VF.
> > > Instead of changing the table size it should be the number of vectors
> > > that are functional that should be the limit. Basically there should
> > > be only some that are functional and some that are essentially just
> > > dummy vectors that you can write values into that will never be used.
> >
> > Ignoring the PCI config space to learn the # of available MSI-X
> > vectors is big break on the how the device's programming ABI works.
> >
> > Or stated another way, that isn't compatible with any existing drivers
> > so it is basically not a useful approach as it can't be deployed.
> >
> > I don't know why you think that is better.
> >
> > Jason
> 
> First off, this is technically violating the PCIe spec section 7.7.2.2
> because this is the device driver modifying the Message Control
> register for a device, even if it is the PF firmware modifying the VF.
> The table size is something that should be set and fixed at device
> creation and not changed.

The word "violating" is rather an over-reaction, at worst this is an
extension.

> The MSI-X table is essentially just an MMIO resource, and I believe it
> should not be resized, just as you wouldn't expect any MMIO BAR to be
> dynamically resized. 

Resizing the BAR is already defined see commit 276b738deb5b ("PCI:
Add resizable BAR infrastructure")

As you say BAR and MSI vector table are not so different, it seems
perfectly in line with current PCI sig thinking to allow resizing the
MSI as well

> Many drivers don't make use of the full MSI-X table nor do they
> bother reading the size. We just populate a subset of the table
> based on the number of interrupt causes we will need to associate to
> interrupt handlers. 

This isn't about "many drivers" this is about what mlx5 does in all
the various OS drivers it has, and mlx5 has a sophisticated use of
MSI-X.

> What I see this patch doing is trying to push driver PF policy onto
> the VF PCIe device configuration space dynamically.

Huh? This is using the PF to dynamically reconfigure a child VF beyond
what the PCI spec defined. This is done safely under Linux because no
driver is bound when it is reconfigured, and any stale config data is
flushed out of any OS caches.

This is also why there is not a strong desire to standardize an ECN at
PCI-sig, the rules for how resizing can work are complicated and OS
specific.

> Having some limited number of interrupt causes should really be what
> is limiting things here. 

MSI inherently requires dedicated on-die resources to implement, so
every device has a maximum # of MSI vectors it can currently
expose. This is some consequence of various PCI rules and applies to
all devices.

To make effective use of this limited pool requires a hard restriction
enforced by the secure domain (hypervisor and FW) onto every
user. Every driver attached to the function needs to be aware of the
hard enforced limit by the secure domain to operate properly. It has
nothing to do with "limited number of interrupt causes".

The standards based way to communicate that limit is the MSI table cap
size.

To complain that changing the MSI table cap size dynamically is
non-standard then offer up a completely non-standard way to operate
MSI instead seems to miss the entire point.

The important standard is to keep the PCI config space acting per-spec
so all the various consumers can work as-is. The extension is to only
modify the rare hypervisor to support a dynamic MSI resizing extension
for VFs.

As far as applicability, any device working at high scale with MSI and
VMs is going to need this. Dynamically assigning the limited MSI HW is
really required to support the universe of VM configurations people
want. eg generally I would expect a VM to receive the number of MSI
vectors equal to the number of CPUs the VM gets.

> I see that being mostly a thing between the firmware and the VF in
> terms of configuration and not something that necessarily has to be
> pushed down onto the PCIe configuration space itself.

If mlx5 drivers had been designed long ago to never use standard based
MSI and instead did something internal with FW you might have a point,
but they weren't. All the mlx5 drivers use standards based MSI and
expect the config space to be correct.

Jason
Alexander Duyck Jan. 14, 2021, 7:24 p.m. UTC | #12
On Thu, Jan 14, 2021 at 10:29 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Jan 14, 2021 at 09:55:24AM -0800, Alexander Duyck wrote:
> > On Thu, Jan 14, 2021 at 8:49 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Thu, Jan 14, 2021 at 08:40:14AM -0800, Alexander Duyck wrote:
> > >
> > > > Where I think you and I disagree is that I really think the MSI-X
> > > > table size should be fixed at one value for the life of the VF.
> > > > Instead of changing the table size it should be the number of vectors
> > > > that are functional that should be the limit. Basically there should
> > > > be only some that are functional and some that are essentially just
> > > > dummy vectors that you can write values into that will never be used.
> > >
> > > Ignoring the PCI config space to learn the # of available MSI-X
> > > vectors is big break on the how the device's programming ABI works.
> > >
> > > Or stated another way, that isn't compatible with any existing drivers
> > > so it is basically not a useful approach as it can't be deployed.
> > >
> > > I don't know why you think that is better.
> > >
> > > Jason
> >
> > First off, this is technically violating the PCIe spec section 7.7.2.2
> > because this is the device driver modifying the Message Control
> > register for a device, even if it is the PF firmware modifying the VF.
> > The table size is something that should be set and fixed at device
> > creation and not changed.
>
> The word "violating" is rather an over-reaction, at worst this is an
> extension.
>
> > The MSI-X table is essentially just an MMIO resource, and I believe it
> > should not be resized, just as you wouldn't expect any MMIO BAR to be
> > dynamically resized.
>
> Resizing the BAR is already defined see commit 276b738deb5b ("PCI:
> Add resizable BAR infrastructure")
>
> As you say BAR and MSI vector table are not so different, it seems
> perfectly in line with current PCI sig thinking to allow resizing the
> MSI as well

The resizing of a BAR has an extended capability that goes with it and
isn't something that the device can just do on a whim. This patch set
is not based on implementing some ECN for resizable MSI-X tables. I
view it as arbitrarily rewriting the table size for a device after it
is created.

In addition Leon still hasn't answered my question on preventing the
VF driver from altering entries beyond the ones listed in the table.
My concern is that this may just be glossing over things and
introducing potential issues in the process if a VF can access
resources that don't belong to it.

> > Many drivers don't make use of the full MSI-X table nor do they
> > bother reading the size. We just populate a subset of the table
> > based on the number of interrupt causes we will need to associate to
> > interrupt handlers.
>
> This isn't about "many drivers" this is about what mlx5 does in all
> the various OS drivers it has, and mlx5 has a sophisticated use of
> MSI-X.

Can you please cite an example for me? The problem here is you are
claiming things without any proof. I feel like the requirement for
changing the VF MSI-X table size is coming from something outside of
Linux and without having any info on that I cannot really understand
the issue this is trying to resolve.

From what I can tell, the mlx5 Linux driver never reads the MSI-X
flags register so it isn't reading the MSI-X size either.

> > What I see this patch doing is trying to push driver PF policy onto
> > the VF PCIe device configuration space dynamically.
>
> Huh? This is using the PF to dynamically reconfigure a child VF beyond
> what the PCI spec defined. This is done safely under Linux because no
> driver is bound when it is reconfigured, and any stale config data is
> flushed out of any OS caches.
>
> This is also why there is not a strong desire to standardize an ECN at
> PCI-sig, the rules for how resizing can work are complicated and OS
> specific.

At a minimum I really think we need to go through and have a clear
definition on when updating the MSI-X table size is okay and when it
is not. I am not sure just saying to not update it when a driver isn't
attached is enough to guarantee all that.

On top of that the interface as defined here is rather ugly. It is
just providing a sysfs front end for a vendor proprietary
circumvention of the fact that the MSI-X table size is read-only. If
we are going to do that we might as well allow any vendor that has a
backdoor to their PCIe config go through and edit it.

> > Having some limited number of interrupt causes should really be what
> > is limiting things here.
>
> MSI inherently requires dedicated on-die resources to implement, so
> every device has a maximum # of MSI vectors it can currently
> expose. This is some consequence of various PCI rules and applies to
> all devices.
>
> To make effective use of this limited pool requires a hard restriction
> enforced by the secure domain (hypervisor and FW) onto every
> user. Every driver attached to the function needs to be aware of the
> hard enforced limit by the secure domain to operate properly. It has
> nothing to do with "limited number of interrupt causes".

What we are talking about is the MSI-X table size. Not the number of
MSI-X vectors being requested by the device driver. Those are normally
two seperate things.

I can only assume you have some out-of-tree issue or some other OS
that is the problem. If you can describe the issue in more detail for
me we have something to work with. Otherwise the request so far seems
unreasonable to me.

> The standards based way to communicate that limit is the MSI table cap
> size.

This only defines the maximum number of entries, not how many have to be used.

> To complain that changing the MSI table cap size dynamically is
> non-standard then offer up a completely non-standard way to operate
> MSI instead seems to miss the entire point.

I'm not offering up a non-standard way to do this. Just think about
it. If I have a device that defines an MSI-X table size of 2048 but
makes use of only one vector how would that be any different than what
I am suggesting where you size your VF to whatever the maximum is you
need but only make use of some fixed number from the hardware.

> The important standard is to keep the PCI config space acting per-spec
> so all the various consumers can work as-is. The extension is to only
> modify the rare hypervisor to support a dynamic MSI resizing extension
> for VFs.

I will repeat what I said before. Why can't this be handled via the
vfio interface? You just need to add the ability to specify the upper
limit on vdev->msix_size so that it is a number between 1 and whatever
your max table size is. It sounds like something that probably belongs
in the vfio_pci_ioctl somewhere. Then if you have an OS running in a
guest that cannot help itself and will allocate an interrupt for every
vector, you can simply modify that value so that it puts a cap on the
total number of vectors it will try to allocate in the guest.

> As far as applicability, any device working at high scale with MSI and
> VMs is going to need this. Dynamically assigning the limited MSI HW is
> really required to support the universe of VM configurations people
> want. eg generally I would expect a VM to receive the number of MSI
> vectors equal to the number of CPUs the VM gets.

Again, you are pushing VM requirements on to PCI. That really seems
like the realm of vfio rather than PCI. Especially since your answer
to the problem is to update a value that is only really being read by
vfio on the host. It really just seems like it would make more sense
to maybe make this part of the vfio ioctl call so you could limit the
use of the MSI-X table in the guest.

> > I see that being mostly a thing between the firmware and the VF in
> > terms of configuration and not something that necessarily has to be
> > pushed down onto the PCIe configuration space itself.
>
> If mlx5 drivers had been designed long ago to never use standard based
> MSI and instead did something internal with FW you might have a point,
> but they weren't. All the mlx5 drivers use standards based MSI and
> expect the config space to be correct.

Again, from what I can tell you are updating a field that isn't read
by the mlx5 driver. It is read/written by the OS and the field itself
is only supposed to be updated by the OS according to the PCIe spec.
While it is all well and good that the firmware can circumvent this
and modify the MMIO space I really feel like it shouldn't be doing
that.

In my mind it sounds a lot like this is something that really should
have been configured in the VFIO driver as the problems you have
described all seem to be issues either with some unknown userspace app
or OSes other than Linux.
Leon Romanovsky Jan. 14, 2021, 7:56 p.m. UTC | #13
On Thu, Jan 14, 2021 at 11:24:12AM -0800, Alexander Duyck wrote:
> On Thu, Jan 14, 2021 at 10:29 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Thu, Jan 14, 2021 at 09:55:24AM -0800, Alexander Duyck wrote:
> > > On Thu, Jan 14, 2021 at 8:49 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > >
> > > > On Thu, Jan 14, 2021 at 08:40:14AM -0800, Alexander Duyck wrote:
> > > >
> > > > > Where I think you and I disagree is that I really think the MSI-X
> > > > > table size should be fixed at one value for the life of the VF.
> > > > > Instead of changing the table size it should be the number of vectors
> > > > > that are functional that should be the limit. Basically there should
> > > > > be only some that are functional and some that are essentially just
> > > > > dummy vectors that you can write values into that will never be used.
> > > >
> > > > Ignoring the PCI config space to learn the # of available MSI-X
> > > > vectors is big break on the how the device's programming ABI works.
> > > >
> > > > Or stated another way, that isn't compatible with any existing drivers
> > > > so it is basically not a useful approach as it can't be deployed.
> > > >
> > > > I don't know why you think that is better.
> > > >
> > > > Jason
> > >
> > > First off, this is technically violating the PCIe spec section 7.7.2.2
> > > because this is the device driver modifying the Message Control
> > > register for a device, even if it is the PF firmware modifying the VF.
> > > The table size is something that should be set and fixed at device
> > > creation and not changed.
> >
> > The word "violating" is rather an over-reaction, at worst this is an
> > extension.
> >
> > > The MSI-X table is essentially just an MMIO resource, and I believe it
> > > should not be resized, just as you wouldn't expect any MMIO BAR to be
> > > dynamically resized.
> >
> > Resizing the BAR is already defined see commit 276b738deb5b ("PCI:
> > Add resizable BAR infrastructure")
> >
> > As you say BAR and MSI vector table are not so different, it seems
> > perfectly in line with current PCI sig thinking to allow resizing the
> > MSI as well
>
> The resizing of a BAR has an extended capability that goes with it and
> isn't something that the device can just do on a whim. This patch set
> is not based on implementing some ECN for resizable MSI-X tables. I
> view it as arbitrarily rewriting the table size for a device after it
> is created.
>
> In addition Leon still hasn't answered my question on preventing the
> VF driver from altering entries beyond the ones listed in the table.
> My concern is that this may just be glossing over things and
> introducing potential issues in the process if a VF can access
> resources that don't belong to it.
>
> > > Many drivers don't make use of the full MSI-X table nor do they
> > > bother reading the size. We just populate a subset of the table
> > > based on the number of interrupt causes we will need to associate to
> > > interrupt handlers.
> >
> > This isn't about "many drivers" this is about what mlx5 does in all
> > the various OS drivers it has, and mlx5 has a sophisticated use of
> > MSI-X.
>
> Can you please cite an example for me? The problem here is you are
> claiming things without any proof. I feel like the requirement for
> changing the VF MSI-X table size is coming from something outside of
> Linux and without having any info on that I cannot really understand
> the issue this is trying to resolve.
>
> From what I can tell, the mlx5 Linux driver never reads the MSI-X
> flags register so it isn't reading the MSI-X size either.
>
> > > What I see this patch doing is trying to push driver PF policy onto
> > > the VF PCIe device configuration space dynamically.
> >
> > Huh? This is using the PF to dynamically reconfigure a child VF beyond
> > what the PCI spec defined. This is done safely under Linux because no
> > driver is bound when it is reconfigured, and any stale config data is
> > flushed out of any OS caches.
> >
> > This is also why there is not a strong desire to standardize an ECN at
> > PCI-sig, the rules for how resizing can work are complicated and OS
> > specific.
>
> At a minimum I really think we need to go through and have a clear
> definition on when updating the MSI-X table size is okay and when it
> is not. I am not sure just saying to not update it when a driver isn't
> attached is enough to guarantee all that.
>
> On top of that the interface as defined here is rather ugly. It is
> just providing a sysfs front end for a vendor proprietary
> circumvention of the fact that the MSI-X table size is read-only. If
> we are going to do that we might as well allow any vendor that has a
> backdoor to their PCIe config go through and edit it.
>
> > > Having some limited number of interrupt causes should really be what
> > > is limiting things here.
> >
> > MSI inherently requires dedicated on-die resources to implement, so
> > every device has a maximum # of MSI vectors it can currently
> > expose. This is some consequence of various PCI rules and applies to
> > all devices.
> >
> > To make effective use of this limited pool requires a hard restriction
> > enforced by the secure domain (hypervisor and FW) onto every
> > user. Every driver attached to the function needs to be aware of the
> > hard enforced limit by the secure domain to operate properly. It has
> > nothing to do with "limited number of interrupt causes".
>
> What we are talking about is the MSI-X table size. Not the number of
> MSI-X vectors being requested by the device driver. Those are normally
> two seperate things.
>
> I can only assume you have some out-of-tree issue or some other OS
> that is the problem. If you can describe the issue in more detail for
> me we have something to work with. Otherwise the request so far seems
> unreasonable to me.
>
> > The standards based way to communicate that limit is the MSI table cap
> > size.
>
> This only defines the maximum number of entries, not how many have to be used.
>
> > To complain that changing the MSI table cap size dynamically is
> > non-standard then offer up a completely non-standard way to operate
> > MSI instead seems to miss the entire point.
>
> I'm not offering up a non-standard way to do this. Just think about
> it. If I have a device that defines an MSI-X table size of 2048 but
> makes use of only one vector how would that be any different than what
> I am suggesting where you size your VF to whatever the maximum is you
> need but only make use of some fixed number from the hardware.
>
> > The important standard is to keep the PCI config space acting per-spec
> > so all the various consumers can work as-is. The extension is to only
> > modify the rare hypervisor to support a dynamic MSI resizing extension
> > for VFs.
>
> I will repeat what I said before. Why can't this be handled via the
> vfio interface? You just need to add the ability to specify the upper
> limit on vdev->msix_size so that it is a number between 1 and whatever
> your max table size is. It sounds like something that probably belongs
> in the vfio_pci_ioctl somewhere. Then if you have an OS running in a
> guest that cannot help itself and will allocate an interrupt for every
> vector, you can simply modify that value so that it puts a cap on the
> total number of vectors it will try to allocate in the guest.
>
> > As far as applicability, any device working at high scale with MSI and
> > VMs is going to need this. Dynamically assigning the limited MSI HW is
> > really required to support the universe of VM configurations people
> > want. eg generally I would expect a VM to receive the number of MSI
> > vectors equal to the number of CPUs the VM gets.
>
> Again, you are pushing VM requirements on to PCI. That really seems
> like the realm of vfio rather than PCI. Especially since your answer
> to the problem is to update a value that is only really being read by
> vfio on the host. It really just seems like it would make more sense
> to maybe make this part of the vfio ioctl call so you could limit the
> use of the MSI-X table in the guest.
>
> > > I see that being mostly a thing between the firmware and the VF in
> > > terms of configuration and not something that necessarily has to be
> > > pushed down onto the PCIe configuration space itself.
> >
> > If mlx5 drivers had been designed long ago to never use standard based
> > MSI and instead did something internal with FW you might have a point,
> > but they weren't. All the mlx5 drivers use standards based MSI and
> > expect the config space to be correct.
>
> Again, from what I can tell you are updating a field that isn't read
> by the mlx5 driver. It is read/written by the OS and the field itself
> is only supposed to be updated by the OS according to the PCIe spec.
> While it is all well and good that the firmware can circumvent this
> and modify the MMIO space I really feel like it shouldn't be doing
> that.
>
> In my mind it sounds a lot like this is something that really should
> have been configured in the VFIO driver as the problems you have
> described all seem to be issues either with some unknown userspace app
> or OSes other than Linux.

Let me summarize:
1. Our FW treats this new MSI-X table size value in the same manner as "default" one.
If user tries to access outside of this table, it will be denied by HW.
2. This feature is for Linux and OSes based Linux. High performance
devices needs that that number of CPUs == number of channels == number of MSI-X vectors
to achieve maximum performance.
3. Device should be operable after SR-IOV ennoblement, it means lspci
over newly created VF should present real and working MSI-X table size.
It can't set to all new VFs some arbitrary max value.
4. This is not for "decreasing" number of vectors, but for increasing.
5. The field was read-only and it stays read-only.

Thanks
Jason Gunthorpe Jan. 14, 2021, 8:08 p.m. UTC | #14
On Thu, Jan 14, 2021 at 11:24:12AM -0800, Alexander Duyck wrote:

> > As you say BAR and MSI vector table are not so different, it seems
> > perfectly in line with current PCI sig thinking to allow resizing the
> > MSI as well
> 
> The resizing of a BAR has an extended capability that goes with it and
> isn't something that the device can just do on a whim. This patch set
> is not based on implementing some ECN for resizable MSI-X tables. I
> view it as arbitrarily rewriting the table size for a device after it
> is created.

The only difference is resizing the BAR is backed by an ECN, and this
is an extension. The device does not "do it on a whim" the OS tells it
when to change the size, exactly like for BAR resizing.

> In addition Leon still hasn't answered my question on preventing the
> VF driver from altering entries beyond the ones listed in the table.

Of course this is blocked, the FW completely revokes the HW resource
backing the vectors.

> From what I can tell, the mlx5 Linux driver never reads the MSI-X
> flags register so it isn't reading the MSI-X size either.

I don't know why you say that. All Linux drivers call into something
like pci_alloc_irq_vectors() requesting a maximum # of vectors and
that call returns the actual allocated. Drivers can request more
vectors than the system provides, which is what mlx5 does.

Under the call chain of pci_alloc_irq_vectors() it calls
pci_msix_vec_count() which does

	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
	return msix_table_size(control);

And eventually uses that to return the result to the driver.

So, yes, it reads the config space and ensures it doesn't allocate
more vectors than that.

Every driver using MSI does this in Linux.

Adjusting config space *directly* limits the number of vectors the
driver allocates.

You should be able to find the call chain in mlx5 based on the above
guidance.

> At a minimum I really think we need to go through and have a clear
> definition on when updating the MSI-X table size is okay and when it
> is not. I am not sure just saying to not update it when a driver isn't
> attached is enough to guarantee all that.

If you know of a real issue then please state it, other don't fear
monger "maybe" issues that don't exist.

> What we are talking about is the MSI-X table size. Not the number of
> MSI-X vectors being requested by the device driver. Those are normally
> two seperate things.

Yes, table size is what is critical. The number of entries in that BAR
memory is what needs to be controlled.

> > The standards based way to communicate that limit is the MSI table cap
> > size.
> 
> This only defines the maximum number of entries, not how many have to be used.

A driver can't use entries beyond the cap. We are not trying to
reclaim vectors that are available but not used by the OS.

> I'm not offering up a non-standard way to do this. Just think about
> it. If I have a device that defines an MSI-X table size of 2048 but
> makes use of only one vector how would that be any different than what
> I am suggesting where you size your VF to whatever the maximum is you
> need but only make use of some fixed number from the hardware.

That is completely different, in the hypervisor there is no idea how
many vectors a guest OS will create. The FW is told to only permit 1
vector. How is the guest to know this if we don't update the config
space *as the standard requires* ?

> I will repeat what I said before. Why can't this be handled via the
> vfio interface? 

1) The FW has to be told of the limit and everything has to be in sync
   If the FW is out of sync with the config space then everything
   breaks if the user makes even a small mistake - for instance
   forgetting to use the ioctl to override vfio. This is needlessly
   frail and complicated.

2) VFIO needs to know how to tell the FW the limit so it can override
   the config space with emulation. This is all device specific and I
   don't see that adding an extension to vfio is any better than
   adding one here.

3) VFIO doesn't cover any other driver that binds to the VF, so
   this "solution" leaves the normal mlx5_core functionally broken on
   VFs which is a major regression.

Overall the entire idea to have the config space not reflect the
actual current device configuration seems completely wrong to me - why
do this? For what gain? It breaks everything.

Jason
Alexander Duyck Jan. 14, 2021, 9:43 p.m. UTC | #15
On Thu, Jan 14, 2021 at 12:08 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Jan 14, 2021 at 11:24:12AM -0800, Alexander Duyck wrote:
>
> > > As you say BAR and MSI vector table are not so different, it seems
> > > perfectly in line with current PCI sig thinking to allow resizing the
> > > MSI as well
> >
> > The resizing of a BAR has an extended capability that goes with it and
> > isn't something that the device can just do on a whim. This patch set
> > is not based on implementing some ECN for resizable MSI-X tables. I
> > view it as arbitrarily rewriting the table size for a device after it
> > is created.
>
> The only difference is resizing the BAR is backed by an ECN, and this
> is an extension. The device does not "do it on a whim" the OS tells it
> when to change the size, exactly like for BAR resizing.
>
> > In addition Leon still hasn't answered my question on preventing the
> > VF driver from altering entries beyond the ones listed in the table.
>
> Of course this is blocked, the FW completely revokes the HW resource
> backing the vectors.

One of the troubles with this is that I am having to take your word
for it. The worst case scenario in my mind is that this is just was
Leon described it earlier and the firmware interface is doing nothing
more than altering the table size in the MSI-X config space so that
the value can be picked up by VFIO and advertised to the guest. In
such a situation you end up opening up a backdoor as now there are
vectors that could be configured by userspace since the protections
provided by VFIO are disabled as you could be reporting a size that is
smaller than the actual number or vectors.

These are the kind of things I am worried about with this interface.
It just seems like this is altering the VF PCIe device config to
address an issue that would be better handled by the vfio interface
ath VM creation time. At least if this was left to vfio it could
prevent the VM from being able to access the unused entries and just
limit the guest to the ones that we want to have the VM access.
Instead we are just being expected to trust the firmware for security
from the VF should it decide to try and be malicious.

> > From what I can tell, the mlx5 Linux driver never reads the MSI-X
> > flags register so it isn't reading the MSI-X size either.
>
> I don't know why you say that. All Linux drivers call into something
> like pci_alloc_irq_vectors() requesting a maximum # of vectors and
> that call returns the actual allocated. Drivers can request more
> vectors than the system provides, which is what mlx5 does.
>
> Under the call chain of pci_alloc_irq_vectors() it calls
> pci_msix_vec_count() which does
>
>         pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
>         return msix_table_size(control);
>
> And eventually uses that to return the result to the driver.
>
> So, yes, it reads the config space and ensures it doesn't allocate
> more vectors than that.
>
> Every driver using MSI does this in Linux.
>
> Adjusting config space *directly* limits the number of vectors the
> driver allocates.
>
> You should be able to find the call chain in mlx5 based on the above
> guidance.

Yes, but at the same time you also end up passing a max_vecs into the
function as you have multiple limitations that come into account from
both the driver, the system, and the table. The MSI-X table size is
just one piece. Specifically the bit in the code for the mlx5 driver
is:

nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
      MLX5_IRQ_VEC_COMP_BASE;
nvec = min_t(int, nvec, num_eqs);

So saying that the MSI-X table size is what defines the number of
interrupts you can use is only partially true. What it defines is the
aperture available in MMIO to define the possible addresses and values
to be written to trigger the interrupts. The device itself plays a
large role in defining the number of interrupts ultimately requested.

> > At a minimum I really think we need to go through and have a clear
> > definition on when updating the MSI-X table size is okay and when it
> > is not. I am not sure just saying to not update it when a driver isn't
> > attached is enough to guarantee all that.
>
> If you know of a real issue then please state it, other don't fear
> monger "maybe" issues that don't exist.

Well I don't have visibility into your firmware so I am not sure what
is going on in response to this command so forgive me when I do a bit
of fear mongering when somebody tells me that all this patch set does
is modify the VF configuration space.

As I have said my main concern is somebody really screwing something
like this up and creating a security vulnerability where they do
something exactly like updating just the MSI-X table size without
protecting the MMIO region that contains the remaining MSI-X table
entries. This is why I would be much more comfortable with something
like a vfio ioctl that says that while the device supports the number
reported in the MSI-X table size the vfio will only present some
subset of those entries to the guest. With that I could at least
review the code and verify that it is providing the expected
protections.

> > What we are talking about is the MSI-X table size. Not the number of
> > MSI-X vectors being requested by the device driver. Those are normally
> > two seperate things.
>
> Yes, table size is what is critical. The number of entries in that BAR
> memory is what needs to be controlled.

That is where we disagree. My past experience is that in the device
you have to be able to associate an interrupt cause to an interrupt
vector. Normally as a part of that the device itself will place some
limit on how many causes and vectors you can associate before you even
get to the MSI-X table. The MSI-X table size is usually a formality
that defines the upper limit on the number of entries the device might
request. The reason why most drivers don't bother asking for it or
reading it is because it is defined early as a part of the definition
of the device itself.

Going back to my earlier example I am not going to size a MSI-X table
at 2048 for a device that only has a few interrupt sources. Odds are I
would size it such that I will have enough entries in the table to
cover all my interrupt sources. Usually the limiting factor for an
MSI-X request is the system itself as too many devices requesting a
large number of interrupts may end up eating up all the vectors
available for a given CPU.

> > > The standards based way to communicate that limit is the MSI table cap
> > > size.
> >
> > This only defines the maximum number of entries, not how many have to be used.
>
> A driver can't use entries beyond the cap. We are not trying to
> reclaim vectors that are available but not used by the OS.

The MSI-X table consists of a MMIO region in one of the BARs on the
device. It is easily possible to code something up in a driver that
would go in and be able to access the region. Most sensible devices
place it in a separate BAR but even then you have to worry about them
possibly sharing the memory region internally among several devices.

The big thing I see as an issue with all this is that arbitrarily
reducing the size of the MSI-X table doesn't have any actual effect on
the MMIO resources. They are still there. So a bad firmware doing
something like reducing the table size without also restricting access
to the resources in the BAR potentially opens up a security hole as
the MSI-X vector is really nothing more than a pre-programmed PCIe
write waiting for something to trigger it. Odds are an IOMMU would
block it, but still not necessarily a good thing.

As such my preference would be to leave the MSI-X table size static at
the size of possible vectors that could be modified, and instead have
the firmware guarantee that writing to those registers has no effect.
Then when you do something like a direct assignment vfio_pci will also
guard that region of MMIO instead of only guarding a portion of it.

> > I'm not offering up a non-standard way to do this. Just think about
> > it. If I have a device that defines an MSI-X table size of 2048 but
> > makes use of only one vector how would that be any different than what
> > I am suggesting where you size your VF to whatever the maximum is you
> > need but only make use of some fixed number from the hardware.
>
> That is completely different, in the hypervisor there is no idea how
> many vectors a guest OS will create. The FW is told to only permit 1
> vector. How is the guest to know this if we don't update the config
> space *as the standard requires* ?

Doesn't the guest driver talk to the firmware? Last I knew it had to
request additional resources such as queues and those come from the
firmware don't they?

> > I will repeat what I said before. Why can't this be handled via the
> > vfio interface?
>
> 1) The FW has to be told of the limit and everything has to be in sync
>    If the FW is out of sync with the config space then everything
>    breaks if the user makes even a small mistake - for instance
>    forgetting to use the ioctl to override vfio. This is needlessly
>    frail and complicated.

That is also the way I feel about the sysfs solution.

I just see VFIO making the call to the device to notify it that it
only needs X number of vectors instead of having the VF sysfs doing
it.

> 2) VFIO needs to know how to tell the FW the limit so it can override
>    the config space with emulation. This is all device specific and I
>    don't see that adding an extension to vfio is any better than
>    adding one here.

So it depends on the setup. In my suggestion I was suggesting VFIO
defines the maximum, not the minimum. So the guest would automatically
know exactly how many it would have as the table size would be
specified inside the guest.

> 3) VFIO doesn't cover any other driver that binds to the VF, so
>    this "solution" leaves the normal mlx5_core functionally broken on
>    VFs which is a major regression.
>
> Overall the entire idea to have the config space not reflect the
> actual current device configuration seems completely wrong to me - why
> do this? For what gain? It breaks everything.

Your configuration is admittedly foreign to me as well. So if I am
understanding correctly your limiting factor isn't the number of
interrupt causes in the platform, but the firmware deciding to provide
too few interrupt vectors in the MSI-X table. I'm just kind of
surprised the firmware itself isn't providing some sort of messaging
on the number of interrupt vectors that a given device has since I
assume that it is already providing you with information on the number
of queues and such since that isn't provided by any other mechanism.
Alex Williamson Jan. 14, 2021, 11:28 p.m. UTC | #16
On Thu, 14 Jan 2021 13:43:57 -0800
Alexander Duyck <alexander.duyck@gmail.com> wrote:

> On Thu, Jan 14, 2021 at 12:08 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Thu, Jan 14, 2021 at 11:24:12AM -0800, Alexander Duyck wrote:
> >  
> > > > As you say BAR and MSI vector table are not so different, it seems
> > > > perfectly in line with current PCI sig thinking to allow resizing the
> > > > MSI as well  
> > >
> > > The resizing of a BAR has an extended capability that goes with it and
> > > isn't something that the device can just do on a whim. This patch set
> > > is not based on implementing some ECN for resizable MSI-X tables. I
> > > view it as arbitrarily rewriting the table size for a device after it
> > > is created.  
> >
> > The only difference is resizing the BAR is backed by an ECN, and this
> > is an extension. The device does not "do it on a whim" the OS tells it
> > when to change the size, exactly like for BAR resizing.
> >  
> > > In addition Leon still hasn't answered my question on preventing the
> > > VF driver from altering entries beyond the ones listed in the table.  
> >
> > Of course this is blocked, the FW completely revokes the HW resource
> > backing the vectors.  
> 
> One of the troubles with this is that I am having to take your word
> for it. The worst case scenario in my mind is that this is just was
> Leon described it earlier and the firmware interface is doing nothing
> more than altering the table size in the MSI-X config space so that
> the value can be picked up by VFIO and advertised to the guest. In
> such a situation you end up opening up a backdoor as now there are
> vectors that could be configured by userspace since the protections
> provided by VFIO are disabled as you could be reporting a size that is
> smaller than the actual number or vectors.

This is why we have interrupt remappers.  We gave up trying to actually
prevent vfio users from having access to write the MSI-X vector table,
there are too many backdoors and page-size issues that this is
impractical.  Instead we rely on interrupt remappers or equivalent
IOMMU hardware to make sure that a device can only signal the interrupts
we've programmed, which occurs via vfio ioctls making use of the MSI-X
capability information.  So if vfio thinks the limit is lower than
actually implemented in the vector table, the user shouldn't be able to
do anything with the extra vectors anyway.

Regarding the read-only nature of the MSI-X capability, would it help
if this interface posted a value which was enabled on FLR of the VF?
I think that would be legal relative to the spec, so really we're
talking about the semantics of whether we really need those a device
reset to change the value report in a read-only field.

> These are the kind of things I am worried about with this interface.
> It just seems like this is altering the VF PCIe device config to
> address an issue that would be better handled by the vfio interface
> ath VM creation time. At least if this was left to vfio it could
> prevent the VM from being able to access the unused entries and just
> limit the guest to the ones that we want to have the VM access.
> Instead we are just being expected to trust the firmware for security
> from the VF should it decide to try and be malicious.

It is possible for vfio to fake the MSI-X capability and limit what a
user can access, but I don't think that's what is being done here.
There was a previous comment that the goal is actually to expand the
number of MSI-X vectors for some devices.  vfio can't do that.  That
requires interaction and coordination with a PF driver managing a pool
of resources where we might require not only device ownership, but host
administrative privileges to prevent a user exhausting the pool.  That's
not ioctl material for vfio.

> > > From what I can tell, the mlx5 Linux driver never reads the MSI-X
> > > flags register so it isn't reading the MSI-X size either.  
> >
> > I don't know why you say that. All Linux drivers call into something
> > like pci_alloc_irq_vectors() requesting a maximum # of vectors and
> > that call returns the actual allocated. Drivers can request more
> > vectors than the system provides, which is what mlx5 does.
> >
> > Under the call chain of pci_alloc_irq_vectors() it calls
> > pci_msix_vec_count() which does
> >
> >         pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
> >         return msix_table_size(control);
> >
> > And eventually uses that to return the result to the driver.
> >
> > So, yes, it reads the config space and ensures it doesn't allocate
> > more vectors than that.
> >
> > Every driver using MSI does this in Linux.
> >
> > Adjusting config space *directly* limits the number of vectors the
> > driver allocates.
> >
> > You should be able to find the call chain in mlx5 based on the above
> > guidance.  
> 
> Yes, but at the same time you also end up passing a max_vecs into the
> function as you have multiple limitations that come into account from
> both the driver, the system, and the table. The MSI-X table size is
> just one piece. Specifically the bit in the code for the mlx5 driver
> is:
> 
> nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
>       MLX5_IRQ_VEC_COMP_BASE;
> nvec = min_t(int, nvec, num_eqs);
> 
> So saying that the MSI-X table size is what defines the number of
> interrupts you can use is only partially true. What it defines is the
> aperture available in MMIO to define the possible addresses and values
> to be written to trigger the interrupts. The device itself plays a
> large role in defining the number of interrupts ultimately requested.
> 
> > > At a minimum I really think we need to go through and have a clear
> > > definition on when updating the MSI-X table size is okay and when it
> > > is not. I am not sure just saying to not update it when a driver isn't
> > > attached is enough to guarantee all that.  
> >
> > If you know of a real issue then please state it, other don't fear
> > monger "maybe" issues that don't exist.  
> 
> Well I don't have visibility into your firmware so I am not sure what
> is going on in response to this command so forgive me when I do a bit
> of fear mongering when somebody tells me that all this patch set does
> is modify the VF configuration space.
> 
> As I have said my main concern is somebody really screwing something
> like this up and creating a security vulnerability where they do
> something exactly like updating just the MSI-X table size without
> protecting the MMIO region that contains the remaining MSI-X table
> entries. This is why I would be much more comfortable with something
> like a vfio ioctl that says that while the device supports the number
> reported in the MSI-X table size the vfio will only present some
> subset of those entries to the guest. With that I could at least
> review the code and verify that it is providing the expected
> protections.

We put a lot of trust in VF firmware otherwise, perhaps too much, but I
don't see how this is outside of how we already trust VFs.  MSIs are
just a DMA write, users don't need to use the vector table to make
devices perform arbitrary DMA writes.  If we trust VF firmware to
signal MSIs with the VF RID, then the IOMMU should provide protection.

> > > What we are talking about is the MSI-X table size. Not the number of
> > > MSI-X vectors being requested by the device driver. Those are normally
> > > two seperate things.  
> >
> > Yes, table size is what is critical. The number of entries in that BAR
> > memory is what needs to be controlled.  
> 
> That is where we disagree. My past experience is that in the device
> you have to be able to associate an interrupt cause to an interrupt
> vector. Normally as a part of that the device itself will place some
> limit on how many causes and vectors you can associate before you even
> get to the MSI-X table. The MSI-X table size is usually a formality
> that defines the upper limit on the number of entries the device might
> request. The reason why most drivers don't bother asking for it or
> reading it is because it is defined early as a part of the definition
> of the device itself.
> 
> Going back to my earlier example I am not going to size a MSI-X table
> at 2048 for a device that only has a few interrupt sources. Odds are I
> would size it such that I will have enough entries in the table to
> cover all my interrupt sources. Usually the limiting factor for an
> MSI-X request is the system itself as too many devices requesting a
> large number of interrupts may end up eating up all the vectors
> available for a given CPU.
> 
> > > > The standards based way to communicate that limit is the MSI table cap
> > > > size.  
> > >
> > > This only defines the maximum number of entries, not how many have to be used.  
> >
> > A driver can't use entries beyond the cap. We are not trying to
> > reclaim vectors that are available but not used by the OS.  
> 
> The MSI-X table consists of a MMIO region in one of the BARs on the
> device. It is easily possible to code something up in a driver that
> would go in and be able to access the region. Most sensible devices
> place it in a separate BAR but even then you have to worry about them
> possibly sharing the memory region internally among several devices.
> 
> The big thing I see as an issue with all this is that arbitrarily
> reducing the size of the MSI-X table doesn't have any actual effect on
> the MMIO resources. They are still there. So a bad firmware doing
> something like reducing the table size without also restricting access
> to the resources in the BAR potentially opens up a security hole as
> the MSI-X vector is really nothing more than a pre-programmed PCIe
> write waiting for something to trigger it. Odds are an IOMMU would
> block it, but still not necessarily a good thing.

Exactly, it's nothing more than a DMA write, which a malicious user can
trigger the device to perform in other ways if not through the vector
table.  We rely on the IOMMU for protection.

> As such my preference would be to leave the MSI-X table size static at
> the size of possible vectors that could be modified, and instead have
> the firmware guarantee that writing to those registers has no effect.
> Then when you do something like a direct assignment vfio_pci will also
> guard that region of MMIO instead of only guarding a portion of it.
> 
> > > I'm not offering up a non-standard way to do this. Just think about
> > > it. If I have a device that defines an MSI-X table size of 2048 but
> > > makes use of only one vector how would that be any different than what
> > > I am suggesting where you size your VF to whatever the maximum is you
> > > need but only make use of some fixed number from the hardware.  
> >
> > That is completely different, in the hypervisor there is no idea how
> > many vectors a guest OS will create. The FW is told to only permit 1
> > vector. How is the guest to know this if we don't update the config
> > space *as the standard requires* ?  
> 
> Doesn't the guest driver talk to the firmware? Last I knew it had to
> request additional resources such as queues and those come from the
> firmware don't they?
> 
> > > I will repeat what I said before. Why can't this be handled via the
> > > vfio interface?  
> >
> > 1) The FW has to be told of the limit and everything has to be in sync
> >    If the FW is out of sync with the config space then everything
> >    breaks if the user makes even a small mistake - for instance
> >    forgetting to use the ioctl to override vfio. This is needlessly
> >    frail and complicated.  
> 
> That is also the way I feel about the sysfs solution.
> 
> I just see VFIO making the call to the device to notify it that it
> only needs X number of vectors instead of having the VF sysfs doing
> it.

Where does this occur?  As above, the device owner shouldn't
necessarily have privileges to exhaust the entire PF of vectors.  Are
we really going to go to the trouble of creating cgroups for MSI-X
vectors?  Reducing vectors through emulation is easy, but that's only a
partial solution to the goal here I think.  Thanks,

Alex

> > 2) VFIO needs to know how to tell the FW the limit so it can override
> >    the config space with emulation. This is all device specific and I
> >    don't see that adding an extension to vfio is any better than
> >    adding one here.  
> 
> So it depends on the setup. In my suggestion I was suggesting VFIO
> defines the maximum, not the minimum. So the guest would automatically
> know exactly how many it would have as the table size would be
> specified inside the guest.
> 
> > 3) VFIO doesn't cover any other driver that binds to the VF, so
> >    this "solution" leaves the normal mlx5_core functionally broken on
> >    VFs which is a major regression.
> >
> > Overall the entire idea to have the config space not reflect the
> > actual current device configuration seems completely wrong to me - why
> > do this? For what gain? It breaks everything.  
> 
> Your configuration is admittedly foreign to me as well. So if I am
> understanding correctly your limiting factor isn't the number of
> interrupt causes in the platform, but the firmware deciding to provide
> too few interrupt vectors in the MSI-X table. I'm just kind of
> surprised the firmware itself isn't providing some sort of messaging
> on the number of interrupt vectors that a given device has since I
> assume that it is already providing you with information on the number
> of queues and such since that isn't provided by any other mechanism.
>
Alexander Duyck Jan. 15, 2021, 1:56 a.m. UTC | #17
On Thu, Jan 14, 2021 at 3:28 PM Alex Williamson
<alex.williamson@redhat.com> wrote:
>
> On Thu, 14 Jan 2021 13:43:57 -0800
> Alexander Duyck <alexander.duyck@gmail.com> wrote:
>
> > On Thu, Jan 14, 2021 at 12:08 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Thu, Jan 14, 2021 at 11:24:12AM -0800, Alexander Duyck wrote:
> > >
> > > > > As you say BAR and MSI vector table are not so different, it seems
> > > > > perfectly in line with current PCI sig thinking to allow resizing the
> > > > > MSI as well
> > > >
> > > > The resizing of a BAR has an extended capability that goes with it and
> > > > isn't something that the device can just do on a whim. This patch set
> > > > is not based on implementing some ECN for resizable MSI-X tables. I
> > > > view it as arbitrarily rewriting the table size for a device after it
> > > > is created.
> > >
> > > The only difference is resizing the BAR is backed by an ECN, and this
> > > is an extension. The device does not "do it on a whim" the OS tells it
> > > when to change the size, exactly like for BAR resizing.
> > >
> > > > In addition Leon still hasn't answered my question on preventing the
> > > > VF driver from altering entries beyond the ones listed in the table.
> > >
> > > Of course this is blocked, the FW completely revokes the HW resource
> > > backing the vectors.
> >
> > One of the troubles with this is that I am having to take your word
> > for it. The worst case scenario in my mind is that this is just was
> > Leon described it earlier and the firmware interface is doing nothing
> > more than altering the table size in the MSI-X config space so that
> > the value can be picked up by VFIO and advertised to the guest. In
> > such a situation you end up opening up a backdoor as now there are
> > vectors that could be configured by userspace since the protections
> > provided by VFIO are disabled as you could be reporting a size that is
> > smaller than the actual number or vectors.
>
> This is why we have interrupt remappers.  We gave up trying to actually
> prevent vfio users from having access to write the MSI-X vector table,
> there are too many backdoors and page-size issues that this is
> impractical.  Instead we rely on interrupt remappers or equivalent
> IOMMU hardware to make sure that a device can only signal the interrupts
> we've programmed, which occurs via vfio ioctls making use of the MSI-X
> capability information.  So if vfio thinks the limit is lower than
> actually implemented in the vector table, the user shouldn't be able to
> do anything with the extra vectors anyway.
>
> Regarding the read-only nature of the MSI-X capability, would it help
> if this interface posted a value which was enabled on FLR of the VF?
> I think that would be legal relative to the spec, so really we're
> talking about the semantics of whether we really need those a device
> reset to change the value report in a read-only field.

I realized this kind of got derailed when we started arguing about the
MSI-X table size. I am okay with the table size changing, however I am
still not a fan of the VF driver having to probe the PF driver to do
so. I really think it would work much better the other way around with
the PF having to probe the VF.

That said, it only works at the driver level. So if the firmware is
the one that is having to do this it also occured to me that if this
update happened on FLR that would probably be preferred. Specifically
if it were tied in with the devlink resources interface I think it
would be a good fit for this issue as you could have a global pool of
vf-interrupt resources that are distributed though individual
subresources where each subresource would indicate an individual VF.
In addition the resources functionality for devlink has the concept of
a posted update where changes do not apply immediately if they cannot
be updated until some event occurs such as an FLR.

Since the mlx5 already supports devlink I don't see any reason why the
driver couldn't be extended to also support the devlink resource
interface and apply it to interrupts.

> > These are the kind of things I am worried about with this interface.
> > It just seems like this is altering the VF PCIe device config to
> > address an issue that would be better handled by the vfio interface
> > ath VM creation time. At least if this was left to vfio it could
> > prevent the VM from being able to access the unused entries and just
> > limit the guest to the ones that we want to have the VM access.
> > Instead we are just being expected to trust the firmware for security
> > from the VF should it decide to try and be malicious.
>
> It is possible for vfio to fake the MSI-X capability and limit what a
> user can access, but I don't think that's what is being done here.

Yeah, I am assuming that is what is being done here. However some of
the things Leon said gave me pause when he made a comment earlier that
it was just updating the MSI-X config. That may not have been what he
meant, but what worries me is an interface like this sets up things
for something like that in the future where someone decides to
short-cut the interface.

> There was a previous comment that the goal is actually to expand the
> number of MSI-X vectors for some devices.  vfio can't do that.  That
> requires interaction and coordination with a PF driver managing a pool
> of resources where we might require not only device ownership, but host
> administrative privileges to prevent a user exhausting the pool.  That's
> not ioctl material for vfio.

In this case the PF driver isn't really managing that, it is the
firmware. Basically it is an entity outside the kernel as the PF
doesn't even have control over its own resources, it is the firmware
that decides what it gets. Thus why the mlx5 driver is apparently at
the mercy of the MSI-X table size to determine how many interrupts it
can actually have.. :)

> > > > From what I can tell, the mlx5 Linux driver never reads the MSI-X
> > > > flags register so it isn't reading the MSI-X size either.
> > >
> > > I don't know why you say that. All Linux drivers call into something
> > > like pci_alloc_irq_vectors() requesting a maximum # of vectors and
> > > that call returns the actual allocated. Drivers can request more
> > > vectors than the system provides, which is what mlx5 does.
> > >
> > > Under the call chain of pci_alloc_irq_vectors() it calls
> > > pci_msix_vec_count() which does
> > >
> > >         pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
> > >         return msix_table_size(control);
> > >
> > > And eventually uses that to return the result to the driver.
> > >
> > > So, yes, it reads the config space and ensures it doesn't allocate
> > > more vectors than that.
> > >
> > > Every driver using MSI does this in Linux.
> > >
> > > Adjusting config space *directly* limits the number of vectors the
> > > driver allocates.
> > >
> > > You should be able to find the call chain in mlx5 based on the above
> > > guidance.
> >
> > Yes, but at the same time you also end up passing a max_vecs into the
> > function as you have multiple limitations that come into account from
> > both the driver, the system, and the table. The MSI-X table size is
> > just one piece. Specifically the bit in the code for the mlx5 driver
> > is:
> >
> > nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
> >       MLX5_IRQ_VEC_COMP_BASE;
> > nvec = min_t(int, nvec, num_eqs);
> >
> > So saying that the MSI-X table size is what defines the number of
> > interrupts you can use is only partially true. What it defines is the
> > aperture available in MMIO to define the possible addresses and values
> > to be written to trigger the interrupts. The device itself plays a
> > large role in defining the number of interrupts ultimately requested.
> >
> > > > At a minimum I really think we need to go through and have a clear
> > > > definition on when updating the MSI-X table size is okay and when it
> > > > is not. I am not sure just saying to not update it when a driver isn't
> > > > attached is enough to guarantee all that.
> > >
> > > If you know of a real issue then please state it, other don't fear
> > > monger "maybe" issues that don't exist.
> >
> > Well I don't have visibility into your firmware so I am not sure what
> > is going on in response to this command so forgive me when I do a bit
> > of fear mongering when somebody tells me that all this patch set does
> > is modify the VF configuration space.
> >
> > As I have said my main concern is somebody really screwing something
> > like this up and creating a security vulnerability where they do
> > something exactly like updating just the MSI-X table size without
> > protecting the MMIO region that contains the remaining MSI-X table
> > entries. This is why I would be much more comfortable with something
> > like a vfio ioctl that says that while the device supports the number
> > reported in the MSI-X table size the vfio will only present some
> > subset of those entries to the guest. With that I could at least
> > review the code and verify that it is providing the expected
> > protections.
>
> We put a lot of trust in VF firmware otherwise, perhaps too much, but I
> don't see how this is outside of how we already trust VFs.  MSIs are
> just a DMA write, users don't need to use the vector table to make
> devices perform arbitrary DMA writes.  If we trust VF firmware to
> signal MSIs with the VF RID, then the IOMMU should provide protection.

Agreed. The IOMMU should take care of this. I'm not a fan of the
dynamic table resizing, however I can live with it. The one part that
is a hard no for me is the VF sysfs file. I would much rather see this
managed through the Devlink Resource interface.
https://www.kernel.org/doc/html/latest/networking/devlink/devlink-resource.html

That combined with the update taking place on FLR would likely be a
perfect fit in my eyes.

> > > > What we are talking about is the MSI-X table size. Not the number of
> > > > MSI-X vectors being requested by the device driver. Those are normally
> > > > two seperate things.
> > >
> > > Yes, table size is what is critical. The number of entries in that BAR
> > > memory is what needs to be controlled.
> >
> > That is where we disagree. My past experience is that in the device
> > you have to be able to associate an interrupt cause to an interrupt
> > vector. Normally as a part of that the device itself will place some
> > limit on how many causes and vectors you can associate before you even
> > get to the MSI-X table. The MSI-X table size is usually a formality
> > that defines the upper limit on the number of entries the device might
> > request. The reason why most drivers don't bother asking for it or
> > reading it is because it is defined early as a part of the definition
> > of the device itself.
> >
> > Going back to my earlier example I am not going to size a MSI-X table
> > at 2048 for a device that only has a few interrupt sources. Odds are I
> > would size it such that I will have enough entries in the table to
> > cover all my interrupt sources. Usually the limiting factor for an
> > MSI-X request is the system itself as too many devices requesting a
> > large number of interrupts may end up eating up all the vectors
> > available for a given CPU.
> >
> > > > > The standards based way to communicate that limit is the MSI table cap
> > > > > size.
> > > >
> > > > This only defines the maximum number of entries, not how many have to be used.
> > >
> > > A driver can't use entries beyond the cap. We are not trying to
> > > reclaim vectors that are available but not used by the OS.
> >
> > The MSI-X table consists of a MMIO region in one of the BARs on the
> > device. It is easily possible to code something up in a driver that
> > would go in and be able to access the region. Most sensible devices
> > place it in a separate BAR but even then you have to worry about them
> > possibly sharing the memory region internally among several devices.
> >
> > The big thing I see as an issue with all this is that arbitrarily
> > reducing the size of the MSI-X table doesn't have any actual effect on
> > the MMIO resources. They are still there. So a bad firmware doing
> > something like reducing the table size without also restricting access
> > to the resources in the BAR potentially opens up a security hole as
> > the MSI-X vector is really nothing more than a pre-programmed PCIe
> > write waiting for something to trigger it. Odds are an IOMMU would
> > block it, but still not necessarily a good thing.
>
> Exactly, it's nothing more than a DMA write, which a malicious user can
> trigger the device to perform in other ways if not through the vector
> table.  We rely on the IOMMU for protection.

I'm just not a fan of opening up more vectors, but as I said above I
would be willing to concede this part of the argument since it seems
like the mlx5 programming model requires the MSI-X table size to match
the limit.

> > As such my preference would be to leave the MSI-X table size static at
> > the size of possible vectors that could be modified, and instead have
> > the firmware guarantee that writing to those registers has no effect.
> > Then when you do something like a direct assignment vfio_pci will also
> > guard that region of MMIO instead of only guarding a portion of it.
> >
> > > > I'm not offering up a non-standard way to do this. Just think about
> > > > it. If I have a device that defines an MSI-X table size of 2048 but
> > > > makes use of only one vector how would that be any different than what
> > > > I am suggesting where you size your VF to whatever the maximum is you
> > > > need but only make use of some fixed number from the hardware.
> > >
> > > That is completely different, in the hypervisor there is no idea how
> > > many vectors a guest OS will create. The FW is told to only permit 1
> > > vector. How is the guest to know this if we don't update the config
> > > space *as the standard requires* ?
> >
> > Doesn't the guest driver talk to the firmware? Last I knew it had to
> > request additional resources such as queues and those come from the
> > firmware don't they?
> >
> > > > I will repeat what I said before. Why can't this be handled via the
> > > > vfio interface?
> > >
> > > 1) The FW has to be told of the limit and everything has to be in sync
> > >    If the FW is out of sync with the config space then everything
> > >    breaks if the user makes even a small mistake - for instance
> > >    forgetting to use the ioctl to override vfio. This is needlessly
> > >    frail and complicated.
> >
> > That is also the way I feel about the sysfs solution.
> >
> > I just see VFIO making the call to the device to notify it that it
> > only needs X number of vectors instead of having the VF sysfs doing
> > it.
>
> Where does this occur?  As above, the device owner shouldn't
> necessarily have privileges to exhaust the entire PF of vectors.  Are
> we really going to go to the trouble of creating cgroups for MSI-X
> vectors?  Reducing vectors through emulation is easy, but that's only a
> partial solution to the goal here I think.  Thanks,
>
> Alex

In my mind this would have been in response to having an ioctl call
come down indicating that we want to limit the number of MSI-X vectors
to some value. Keep in mind that also in this model I was suggesting
that the device just advertise whatever the maximum MSI-X table size
could be for the VF. So we would only ever be shrinking the device,
not growing it.

I was just kind of thinking that something like this might have uses
outside of the mlx5 case as I seem to recall, and not my memory could
be faulty, that there were some OSes such as Windows that would do the
kind of setup where it would allocate as many vectors as a device
could handle when the CPU count got high. In such a VM it might be
useful to have an option for the hypervisor to override the MSI-X
table size and tell the OS that it cannot allocate asw many vectors.

Anyway I was just kind of wandering into solution space on this trying
to come up with an alternate solution. As I said the ideal solution
for me at this point would be to see the Devlink Resource interface be
used to assign the interrupts, and then apply the changes on FLR.
Jason Gunthorpe Jan. 15, 2021, 1:51 p.m. UTC | #18
On Thu, Jan 14, 2021 at 01:43:57PM -0800, Alexander Duyck wrote:

> > > In addition Leon still hasn't answered my question on preventing the
> > > VF driver from altering entries beyond the ones listed in the table.
> >
> > Of course this is blocked, the FW completely revokes the HW resource
> > backing the vectors.
> 
> One of the troubles with this is that I am having to take your word
> for it.

This is a Linux patch review, not a security review of a HW
implementation. There are million ways to screw up a PCI device
implementation and in SRIOV the PCI device HW implementation forms
part of the trust base of the hypervisor.

If the HW API can be implemented securely and the Linux code is
appropriate is the only question here.

In this case mlx5 HW is implemented correctly and securely, if you
don't belive then you are free not to use it.

> What it defines is the aperture available in MMIO to define the
> possible addresses and values to be written to trigger the
> interrupts. The device itself plays a large role in defining the
> number of interrupts ultimately requested.

Again you are confused about what is going on here - this is about
reconfiguring the HW so that MSI vector entries exist or not - it has
absoultely nothing to do with the driver. We are not optimizing for
the case where the driver does not use MSI vectors the VF has
available.

> > > At a minimum I really think we need to go through and have a clear
> > > definition on when updating the MSI-X table size is okay and when it
> > > is not. I am not sure just saying to not update it when a driver isn't
> > > attached is enough to guarantee all that.
> >
> > If you know of a real issue then please state it, other don't fear
> > monger "maybe" issues that don't exist.
>
> Well I don't have visibility into your firmware so I am not sure what
> is going on in response to this command so forgive me when I do a bit
> of fear mongering when somebody tells me that all this patch set does
> is modify the VF configuration space.

You were not talking about the FW, "is okay and when it is not" is a
*Linux* question.

> > > What we are talking about is the MSI-X table size. Not the number of
> > > MSI-X vectors being requested by the device driver. Those are normally
> > > two seperate things.
> >
> > Yes, table size is what is critical. The number of entries in that BAR
> > memory is what needs to be controlled.
> 
> That is where we disagree. 

Huh? You are disagreeing this is how the mlx5 device works?

> Normally as a part of that the device itself will place some
> limit on how many causes and vectors you can associate before you even
> get to the MSI-X table.

For mlx5 this cause limit is huge. With IMS it can even be higher than
the 2K MSI-X limit. Remember on an x86 system you get 256 interrupt
vectors per CPU *and* per vCPU, so with interrupt remapping there can
be huge numbers of interrupts required.

Your "normally" is for simplistic fixed function HW devices not
intended for use at this scale.

> The MSI-X table size is usually a formality that defines the upper
> limit on the number of entries the device might request.

It is not a formality. PCI rules require *actual physical HW* to be
dedicated to the MSI vector entries.

Think of it like this - the device has a large global MSI-X table of
say 2K entires. This is the actual physical HW SRAM backing MSI
entires required by PCIe.

The HW will map the MSI-X table BAR space in every PF/VF to a slice of
that global table. If the PCI Cap says 8 entries then the MSI-X page has
only 8 entries, everything else is /dev/null.

Global MSI entries cannot be shared - the total of all PF/VFs cap
field must not be more than 2K.

One application requires 2K MSI-X on a single function because it uses
VDPA devices and VT-d interrupt remapping

Another application requires 16 MSI-X on 128 VFs because it is using
SRIOV with VMs having 16 vCPUs.

The HW is configured differently in both cases. It is not something
that can be faked with VFIO!

> > That is completely different, in the hypervisor there is no idea how
> > many vectors a guest OS will create. The FW is told to only permit 1
> > vector. How is the guest to know this if we don't update the config
> > space *as the standard requires* ?
> 
> Doesn't the guest driver talk to the firmware? Last I knew it had to
> request additional resources such as queues and those come from the
> firmware don't they?

That is not how things work. Because VFIO has to be involved in
setting up interrupt remapping through its MSI emulation we don't get
to use a dynamic FW only path as something like IMS might imagine.

That would be so much better, but lots of things are not ready for
that.

> > 1) The FW has to be told of the limit and everything has to be in sync
> >    If the FW is out of sync with the config space then everything
> >    breaks if the user makes even a small mistake - for instance
> >    forgetting to use the ioctl to override vfio. This is needlessly
> >    frail and complicated.
> 
> That is also the way I feel about the sysfs solution.

Huh? The sysfs is much safer. If the write() succeeds I can't think of
any way the system would be left broken? Why do you think it is frail?

> I'm just kind of surprised the firmware itself isn't providing some
> sort of messaging on the number of interrupt vectors that a given

It does, it is the PCI cap, just because you keep saying it isn't used
doesn't make that true :)

> device has since I assume that it is already providing you with
> information on the number of queues and such since that isn't
> provided by any other mechanism.

Queues are effectively unlimited.

Jason
Jason Gunthorpe Jan. 15, 2021, 2:06 p.m. UTC | #19
On Thu, Jan 14, 2021 at 05:56:20PM -0800, Alexander Duyck wrote:

> That said, it only works at the driver level. So if the firmware is
> the one that is having to do this it also occured to me that if this
> update happened on FLR that would probably be preferred. 

FLR is not free, I'd prefer not to require it just for some
philosophical reason.

> Since the mlx5 already supports devlink I don't see any reason why the
> driver couldn't be extended to also support the devlink resource
> interface and apply it to interrupts.

So you are OK with the PF changing the VF as long as it is devlink not
sysfs? Seems rather arbitary?

Leon knows best, but if I recall devlink becomes wonky when the VF
driver doesn't provide a devlink instance. How does it do reload of a
VF then?

I think you end up with essentially the same logic as presented here
with sysfs.

> > It is possible for vfio to fake the MSI-X capability and limit what a
> > user can access, but I don't think that's what is being done here.
> 
> Yeah, I am assuming that is what is being done here. 

Just to be really clear, that assumption is wrong

Jason
Leon Romanovsky Jan. 15, 2021, 3:53 p.m. UTC | #20
On Fri, Jan 15, 2021 at 10:06:19AM -0400, Jason Gunthorpe wrote:
> On Thu, Jan 14, 2021 at 05:56:20PM -0800, Alexander Duyck wrote:
>
> > That said, it only works at the driver level. So if the firmware is
> > the one that is having to do this it also occured to me that if this
> > update happened on FLR that would probably be preferred.
>
> FLR is not free, I'd prefer not to require it just for some
> philosophical reason.
>
> > Since the mlx5 already supports devlink I don't see any reason why the
> > driver couldn't be extended to also support the devlink resource
> > interface and apply it to interrupts.
>
> So you are OK with the PF changing the VF as long as it is devlink not
> sysfs? Seems rather arbitary?
>
> Leon knows best, but if I recall devlink becomes wonky when the VF
> driver doesn't provide a devlink instance. How does it do reload of a
> VF then?
>
> I think you end up with essentially the same logic as presented here
> with sysfs.

The reasons why I decided to go with sysfs are:
1. This MSI-X table size change is applicable to ALL devices in the world,
and not only netdev.
2. This is purely PCI field and apply equally with same logic to all
subsystems and not to netdev only.
3. The sysfs interface is the standard way of configuring PCI/core, not
devlink.
4. This is how orchestration software provisioning VFs already. It fits
real world usage of SR-IOV, not the artificial one that is proposed during
the discussion.

So the idea to use devlink just because mlx5 supports it, sound really
wrong to me. If it was other driver from another subsystem without
devlink support, the request to use devlink won't never come.

Thanks

>
> > > It is possible for vfio to fake the MSI-X capability and limit what a
> > > user can access, but I don't think that's what is being done here.
> >
> > Yeah, I am assuming that is what is being done here.
>
> Just to be really clear, that assumption is wrong
>
> Jason
Alexander Duyck Jan. 16, 2021, 1:48 a.m. UTC | #21
On Fri, Jan 15, 2021 at 7:53 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Fri, Jan 15, 2021 at 10:06:19AM -0400, Jason Gunthorpe wrote:
> > On Thu, Jan 14, 2021 at 05:56:20PM -0800, Alexander Duyck wrote:
> >
> > > That said, it only works at the driver level. So if the firmware is
> > > the one that is having to do this it also occured to me that if this
> > > update happened on FLR that would probably be preferred.
> >
> > FLR is not free, I'd prefer not to require it just for some
> > philosophical reason.
> >
> > > Since the mlx5 already supports devlink I don't see any reason why the
> > > driver couldn't be extended to also support the devlink resource
> > > interface and apply it to interrupts.
> >
> > So you are OK with the PF changing the VF as long as it is devlink not
> > sysfs? Seems rather arbitary?
> >
> > Leon knows best, but if I recall devlink becomes wonky when the VF
> > driver doesn't provide a devlink instance. How does it do reload of a
> > VF then?
> >
> > I think you end up with essentially the same logic as presented here
> > with sysfs.
>
> The reasons why I decided to go with sysfs are:
> 1. This MSI-X table size change is applicable to ALL devices in the world,
> and not only netdev.

In the PCI world MSI-X table size is a read only value. That is why I
am pushing back on this as a PCI interface.

> 2. This is purely PCI field and apply equally with same logic to all
> subsystems and not to netdev only.

Again, calling this "purely PCI" is the sort of wording that has me
concerned. I would prefer it if we avoid that wording. There is much
more to this than just modifying the table size field. The firmware is
having to shift resources between devices and this potentially has an
effect on the entire part, not just one VF.

> 3. The sysfs interface is the standard way of configuring PCI/core, not
> devlink.

This isn't PCI core that is being configured. It is the firmware for
the device. You are working with resources that are shared between
multiple functions.

> 4. This is how orchestration software provisioning VFs already. It fits
> real world usage of SR-IOV, not the artificial one that is proposed during
> the discussion.

What do you mean this is how they are doing it already? Do you have
something out-of-tree and that is why you are fighting to keep the
sysfs? If so that isn't a valid argument.

> So the idea to use devlink just because mlx5 supports it, sound really
> wrong to me. If it was other driver from another subsystem without
> devlink support, the request to use devlink won't never come.
>
> Thanks

I am suggesting the devlink resources interface because it would be a
VERY good fit for something like this. By the definition of it:
``devlink`` provides the ability for drivers to register resources, which
can allow administrators to see the device restrictions for a given
resource, as well as how much of the given resource is currently
in use. Additionally, these resources can optionally have configurable size.
This could enable the administrator to limit the number of resources that
are used.

Even looking over the example usage I don't see there being much to
prevent you from applying it to this issue. In addition it has the
idea of handling changes that cannot be immediately applied already
included. Your current solution doesn't have a good way of handling
that and instead just aborts with an error.
Alexander Duyck Jan. 16, 2021, 4:32 a.m. UTC | #22
On Fri, Jan 15, 2021 at 6:06 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Jan 14, 2021 at 05:56:20PM -0800, Alexander Duyck wrote:
>
> > That said, it only works at the driver level. So if the firmware is
> > the one that is having to do this it also occured to me that if this
> > update happened on FLR that would probably be preferred.
>
> FLR is not free, I'd prefer not to require it just for some
> philosophical reason.

It wasn't so much a philosophical thing as the fact that it can sort
of take the place as a reload. Essentially with an FLR we are
rewriting the configuration so if the driver were involved it would be
a good time to pull in the MSI-X table update. However looking over
the mlx5 code I don't see any handling of FLR in there so I am
assuming that is handled by the firmware.

> > Since the mlx5 already supports devlink I don't see any reason why the
> > driver couldn't be extended to also support the devlink resource
> > interface and apply it to interrupts.
>
> So you are OK with the PF changing the VF as long as it is devlink not
> sysfs? Seems rather arbitary?

It is about the setup of things. The sysfs existing in the VF is kind
of ugly since it is a child device calling up to the parent and
telling it how it is supposed to be configured. I'm sure in theory we
could probably even have the VF request something like that itself
through some sort of mailbox and cut out the middle-man but that would
be even uglier.

If you take a look at the usage of pci_physfn it is usually in spots
where the PF is being looked for in order to find the policy that is
supposed to be applied to the VF. This is one of the first few cases
where it is being used to set the policy for the VF.

> Leon knows best, but if I recall devlink becomes wonky when the VF
> driver doesn't provide a devlink instance. How does it do reload of a
> VF then?

In my mind it was the PF driver providing a devlink instance for the
VF if a driver isn't loaded. Then if the mlx5 driver was loaded on the
VF you would replace that instance with one supported by the VF itself
in order to coordinate with the VF driver. That way if the mlx5 driver
is loaded on the VF you can still change the settings instead of being
blocked by your own driver.

As far as a reload the non-driver loaded case would probably look a
lot like how things are handled now with the taking of the device
lock, verifying no driver is loaded, notifying the firmware, and
releasing the lock when it is complete. If the mlx5 driver is loaded
on the VF it could be a more complete setup that would probably look
more like your standard driver reinit.

> I think you end up with essentially the same logic as presented here
> with sysfs.

It is similar, however the big difference is how the control is setup.
With the VF sysfs file running things it feels sort of like the tail
wagging the dog. You are having to go through and verify that this is
a VF, that the PF is present, that the PF supports this operation and
so on. If the PF is in charge of managing the configuration it should
be the one registering the interfaces, not the VF. That is my view on
this anyway as I feel it simplifies this quite a bit as the interface
won't be there if it isn't supported.:

> > > It is possible for vfio to fake the MSI-X capability and limit what a
> > > user can access, but I don't think that's what is being done here.
> >
> > Yeah, I am assuming that is what is being done here.
>
> Just to be really clear, that assumption is wrong

I misspoke and meant to agree with Alex's comment. If you are saying I
was wrong, then yes, I was wrong. I meant that I was assuming you were
resizing the actual table in the MMIO region where the MSI-X table and
PBA bits are present.
Leon Romanovsky Jan. 16, 2021, 8:20 a.m. UTC | #23
On Fri, Jan 15, 2021 at 05:48:59PM -0800, Alexander Duyck wrote:
> On Fri, Jan 15, 2021 at 7:53 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Fri, Jan 15, 2021 at 10:06:19AM -0400, Jason Gunthorpe wrote:
> > > On Thu, Jan 14, 2021 at 05:56:20PM -0800, Alexander Duyck wrote:
> > >
> > > > That said, it only works at the driver level. So if the firmware is
> > > > the one that is having to do this it also occured to me that if this
> > > > update happened on FLR that would probably be preferred.
> > >
> > > FLR is not free, I'd prefer not to require it just for some
> > > philosophical reason.
> > >
> > > > Since the mlx5 already supports devlink I don't see any reason why the
> > > > driver couldn't be extended to also support the devlink resource
> > > > interface and apply it to interrupts.
> > >
> > > So you are OK with the PF changing the VF as long as it is devlink not
> > > sysfs? Seems rather arbitary?
> > >
> > > Leon knows best, but if I recall devlink becomes wonky when the VF
> > > driver doesn't provide a devlink instance. How does it do reload of a
> > > VF then?
> > >
> > > I think you end up with essentially the same logic as presented here
> > > with sysfs.
> >
> > The reasons why I decided to go with sysfs are:
> > 1. This MSI-X table size change is applicable to ALL devices in the world,
> > and not only netdev.
>
> In the PCI world MSI-X table size is a read only value. That is why I
> am pushing back on this as a PCI interface.

And it stays read-only.

>
> > 2. This is purely PCI field and apply equally with same logic to all
> > subsystems and not to netdev only.
>
> Again, calling this "purely PCI" is the sort of wording that has me
> concerned. I would prefer it if we avoid that wording. There is much
> more to this than just modifying the table size field. The firmware is
> having to shift resources between devices and this potentially has an
> effect on the entire part, not just one VF.

It is internal to HW implementation, dumb device can solve it differently.

>
> > 3. The sysfs interface is the standard way of configuring PCI/core, not
> > devlink.
>
> This isn't PCI core that is being configured. It is the firmware for
> the device. You are working with resources that are shared between
> multiple functions.

I'm ensuring that "lspci -vv .." will work correctly after such change.
It is PCI core responsibility.

>
> > 4. This is how orchestration software provisioning VFs already. It fits
> > real world usage of SR-IOV, not the artificial one that is proposed during
> > the discussion.
>
> What do you mean this is how they are doing it already? Do you have
> something out-of-tree and that is why you are fighting to keep the
> sysfs? If so that isn't a valid argument.

I have Kubernetes and OpenStack, indeed they are not part of the kernel tree.
They already use sriov_driver_autoprobe sysfs knob to disable autobind
before even starting. They configure MACs and bind VFs through sysfs/netlink
already. For them, the read/write of sysfs that is going to be bound to
the already created VM with known CPU properties, fits perfectly.

>
> > So the idea to use devlink just because mlx5 supports it, sound really
> > wrong to me. If it was other driver from another subsystem without
> > devlink support, the request to use devlink won't never come.
> >
> > Thanks
>
> I am suggesting the devlink resources interface because it would be a
> VERY good fit for something like this. By the definition of it:
> ``devlink`` provides the ability for drivers to register resources, which
> can allow administrators to see the device restrictions for a given
> resource, as well as how much of the given resource is currently
> in use. Additionally, these resources can optionally have configurable size.
> This could enable the administrator to limit the number of resources that
> are used.

It is not resource, but HW objects. The devlink doesn't even see the VFs
as long as they are not bound to the drivers.

This is an example:

[root@vm ~]# echo 0 > /sys/bus/pci/devices/0000\:01\:00.0/sriov_drivers_autoprobe
[root@vm ~]# echo 0 > /sys/bus/pci/devices/0000\:01\:00.0/sriov_numvfs
[ 2370.579711] mlx5_core 0000:01:00.0: E-Switch: Disable: mode(LEGACY), nvfs(2), active vports(3)
[root@vm ~]# echo 2 > /sys/bus/pci/devices/0000\:01\:00.0/sriov_numvfs
[ 2377.663666] mlx5_core 0000:01:00.0: E-Switch: Enable: mode(LEGACY), nvfs(2), active vports(3)
[ 2377.777010] pci 0000:01:00.1: [15b3:101c] type 00 class 0x020000
[ 2377.784903] pci 0000:01:00.2: [15b3:101c] type 00 class 0x020000
[root@vm ~]# devlink dev
pci/0000:01:00.0
[root@vm ~]# lspci |grep nox
01:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
01:00.1 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function]
01:00.2 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function]

So despite us having 2 VFs ready to be given to VMs, administrator doesn't
see them as devices.

>
> Even looking over the example usage I don't see there being much to
> prevent you from applying it to this issue. In addition it has the
> idea of handling changes that cannot be immediately applied already
> included. Your current solution doesn't have a good way of handling
> that and instead just aborts with an error.

Yes, because it is HW resource that should be applied immediately to
make sure that it is honored, before it is committed to the users.

It is very tempting to use devlink everywhere, but it is really wrong
tool for this scenario.

Thanks
Alexander Duyck Jan. 18, 2021, 3:16 a.m. UTC | #24
On Sat, Jan 16, 2021 at 12:20 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Fri, Jan 15, 2021 at 05:48:59PM -0800, Alexander Duyck wrote:
> > On Fri, Jan 15, 2021 at 7:53 AM Leon Romanovsky <leon@kernel.org> wrote:
> > >
> > > On Fri, Jan 15, 2021 at 10:06:19AM -0400, Jason Gunthorpe wrote:
> > > > On Thu, Jan 14, 2021 at 05:56:20PM -0800, Alexander Duyck wrote:
> > > >
> > > > > That said, it only works at the driver level. So if the firmware is
> > > > > the one that is having to do this it also occured to me that if this
> > > > > update happened on FLR that would probably be preferred.
> > > >
> > > > FLR is not free, I'd prefer not to require it just for some
> > > > philosophical reason.
> > > >
> > > > > Since the mlx5 already supports devlink I don't see any reason why the
> > > > > driver couldn't be extended to also support the devlink resource
> > > > > interface and apply it to interrupts.
> > > >
> > > > So you are OK with the PF changing the VF as long as it is devlink not
> > > > sysfs? Seems rather arbitary?
> > > >
> > > > Leon knows best, but if I recall devlink becomes wonky when the VF
> > > > driver doesn't provide a devlink instance. How does it do reload of a
> > > > VF then?
> > > >
> > > > I think you end up with essentially the same logic as presented here
> > > > with sysfs.
> > >
> > > The reasons why I decided to go with sysfs are:
> > > 1. This MSI-X table size change is applicable to ALL devices in the world,
> > > and not only netdev.
> >
> > In the PCI world MSI-X table size is a read only value. That is why I
> > am pushing back on this as a PCI interface.
>
> And it stays read-only.

Only if you come at it directly. What this is adding is a back door
that is visible as a part of the VF sysfs.

> >
> > > 2. This is purely PCI field and apply equally with same logic to all
> > > subsystems and not to netdev only.
> >
> > Again, calling this "purely PCI" is the sort of wording that has me
> > concerned. I would prefer it if we avoid that wording. There is much
> > more to this than just modifying the table size field. The firmware is
> > having to shift resources between devices and this potentially has an
> > effect on the entire part, not just one VF.
>
> It is internal to HW implementation, dumb device can solve it differently.

That is my point. I am worried about "dumb devices" that may follow. I
would like to see the steps that should be taken to prevent these sort
of things called out specifically. Basically this isn't just modifying
the PCIe config space, it is actually resizing the PBA and MSI-X
table.

> >
> > > 3. The sysfs interface is the standard way of configuring PCI/core, not
> > > devlink.
> >
> > This isn't PCI core that is being configured. It is the firmware for
> > the device. You are working with resources that are shared between
> > multiple functions.
>
> I'm ensuring that "lspci -vv .." will work correctly after such change.
> It is PCI core responsibility.

The current code doesn't work on anything with a driver loaded on it.
In addition the messaging provided is fairly minimal which results in
an interface that will be difficult to understand when it doesn't
work. In addition there is currently only one piece of hardware that
works with this interface which is the mlx5. My concern is this is
adding overhead to all VFs that will not be used by most SR-IOV
capable devices. In my view it would make much more sense to have a
top-down approach instead of bottom-up where the PF is registering
interfaces for the VFs.

If you want yet another compromise I would be much happier with the PF
registering the sysfs interfaces on the VFs rather than the VFs
registering the interface and hoping the PF supports it. At least with
that you are guaranteed the PF will respond to the interface when it
is registered.

> >
> > > 4. This is how orchestration software provisioning VFs already. It fits
> > > real world usage of SR-IOV, not the artificial one that is proposed during
> > > the discussion.
> >
> > What do you mean this is how they are doing it already? Do you have
> > something out-of-tree and that is why you are fighting to keep the
> > sysfs? If so that isn't a valid argument.
>
> I have Kubernetes and OpenStack, indeed they are not part of the kernel tree.
> They already use sriov_driver_autoprobe sysfs knob to disable autobind
> before even starting. They configure MACs and bind VFs through sysfs/netlink
> already. For them, the read/write of sysfs that is going to be bound to
> the already created VM with known CPU properties, fits perfectly.

By that argument the same could be said about netlink. What I don't
get is why it is okay to configure the MAC through netlink but
suddenly when we are talking about interrupts it is out of the
question. As far as the binding that is the driver interface which is
more or less grandfathered in anyway as there aren't too many ways to
deal with them as there isn't an alternate interface for the drivers
to define support.

> >
> > > So the idea to use devlink just because mlx5 supports it, sound really
> > > wrong to me. If it was other driver from another subsystem without
> > > devlink support, the request to use devlink won't never come.
> > >
> > > Thanks
> >
> > I am suggesting the devlink resources interface because it would be a
> > VERY good fit for something like this. By the definition of it:
> > ``devlink`` provides the ability for drivers to register resources, which
> > can allow administrators to see the device restrictions for a given
> > resource, as well as how much of the given resource is currently
> > in use. Additionally, these resources can optionally have configurable size.
> > This could enable the administrator to limit the number of resources that
> > are used.
>
> It is not resource, but HW objects. The devlink doesn't even see the VFs
> as long as they are not bound to the drivers.
>
> This is an example:
>
> [root@vm ~]# echo 0 > /sys/bus/pci/devices/0000\:01\:00.0/sriov_drivers_autoprobe
> [root@vm ~]# echo 0 > /sys/bus/pci/devices/0000\:01\:00.0/sriov_numvfs
> [ 2370.579711] mlx5_core 0000:01:00.0: E-Switch: Disable: mode(LEGACY), nvfs(2), active vports(3)
> [root@vm ~]# echo 2 > /sys/bus/pci/devices/0000\:01\:00.0/sriov_numvfs
> [ 2377.663666] mlx5_core 0000:01:00.0: E-Switch: Enable: mode(LEGACY), nvfs(2), active vports(3)
> [ 2377.777010] pci 0000:01:00.1: [15b3:101c] type 00 class 0x020000
> [ 2377.784903] pci 0000:01:00.2: [15b3:101c] type 00 class 0x020000
> [root@vm ~]# devlink dev
> pci/0000:01:00.0
> [root@vm ~]# lspci |grep nox
> 01:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
> 01:00.1 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function]
> 01:00.2 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function]
>
> So despite us having 2 VFs ready to be given to VMs, administrator doesn't
> see them as devices.

The MSI-X vectors are a resource assigned to hardware objects. It just
depends on how you want to look at things. Right now you have the VFs
register an interface on behalf of the PF. I am arguing it would be
better to have the PF register an interface on behalf of the VFs.
Ultimately the PF is responsible for creating the VFs in the first
place. I don't see it as that much of a leap to have the
mlx5_sriov_enable call register interfaces for the VFs so that you can
configure the MSI-X vectors from the PF, and then tear them down
before it frees the VFs. Having the VFs do the work seems error prone
since it is assuming the interfaces are there on the PF when in all
cases but one (mlx5) it currently isn't.

> >
> > Even looking over the example usage I don't see there being much to
> > prevent you from applying it to this issue. In addition it has the
> > idea of handling changes that cannot be immediately applied already
> > included. Your current solution doesn't have a good way of handling
> > that and instead just aborts with an error.
>
> Yes, because it is HW resource that should be applied immediately to
> make sure that it is honored, before it is committed to the users.

The problem is you cannot do that at all if the driver is already
loaded. One advantage of using something like devlink is that you
could potentially have the VF driver help to coordinate things so you
could have the case where the VF has the mlx5 driver loaded work
correctly where you would update the MSI-X vector count and then
trigger the driver reload via devlink.

> It is very tempting to use devlink everywhere, but it is really wrong
> tool for this scenario.

We can agree to disagree there. I am not a fan of sysfs being applied
everywhere either. The problem is it is an easy goto when someone is
looking for a quick and dirty solution and often leads to more
problems later as it usually misses critical path locking issues and
the like.

Especially when it is making a subordinate interface look like the
MSI-X table size is somehow writable. I would much rather the creation
of any interface controlled more directly by the PF, or at a minimum
have the PF registering the interfaces rather than leaving this up to
the VF in the hopes that the PF provides the functionality needed to
service the request.
Leon Romanovsky Jan. 18, 2021, 7:20 a.m. UTC | #25
On Sun, Jan 17, 2021 at 07:16:30PM -0800, Alexander Duyck wrote:
> On Sat, Jan 16, 2021 at 12:20 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Fri, Jan 15, 2021 at 05:48:59PM -0800, Alexander Duyck wrote:
> > > On Fri, Jan 15, 2021 at 7:53 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > >
> > > > On Fri, Jan 15, 2021 at 10:06:19AM -0400, Jason Gunthorpe wrote:
> > > > > On Thu, Jan 14, 2021 at 05:56:20PM -0800, Alexander Duyck wrote:
> > > > >
> > > > > > That said, it only works at the driver level. So if the firmware is
> > > > > > the one that is having to do this it also occured to me that if this
> > > > > > update happened on FLR that would probably be preferred.
> > > > >
> > > > > FLR is not free, I'd prefer not to require it just for some
> > > > > philosophical reason.
> > > > >
> > > > > > Since the mlx5 already supports devlink I don't see any reason why the
> > > > > > driver couldn't be extended to also support the devlink resource
> > > > > > interface and apply it to interrupts.
> > > > >
> > > > > So you are OK with the PF changing the VF as long as it is devlink not
> > > > > sysfs? Seems rather arbitary?
> > > > >
> > > > > Leon knows best, but if I recall devlink becomes wonky when the VF
> > > > > driver doesn't provide a devlink instance. How does it do reload of a
> > > > > VF then?
> > > > >
> > > > > I think you end up with essentially the same logic as presented here
> > > > > with sysfs.
> > > >
> > > > The reasons why I decided to go with sysfs are:
> > > > 1. This MSI-X table size change is applicable to ALL devices in the world,
> > > > and not only netdev.
> > >
> > > In the PCI world MSI-X table size is a read only value. That is why I
> > > am pushing back on this as a PCI interface.
> >
> > And it stays read-only.
>
> Only if you come at it directly. What this is adding is a back door
> that is visible as a part of the VF sysfs.
>
> > >
> > > > 2. This is purely PCI field and apply equally with same logic to all
> > > > subsystems and not to netdev only.
> > >
> > > Again, calling this "purely PCI" is the sort of wording that has me
> > > concerned. I would prefer it if we avoid that wording. There is much
> > > more to this than just modifying the table size field. The firmware is
> > > having to shift resources between devices and this potentially has an
> > > effect on the entire part, not just one VF.
> >
> > It is internal to HW implementation, dumb device can solve it differently.
>
> That is my point. I am worried about "dumb devices" that may follow. I
> would like to see the steps that should be taken to prevent these sort
> of things called out specifically. Basically this isn't just modifying
> the PCIe config space, it is actually resizing the PBA and MSI-X
> table.

Exactly the last line the dumb device can implement differently. The
request is simple - configure MSI-X table size to be the new size.

>
> > >
> > > > 3. The sysfs interface is the standard way of configuring PCI/core, not
> > > > devlink.
> > >
> > > This isn't PCI core that is being configured. It is the firmware for
> > > the device. You are working with resources that are shared between
> > > multiple functions.
> >
> > I'm ensuring that "lspci -vv .." will work correctly after such change.
> > It is PCI core responsibility.
>
> The current code doesn't work on anything with a driver loaded on it.

The problem that no one care about this case, because in opposite to
other devices that usually operates in the hypervisor and probed during
the boot, the VFs are used differently. They run in VMs, probed there
and (usually) not needed in hypervisor.

The driver reload would make sense if PF MSI-X table was changed.

> In addition the messaging provided is fairly minimal which results in
> an interface that will be difficult to understand when it doesn't
> work.

I'm fond of simple interfaces: 0, EBUSY and EINVAL are common way
to inform user. We must remember that this interface is for low-level
PCI property and is needed for expert users who needs to squeeze maximum
for their VMs out of expensive high speed network card that supports SR-IOV.

According to the ebay, the CX6 card costs between 1000 and 1700 USDs,
not really home equipment.

> In addition there is currently only one piece of hardware that
> works with this interface which is the mlx5.

It is not different from any other feature, someone should be first.
This has very clear purpose, scoped well and understandable when and
why it is needed.

Kernel is full of devices and features that exist in one device only.

> My concern is this is
> adding overhead to all VFs that will not be used by most SR-IOV
> capable devices. In my view it would make much more sense to have a
> top-down approach instead of bottom-up where the PF is registering
> interfaces for the VFs.
>
> If you want yet another compromise I would be much happier with the PF
> registering the sysfs interfaces on the VFs rather than the VFs
> registering the interface and hoping the PF supports it. At least with
> that you are guaranteed the PF will respond to the interface when it
> is registered.

Thanks a lot, I appreciate it, will take a look now.

>
> > >
> > > > 4. This is how orchestration software provisioning VFs already. It fits
> > > > real world usage of SR-IOV, not the artificial one that is proposed during
> > > > the discussion.
> > >
> > > What do you mean this is how they are doing it already? Do you have
> > > something out-of-tree and that is why you are fighting to keep the
> > > sysfs? If so that isn't a valid argument.
> >
> > I have Kubernetes and OpenStack, indeed they are not part of the kernel tree.
> > They already use sriov_driver_autoprobe sysfs knob to disable autobind
> > before even starting. They configure MACs and bind VFs through sysfs/netlink
> > already. For them, the read/write of sysfs that is going to be bound to
> > the already created VM with known CPU properties, fits perfectly.
>
> By that argument the same could be said about netlink. What I don't
> get is why it is okay to configure the MAC through netlink but
> suddenly when we are talking about interrupts it is out of the
> question.

They belong to different subsystems, while MAC is applicable to the
netdev (both PF and VFs), MSI-X is applicable to all devices.

I'm not arguing about netlink vs. sysfs, just saying that devlink doesn't
fit here.

> As far as the binding that is the driver interface which is
> more or less grandfathered in anyway as there aren't too many ways to
> deal with them as there isn't an alternate interface for the drivers
> to define support.
>
> > >
> > > > So the idea to use devlink just because mlx5 supports it, sound really
> > > > wrong to me. If it was other driver from another subsystem without
> > > > devlink support, the request to use devlink won't never come.
> > > >
> > > > Thanks
> > >
> > > I am suggesting the devlink resources interface because it would be a
> > > VERY good fit for something like this. By the definition of it:
> > > ``devlink`` provides the ability for drivers to register resources, which
> > > can allow administrators to see the device restrictions for a given
> > > resource, as well as how much of the given resource is currently
> > > in use. Additionally, these resources can optionally have configurable size.
> > > This could enable the administrator to limit the number of resources that
> > > are used.
> >
> > It is not resource, but HW objects. The devlink doesn't even see the VFs
> > as long as they are not bound to the drivers.
> >
> > This is an example:
> >
> > [root@vm ~]# echo 0 > /sys/bus/pci/devices/0000\:01\:00.0/sriov_drivers_autoprobe
> > [root@vm ~]# echo 0 > /sys/bus/pci/devices/0000\:01\:00.0/sriov_numvfs
> > [ 2370.579711] mlx5_core 0000:01:00.0: E-Switch: Disable: mode(LEGACY), nvfs(2), active vports(3)
> > [root@vm ~]# echo 2 > /sys/bus/pci/devices/0000\:01\:00.0/sriov_numvfs
> > [ 2377.663666] mlx5_core 0000:01:00.0: E-Switch: Enable: mode(LEGACY), nvfs(2), active vports(3)
> > [ 2377.777010] pci 0000:01:00.1: [15b3:101c] type 00 class 0x020000
> > [ 2377.784903] pci 0000:01:00.2: [15b3:101c] type 00 class 0x020000
> > [root@vm ~]# devlink dev
> > pci/0000:01:00.0
> > [root@vm ~]# lspci |grep nox
> > 01:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
> > 01:00.1 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function]
> > 01:00.2 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function]
> >
> > So despite us having 2 VFs ready to be given to VMs, administrator doesn't
> > see them as devices.
>
> The MSI-X vectors are a resource assigned to hardware objects. It just
> depends on how you want to look at things. Right now you have the VFs
> register an interface on behalf of the PF. I am arguing it would be
> better to have the PF register an interface on behalf of the VFs.
> Ultimately the PF is responsible for creating the VFs in the first
> place. I don't see it as that much of a leap to have the
> mlx5_sriov_enable call register interfaces for the VFs so that you can
> configure the MSI-X vectors from the PF, and then tear them down
> before it frees the VFs. Having the VFs do the work seems error prone
> since it is assuming the interfaces are there on the PF when in all
> cases but one (mlx5) it currently isn't.

I'm not sure that I understood your last sentence correctly. If VF device
is not on hypervisor, it will mean the device is already probed and
change of MSI-X table is prohibited. I don't know how you can configure
VF devices to be passthrough to the VMs without SR-IOV enable call first.

I would say that all devices start their life at the same place where PF
is located.

>
> > >
> > > Even looking over the example usage I don't see there being much to
> > > prevent you from applying it to this issue. In addition it has the
> > > idea of handling changes that cannot be immediately applied already
> > > included. Your current solution doesn't have a good way of handling
> > > that and instead just aborts with an error.
> >
> > Yes, because it is HW resource that should be applied immediately to
> > make sure that it is honored, before it is committed to the users.
>
> The problem is you cannot do that at all if the driver is already
> loaded. One advantage of using something like devlink is that you
> could potentially have the VF driver help to coordinate things so you
> could have the case where the VF has the mlx5 driver loaded work
> correctly where you would update the MSI-X vector count and then
> trigger the driver reload via devlink.

The thing is that it is not needed for VFs at all.

>
> > It is very tempting to use devlink everywhere, but it is really wrong
> > tool for this scenario.
>
> We can agree to disagree there. I am not a fan of sysfs being applied
> everywhere either. The problem is it is an easy goto when someone is
> looking for a quick and dirty solution and often leads to more
> problems later as it usually misses critical path locking issues and
> the like.

It is fun that you mentioned that devlink as an example of good locking scheme.
Without going into to much details, right now Parav and myself are trying to fix
devlink locking around reload functionality. It was close to DOA for me when I
worked on auxiliary bus patches.

So no, devlink is not better. It is another (good) tool that needs more love
and care to be real PCI configuration utility. At lest, it should step out of
netdev shadow.

The block subsystem built whole stack around sysfs and they doesn't seem
upset about it.

Thanks
Leon Romanovsky Jan. 18, 2021, 1:28 p.m. UTC | #26
On Mon, Jan 18, 2021 at 09:20:08AM +0200, Leon Romanovsky wrote:
> On Sun, Jan 17, 2021 at 07:16:30PM -0800, Alexander Duyck wrote:
> > On Sat, Jan 16, 2021 at 12:20 AM Leon Romanovsky <leon@kernel.org> wrote:
> > >
> > > On Fri, Jan 15, 2021 at 05:48:59PM -0800, Alexander Duyck wrote:
> > > > On Fri, Jan 15, 2021 at 7:53 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > > >
> > > > > On Fri, Jan 15, 2021 at 10:06:19AM -0400, Jason Gunthorpe wrote:
> > > > > > On Thu, Jan 14, 2021 at 05:56:20PM -0800, Alexander Duyck wrote:

<...>

> > If you want yet another compromise I would be much happier with the PF
> > registering the sysfs interfaces on the VFs rather than the VFs
> > registering the interface and hoping the PF supports it. At least with
> > that you are guaranteed the PF will respond to the interface when it
> > is registered.
>
> Thanks a lot, I appreciate it, will take a look now.

I found only two solutions to implement it in this way.
Option 1.
Allow multi entry write to some new sysfs knob that will receive BDF (or another VF
identification) and vector count. Something like this:

 echo "0000:01:00.2 123" > sriov_vf_msix_count

From one side, that solution is unlikely to be welcomed by Greg KH and from another,
it will require a lot of boilerplate code to make it safe and correct.

Option 2.
Create directory under PF device with files writable and organized by VF numbers.
It is doable, but will cause to code bloat with no gain at all. Cleaner than now,
it won't be.

Why the current approach with one file per-proper VF device is not good enough?

Thanks
Jason Gunthorpe Jan. 18, 2021, 3:47 p.m. UTC | #27
On Fri, Jan 15, 2021 at 08:32:19PM -0800, Alexander Duyck wrote:
> On Fri, Jan 15, 2021 at 6:06 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Thu, Jan 14, 2021 at 05:56:20PM -0800, Alexander Duyck wrote:
> >
> > > That said, it only works at the driver level. So if the firmware is
> > > the one that is having to do this it also occured to me that if this
> > > update happened on FLR that would probably be preferred.
> >
> > FLR is not free, I'd prefer not to require it just for some
> > philosophical reason.
> 
> It wasn't so much a philosophical thing as the fact that it can sort
> of take the place as a reload. 

Asserting no driver is present and doing some SW-only "FLR" is pretty
much the same thing.

We can't issue FLR unless no driver is present anyhow, so really all
this does is add a useless step. If some HW needs FLR then it can do
it in here, but I don't see a value to inject it when not needed. 

Yes, if we were PCI-SIG we'd probably insist that a FLR be done, but
we are not PCI-SIG, this is just Linux, and asserting there are no
users of the MSI is sufficient.

> However looking over the mlx5 code I don't see any handling of FLR
> in there so I am assuming that is handled by the firmware.

The device does the device side of the FLR, the mlx5 driver should
trigger FLR during error recovery flows.

> It is about the setup of things. The sysfs existing in the VF is kind
> of ugly since it is a child device calling up to the parent and
> telling it how it is supposed to be configured. 

Well, the logical place to put that sysfs file is under the VF,
otherwise it becomes ugly in a different way. I agree it would be
nicer if the file only existed when the right driver is loaded, and
there was a better way to get from the PF to VF.

> I'm sure in theory we could probably even have the VF request
> something like that itself through some sort of mailbox and cut out
> the middle-man but that would be even uglier.

No, not ever. The VF is in a security domain that can't make those
kinds of changes to itself.

> In my mind it was the PF driver providing a devlink instance for the
> VF if a driver isn't loaded.

I think hacking up devlink to provide dummy devlink objects for VFs
that otherwise wouldn't exist and then ensuring handover to/from real
drivers that might want those objects natively, just for the sake of
using devlink to instead of the existing PCI sysfs is major overkill.

If we are even thinking of moving PCI to devlink I'd want to see
devlink taken out of net and a whole devlink PCI subsystem
infrastructure created to manage all this sanely.

Hacking a subystem into devlink on the side with some small niche
feature is not the way to approach such fundamental things.

I also don't know if PCI will get much value from netlinkification, or
if devlink is even the right netlink representation for PCI in the
first place.

Jason
Greg KH Jan. 18, 2021, 5:03 p.m. UTC | #28
On Sun, Jan 10, 2021 at 05:07:24PM +0200, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> Some SR-IOV capable devices provide an ability to configure specific
> number of MSI-X vectors on their VF prior driver is probed on that VF.
> 
> In order to make management easy, provide new read-only sysfs file that
> returns a total number of possible to configure MSI-X vectors.
> 
> cat /sys/bus/pci/devices/.../sriov_vf_total_msix
>   = 0 - feature is not supported
>   > 0 - total number of MSI-X vectors to consume by the VFs
> 
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  Documentation/ABI/testing/sysfs-bus-pci | 14 +++++++++++
>  drivers/pci/iov.c                       | 31 +++++++++++++++++++++++++
>  drivers/pci/pci.h                       |  3 +++
>  include/linux/pci.h                     |  2 ++
>  4 files changed, 50 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> index 05e26e5da54e..64e9b700acc9 100644
> --- a/Documentation/ABI/testing/sysfs-bus-pci
> +++ b/Documentation/ABI/testing/sysfs-bus-pci
> @@ -395,3 +395,17 @@ Description:
>  		The file is writable if the PF is bound to a driver that
>  		supports the ->sriov_set_msix_vec_count() callback and there
>  		is no driver bound to the VF.
> +
> +What:		/sys/bus/pci/devices/.../sriov_vf_total_msix
> +Date:		January 2021
> +Contact:	Leon Romanovsky <leonro@nvidia.com>
> +Description:
> +		This file is associated with the SR-IOV PFs.
> +		It returns a total number of possible to configure MSI-X
> +		vectors on the enabled VFs.
> +
> +		The values returned are:
> +		 * > 0 - this will be total number possible to consume by VFs,
> +		 * = 0 - feature is not supported
> +
> +		If no SR-IOV VFs are enabled, this value will return 0.
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 42c0df4158d1..0a6ddf3230fd 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -394,12 +394,22 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
>  	return count;
>  }
> 
> +static ssize_t sriov_vf_total_msix_show(struct device *dev,
> +					struct device_attribute *attr,
> +					char *buf)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +
> +	return sprintf(buf, "%d\n", pdev->sriov->vf_total_msix);

Nit, please use sysfs_emit() for new sysfs files.

thanks,

greg k-h
Alexander Duyck Jan. 18, 2021, 6:21 p.m. UTC | #29
On Mon, Jan 18, 2021 at 5:28 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Mon, Jan 18, 2021 at 09:20:08AM +0200, Leon Romanovsky wrote:
> > On Sun, Jan 17, 2021 at 07:16:30PM -0800, Alexander Duyck wrote:
> > > On Sat, Jan 16, 2021 at 12:20 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > >
> > > > On Fri, Jan 15, 2021 at 05:48:59PM -0800, Alexander Duyck wrote:
> > > > > On Fri, Jan 15, 2021 at 7:53 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > > > >
> > > > > > On Fri, Jan 15, 2021 at 10:06:19AM -0400, Jason Gunthorpe wrote:
> > > > > > > On Thu, Jan 14, 2021 at 05:56:20PM -0800, Alexander Duyck wrote:
>
> <...>
>
> > > If you want yet another compromise I would be much happier with the PF
> > > registering the sysfs interfaces on the VFs rather than the VFs
> > > registering the interface and hoping the PF supports it. At least with
> > > that you are guaranteed the PF will respond to the interface when it
> > > is registered.
> >
> > Thanks a lot, I appreciate it, will take a look now.
>
> I found only two solutions to implement it in this way.
> Option 1.
> Allow multi entry write to some new sysfs knob that will receive BDF (or another VF
> identification) and vector count. Something like this:
>
>  echo "0000:01:00.2 123" > sriov_vf_msix_count
>
> From one side, that solution is unlikely to be welcomed by Greg KH and from another,
> it will require a lot of boilerplate code to make it safe and correct.

You are overthinking this. I didn't say the sysfs had to be in the PF
directory itself. My request was that the PF is what placed the sysfs
file in the directory since indirectly it is responsible for spawning
the VF anyway it shouldn't be too much of a lift to have the PF place
sysfs files in the VF hierarchy.

The main piece I am not a fan of is the fact that the VF is blindly
registering an interface and presenting it without knowing if it even
works.

The secondary issue that I see as important, but I am willing to
compromise on is that the interface makes it appear as though the VF
configuration space is writable via this sysfs file. My preference
would be to somehow make it transparent that the PF is providing this
functionality. I thought it might be easier to do with devlink rather
than with sysfs which is why I have been preferring devlink. However
based on your pushback I am willing to give up on that, but I think we
still need to restructure how the sysfs is being managed.

> Option 2.
> Create directory under PF device with files writable and organized by VF numbers.
> It is doable, but will cause to code bloat with no gain at all. Cleaner than now,
> it won't be.
>
> Why the current approach with one file per-proper VF device is not good enough?

Because it is muddying the waters in terms of what is control taking
place from the VF versus the PF. In my mind the ideal solution if you
insist on going with the VF sysfs route would be to look at spawning a
directory inside the VF sysfs specifically for all of the instances
that will be PF management controls. At least that would give some
hint that this is a backdoor control and not actually interacting with
the VF PCI device directly. Then if in the future you have to add more
to this you have a spot already laid out and the controls won't be
mistaken for standard PCI controls as they are PF management controls.

In addition you could probably even create a directory on the PF with
the new control you had added for getting the master count as well as
look at adding symlinks to the VF files so that you could manage all
of the resources in one spot. That would result in the controls being
nicely organized and easy to use.
Leon Romanovsky Jan. 19, 2021, 5:38 a.m. UTC | #30
On Mon, Jan 18, 2021 at 06:03:22PM +0100, Greg KH wrote:
> On Sun, Jan 10, 2021 at 05:07:24PM +0200, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> >
> > Some SR-IOV capable devices provide an ability to configure specific
> > number of MSI-X vectors on their VF prior driver is probed on that VF.
> >
> > In order to make management easy, provide new read-only sysfs file that
> > returns a total number of possible to configure MSI-X vectors.
> >
> > cat /sys/bus/pci/devices/.../sriov_vf_total_msix
> >   = 0 - feature is not supported
> >   > 0 - total number of MSI-X vectors to consume by the VFs
> >
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> >  Documentation/ABI/testing/sysfs-bus-pci | 14 +++++++++++
> >  drivers/pci/iov.c                       | 31 +++++++++++++++++++++++++
> >  drivers/pci/pci.h                       |  3 +++
> >  include/linux/pci.h                     |  2 ++
> >  4 files changed, 50 insertions(+)
> >
> > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > index 05e26e5da54e..64e9b700acc9 100644
> > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > @@ -395,3 +395,17 @@ Description:
> >  		The file is writable if the PF is bound to a driver that
> >  		supports the ->sriov_set_msix_vec_count() callback and there
> >  		is no driver bound to the VF.
> > +
> > +What:		/sys/bus/pci/devices/.../sriov_vf_total_msix
> > +Date:		January 2021
> > +Contact:	Leon Romanovsky <leonro@nvidia.com>
> > +Description:
> > +		This file is associated with the SR-IOV PFs.
> > +		It returns a total number of possible to configure MSI-X
> > +		vectors on the enabled VFs.
> > +
> > +		The values returned are:
> > +		 * > 0 - this will be total number possible to consume by VFs,
> > +		 * = 0 - feature is not supported
> > +
> > +		If no SR-IOV VFs are enabled, this value will return 0.
> > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> > index 42c0df4158d1..0a6ddf3230fd 100644
> > --- a/drivers/pci/iov.c
> > +++ b/drivers/pci/iov.c
> > @@ -394,12 +394,22 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
> >  	return count;
> >  }
> >
> > +static ssize_t sriov_vf_total_msix_show(struct device *dev,
> > +					struct device_attribute *attr,
> > +					char *buf)
> > +{
> > +	struct pci_dev *pdev = to_pci_dev(dev);
> > +
> > +	return sprintf(buf, "%d\n", pdev->sriov->vf_total_msix);
>
> Nit, please use sysfs_emit() for new sysfs files.

I'll do, thanks.

>
> thanks,
>
> greg k-h
Leon Romanovsky Jan. 19, 2021, 5:43 a.m. UTC | #31
On Mon, Jan 18, 2021 at 10:21:03AM -0800, Alexander Duyck wrote:
> On Mon, Jan 18, 2021 at 5:28 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Mon, Jan 18, 2021 at 09:20:08AM +0200, Leon Romanovsky wrote:
> > > On Sun, Jan 17, 2021 at 07:16:30PM -0800, Alexander Duyck wrote:
> > > > On Sat, Jan 16, 2021 at 12:20 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > > >
> > > > > On Fri, Jan 15, 2021 at 05:48:59PM -0800, Alexander Duyck wrote:
> > > > > > On Fri, Jan 15, 2021 at 7:53 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > > > > >
> > > > > > > On Fri, Jan 15, 2021 at 10:06:19AM -0400, Jason Gunthorpe wrote:
> > > > > > > > On Thu, Jan 14, 2021 at 05:56:20PM -0800, Alexander Duyck wrote:
> >
> > <...>
> >
> > > > If you want yet another compromise I would be much happier with the PF
> > > > registering the sysfs interfaces on the VFs rather than the VFs
> > > > registering the interface and hoping the PF supports it. At least with
> > > > that you are guaranteed the PF will respond to the interface when it
> > > > is registered.
> > >
> > > Thanks a lot, I appreciate it, will take a look now.
> >
> > I found only two solutions to implement it in this way.
> > Option 1.
> > Allow multi entry write to some new sysfs knob that will receive BDF (or another VF
> > identification) and vector count. Something like this:
> >
> >  echo "0000:01:00.2 123" > sriov_vf_msix_count
> >
> > From one side, that solution is unlikely to be welcomed by Greg KH and from another,
> > it will require a lot of boilerplate code to make it safe and correct.
>
> You are overthinking this. I didn't say the sysfs had to be in the PF
> directory itself. My request was that the PF is what placed the sysfs
> file in the directory since indirectly it is responsible for spawning
> the VF anyway it shouldn't be too much of a lift to have the PF place
> sysfs files in the VF hierarchy.
>
> The main piece I am not a fan of is the fact that the VF is blindly
> registering an interface and presenting it without knowing if it even
> works.
>
> The secondary issue that I see as important, but I am willing to
> compromise on is that the interface makes it appear as though the VF
> configuration space is writable via this sysfs file. My preference
> would be to somehow make it transparent that the PF is providing this
> functionality. I thought it might be easier to do with devlink rather
> than with sysfs which is why I have been preferring devlink. However
> based on your pushback I am willing to give up on that, but I think we
> still need to restructure how the sysfs is being managed.
>
> > Option 2.
> > Create directory under PF device with files writable and organized by VF numbers.
> > It is doable, but will cause to code bloat with no gain at all. Cleaner than now,
> > it won't be.
> >
> > Why the current approach with one file per-proper VF device is not good enough?
>
> Because it is muddying the waters in terms of what is control taking
> place from the VF versus the PF. In my mind the ideal solution if you
> insist on going with the VF sysfs route would be to look at spawning a
> directory inside the VF sysfs specifically for all of the instances
> that will be PF management controls. At least that would give some
> hint that this is a backdoor control and not actually interacting with
> the VF PCI device directly. Then if in the future you have to add more
> to this you have a spot already laid out and the controls won't be
> mistaken for standard PCI controls as they are PF management controls.
>
> In addition you could probably even create a directory on the PF with
> the new control you had added for getting the master count as well as
> look at adding symlinks to the VF files so that you could manage all
> of the resources in one spot. That would result in the controls being
> nicely organized and easy to use.

Thanks, for you inputs.

I'll try offline different variants and will post v4 soon.
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 05e26e5da54e..64e9b700acc9 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -395,3 +395,17 @@  Description:
 		The file is writable if the PF is bound to a driver that
 		supports the ->sriov_set_msix_vec_count() callback and there
 		is no driver bound to the VF.
+
+What:		/sys/bus/pci/devices/.../sriov_vf_total_msix
+Date:		January 2021
+Contact:	Leon Romanovsky <leonro@nvidia.com>
+Description:
+		This file is associated with the SR-IOV PFs.
+		It returns a total number of possible to configure MSI-X
+		vectors on the enabled VFs.
+
+		The values returned are:
+		 * > 0 - this will be total number possible to consume by VFs,
+		 * = 0 - feature is not supported
+
+		If no SR-IOV VFs are enabled, this value will return 0.
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 42c0df4158d1..0a6ddf3230fd 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -394,12 +394,22 @@  static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
 	return count;
 }

+static ssize_t sriov_vf_total_msix_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	return sprintf(buf, "%d\n", pdev->sriov->vf_total_msix);
+}
+
 static DEVICE_ATTR_RO(sriov_totalvfs);
 static DEVICE_ATTR_RW(sriov_numvfs);
 static DEVICE_ATTR_RO(sriov_offset);
 static DEVICE_ATTR_RO(sriov_stride);
 static DEVICE_ATTR_RO(sriov_vf_device);
 static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
+static DEVICE_ATTR_RO(sriov_vf_total_msix);

 static struct attribute *sriov_dev_attrs[] = {
 	&dev_attr_sriov_totalvfs.attr,
@@ -408,6 +418,7 @@  static struct attribute *sriov_dev_attrs[] = {
 	&dev_attr_sriov_stride.attr,
 	&dev_attr_sriov_vf_device.attr,
 	&dev_attr_sriov_drivers_autoprobe.attr,
+	&dev_attr_sriov_vf_total_msix.attr,
 	NULL,
 };

@@ -658,6 +669,7 @@  static void sriov_disable(struct pci_dev *dev)
 		sysfs_remove_link(&dev->dev.kobj, "dep_link");

 	iov->num_VFs = 0;
+	iov->vf_total_msix = 0;
 	pci_iov_set_numvfs(dev, 0);
 }

@@ -1116,6 +1128,25 @@  int pci_sriov_get_totalvfs(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);

+/**
+ * pci_sriov_set_vf_total_msix - set total number of MSI-X vectors for the VFs
+ * @dev: the PCI PF device
+ * @numb: the total number of MSI-X vector to consume by the VFs
+ *
+ * Sets the number of MSI-X vectors that is possible to consume by the VFs.
+ * This interface is complimentary part of the pci_set_msix_vec_count()
+ * that will be used to configure the required number on the VF.
+ */
+void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb)
+{
+	if (!dev->is_physfn || !dev->driver ||
+	    !dev->driver->sriov_set_msix_vec_count)
+		return;
+
+	dev->sriov->vf_total_msix = numb;
+}
+EXPORT_SYMBOL_GPL(pci_sriov_set_vf_total_msix);
+
 /**
  * pci_sriov_configure_simple - helper to configure SR-IOV
  * @dev: the PCI device
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 1fd273077637..0fbe291eb0f2 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -327,6 +327,9 @@  struct pci_sriov {
 	u16		subsystem_device; /* VF subsystem device */
 	resource_size_t	barsz[PCI_SRIOV_NUM_BARS];	/* VF BAR size */
 	bool		drivers_autoprobe; /* Auto probing of VFs by driver */
+	int		vf_total_msix;  /* Total number of MSI-X vectors the VFs
+					 * can consume
+					 */
 };

 /**
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a17cfc28eb66..fd9ff1f42a09 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2074,6 +2074,7 @@  int pci_sriov_get_totalvfs(struct pci_dev *dev);
 int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
 resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
 void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
+void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb);

 /* Arch may override these (weak) */
 int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs);
@@ -2114,6 +2115,7 @@  static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
 static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
 { return 0; }
 static inline void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe) { }
+static inline void pci_sriov_set_vf_total_msix(struct pci_dev *dev, int numb) {}
 #endif

 #if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)