diff mbox

[RFC,v2] PCI: sysfs per device SRIOV control and status

Message ID 1349388885-43938-1-git-send-email-ddutile@redhat.com
State Superseded
Headers show

Commit Message

Don Dutile Oct. 4, 2012, 10:14 p.m. UTC
Provide files under sysfs to determine the max number of vfs
an SRIOV-capable PCIe device supports, and methods to enable and
disable the vfs on a per device basis.

Currently, VF enablement by SRIOV-capable PCIe devices is done
in driver-specific module parameters.  If not setup in modprobe files,
it requires admin to unload & reload PF drivers with number of desired
VFs to enable.  Additionally, the enablement is system wide: all
devices controlled by the same driver have the same number of VFs
enabled.  Although the latter is probably desired, there are PCI
configurations setup by system BIOS that may not enable that to occur.

Three files are created if a PCIe device has SRIOV support:
sriov_max_vfs -- cat-ing this file returns the maximum number
                 of VFs a PCIe device supports.
sriov_enable_vfs -- echo'ing a number to this file enables this
                    number of VFs for this given PCIe device.
                 -- cat-ing this file will return the number of VFs
                    currently enabled on this PCIe device.
sriov_disable_vfs -- echo-ing any number other than 0 disables all
                     VFs associated with this PCIe device.

VF enable and disablement is invoked much like other PCIe
configuration functions -- via registered callbacks in the driver,
i.e., probe, release, etc.

v1->v2:
This patch is based on previous 2 patches by Yinghai Lu
that cleaned up the vga attributes for PCI devices under sysfs,
and uses visibility-checking group attributes as recommended by
Greg K-H.

Signed-off-by: Donald Dutile <ddutile@redhat.com>
---
 drivers/pci/pci-sysfs.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/pci.h     |   2 +
 2 files changed, 154 insertions(+), 1 deletion(-)

Comments

Yinghai Lu Oct. 4, 2012, 10:30 p.m. UTC | #1
On Thu, Oct 4, 2012 at 3:14 PM, Donald Dutile <ddutile@redhat.com> wrote:
> Provide files under sysfs to determine the max number of vfs
> an SRIOV-capable PCIe device supports, and methods to enable and
> disable the vfs on a per device basis.
>
> Currently, VF enablement by SRIOV-capable PCIe devices is done
> in driver-specific module parameters.  If not setup in modprobe files,
> it requires admin to unload & reload PF drivers with number of desired
> VFs to enable.  Additionally, the enablement is system wide: all
> devices controlled by the same driver have the same number of VFs
> enabled.  Although the latter is probably desired, there are PCI
> configurations setup by system BIOS that may not enable that to occur.
>
> Three files are created if a PCIe device has SRIOV support:
> sriov_max_vfs -- cat-ing this file returns the maximum number
>                  of VFs a PCIe device supports.
> sriov_enable_vfs -- echo'ing a number to this file enables this
>                     number of VFs for this given PCIe device.
>                  -- cat-ing this file will return the number of VFs
>                     currently enabled on this PCIe device.
> sriov_disable_vfs -- echo-ing any number other than 0 disables all
>                      VFs associated with this PCIe device.
>
> VF enable and disablement is invoked much like other PCIe
> configuration functions -- via registered callbacks in the driver,
> i.e., probe, release, etc.
>
> v1->v2:
> This patch is based on previous 2 patches by Yinghai Lu
> that cleaned up the vga attributes for PCI devices under sysfs,
> and uses visibility-checking group attributes as recommended by
> Greg K-H.
>
> Signed-off-by: Donald Dutile <ddutile@redhat.com>
> ---
>  drivers/pci/pci-sysfs.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/pci.h     |   2 +
>  2 files changed, 154 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> index fbbb97f..e6522c2 100644
> --- a/drivers/pci/pci-sysfs.c
> +++ b/drivers/pci/pci-sysfs.c
> @@ -404,6 +404,128 @@ static ssize_t d3cold_allowed_show(struct device *dev,
>  }
>  #endif
>
> +bool pci_dev_has_sriov(struct pci_dev *pdev)
> +{
> +        int ret = false;
> +        int pos;
> +
> +        if (!pci_is_pcie(pdev))
> +                goto out;
> +
> +        pos =  pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
> +        if (pos)
> +                ret = true;
> +out:
> +        return ret;
> +}

should use dev_is_pf?

cap does not mean you have sriov_init return successfully.

> +
> +#ifdef CONFIG_PCI_IOV
> +static ssize_t sriov_max_vfs_show(struct device *dev,
> +                                  struct device_attribute *attr,
> +                                  char *buf)
> +{
> +        struct pci_dev *pdev;
> +
> +        pdev = to_pci_dev(dev);
> +        return sprintf (buf, "%u\n", pdev->sriov->total);
> +}
> +
> +
> +static ssize_t sriov_enable_vfs_show(struct device *dev,
> +                                     struct device_attribute *attr,
> +                                     char *buf)
> +{
> +        struct pci_dev *pdev;
> +
> +        pdev = to_pci_dev(dev);
> +
> +        return sprintf (buf, "%u\n", pdev->sriov->nr_virtfn);
> +}
> +
> +static ssize_t sriov_enable_vfs_store(struct device *dev,
> +                                      struct device_attribute *attr,
> +                                      const char *buf, size_t count)
> +{
> +        struct pci_dev *pdev;
> +        int num_vf_enabled = 0;
> +        unsigned long num_vfs;
> +        pdev = to_pci_dev(dev);
> +
> +        /* Requested VFs to enable < max_vfs
> +         * and none enabled already
> +         */
> +        if (strict_strtoul(buf, 0, &num_vfs) < 0)
> +               return -EINVAL;

checkpatch.pl should ask you to change to kstrtoul instead.

> +
> +        if ((num_vfs > pdev->sriov->total) ||
> +            (pdev->sriov->nr_virtfn != 0))
> +                return -EINVAL;
> +
> +        /* Ready for lift-off! */
> +        if (pdev->driver && pdev->driver->sriov_enable) {
> +                num_vf_enabled = pdev->driver->sriov_enable(pdev, num_vfs);
> +        }
> +
> +        if (num_vf_enabled != num_vfs)
> +                printk(KERN_WARNING
> +                        "%s: %04x:%02x:%02x.%d: Only %d VFs enabled \n",
> +                        pci_name(pdev), pci_domain_nr(pdev->bus),
> +                        pdev->bus->number, PCI_SLOT(pdev->devfn),
> +                        PCI_FUNC(pdev->devfn), num_vf_enabled);
> +
> +        return count;

do you need pci_remove_rescan_mutex ?

also that enable could take a while ..., may need to use
device_schedule_callback to start one callback.


> +}
> +
> +static ssize_t sriov_disable_vfs_store(struct device *dev,
> +                                       struct device_attribute *attr,
> +                                       const char *buf, size_t count)
> +{
> +        struct pci_dev *pdev;
> +       int ret = 0;
> +
> +        pdev = to_pci_dev(dev);
> +
> +        /* make sure sriov device & at least 1 vf enabled */
> +        if (pdev->sriov->nr_virtfn == 0) {
> +               ret = -ENODEV;
> +                goto out;
> +       }
> +
> +        /* Ready for landing! */
> +       /* Note: Disabling VF can fail if VF assigned to virtualized guest */
> +        if (pdev->driver && pdev->driver->sriov_disable) {
> +                ret = pdev->driver->sriov_disable(pdev);
> +        }
> +out:
> +        return ret ? ret : count;
> +}
> +#else
> +static ssize_t sriov_max_vfs_show(struct device *dev,
> +                                  struct device_attribute *attr,
> +                                  char *buf)
> +{ return 0; }
> +static ssize_t sriov_enable_vfs_show(struct device *dev,
> +                                     struct device_attribute *attr,
> +                                     char *buf)
> +{ return 0; }
> +static ssize_t sriov_enable_vfs_store(struct device *dev,
> +                                      struct device_attribute *attr,
> +                                      const char *buf, size_t count)
> +{ return count; }
> +static ssize_t sriov_disable_vfs_store(struct device *dev,
> +                                       struct device_attribute *attr,
> +                                       const char *buf, size_t count)
> +{ return count; }
> +#endif /* CONFIG_PCI_IOV */
> +
> +static struct device_attribute sriov_max_vfs_attr = __ATTR_RO(sriov_max_vfs);
> +static struct device_attribute sriov_enable_vfs_attr =
> +               __ATTR(sriov_enable_vfs, (S_IRUGO|S_IWUSR|S_IWGRP),
> +                       sriov_enable_vfs_show, sriov_enable_vfs_store);
> +static struct device_attribute sriov_disable_vfs_attr =
> +               __ATTR(sriov_disable_vfs, (S_IWUSR|S_IWGRP),
> +                       NULL, sriov_disable_vfs_store);
> +
>  struct device_attribute pci_dev_attrs[] = {
>         __ATTR_RO(resource),
>         __ATTR_RO(vendor),
> @@ -1408,8 +1530,15 @@ static struct attribute *pci_dev_dev_attrs[] = {
>         NULL,
>  };
>
> +static struct attribute *sriov_dev_attrs[] = {
> +        &sriov_max_vfs_attr.attr,
> +        &sriov_enable_vfs_attr.attr,
> +        &sriov_disable_vfs_attr.attr,
> +       NULL,
> +};
> +
>  static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
> -                                               struct attribute *a, int n)
> +                                        struct attribute *a, int n)
>  {
>         struct device *dev = container_of(kobj, struct device, kobj);
>         struct pci_dev *pdev = to_pci_dev(dev);
> @@ -1421,13 +1550,35 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
>         return a->mode;
>  }
>
> +static umode_t sriov_attrs_are_visible(struct kobject *kobj,
> +                                        struct attribute *a, int n)
> +{
> +       struct device *dev = container_of(kobj, struct device, kobj);
> +       struct pci_dev *pdev = to_pci_dev(dev);
> +
> +       if ((a == &sriov_max_vfs_attr.attr) ||
> +           (a == &sriov_enable_vfs_attr.attr) ||
> +           (a == &sriov_disable_vfs_attr.attr)) {
> +               if (!pci_dev_has_sriov(pdev))
> +                       return 0;
> +       }
> +
> +       return a->mode;
> +}
> +
>  static struct attribute_group pci_dev_attr_group = {
>         .attrs = pci_dev_dev_attrs,
>         .is_visible = pci_dev_attrs_are_visible,
>  };
>
> +static struct attribute_group sriov_dev_attr_group = {
> +       .attrs = sriov_dev_attrs,
> +       .is_visible = sriov_attrs_are_visible,

you could fold contents in sriov_dev_attrs and sriov_attrs_are_visible into
pci_dev_dev_attrs and pci_dev_attrs_are_visible.

> +};
> +
>  static const struct attribute_group *pci_dev_attr_groups[] = {
>         &pci_dev_attr_group,
> +       &sriov_dev_attr_group,
>         NULL,
>  };
>
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index be1de01..b3e25a8 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -595,6 +595,8 @@ struct pci_driver {
>         int  (*resume_early) (struct pci_dev *dev);
>         int  (*resume) (struct pci_dev *dev);                   /* Device woken up */
>         void (*shutdown) (struct pci_dev *dev);
> +       int (*sriov_enable) (struct pci_dev *dev, int num_vfs);    /* PF pci dev */
> +       int (*sriov_disable) (struct pci_dev *dev);
>         const struct pci_error_handlers *err_handler;
>         struct device_driver    driver;
>         struct pci_dynids dynids;
> --
> 1.7.10.2.552.gaa3bb87
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Don Dutile Oct. 4, 2012, 10:50 p.m. UTC | #2
Thanks for adding Greg k-h to the thread...
(sorry about that omission on git send-email, Greg!)

On 10/04/2012 06:30 PM, Yinghai Lu wrote:
> On Thu, Oct 4, 2012 at 3:14 PM, Donald Dutile<ddutile@redhat.com>  wrote:
>> Provide files under sysfs to determine the max number of vfs
>> an SRIOV-capable PCIe device supports, and methods to enable and
>> disable the vfs on a per device basis.
>>
>> Currently, VF enablement by SRIOV-capable PCIe devices is done
>> in driver-specific module parameters.  If not setup in modprobe files,
>> it requires admin to unload&  reload PF drivers with number of desired
>> VFs to enable.  Additionally, the enablement is system wide: all
>> devices controlled by the same driver have the same number of VFs
>> enabled.  Although the latter is probably desired, there are PCI
>> configurations setup by system BIOS that may not enable that to occur.
>>
>> Three files are created if a PCIe device has SRIOV support:
>> sriov_max_vfs -- cat-ing this file returns the maximum number
>>                   of VFs a PCIe device supports.
>> sriov_enable_vfs -- echo'ing a number to this file enables this
>>                      number of VFs for this given PCIe device.
>>                   -- cat-ing this file will return the number of VFs
>>                      currently enabled on this PCIe device.
>> sriov_disable_vfs -- echo-ing any number other than 0 disables all
>>                       VFs associated with this PCIe device.
>>
>> VF enable and disablement is invoked much like other PCIe
>> configuration functions -- via registered callbacks in the driver,
>> i.e., probe, release, etc.
>>
>> v1->v2:
>> This patch is based on previous 2 patches by Yinghai Lu
>> that cleaned up the vga attributes for PCI devices under sysfs,
>> and uses visibility-checking group attributes as recommended by
>> Greg K-H.
>>
>> Signed-off-by: Donald Dutile<ddutile@redhat.com>
>> ---
>>   drivers/pci/pci-sysfs.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++-
>>   include/linux/pci.h     |   2 +
>>   2 files changed, 154 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
>> index fbbb97f..e6522c2 100644
>> --- a/drivers/pci/pci-sysfs.c
>> +++ b/drivers/pci/pci-sysfs.c
>> @@ -404,6 +404,128 @@ static ssize_t d3cold_allowed_show(struct device *dev,
>>   }
>>   #endif
>>
>> +bool pci_dev_has_sriov(struct pci_dev *pdev)
>> +{
>> +        int ret = false;
>> +        int pos;
>> +
>> +        if (!pci_is_pcie(pdev))
>> +                goto out;
>> +
>> +        pos =  pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
>> +        if (pos)
>> +                ret = true;
>> +out:
>> +        return ret;
>> +}
>
> should use dev_is_pf?
>
No.  Only want the files to show up on pf's with sriov cap.
dev_is_pf will have the files show up on non-sriov-capable devices.

> cap does not mean you have sriov_init return successfully.
>
will find out if sriov-init fails if driver's sriov-enable callback fails!
could add a flag & check for sriov-init success...

>> +
>> +#ifdef CONFIG_PCI_IOV
>> +static ssize_t sriov_max_vfs_show(struct device *dev,
>> +                                  struct device_attribute *attr,
>> +                                  char *buf)
>> +{
>> +        struct pci_dev *pdev;
>> +
>> +        pdev = to_pci_dev(dev);
>> +        return sprintf (buf, "%u\n", pdev->sriov->total);
>> +}
>> +
>> +
>> +static ssize_t sriov_enable_vfs_show(struct device *dev,
>> +                                     struct device_attribute *attr,
>> +                                     char *buf)
>> +{
>> +        struct pci_dev *pdev;
>> +
>> +        pdev = to_pci_dev(dev);
>> +
>> +        return sprintf (buf, "%u\n", pdev->sriov->nr_virtfn);
>> +}
>> +
>> +static ssize_t sriov_enable_vfs_store(struct device *dev,
>> +                                      struct device_attribute *attr,
>> +                                      const char *buf, size_t count)
>> +{
>> +        struct pci_dev *pdev;
>> +        int num_vf_enabled = 0;
>> +        unsigned long num_vfs;
>> +        pdev = to_pci_dev(dev);
>> +
>> +        /* Requested VFs to enable<  max_vfs
>> +         * and none enabled already
>> +         */
>> +        if (strict_strtoul(buf, 0,&num_vfs)<  0)
>> +               return -EINVAL;
>
> checkpatch.pl should ask you to change to kstrtoul instead.
>
ok, thanks.  i saw strict_strtoul() used elsewhere, but
glad to update.

>> +
>> +        if ((num_vfs>  pdev->sriov->total) ||
>> +            (pdev->sriov->nr_virtfn != 0))
>> +                return -EINVAL;
>> +
>> +        /* Ready for lift-off! */
>> +        if (pdev->driver&&  pdev->driver->sriov_enable) {
>> +                num_vf_enabled = pdev->driver->sriov_enable(pdev, num_vfs);
>> +        }
>> +
>> +        if (num_vf_enabled != num_vfs)
>> +                printk(KERN_WARNING
>> +                        "%s: %04x:%02x:%02x.%d: Only %d VFs enabled \n",
>> +                        pci_name(pdev), pci_domain_nr(pdev->bus),
>> +                        pdev->bus->number, PCI_SLOT(pdev->devfn),
>> +                        PCI_FUNC(pdev->devfn), num_vf_enabled);
>> +
>> +        return count;
>
> do you need pci_remove_rescan_mutex ?
>
I saw that in your patch set; wondering if i need to now that the
files are added in sysfs device attribute framework, which would have
same race for other attributes, I would think.  Thus, I didn't add it.
Greg?

> also that enable could take a while ..., may need to use
> device_schedule_callback to start one callback.
>
agreed, but device_schedule_callback doesn't allow a parameter
to be passed. only way I can think to get around it is to
put the requested-num-vf's in the pci-(pf)-dev structure,
and have the pf driver pluck it from there.
other way to pass num-vf-to-enable?

>
>> +}
>> +
>> +static ssize_t sriov_disable_vfs_store(struct device *dev,
>> +                                       struct device_attribute *attr,
>> +                                       const char *buf, size_t count)
>> +{
>> +        struct pci_dev *pdev;
>> +       int ret = 0;
>> +
>> +        pdev = to_pci_dev(dev);
>> +
>> +        /* make sure sriov device&  at least 1 vf enabled */
>> +        if (pdev->sriov->nr_virtfn == 0) {
>> +               ret = -ENODEV;
>> +                goto out;
>> +       }
>> +
>> +        /* Ready for landing! */
>> +       /* Note: Disabling VF can fail if VF assigned to virtualized guest */
>> +        if (pdev->driver&&  pdev->driver->sriov_disable) {
>> +                ret = pdev->driver->sriov_disable(pdev);
>> +        }
>> +out:
>> +        return ret ? ret : count;
>> +}
>> +#else
>> +static ssize_t sriov_max_vfs_show(struct device *dev,
>> +                                  struct device_attribute *attr,
>> +                                  char *buf)
>> +{ return 0; }
>> +static ssize_t sriov_enable_vfs_show(struct device *dev,
>> +                                     struct device_attribute *attr,
>> +                                     char *buf)
>> +{ return 0; }
>> +static ssize_t sriov_enable_vfs_store(struct device *dev,
>> +                                      struct device_attribute *attr,
>> +                                      const char *buf, size_t count)
>> +{ return count; }
>> +static ssize_t sriov_disable_vfs_store(struct device *dev,
>> +                                       struct device_attribute *attr,
>> +                                       const char *buf, size_t count)
>> +{ return count; }
>> +#endif /* CONFIG_PCI_IOV */
>> +
>> +static struct device_attribute sriov_max_vfs_attr = __ATTR_RO(sriov_max_vfs);
>> +static struct device_attribute sriov_enable_vfs_attr =
>> +               __ATTR(sriov_enable_vfs, (S_IRUGO|S_IWUSR|S_IWGRP),
>> +                       sriov_enable_vfs_show, sriov_enable_vfs_store);
>> +static struct device_attribute sriov_disable_vfs_attr =
>> +               __ATTR(sriov_disable_vfs, (S_IWUSR|S_IWGRP),
>> +                       NULL, sriov_disable_vfs_store);
>> +
>>   struct device_attribute pci_dev_attrs[] = {
>>          __ATTR_RO(resource),
>>          __ATTR_RO(vendor),
>> @@ -1408,8 +1530,15 @@ static struct attribute *pci_dev_dev_attrs[] = {
>>          NULL,
>>   };
>>
>> +static struct attribute *sriov_dev_attrs[] = {
>> +&sriov_max_vfs_attr.attr,
>> +&sriov_enable_vfs_attr.attr,
>> +&sriov_disable_vfs_attr.attr,
>> +       NULL,
>> +};
>> +
>>   static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
>> -                                               struct attribute *a, int n)
>> +                                        struct attribute *a, int n)
>>   {
>>          struct device *dev = container_of(kobj, struct device, kobj);
>>          struct pci_dev *pdev = to_pci_dev(dev);
>> @@ -1421,13 +1550,35 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
>>          return a->mode;
>>   }
>>
>> +static umode_t sriov_attrs_are_visible(struct kobject *kobj,
>> +                                        struct attribute *a, int n)
>> +{
>> +       struct device *dev = container_of(kobj, struct device, kobj);
>> +       struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +       if ((a ==&sriov_max_vfs_attr.attr) ||
>> +           (a ==&sriov_enable_vfs_attr.attr) ||
>> +           (a ==&sriov_disable_vfs_attr.attr)) {
>> +               if (!pci_dev_has_sriov(pdev))
>> +                       return 0;
>> +       }
>> +
>> +       return a->mode;
>> +}
>> +
>>   static struct attribute_group pci_dev_attr_group = {
>>          .attrs = pci_dev_dev_attrs,
>>          .is_visible = pci_dev_attrs_are_visible,
>>   };
>>
>> +static struct attribute_group sriov_dev_attr_group = {
>> +       .attrs = sriov_dev_attrs,
>> +       .is_visible = sriov_attrs_are_visible,
>
> you could fold contents in sriov_dev_attrs and sriov_attrs_are_visible into
> pci_dev_dev_attrs and pci_dev_attrs_are_visible.
>
Yes, I did that originally, but since you had
the attr_groups[] below, I figured you wanted it separate,
and it makes clear division for other cleanup (pm attributes?).

>> +};
>> +
>>   static const struct attribute_group *pci_dev_attr_groups[] = {
>>          &pci_dev_attr_group,
>> +&sriov_dev_attr_group,
>>          NULL,
>>   };
>>
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index be1de01..b3e25a8 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -595,6 +595,8 @@ struct pci_driver {
>>          int  (*resume_early) (struct pci_dev *dev);
>>          int  (*resume) (struct pci_dev *dev);                   /* Device woken up */
>>          void (*shutdown) (struct pci_dev *dev);
>> +       int (*sriov_enable) (struct pci_dev *dev, int num_vfs);    /* PF pci dev */
>> +       int (*sriov_disable) (struct pci_dev *dev);
>>          const struct pci_error_handlers *err_handler;
>>          struct device_driver    driver;
>>          struct pci_dynids dynids;
>> --
>> 1.7.10.2.552.gaa3bb87
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Thanks for prompt review & feedback.
I'll wait for other feedback b4 spinning a final PATCH.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yinghai Lu Oct. 4, 2012, 11:05 p.m. UTC | #3
On Thu, Oct 4, 2012 at 3:50 PM, Don Dutile <ddutile@redhat.com> wrote:
> Thanks for adding Greg k-h to the thread...
> (sorry about that omission on git send-email, Greg!)
>
>
> On 10/04/2012 06:30 PM, Yinghai Lu wrote:
>>
>> On Thu, Oct 4, 2012 at 3:14 PM, Donald Dutile<ddutile@redhat.com>  wrote:
>>>
>>> Provide files under sysfs to determine the max number of vfs
>>> an SRIOV-capable PCIe device supports, and methods to enable and
>>> disable the vfs on a per device basis.
>>>
>>> Currently, VF enablement by SRIOV-capable PCIe devices is done
>>> in driver-specific module parameters.  If not setup in modprobe files,
>>> it requires admin to unload&  reload PF drivers with number of desired
>>>
>>> VFs to enable.  Additionally, the enablement is system wide: all
>>> devices controlled by the same driver have the same number of VFs
>>> enabled.  Although the latter is probably desired, there are PCI
>>> configurations setup by system BIOS that may not enable that to occur.
>>>
>>> Three files are created if a PCIe device has SRIOV support:
>>> sriov_max_vfs -- cat-ing this file returns the maximum number
>>>                   of VFs a PCIe device supports.
>>> sriov_enable_vfs -- echo'ing a number to this file enables this
>>>                      number of VFs for this given PCIe device.
>>>                   -- cat-ing this file will return the number of VFs
>>>                      currently enabled on this PCIe device.
>>> sriov_disable_vfs -- echo-ing any number other than 0 disables all
>>>                       VFs associated with this PCIe device.
>>>
>>> VF enable and disablement is invoked much like other PCIe
>>> configuration functions -- via registered callbacks in the driver,
>>> i.e., probe, release, etc.
>>>
>>> v1->v2:
>>> This patch is based on previous 2 patches by Yinghai Lu
>>> that cleaned up the vga attributes for PCI devices under sysfs,
>>> and uses visibility-checking group attributes as recommended by
>>> Greg K-H.
>>>
>>> Signed-off-by: Donald Dutile<ddutile@redhat.com>
>>> ---
>>>   drivers/pci/pci-sysfs.c | 153
>>> +++++++++++++++++++++++++++++++++++++++++++++++-
>>>   include/linux/pci.h     |   2 +
>>>   2 files changed, 154 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
>>> index fbbb97f..e6522c2 100644
>>> --- a/drivers/pci/pci-sysfs.c
>>> +++ b/drivers/pci/pci-sysfs.c
>>> @@ -404,6 +404,128 @@ static ssize_t d3cold_allowed_show(struct device
>>> *dev,
>>>   }
>>>   #endif
>>>
>>> +bool pci_dev_has_sriov(struct pci_dev *pdev)
>>> +{
>>> +        int ret = false;
>>> +        int pos;
>>> +
>>> +        if (!pci_is_pcie(pdev))
>>> +                goto out;
>>> +
>>> +        pos =  pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
>>> +        if (pos)
>>> +                ret = true;
>>> +out:
>>> +        return ret;
>>> +}
>>
>>
>> should use dev_is_pf?
>>
> No.  Only want the files to show up on pf's with sriov cap.
> dev_is_pf will have the files show up on non-sriov-capable devices.

dev->is_physn is set iff pci_iov_init/sriov_init successfully.

when we have pci_device_add called, pci_init_capabilities will call
pci_iov_init.

>
>
>> cap does not mean you have sriov_init return successfully.
>>
> will find out if sriov-init fails if driver's sriov-enable callback fails!
> could add a flag & check for sriov-init success...
>
>>> +
>>> +#ifdef CONFIG_PCI_IOV
>>> +static ssize_t sriov_max_vfs_show(struct device *dev,
>>> +                                  struct device_attribute *attr,
>>> +                                  char *buf)
>>> +{
>>> +        struct pci_dev *pdev;
>>> +
>>> +        pdev = to_pci_dev(dev);
>>> +        return sprintf (buf, "%u\n", pdev->sriov->total);
>>> +}
>>> +
>>> +
>>> +static ssize_t sriov_enable_vfs_show(struct device *dev,
>>> +                                     struct device_attribute *attr,
>>> +                                     char *buf)
>>> +{
>>> +        struct pci_dev *pdev;
>>> +
>>> +        pdev = to_pci_dev(dev);
>>> +
>>> +        return sprintf (buf, "%u\n", pdev->sriov->nr_virtfn);
>>> +}
>>> +
>>> +static ssize_t sriov_enable_vfs_store(struct device *dev,
>>> +                                      struct device_attribute *attr,
>>> +                                      const char *buf, size_t count)
>>> +{
>>> +        struct pci_dev *pdev;
>>> +        int num_vf_enabled = 0;
>>> +        unsigned long num_vfs;
>>> +        pdev = to_pci_dev(dev);
>>> +
>>> +        /* Requested VFs to enable<  max_vfs
>>> +         * and none enabled already
>>> +         */
>>> +        if (strict_strtoul(buf, 0,&num_vfs)<  0)
>>> +               return -EINVAL;
>>
>>
>> checkpatch.pl should ask you to change to kstrtoul instead.
>>
> ok, thanks.  i saw strict_strtoul() used elsewhere, but
> glad to update.
>
>>> +
>>> +        if ((num_vfs>  pdev->sriov->total) ||
>>> +            (pdev->sriov->nr_virtfn != 0))
>>> +                return -EINVAL;
>>> +
>>> +        /* Ready for lift-off! */
>>> +        if (pdev->driver&&  pdev->driver->sriov_enable) {
>>>
>>> +                num_vf_enabled = pdev->driver->sriov_enable(pdev,
>>> num_vfs);
>>> +        }
>>> +
>>> +        if (num_vf_enabled != num_vfs)
>>> +                printk(KERN_WARNING
>>> +                        "%s: %04x:%02x:%02x.%d: Only %d VFs enabled \n",
>>> +                        pci_name(pdev), pci_domain_nr(pdev->bus),
>>> +                        pdev->bus->number, PCI_SLOT(pdev->devfn),
>>> +                        PCI_FUNC(pdev->devfn), num_vf_enabled);
>>> +
>>> +        return count;
>>
>>
>> do you need pci_remove_rescan_mutex ?
>>
> I saw that in your patch set; wondering if i need to now that the
> files are added in sysfs device attribute framework, which would have
> same race for other attributes, I would think.  Thus, I didn't add it.
> Greg?
>
>
>> also that enable could take a while ..., may need to use
>> device_schedule_callback to start one callback.
>>
> agreed, but device_schedule_callback doesn't allow a parameter
> to be passed. only way I can think to get around it is to
> put the requested-num-vf's in the pci-(pf)-dev structure,

why not?

> and have the pf driver pluck it from there.
> other way to pass num-vf-to-enable?
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Don Dutile Oct. 5, 2012, 3:10 p.m. UTC | #4
On 10/04/2012 07:05 PM, Yinghai Lu wrote:
> On Thu, Oct 4, 2012 at 3:50 PM, Don Dutile<ddutile@redhat.com>  wrote:
>> Thanks for adding Greg k-h to the thread...
>> (sorry about that omission on git send-email, Greg!)
>>
>>
>> On 10/04/2012 06:30 PM, Yinghai Lu wrote:
>>>
>>> On Thu, Oct 4, 2012 at 3:14 PM, Donald Dutile<ddutile@redhat.com>   wrote:
>>>>
>>>> Provide files under sysfs to determine the max number of vfs
>>>> an SRIOV-capable PCIe device supports, and methods to enable and
>>>> disable the vfs on a per device basis.
>>>>
>>>> Currently, VF enablement by SRIOV-capable PCIe devices is done
>>>> in driver-specific module parameters.  If not setup in modprobe files,
>>>> it requires admin to unload&   reload PF drivers with number of desired
>>>>
>>>> VFs to enable.  Additionally, the enablement is system wide: all
>>>> devices controlled by the same driver have the same number of VFs
>>>> enabled.  Although the latter is probably desired, there are PCI
>>>> configurations setup by system BIOS that may not enable that to occur.
>>>>
>>>> Three files are created if a PCIe device has SRIOV support:
>>>> sriov_max_vfs -- cat-ing this file returns the maximum number
>>>>                    of VFs a PCIe device supports.
>>>> sriov_enable_vfs -- echo'ing a number to this file enables this
>>>>                       number of VFs for this given PCIe device.
>>>>                    -- cat-ing this file will return the number of VFs
>>>>                       currently enabled on this PCIe device.
>>>> sriov_disable_vfs -- echo-ing any number other than 0 disables all
>>>>                        VFs associated with this PCIe device.
>>>>
>>>> VF enable and disablement is invoked much like other PCIe
>>>> configuration functions -- via registered callbacks in the driver,
>>>> i.e., probe, release, etc.
>>>>
>>>> v1->v2:
>>>> This patch is based on previous 2 patches by Yinghai Lu
>>>> that cleaned up the vga attributes for PCI devices under sysfs,
>>>> and uses visibility-checking group attributes as recommended by
>>>> Greg K-H.
>>>>
>>>> Signed-off-by: Donald Dutile<ddutile@redhat.com>
>>>> ---
>>>>    drivers/pci/pci-sysfs.c | 153
>>>> +++++++++++++++++++++++++++++++++++++++++++++++-
>>>>    include/linux/pci.h     |   2 +
>>>>    2 files changed, 154 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
>>>> index fbbb97f..e6522c2 100644
>>>> --- a/drivers/pci/pci-sysfs.c
>>>> +++ b/drivers/pci/pci-sysfs.c
>>>> @@ -404,6 +404,128 @@ static ssize_t d3cold_allowed_show(struct device
>>>> *dev,
>>>>    }
>>>>    #endif
>>>>
>>>> +bool pci_dev_has_sriov(struct pci_dev *pdev)
>>>> +{
>>>> +        int ret = false;
>>>> +        int pos;
>>>> +
>>>> +        if (!pci_is_pcie(pdev))
>>>> +                goto out;
>>>> +
>>>> +        pos =  pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
>>>> +        if (pos)
>>>> +                ret = true;
>>>> +out:
>>>> +        return ret;
>>>> +}
>>>
>>>
>>> should use dev_is_pf?
>>>
>> No.  Only want the files to show up on pf's with sriov cap.
>> dev_is_pf will have the files show up on non-sriov-capable devices.
>
> dev->is_physn is set iff pci_iov_init/sriov_init successfully.
>
I'm not fond of 'is_physfn' indicating sriov-init successful...
Now, if we want to put such a status flag in pdev (sriov_init_completed),
then, I'd go for the additional is_physfn check....
btw, which should be set by the pci probe code in general...

other reasons not to make it dependent on is_physfn...
what happens if sriov_init() is moved... to a later point...
say when sriov_enable() is called by a driver...
or when sriov_max_vfs is read/cat'd...
or when sriov_enable_vfs is written or read to?


> when we have pci_device_add called, pci_init_capabilities will call
> pci_iov_init.
>
that's what it does now.
Added Chris Wright to cc: to weigh in ...

>>
>>
>>> cap does not mean you have sriov_init return successfully.
>>>
>> will find out if sriov-init fails if driver's sriov-enable callback fails!
>> could add a flag&  check for sriov-init success...
>>
>>>> +
>>>> +#ifdef CONFIG_PCI_IOV
>>>> +static ssize_t sriov_max_vfs_show(struct device *dev,
>>>> +                                  struct device_attribute *attr,
>>>> +                                  char *buf)
>>>> +{
>>>> +        struct pci_dev *pdev;
>>>> +
>>>> +        pdev = to_pci_dev(dev);
>>>> +        return sprintf (buf, "%u\n", pdev->sriov->total);
>>>> +}
>>>> +
>>>> +
>>>> +static ssize_t sriov_enable_vfs_show(struct device *dev,
>>>> +                                     struct device_attribute *attr,
>>>> +                                     char *buf)
>>>> +{
>>>> +        struct pci_dev *pdev;
>>>> +
>>>> +        pdev = to_pci_dev(dev);
>>>> +
>>>> +        return sprintf (buf, "%u\n", pdev->sriov->nr_virtfn);
>>>> +}
>>>> +
>>>> +static ssize_t sriov_enable_vfs_store(struct device *dev,
>>>> +                                      struct device_attribute *attr,
>>>> +                                      const char *buf, size_t count)
>>>> +{
>>>> +        struct pci_dev *pdev;
>>>> +        int num_vf_enabled = 0;
>>>> +        unsigned long num_vfs;
>>>> +        pdev = to_pci_dev(dev);
>>>> +
>>>> +        /* Requested VFs to enable<   max_vfs
>>>> +         * and none enabled already
>>>> +         */
>>>> +        if (strict_strtoul(buf, 0,&num_vfs)<   0)
>>>> +               return -EINVAL;
>>>
>>>
>>> checkpatch.pl should ask you to change to kstrtoul instead.
>>>
>> ok, thanks.  i saw strict_strtoul() used elsewhere, but
>> glad to update.
>>
>>>> +
>>>> +        if ((num_vfs>   pdev->sriov->total) ||
>>>> +            (pdev->sriov->nr_virtfn != 0))
>>>> +                return -EINVAL;
>>>> +
>>>> +        /* Ready for lift-off! */
>>>> +        if (pdev->driver&&   pdev->driver->sriov_enable) {
>>>>
>>>> +                num_vf_enabled = pdev->driver->sriov_enable(pdev,
>>>> num_vfs);
>>>> +        }
>>>> +
>>>> +        if (num_vf_enabled != num_vfs)
>>>> +                printk(KERN_WARNING
>>>> +                        "%s: %04x:%02x:%02x.%d: Only %d VFs enabled \n",
>>>> +                        pci_name(pdev), pci_domain_nr(pdev->bus),
>>>> +                        pdev->bus->number, PCI_SLOT(pdev->devfn),
>>>> +                        PCI_FUNC(pdev->devfn), num_vf_enabled);
>>>> +
>>>> +        return count;
>>>
>>>
>>> do you need pci_remove_rescan_mutex ?
>>>
>> I saw that in your patch set; wondering if i need to now that the
>> files are added in sysfs device attribute framework, which would have
>> same race for other attributes, I would think.  Thus, I didn't add it.
>> Greg?
>>
>>
>>> also that enable could take a while ..., may need to use
>>> device_schedule_callback to start one callback.
>>>
>> agreed, but device_schedule_callback doesn't allow a parameter
>> to be passed. only way I can think to get around it is to
>> put the requested-num-vf's in the pci-(pf)-dev structure,
>
> why not?
>
sorry, I don't know what you mean by 'why not?'...
why not put requested-num-vf's in pdev?
or why can't it be in device_schedule_callback() ?
other?

>> and have the pf driver pluck it from there.
>> other way to pass num-vf-to-enable?

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander H Duyck Oct. 6, 2012, 6:21 p.m. UTC | #5
On 10/4/2012 3:14 PM, Donald Dutile wrote:
> Provide files under sysfs to determine the max number of vfs
> an SRIOV-capable PCIe device supports, and methods to enable and
> disable the vfs on a per device basis.
>
> Currently, VF enablement by SRIOV-capable PCIe devices is done
> in driver-specific module parameters.  If not setup in modprobe files,
> it requires admin to unload & reload PF drivers with number of desired
> VFs to enable.  Additionally, the enablement is system wide: all
> devices controlled by the same driver have the same number of VFs
> enabled.  Although the latter is probably desired, there are PCI
> configurations setup by system BIOS that may not enable that to occur.
>
> Three files are created if a PCIe device has SRIOV support:
> sriov_max_vfs -- cat-ing this file returns the maximum number
>                   of VFs a PCIe device supports.
> sriov_enable_vfs -- echo'ing a number to this file enables this
>                      number of VFs for this given PCIe device.
>                   -- cat-ing this file will return the number of VFs
>                      currently enabled on this PCIe device.
> sriov_disable_vfs -- echo-ing any number other than 0 disables all
>                       VFs associated with this PCIe device.
>
> VF enable and disablement is invoked much like other PCIe
> configuration functions -- via registered callbacks in the driver,
> i.e., probe, release, etc.
>
> v1->v2:
> This patch is based on previous 2 patches by Yinghai Lu
> that cleaned up the vga attributes for PCI devices under sysfs,
> and uses visibility-checking group attributes as recommended by
> Greg K-H.
>
> Signed-off-by: Donald Dutile <ddutile@redhat.com>

I'm not certain if having separate values/functions for enable and 
disabling VFs will work out very well.  It seems like it would probably 
make more sense just to have one value that manipulates NumVFs.  As is, 
in the sriov_enable_vfs case we would end up having to disable SR-IOV 
should we be changing the number of VFs anyway so it would probably make 
sense to just merge both values into a single one and use 0 as the value 
to disable SR-IOV.

Also in the naming scheme for the values it may preferable to use the 
names of the values from the SR-IOV PCIe capability to avoid any 
confusion on what the values represent.  I believe the names for the two 
values you are representing are TotalVFs and NumVFs.

Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rose, Gregory V Oct. 8, 2012, 3:44 p.m. UTC | #6
On Sat, 6 Oct 2012 11:21:00 -0700
Alexander Duyck <alexander.duyck@gmail.com> wrote:

> On 10/4/2012 3:14 PM, Donald Dutile wrote:
> > Provide files under sysfs to determine the max number of vfs
> > an SRIOV-capable PCIe device supports, and methods to enable and
> > disable the vfs on a per device basis.
> >
> > Currently, VF enablement by SRIOV-capable PCIe devices is done
> > in driver-specific module parameters.  If not setup in modprobe
> > files, it requires admin to unload & reload PF drivers with number
> > of desired VFs to enable.  Additionally, the enablement is system
> > wide: all devices controlled by the same driver have the same
> > number of VFs enabled.  Although the latter is probably desired,
> > there are PCI configurations setup by system BIOS that may not
> > enable that to occur.
> >
> > Three files are created if a PCIe device has SRIOV support:
> > sriov_max_vfs -- cat-ing this file returns the maximum number
> >                   of VFs a PCIe device supports.
> > sriov_enable_vfs -- echo'ing a number to this file enables this
> >                      number of VFs for this given PCIe device.
> >                   -- cat-ing this file will return the number of VFs
> >                      currently enabled on this PCIe device.
> > sriov_disable_vfs -- echo-ing any number other than 0 disables all
> >                       VFs associated with this PCIe device.
> >
> > VF enable and disablement is invoked much like other PCIe
> > configuration functions -- via registered callbacks in the driver,
> > i.e., probe, release, etc.
> >
> > v1->v2:
> > This patch is based on previous 2 patches by Yinghai Lu
> > that cleaned up the vga attributes for PCI devices under sysfs,
> > and uses visibility-checking group attributes as recommended by
> > Greg K-H.
> >
> > Signed-off-by: Donald Dutile <ddutile@redhat.com>
> 
> I'm not certain if having separate values/functions for enable and 
> disabling VFs will work out very well.  It seems like it would
> probably make more sense just to have one value that manipulates
> NumVFs.  As is, in the sriov_enable_vfs case we would end up having
> to disable SR-IOV should we be changing the number of VFs anyway so
> it would probably make sense to just merge both values into a single
> one and use 0 as the value to disable SR-IOV.
> 
> Also in the naming scheme for the values it may preferable to use the 
> names of the values from the SR-IOV PCIe capability to avoid any 
> confusion on what the values represent.  I believe the names for the
> two values you are representing are TotalVFs and NumVFs.

Alex makes a good point here.  If you could change sriov_max_vfs to
total_vfs that would help.

And a single callback is all that is necessary, especially if you take
into account my further comments below.  After looking at the
implementation and thinking about it a bit over the weekend I see some
problems that we should probably address.

In my opinion we shouldn't be having the endpoint device driver call
the pci_enable_sriov() and pci_disable_sriov() functions as this
implementation requires.  The pci bus driver should do that itself and
then just notify the endpoint device driver of the action and, in the
case of enabling VFs, tell it how many were enabled, with the num_vfs
parameter.

There is a good reason for this too. I asked you to change the return
value of sriov_disable_vfs() to an int so that the driver could check
if VFs are assigned and return an error.  But the more I think of I
feel that is backwards.  This means that the system is at the mercy of
the endpoint device driver to do the check for assigned VFs.  Badly
written drivers might not do that check and then crash the system. Plus
that means every driver has to write the same code to check for
assigned VFs which could easily be put into the pci bus driver.

If the pci bus driver were to check for assigned VFs then it could take
steps to protect the system and not depend on dumb device driver
writers such as yours truly to do it.

I've pasted in the code below from the ixgbe driver that checks for
assigned VFs.  The pci bus driver has access to all the necessary
information for the check but it would need to get the VF device ID
from the PF SR-IOV capability structure.

- Greg

--------------

static bool ixgbe_vfs_are_assigned(struct ixgbe_adapter *adapter)
{
        struct pci_dev *pdev = adapter->pdev;
        struct pci_dev *vfdev;
        int dev_id;

        switch (adapter->hw.mac.type) {
        case ixgbe_mac_82599EB:
                dev_id = IXGBE_DEV_ID_82599_VF;
                break;
        case ixgbe_mac_X540:
                dev_id = IXGBE_DEV_ID_X540_VF;
                break;
        default:
                return false;
        }

        /* loop through all the VFs to see if we own any that are assigned */
        vfdev = pci_get_device(PCI_VENDOR_ID_INTEL, dev_id, NULL);
        while (vfdev) {
                /* if we don't own it we don't care */
                if (vfdev->is_virtfn && vfdev->physfn == pdev) {
                        /* if it is assigned we cannot release it */
                        if (vfdev->dev_flags & PCI_DEV_FLAGS_ASSIGNED)
                                return true;
                }

                vfdev = pci_get_device(PCI_VENDOR_ID_INTEL, dev_id, vfdev);
        }

        return false;
}


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Don Dutile Oct. 9, 2012, 4:12 p.m. UTC | #7
On 10/06/2012 02:21 PM, Alexander Duyck wrote:
> On 10/4/2012 3:14 PM, Donald Dutile wrote:
>> Provide files under sysfs to determine the max number of vfs
>> an SRIOV-capable PCIe device supports, and methods to enable and
>> disable the vfs on a per device basis.
>>
>> Currently, VF enablement by SRIOV-capable PCIe devices is done
>> in driver-specific module parameters. If not setup in modprobe files,
>> it requires admin to unload & reload PF drivers with number of desired
>> VFs to enable. Additionally, the enablement is system wide: all
>> devices controlled by the same driver have the same number of VFs
>> enabled. Although the latter is probably desired, there are PCI
>> configurations setup by system BIOS that may not enable that to occur.
>>
>> Three files are created if a PCIe device has SRIOV support:
>> sriov_max_vfs -- cat-ing this file returns the maximum number
>> of VFs a PCIe device supports.
>> sriov_enable_vfs -- echo'ing a number to this file enables this
>> number of VFs for this given PCIe device.
>> -- cat-ing this file will return the number of VFs
>> currently enabled on this PCIe device.
>> sriov_disable_vfs -- echo-ing any number other than 0 disables all
>> VFs associated with this PCIe device.
>>
>> VF enable and disablement is invoked much like other PCIe
>> configuration functions -- via registered callbacks in the driver,
>> i.e., probe, release, etc.
>>
>> v1->v2:
>> This patch is based on previous 2 patches by Yinghai Lu
>> that cleaned up the vga attributes for PCI devices under sysfs,
>> and uses visibility-checking group attributes as recommended by
>> Greg K-H.
>>
>> Signed-off-by: Donald Dutile <ddutile@redhat.com>
>
> I'm not certain if having separate values/functions for enable and disabling VFs will work out very well. It seems like it would probably make more sense just to have one value that manipulates NumVFs. As is, in the sriov_enable_vfs case we would end up having to disable SR-IOV should we be changing the number of VFs anyway so it would probably make sense to just merge both values into a single one and use 0 as the value to disable SR-IOV.
>
> Also in the naming scheme for the values it may preferable to use the names of the values from the SR-IOV PCIe capability to avoid any confusion on what the values represent. I believe the names for the two values you are representing are TotalVFs and NumVFs.
>
> Thanks,
>
> Alex
Agreed!
The reason I changed the names from 'set_max_vfs' to 'sriov_enable_vfs' was to
not confuse terms.  I think the 'set_max_vfs' was taken from the context that the
drivers currently set the number of enabled vfs by a driver param named 'max_vfs' today;
maybe that should be changed as well to 'numvfs'.

If patches go as you & Greg Rose suggest, then 'sriov_number_vfs' becomes 'sriov_totalvfs'.
cat'ing the (renamed sriov_enable_vfs to ) 'sriov_numvfs' would work.
  

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Don Dutile Oct. 9, 2012, 6:39 p.m. UTC | #8
On 10/08/2012 11:44 AM, Greg Rose wrote:
> On Sat, 6 Oct 2012 11:21:00 -0700
> Alexander Duyck<alexander.duyck@gmail.com>  wrote:
>
>> On 10/4/2012 3:14 PM, Donald Dutile wrote:
>>> Provide files under sysfs to determine the max number of vfs
>>> an SRIOV-capable PCIe device supports, and methods to enable and
>>> disable the vfs on a per device basis.
>>>
>>> Currently, VF enablement by SRIOV-capable PCIe devices is done
>>> in driver-specific module parameters.  If not setup in modprobe
>>> files, it requires admin to unload&  reload PF drivers with number
>>> of desired VFs to enable.  Additionally, the enablement is system
>>> wide: all devices controlled by the same driver have the same
>>> number of VFs enabled.  Although the latter is probably desired,
>>> there are PCI configurations setup by system BIOS that may not
>>> enable that to occur.
>>>
>>> Three files are created if a PCIe device has SRIOV support:
>>> sriov_max_vfs -- cat-ing this file returns the maximum number
>>>                    of VFs a PCIe device supports.
>>> sriov_enable_vfs -- echo'ing a number to this file enables this
>>>                       number of VFs for this given PCIe device.
>>>                    -- cat-ing this file will return the number of VFs
>>>                       currently enabled on this PCIe device.
>>> sriov_disable_vfs -- echo-ing any number other than 0 disables all
>>>                        VFs associated with this PCIe device.
>>>
>>> VF enable and disablement is invoked much like other PCIe
>>> configuration functions -- via registered callbacks in the driver,
>>> i.e., probe, release, etc.
>>>
>>> v1->v2:
>>> This patch is based on previous 2 patches by Yinghai Lu
>>> that cleaned up the vga attributes for PCI devices under sysfs,
>>> and uses visibility-checking group attributes as recommended by
>>> Greg K-H.
>>>
>>> Signed-off-by: Donald Dutile<ddutile@redhat.com>
>>
>> I'm not certain if having separate values/functions for enable and
>> disabling VFs will work out very well.  It seems like it would
>> probably make more sense just to have one value that manipulates
>> NumVFs.  As is, in the sriov_enable_vfs case we would end up having
>> to disable SR-IOV should we be changing the number of VFs anyway so
>> it would probably make sense to just merge both values into a single
>> one and use 0 as the value to disable SR-IOV.
>>
>> Also in the naming scheme for the values it may preferable to use the
>> names of the values from the SR-IOV PCIe capability to avoid any
>> confusion on what the values represent.  I believe the names for the
>> two values you are representing are TotalVFs and NumVFs.
>
> Alex makes a good point here.  If you could change sriov_max_vfs to
> total_vfs that would help.
>
consider it done; pls read my reply to Alex's suggestion(s).

> And a single callback is all that is necessary, especially if you take
> into account my further comments below.  After looking at the
> implementation and thinking about it a bit over the weekend I see some
> problems that we should probably address.
>
+1.

> In my opinion we shouldn't be having the endpoint device driver call
> the pci_enable_sriov() and pci_disable_sriov() functions as this
> implementation requires.  The pci bus driver should do that itself and
> then just notify the endpoint device driver of the action and, in the
> case of enabling VFs, tell it how many were enabled, with the num_vfs
> parameter.
>
I want to double-check this logic, but on first glance I agree for this API...
But, until the PF driver's, param-based, 'max_vfs' interface is removed,
the driver will still have to have a code path that does the enable/disable_sriov().
Again, I agree it is flawed, as a bad driver not calling disable_sriov()
will leak resources and quickly fail other PCI (hotplug) ops (be they VFs or PFs).
Q: what did you mean by 'and then just notify the endpoint device driver' ?
    -- another/new api for pf drivers w/sriov support/device ??

> There is a good reason for this too. I asked you to change the return
> value of sriov_disable_vfs() to an int so that the driver could check
> if VFs are assigned and return an error.  But the more I think of I
> feel that is backwards.  This means that the system is at the mercy of
> the endpoint device driver to do the check for assigned VFs.  Badly
> written drivers might not do that check and then crash the system. Plus
> that means every driver has to write the same code to check for
> assigned VFs which could easily be put into the pci bus driver.
>
Isn't this code still needed when user-level unbind done, i.e.,
echo 1 > /sys/bus/pci/drivers/ixgbe/unbind

remove_id is another one, but bind/unbind the common path for virtualization

> If the pci bus driver were to check for assigned VFs then it could take
> steps to protect the system and not depend on dumb device driver
> writers such as yours truly to do it.
>


> I've pasted in the code below from the ixgbe driver that checks for
> assigned VFs.  The pci bus driver has access to all the necessary
> information for the check but it would need to get the VF device ID
> from the PF SR-IOV capability structure.
>
easy to do; already done & in the vid/did of the vf's pci-dev structure.

> - Greg
>
> --------------
>
> static bool ixgbe_vfs_are_assigned(struct ixgbe_adapter *adapter)
> {
>          struct pci_dev *pdev = adapter->pdev;
>          struct pci_dev *vfdev;
>          int dev_id;
>
>          switch (adapter->hw.mac.type) {
>          case ixgbe_mac_82599EB:
>                  dev_id = IXGBE_DEV_ID_82599_VF;
>                  break;
>          case ixgbe_mac_X540:
>                  dev_id = IXGBE_DEV_ID_X540_VF;
>                  break;
>          default:
>                  return false;
>          }
>
>          /* loop through all the VFs to see if we own any that are assigned */
>          vfdev = pci_get_device(PCI_VENDOR_ID_INTEL, dev_id, NULL);
>          while (vfdev) {
>                  /* if we don't own it we don't care */
>                  if (vfdev->is_virtfn&&  vfdev->physfn == pdev) {
>                          /* if it is assigned we cannot release it */
>                          if (vfdev->dev_flags&  PCI_DEV_FLAGS_ASSIGNED)
>                                  return true;
>                  }
>
>                  vfdev = pci_get_device(PCI_VENDOR_ID_INTEL, dev_id, vfdev);
>          }
>
>          return false;
> }
>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rose, Gregory V Oct. 9, 2012, 8:31 p.m. UTC | #9
> -----Original Message-----
> From: Don Dutile [mailto:ddutile@redhat.com]
> Sent: Tuesday, October 09, 2012 11:40 AM
> To: Rose, Gregory V
> Cc: Alexander Duyck; linux-pci@vger.kernel.org; bhelgaas@google.com;
> yuvalmin@broadcom.com; bhutchings@solarflare.com; davem@davemloft.net
> Subject: Re: [RFC v2] PCI: sysfs per device SRIOV control and status
> 
> On 10/08/2012 11:44 AM, Greg Rose wrote:
> > On Sat, 6 Oct 2012 11:21:00 -0700

[snip]

> > In my opinion we shouldn't be having the endpoint device driver call
> > the pci_enable_sriov() and pci_disable_sriov() functions as this
> > implementation requires.  The pci bus driver should do that itself and
> > then just notify the endpoint device driver of the action and, in the
> > case of enabling VFs, tell it how many were enabled, with the num_vfs
> > parameter.
> >
> I want to double-check this logic, but on first glance I agree for this
> API...
> But, until the PF driver's, param-based, 'max_vfs' interface is removed,
> the driver will still have to have a code path that does the
> enable/disable_sriov().

True, but those will be somewhat different code paths in our driver and I'm going to insert a message into the driver that lets the user know that using the 'max_vfs' module parameter is deprecated.  Eventually I'd like to get rid of it entirely when this new interface becomes common.

> Again, I agree it is flawed, as a bad driver not calling disable_sriov()
> will leak resources and quickly fail other PCI (hotplug) ops (be they VFs
> or PFs).
> Q: what did you mean by 'and then just notify the endpoint device driver'
> ?
>     -- another/new api for pf drivers w/sriov support/device ??

No, I was thinking of a sequence like this:

1) PCI bus driver notifies endpoint device driver that user requests <num_vfs>
2) The endpoint driver processes the request and configures the HW to be ready for VFs coming on-line.
3) The endpoint device driver returns the number of VFs it is able to create or else a negative value of some error code.
4) The PCI bus driver sees that the notification/request is successful so it goes ahead and calls pci_enable_sriov() to enable the VFs.

This has the advantage of minimizing the amount of disruption in service to a minimum.  Also, it is how I'm handling it now in my experimental patches.  Everything is checked, configured and gotten ready and then the last thing I do before returning from sriov_enable_vfs() is call pci_enable_sriov().

That said, I suppose it is six of one and half a dozen of another except I like the approach I mention a little better because it forces the endpoint device driver to configure everything first successfully before enabling the VFs.  In the PCI bus driver if the call to pci_enable_sriov() fails then it can just quickly turn around and call the endpoint device driver to disable SR-IOV so that it will reconfigure itself to non SR-IOV operational mode.

> 
> > There is a good reason for this too. I asked you to change the return
> > value of sriov_disable_vfs() to an int so that the driver could check
> > if VFs are assigned and return an error.  But the more I think of I
> > feel that is backwards.  This means that the system is at the mercy of
> > the endpoint device driver to do the check for assigned VFs.  Badly
> > written drivers might not do that check and then crash the system.
> > Plus that means every driver has to write the same code to check for
> > assigned VFs which could easily be put into the pci bus driver.
> >
> Isn't this code still needed when user-level unbind done, i.e., echo 1 >
> /sys/bus/pci/drivers/ixgbe/unbind

Oh yeah.  Since that's the case then the endpoint device drivers will still need to do the check, but it might be a good idea to go ahead and put the check into the PCI bus driver also.  System panics are bad and whatever we can do to avoid them is hence good.

> 
> remove_id is another one, but bind/unbind the common path for
> virtualization
> 
> > If the pci bus driver were to check for assigned VFs then it could
> > take steps to protect the system and not depend on dumb device driver
> > writers such as yours truly to do it.
> >
> 
> 
> > I've pasted in the code below from the ixgbe driver that checks for
> > assigned VFs.  The pci bus driver has access to all the necessary
> > information for the check but it would need to get the VF device ID
> > from the PF SR-IOV capability structure.
> >
> easy to do; already done & in the vid/did of the vf's pci-dev structure.

OK, good deal.

I've finished some patches for the ixgbe driver and they are under internal review.  Testing turned out pretty well.  Hopefully they'll be ready for posting tomorrow afternoon since I'm gone Thursday and Friday.  They'll be based on your RFC v2 patch.  I don't expect many changes will be required for the next version of your patch.

- Greg

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Don Dutile Oct. 9, 2012, 10:49 p.m. UTC | #10
On 10/09/2012 04:31 PM, Rose, Gregory V wrote:
>> -----Original Message-----
>> From: Don Dutile [mailto:ddutile@redhat.com]
>> Sent: Tuesday, October 09, 2012 11:40 AM
>> To: Rose, Gregory V
>> Cc: Alexander Duyck; linux-pci@vger.kernel.org; bhelgaas@google.com;
>> yuvalmin@broadcom.com; bhutchings@solarflare.com; davem@davemloft.net
>> Subject: Re: [RFC v2] PCI: sysfs per device SRIOV control and status
>>
>> On 10/08/2012 11:44 AM, Greg Rose wrote:
>>> On Sat, 6 Oct 2012 11:21:00 -0700
>
> [snip]
>
>>> In my opinion we shouldn't be having the endpoint device driver call
>>> the pci_enable_sriov() and pci_disable_sriov() functions as this
>>> implementation requires.  The pci bus driver should do that itself and
>>> then just notify the endpoint device driver of the action and, in the
>>> case of enabling VFs, tell it how many were enabled, with the num_vfs
>>> parameter.
>>>
>> I want to double-check this logic, but on first glance I agree for this
>> API...
>> But, until the PF driver's, param-based, 'max_vfs' interface is removed,
>> the driver will still have to have a code path that does the
>> enable/disable_sriov().
>
> True, but those will be somewhat different code paths in our driver and I'm going to insert a message into the driver that lets the user know that using the 'max_vfs' module parameter is deprecated.  Eventually I'd like to get rid of it entirely when this new interface becomes common.
>
Sounds like plan, i.e., removing stuff that may cause problems.. :)

>> Again, I agree it is flawed, as a bad driver not calling disable_sriov()
>> will leak resources and quickly fail other PCI (hotplug) ops (be they VFs
>> or PFs).
>> Q: what did you mean by 'and then just notify the endpoint device driver'
>> ?
>>      -- another/new api for pf drivers w/sriov support/device ??
>
> No, I was thinking of a sequence like this:
>
> 1) PCI bus driver notifies endpoint device driver that user requests<num_vfs>
> 2) The endpoint driver processes the request and configures the HW to be ready for VFs coming on-line.
> 3) The endpoint device driver returns the number of VFs it is able to create or else a negative value of some error code.
> 4) The PCI bus driver sees that the notification/request is successful so it goes ahead and calls pci_enable_sriov() to enable the VFs.
>
> This has the advantage of minimizing the amount of disruption in service to a minimum.  Also, it is how I'm handling it now in my experimental patches.  Everything is checked, configured and gotten ready and then the last thing I do before returning from sriov_enable_vfs() is call pci_enable_sriov().
>
> That said, I suppose it is six of one and half a dozen of another except I like the approach I mention a little better because it forces the endpoint device driver to configure everything first successfully before enabling the VFs.  In the PCI bus driver if the call to pci_enable_sriov() fails then it can just quickly turn around and call the endpoint device driver to disable SR-IOV so that it will reconfigure itself to non SR-IOV operational mode.
>
Well, there's all kind of fun that can still occur at pci_sriov_enable() time --
not enough mmio resources is one of them.
So, the above sounds like a reasonable change, and IMO, makes more sense
to have the core code doing enable/disable & appropriate checking.

>>
>>> There is a good reason for this too. I asked you to change the return
>>> value of sriov_disable_vfs() to an int so that the driver could check
>>> if VFs are assigned and return an error.  But the more I think of I
>>> feel that is backwards.  This means that the system is at the mercy of
>>> the endpoint device driver to do the check for assigned VFs.  Badly
>>> written drivers might not do that check and then crash the system.
>>> Plus that means every driver has to write the same code to check for
>>> assigned VFs which could easily be put into the pci bus driver.
>>>
>> Isn't this code still needed when user-level unbind done, i.e., echo 1>
>> /sys/bus/pci/drivers/ixgbe/unbind
>
> Oh yeah.  Since that's the case then the endpoint device drivers will still need to do the check, but it might be a good idea to go ahead and put the check into the PCI bus driver also.  System panics are bad and whatever we can do to avoid them is hence good.
>
I get your type --- get it out of my driver... put it in the core!
winner! :)  The issue is that bind/unbind is under pci/drivers/<blah>
and not pci/devices/D:B:D.F/, so I have to find where to hook into bind/unbind
so vf->is_assigned check can be done for PCI devices.

>>
>> remove_id is another one, but bind/unbind the common path for
>> virtualization
>>
>>> If the pci bus driver were to check for assigned VFs then it could
>>> take steps to protect the system and not depend on dumb device driver
>>> writers such as yours truly to do it.
>>>
>>
>>
>>> I've pasted in the code below from the ixgbe driver that checks for
>>> assigned VFs.  The pci bus driver has access to all the necessary
>>> information for the check but it would need to get the VF device ID
>>> from the PF SR-IOV capability structure.
>>>
>> easy to do; already done&  in the vid/did of the vf's pci-dev structure.
>
> OK, good deal.
>
> I've finished some patches for the ixgbe driver and they are under internal review.  Testing turned out pretty well.  Hopefully they'll be ready for posting tomorrow afternoon since I'm gone Thursday and Friday.  They'll be based on your RFC v2 patch.  I don't expect many changes will be required for the next version of your patch.
>
> - Greg
>
Great!
I'll work on re-doing the patch set to have just the two sysfs files
as discussed above.
We can make an incremental change to move the sriov_enable/disable since
your patch has it ready to do just that.

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Mintz Oct. 24, 2012, 5:47 a.m. UTC | #11
>> I've finished some patches for the ixgbe driver and they are under internal review.  Testing turned out pretty well.  Hopefully they'll be ready for posting tomorrow afternoon since I'm gone Thursday and Friday.  They'll be based on your RFC v2 patch.  I don't expect many changes will be required for the next version of your patch.
>>
>> - Greg
>>
> Great!
> I'll work on re-doing the patch set to have just the two sysfs files
> as discussed above.
> We can make an incremental change to move the sriov_enable/disable since
> your patch has it ready to do just that.

Hi,

I've failed to find [RFC v3]; Will such a series be sent in the imminent future?

Thanks,
Yuval


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Don Dutile Oct. 25, 2012, 1:19 p.m. UTC | #12
On 10/24/2012 01:47 AM, Yuval Mintz wrote:
>>> I've finished some patches for the ixgbe driver and they are under internal review.  Testing turned out pretty well.  Hopefully they'll be ready for posting tomorrow afternoon since I'm gone Thursday and Friday.  They'll be based on your RFC v2 patch.  I don't expect many changes will be required for the next version of your patch.
>>>
>>> - Greg
>>>
>> Great!
>> I'll work on re-doing the patch set to have just the two sysfs files
>> as discussed above.
>> We can make an incremental change to move the sriov_enable/disable since
>> your patch has it ready to do just that.
>
> Hi,
>
> I've failed to find [RFC v3]; Will such a series be sent in the imminent future?
>
> Thanks,
> Yuval
>
>
Sorry for delay... (many) customers getting in the way again! ;-)
I'll be posting a V3 set shortly.
Testing w/patches Greg Rose gave me for ixgbe to fully exercise the
good & failure paths, so the V3 set should be in good functional
shape; just finished the testing late last night.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index fbbb97f..e6522c2 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -404,6 +404,128 @@  static ssize_t d3cold_allowed_show(struct device *dev,
 }
 #endif
 
+bool pci_dev_has_sriov(struct pci_dev *pdev)
+{
+        int ret = false;
+        int pos;
+
+        if (!pci_is_pcie(pdev))
+                goto out;
+
+        pos =  pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+        if (pos)
+                ret = true;
+out:
+        return ret;
+}
+
+#ifdef CONFIG_PCI_IOV
+static ssize_t sriov_max_vfs_show(struct device *dev,
+                                  struct device_attribute *attr,
+                                  char *buf)
+{
+        struct pci_dev *pdev;
+
+        pdev = to_pci_dev(dev);
+        return sprintf (buf, "%u\n", pdev->sriov->total);
+}
+
+
+static ssize_t sriov_enable_vfs_show(struct device *dev,
+                                     struct device_attribute *attr,
+                                     char *buf)
+{
+        struct pci_dev *pdev;
+
+        pdev = to_pci_dev(dev);
+
+        return sprintf (buf, "%u\n", pdev->sriov->nr_virtfn);
+}
+
+static ssize_t sriov_enable_vfs_store(struct device *dev,
+                                      struct device_attribute *attr,
+                                      const char *buf, size_t count)
+{
+        struct pci_dev *pdev;
+        int num_vf_enabled = 0;
+        unsigned long num_vfs;
+        pdev = to_pci_dev(dev);
+
+        /* Requested VFs to enable < max_vfs
+         * and none enabled already
+         */
+        if (strict_strtoul(buf, 0, &num_vfs) < 0)
+		return -EINVAL;
+
+        if ((num_vfs > pdev->sriov->total) ||
+            (pdev->sriov->nr_virtfn != 0))
+                return -EINVAL;
+
+        /* Ready for lift-off! */
+        if (pdev->driver && pdev->driver->sriov_enable) {
+                num_vf_enabled = pdev->driver->sriov_enable(pdev, num_vfs);
+        }
+
+        if (num_vf_enabled != num_vfs)
+                printk(KERN_WARNING
+                        "%s: %04x:%02x:%02x.%d: Only %d VFs enabled \n",
+                        pci_name(pdev), pci_domain_nr(pdev->bus),
+                        pdev->bus->number, PCI_SLOT(pdev->devfn),
+                        PCI_FUNC(pdev->devfn), num_vf_enabled);
+
+        return count;
+}
+
+static ssize_t sriov_disable_vfs_store(struct device *dev,
+                                       struct device_attribute *attr,
+                                       const char *buf, size_t count)
+{
+        struct pci_dev *pdev;
+	int ret = 0;
+
+        pdev = to_pci_dev(dev);
+
+        /* make sure sriov device & at least 1 vf enabled */
+        if (pdev->sriov->nr_virtfn == 0) {
+		ret = -ENODEV;
+                goto out;
+	}
+
+        /* Ready for landing! */
+	/* Note: Disabling VF can fail if VF assigned to virtualized guest */
+        if (pdev->driver && pdev->driver->sriov_disable) {
+                ret = pdev->driver->sriov_disable(pdev);
+        }
+out:
+        return ret ? ret : count;
+}
+#else
+static ssize_t sriov_max_vfs_show(struct device *dev,
+                                  struct device_attribute *attr,
+                                  char *buf)
+{ return 0; }
+static ssize_t sriov_enable_vfs_show(struct device *dev,
+                                     struct device_attribute *attr,
+                                     char *buf)
+{ return 0; }
+static ssize_t sriov_enable_vfs_store(struct device *dev,
+                                      struct device_attribute *attr,
+                                      const char *buf, size_t count)
+{ return count; }
+static ssize_t sriov_disable_vfs_store(struct device *dev,
+                                       struct device_attribute *attr,
+                                       const char *buf, size_t count)
+{ return count; }
+#endif /* CONFIG_PCI_IOV */
+
+static struct device_attribute sriov_max_vfs_attr = __ATTR_RO(sriov_max_vfs);
+static struct device_attribute sriov_enable_vfs_attr = 
+		__ATTR(sriov_enable_vfs, (S_IRUGO|S_IWUSR|S_IWGRP),
+                	sriov_enable_vfs_show, sriov_enable_vfs_store);
+static struct device_attribute sriov_disable_vfs_attr = 
+        	__ATTR(sriov_disable_vfs, (S_IWUSR|S_IWGRP),
+                	NULL, sriov_disable_vfs_store);
+
 struct device_attribute pci_dev_attrs[] = {
 	__ATTR_RO(resource),
 	__ATTR_RO(vendor),
@@ -1408,8 +1530,15 @@  static struct attribute *pci_dev_dev_attrs[] = {
 	NULL,
 };
 
+static struct attribute *sriov_dev_attrs[] = {
+        &sriov_max_vfs_attr.attr,
+        &sriov_enable_vfs_attr.attr,
+        &sriov_disable_vfs_attr.attr,
+	NULL,
+};
+
 static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
-						struct attribute *a, int n)
+					 struct attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
 	struct pci_dev *pdev = to_pci_dev(dev);
@@ -1421,13 +1550,35 @@  static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
 	return a->mode;
 }
 
+static umode_t sriov_attrs_are_visible(struct kobject *kobj,
+					 struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if ((a == &sriov_max_vfs_attr.attr) ||
+	    (a == &sriov_enable_vfs_attr.attr) ||
+	    (a == &sriov_disable_vfs_attr.attr)) {
+		if (!pci_dev_has_sriov(pdev))
+			return 0;
+	}
+
+	return a->mode;
+}
+
 static struct attribute_group pci_dev_attr_group = {
 	.attrs = pci_dev_dev_attrs,
 	.is_visible = pci_dev_attrs_are_visible,
 };
 
+static struct attribute_group sriov_dev_attr_group = {
+	.attrs = sriov_dev_attrs,
+	.is_visible = sriov_attrs_are_visible,
+};
+
 static const struct attribute_group *pci_dev_attr_groups[] = {
 	&pci_dev_attr_group,
+	&sriov_dev_attr_group,
 	NULL,
 };
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index be1de01..b3e25a8 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -595,6 +595,8 @@  struct pci_driver {
 	int  (*resume_early) (struct pci_dev *dev);
 	int  (*resume) (struct pci_dev *dev);	                /* Device woken up */
 	void (*shutdown) (struct pci_dev *dev);
+	int (*sriov_enable) (struct pci_dev *dev, int num_vfs);    /* PF pci dev */
+	int (*sriov_disable) (struct pci_dev *dev);
 	const struct pci_error_handlers *err_handler;
 	struct device_driver	driver;
 	struct pci_dynids dynids;