Patchwork [3/8] PCI: sysfs per device SRIOV control and status

login
register
mail settings
Submitter Don Dutile
Date Oct. 25, 2012, 6:38 p.m.
Message ID <1351190310-5043-4-git-send-email-ddutile@redhat.com>
Download mbox | patch
Permalink /patch/194272/
State Superseded
Headers show

Comments

Don Dutile - Oct. 25, 2012, 6:38 p.m.
Provide files under sysfs to determine the max number of vfs
an SRIOV-capable PCIe device supports, and methods to enable and
disable the vfs on a per device basis.

Currently, VF enablement by SRIOV-capable PCIe devices is done
in driver-specific module parameters.  If not setup in modprobe files,
it requires admin to unload & reload PF drivers with number of desired
VFs to enable.  Additionally, the enablement is system wide: all
devices controlled by the same driver have the same number of VFs
enabled.  Although the latter is probably desired, there are PCI
configurations setup by system BIOS that may not enable that to occur.

Three files are created if a PCIe device has SRIOV support:
sriov_totalvfs -- cat-ing this file returns the maximum number
                  of VFs a PCIe device supports as reported by
                  the TotalVFs in the SRIOV ext cap structure.
sriov_numvfs -- echo'ing a positive number to this file enables this
                number of VFs for this given PCIe device.
             -- echo'ing a 0 to this file disables
                any previously enabled VFs for this PCIe device.
             -- cat-ing this file will return the number of VFs
                currently enabled on this PCIe device, i.e.,
                the NumVFs field in SRIOV ext. cap structure.

VF enable and disablement is invoked much like other PCIe
configuration functions -- via registered callbacks in the driver,
i.e., probe, release, etc.

Signed-off-by: Donald Dutile <ddutile@redhat.com>
---
 drivers/pci/pci-sysfs.c | 155 ++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/pci.h     |   1 +
 2 files changed, 146 insertions(+), 10 deletions(-)
Ben Hutchings - Oct. 25, 2012, 8:17 p.m.
On Thu, 2012-10-25 at 14:38 -0400, Donald Dutile wrote:
[...]
> --- a/drivers/pci/pci-sysfs.c
> +++ b/drivers/pci/pci-sysfs.c
> @@ -14,6 +14,7 @@
>   *
>   */
>  
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* has to precede printk.h */
>  
>  #include <linux/kernel.h>
>  #include <linux/sched.h>
> @@ -66,7 +67,7 @@ static ssize_t broken_parity_status_store(struct device *dev,
>  	struct pci_dev *pdev = to_pci_dev(dev);
>  	unsigned long val;
>  
> -	if (strict_strtoul(buf, 0, &val) < 0)
> +	if (kstrtoul(buf, 0, &val) < 0)
>  		return -EINVAL;
>  
>  	pdev->broken_parity_status = !!val;
> @@ -188,7 +189,7 @@ static ssize_t is_enabled_store(struct device *dev,
>  {
>  	struct pci_dev *pdev = to_pci_dev(dev);
>  	unsigned long val;
> -	ssize_t result = strict_strtoul(buf, 0, &val);
> +	ssize_t result = kstrtoul(buf, 0, &val);
>  
>  	if (result < 0)
>  		return result;
> @@ -259,7 +260,7 @@ msi_bus_store(struct device *dev, struct device_attribute *attr,
>  	struct pci_dev *pdev = to_pci_dev(dev);
>  	unsigned long val;
>  
> -	if (strict_strtoul(buf, 0, &val) < 0)
> +	if (kstrtoul(buf, 0, &val) < 0)
>  		return -EINVAL;
>  
>  	/* bad things may happen if the no_msi flag is changed
> @@ -292,7 +293,7 @@ static ssize_t bus_rescan_store(struct bus_type *bus, const char *buf,
>  	unsigned long val;
>  	struct pci_bus *b = NULL;
>  
> -	if (strict_strtoul(buf, 0, &val) < 0)
> +	if (kstrtoul(buf, 0, &val) < 0)
>  		return -EINVAL;
>  
>  	if (val) {
> @@ -316,7 +317,7 @@ dev_rescan_store(struct device *dev, struct device_attribute *attr,
>  	unsigned long val;
>  	struct pci_dev *pdev = to_pci_dev(dev);
>  
> -	if (strict_strtoul(buf, 0, &val) < 0)
> +	if (kstrtoul(buf, 0, &val) < 0)
>  		return -EINVAL;
>  
>  	if (val) {
> @@ -343,7 +344,7 @@ remove_store(struct device *dev, struct device_attribute *dummy,
>  	int ret = 0;
>  	unsigned long val;
>  
> -	if (strict_strtoul(buf, 0, &val) < 0)
> +	if (kstrtoul(buf, 0, &val) < 0)
>  		return -EINVAL;
>  
>  	/* An attribute cannot be unregistered by one of its own methods,
> @@ -363,7 +364,7 @@ dev_bus_rescan_store(struct device *dev, struct device_attribute *attr,
>  	unsigned long val;
>  	struct pci_bus *bus = to_pci_bus(dev);
>  
> -	if (strict_strtoul(buf, 0, &val) < 0)
> +	if (kstrtoul(buf, 0, &val) < 0)
>  		return -EINVAL;
>  
>  	if (val) {
> @@ -387,7 +388,7 @@ static ssize_t d3cold_allowed_store(struct device *dev,
>  	struct pci_dev *pdev = to_pci_dev(dev);
>  	unsigned long val;
>  
> -	if (strict_strtoul(buf, 0, &val) < 0)
> +	if (kstrtoul(buf, 0, &val) < 0)
>  		return -EINVAL;
>  
>  	pdev->d3cold_allowed = !!val;

All this cleanup belongs in a separate patch.

> @@ -404,6 +405,114 @@ static ssize_t d3cold_allowed_show(struct device *dev,
>  }
>  #endif
>  
> +#ifdef CONFIG_PCI_IOV
> +static ssize_t sriov_totalvfs_show(struct device *dev,
> +                                  struct device_attribute *attr,
> +                                  char *buf)
> +{
> +	struct pci_dev *pdev;
> +	u16 total;
> +
> +	pdev = to_pci_dev(dev);
> +	total = pdev->sriov->total;
> +	return sprintf (buf, "%u\n", total);

No space after sprintf, please.

> +}
> +
> +
> +static ssize_t sriov_numvfs_show(struct device *dev,
> +                                 struct device_attribute *attr,
> +                                 char *buf)
> +{
> +	struct pci_dev *pdev;
> +
> +	pdev = to_pci_dev(dev);
> +
> +	return sprintf (buf, "%u\n", pdev->sriov->nr_virtfn);
> +}
> +
> +/* 
> + * num_vfs > 0; number of vfs to enable
> + * num_vfs = 0; disable all vfs
> + *
> + * Note: SRIOV spec doesn't allow partial VF
> + *       disable, so its all or none.
> + */
> +static ssize_t sriov_numvfs_store(struct device *dev,
> +                                  struct device_attribute *attr,
> +                                  const char *buf, size_t count)
> +{
> +	struct pci_dev *pdev;
> +	int num_vfs_enabled = 0;
> +	int num_vfs;
> +	int ret = 0;
> +	u16 total;
> +
> +	pdev = to_pci_dev(dev);
> +
> +	/* Requested VFs to enable < totalvfs
> +	 * and none enabled already
> +	 */
> +	if (kstrtoint(buf, 0, &num_vfs) < 0)
> +		return -EINVAL;

The above comment belongs with
'if ((num_vfs > 0) && (num_vfs <= total))'.

> +	/* is PF driver loaded w/callback */
> +	if (!pdev->driver || !pdev->driver->sriov_configure) {
> +		pr_info("Driver doesn't support sriov configuration via sysfs \n");
> +		return -ENOSYS;

Use dev_err() to set the proper severity and provide context.  Not sure
whether this is the right error code.

> +	}
> +
> +	/* if enabling vf's ... */
> +	total = pdev->sriov->total;
> +	if ((num_vfs > 0) && (num_vfs <= total)) {
> +		if (pdev->sriov->nr_virtfn == 0) { /* if not already enabled */
> +		    num_vfs_enabled = pdev->driver->sriov_configure(pdev, num_vfs);
> +        	    if ((num_vfs_enabled >= 0) && (num_vfs_enabled != num_vfs))
> +			pr_warn("%s: %04x:%02x:%02x.%d: Only %d VFs enabled \n",
> +                        	pdev->dev.driver->name, pci_domain_nr(pdev->bus),
> +                        	pdev->bus->number, PCI_SLOT(pdev->devfn),
> +                        	PCI_FUNC(pdev->devfn), num_vfs_enabled);

Use dev_warn(), don't try to replicate it.

> +			return count;
> +		} else {
> +			pr_warn("VFs already enabled. Disable before re-enabling\n");
> +			return -EINVAL;

It would be helpful to make an 'enable' write succeed when
num_vfs == pdev->sriov->nr_virtfn.

> +		}
> +	} 
> +
> +	/* disable vfs */
> +	if (num_vfs == 0) {
> +		if (pdev->sriov->nr_virtfn != 0) {
> +			ret = pdev->driver->sriov_configure(pdev, 0);
> +			return ret ? ret: count;
> +		} else {
> +			pr_err("Disable failed since no VFs enabled\n");
> +			return -EINVAL;

This definitely should be treated as successful, not an error.

> +		}
> +	}
> +
> +	pr_err("Invalid value for number of VFs to enable: %d\n", num_vfs);
> +
> +	return -EINVAL;
> +}
> +#else
> +static ssize_t sriov_totalvfs_show(struct device *dev,
> +                                   struct device_attribute *attr,
> +                                   char *buf)
> +{ return 0; }
> +static ssize_t sriov_numvfs_show(struct device *dev,
> +			         struct device_attribute *attr,
> +			         char *buf)
> +{ return 0; }

Shouldn't these print "0\n"?

> +static ssize_t sriov_numvfs_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count)
> +{ return count; }
> +#endif /* CONFIG_PCI_IOV */
> +
> +static struct device_attribute sriov_totalvfs_attr = __ATTR_RO(sriov_totalvfs);
> +static struct device_attribute sriov_numvfs_attr = 
> +		__ATTR(sriov_numvfs, (S_IRUGO|S_IWUSR|S_IWGRP),
> +                	sriov_numvfs_show, sriov_numvfs_store);

sriov_numvfs should be read-only if !CONFIG_PCI_IOV, rather than
ignoring writes.

>  struct device_attribute pci_dev_attrs[] = {
>  	__ATTR_RO(resource),
>  	__ATTR_RO(vendor),
> @@ -1194,7 +1303,7 @@ static ssize_t reset_store(struct device *dev,
>  {
>  	struct pci_dev *pdev = to_pci_dev(dev);
>  	unsigned long val;
> -	ssize_t result = strict_strtoul(buf, 0, &val);
> +	ssize_t result = kstrtoul(buf, 0, &val);
>  
>  	if (result < 0)
>  		return result;
> @@ -1408,8 +1517,14 @@ static struct attribute *pci_dev_dev_attrs[] = {
>  	NULL,
>  };
>  
> +static struct attribute *sriov_dev_attrs[] = {
> +        &sriov_totalvfs_attr.attr,
> +        &sriov_numvfs_attr.attr,

These are wrongly indented with spaces.

> +	NULL,
> +};
> +
>  static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
> -						struct attribute *a, int n)
> +					 struct attribute *a, int n)
>  {
>  	struct device *dev = container_of(kobj, struct device, kobj);
>  	struct pci_dev *pdev = to_pci_dev(dev);
> @@ -1421,13 +1536,33 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
>  	return a->mode;
>  }
>  
> +static umode_t sriov_attrs_are_visible(struct kobject *kobj,
> +					 struct attribute *a, int n)
> +{
> +	struct device *dev = container_of(kobj, struct device, kobj);
> +
> +	if ((a == &sriov_totalvfs_attr.attr) ||
> +	    (a == &sriov_numvfs_attr.attr)) {
> +		if (!dev_is_pf(dev))
> +			return 0;
[...]

An odd thing about dev_is_pf() is that if !CONFIG_PCI_IOV then it is
false for all devices.  Which means that the dummy attribute reader
functions will actually be dead code.  I think that either dev_is_pf()
should be fixed to be accurate when !CONFIG_PCI_IOV, or the attributes
should be completely removed if !CONFIG_PCI_IOV.

Ben.
Don Dutile - Oct. 26, 2012, 3:07 p.m.
On 10/25/2012 04:17 PM, Ben Hutchings wrote:
> On Thu, 2012-10-25 at 14:38 -0400, Donald Dutile wrote:
> [...]
>> --- a/drivers/pci/pci-sysfs.c
>> +++ b/drivers/pci/pci-sysfs.c
>> @@ -14,6 +14,7 @@
>>    *
>>    */
>>
>> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* has to precede printk.h */
>>
>>   #include<linux/kernel.h>
>>   #include<linux/sched.h>
>> @@ -66,7 +67,7 @@ static ssize_t broken_parity_status_store(struct device *dev,
>>   	struct pci_dev *pdev = to_pci_dev(dev);
>>   	unsigned long val;
>>
>> -	if (strict_strtoul(buf, 0,&val)<  0)
>> +	if (kstrtoul(buf, 0,&val)<  0)
>>   		return -EINVAL;
>>
>>   	pdev->broken_parity_status = !!val;
>> @@ -188,7 +189,7 @@ static ssize_t is_enabled_store(struct device *dev,
>>   {
>>   	struct pci_dev *pdev = to_pci_dev(dev);
>>   	unsigned long val;
>> -	ssize_t result = strict_strtoul(buf, 0,&val);
>> +	ssize_t result = kstrtoul(buf, 0,&val);
>>
>>   	if (result<  0)
>>   		return result;
>> @@ -259,7 +260,7 @@ msi_bus_store(struct device *dev, struct device_attribute *attr,
>>   	struct pci_dev *pdev = to_pci_dev(dev);
>>   	unsigned long val;
>>
>> -	if (strict_strtoul(buf, 0,&val)<  0)
>> +	if (kstrtoul(buf, 0,&val)<  0)
>>   		return -EINVAL;
>>
>>   	/* bad things may happen if the no_msi flag is changed
>> @@ -292,7 +293,7 @@ static ssize_t bus_rescan_store(struct bus_type *bus, const char *buf,
>>   	unsigned long val;
>>   	struct pci_bus *b = NULL;
>>
>> -	if (strict_strtoul(buf, 0,&val)<  0)
>> +	if (kstrtoul(buf, 0,&val)<  0)
>>   		return -EINVAL;
>>
>>   	if (val) {
>> @@ -316,7 +317,7 @@ dev_rescan_store(struct device *dev, struct device_attribute *attr,
>>   	unsigned long val;
>>   	struct pci_dev *pdev = to_pci_dev(dev);
>>
>> -	if (strict_strtoul(buf, 0,&val)<  0)
>> +	if (kstrtoul(buf, 0,&val)<  0)
>>   		return -EINVAL;
>>
>>   	if (val) {
>> @@ -343,7 +344,7 @@ remove_store(struct device *dev, struct device_attribute *dummy,
>>   	int ret = 0;
>>   	unsigned long val;
>>
>> -	if (strict_strtoul(buf, 0,&val)<  0)
>> +	if (kstrtoul(buf, 0,&val)<  0)
>>   		return -EINVAL;
>>
>>   	/* An attribute cannot be unregistered by one of its own methods,
>> @@ -363,7 +364,7 @@ dev_bus_rescan_store(struct device *dev, struct device_attribute *attr,
>>   	unsigned long val;
>>   	struct pci_bus *bus = to_pci_bus(dev);
>>
>> -	if (strict_strtoul(buf, 0,&val)<  0)
>> +	if (kstrtoul(buf, 0,&val)<  0)
>>   		return -EINVAL;
>>
>>   	if (val) {
>> @@ -387,7 +388,7 @@ static ssize_t d3cold_allowed_store(struct device *dev,
>>   	struct pci_dev *pdev = to_pci_dev(dev);
>>   	unsigned long val;
>>
>> -	if (strict_strtoul(buf, 0,&val)<  0)
>> +	if (kstrtoul(buf, 0,&val)<  0)
>>   		return -EINVAL;
>>
>>   	pdev->d3cold_allowed = !!val;
>
> All this cleanup belongs in a separate patch.
Yes.  Multiple people were getting anxious to see this V3, so I wanted
to get it out for review of its direction.
I did a checkpatch.pl run after I posted... ugh! what a mess!
you note a number of the issues below.

>
>> @@ -404,6 +405,114 @@ static ssize_t d3cold_allowed_show(struct device *dev,
>>   }
>>   #endif
>>
>> +#ifdef CONFIG_PCI_IOV
>> +static ssize_t sriov_totalvfs_show(struct device *dev,
>> +                                  struct device_attribute *attr,
>> +                                  char *buf)
>> +{
>> +	struct pci_dev *pdev;
>> +	u16 total;
>> +
>> +	pdev = to_pci_dev(dev);
>> +	total = pdev->sriov->total;
>> +	return sprintf (buf, "%u\n", total);
>
> No space after sprintf, please.
>
yep, checkpatch...

>> +}
>> +
>> +
>> +static ssize_t sriov_numvfs_show(struct device *dev,
>> +                                 struct device_attribute *attr,
>> +                                 char *buf)
>> +{
>> +	struct pci_dev *pdev;
>> +
>> +	pdev = to_pci_dev(dev);
>> +
>> +	return sprintf (buf, "%u\n", pdev->sriov->nr_virtfn);
>> +}
>> +
>> +/*
>> + * num_vfs>  0; number of vfs to enable
>> + * num_vfs = 0; disable all vfs
>> + *
>> + * Note: SRIOV spec doesn't allow partial VF
>> + *       disable, so its all or none.
>> + */
>> +static ssize_t sriov_numvfs_store(struct device *dev,
>> +                                  struct device_attribute *attr,
>> +                                  const char *buf, size_t count)
>> +{
>> +	struct pci_dev *pdev;
>> +	int num_vfs_enabled = 0;
>> +	int num_vfs;
>> +	int ret = 0;
>> +	u16 total;
>> +
>> +	pdev = to_pci_dev(dev);
>> +
>> +	/* Requested VFs to enable<  totalvfs
>> +	 * and none enabled already
>> +	 */
>> +	if (kstrtoint(buf, 0,&num_vfs)<  0)
>> +		return -EINVAL;
>
> The above comment belongs with
> 'if ((num_vfs>  0)&&  (num_vfs<= total))'.
>
yes, forgot to move it. thanks!

>> +	/* is PF driver loaded w/callback */
>> +	if (!pdev->driver || !pdev->driver->sriov_configure) {
>> +		pr_info("Driver doesn't support sriov configuration via sysfs \n");
>> +		return -ENOSYS;
>
> Use dev_err() to set the proper severity and provide context.  Not sure
> whether this is the right error code.
>
well, i used pr_info() b/c that was what I was requested to do
when I did similar cleanup in the dmar.c file recently.  I
can change to dev_*() formatting.

>> +	}
>> +
>> +	/* if enabling vf's ... */
>> +	total = pdev->sriov->total;
>> +	if ((num_vfs>  0)&&  (num_vfs<= total)) {
>> +		if (pdev->sriov->nr_virtfn == 0) { /* if not already enabled */
>> +		    num_vfs_enabled = pdev->driver->sriov_configure(pdev, num_vfs);
>> +        	    if ((num_vfs_enabled>= 0)&&  (num_vfs_enabled != num_vfs))
>> +			pr_warn("%s: %04x:%02x:%02x.%d: Only %d VFs enabled \n",
>> +                        	pdev->dev.driver->name, pci_domain_nr(pdev->bus),
>> +                        	pdev->bus->number, PCI_SLOT(pdev->devfn),
>> +                        	PCI_FUNC(pdev->devfn), num_vfs_enabled);
>
> Use dev_warn(), don't try to replicate it.
>
ditto.

>> +			return count;
>> +		} else {
>> +			pr_warn("VFs already enabled. Disable before re-enabling\n");
>> +			return -EINVAL;
>
> It would be helpful to make an 'enable' write succeed when
> num_vfs == pdev->sriov->nr_virtfn.
>
ah, functional feedback!
yes, setting numvfs == existing value could reply success,
but I had it this way since it could show a programming error
with a mgmt tool that is trying to do the enable when it
should not, or already had, and a successful return may be interpreted
as a correctness in the mgmt app's logic. .... I could go either way.
I'll wait for other feedback on this to see which way the
group wants to have it.

>> +		}
>> +	}
>> +
>> +	/* disable vfs */
>> +	if (num_vfs == 0) {
>> +		if (pdev->sriov->nr_virtfn != 0) {
>> +			ret = pdev->driver->sriov_configure(pdev, 0);
>> +			return ret ? ret: count;
>> +		} else {
>> +			pr_err("Disable failed since no VFs enabled\n");
>> +			return -EINVAL;
>
> This definitely should be treated as successful, not an error.
>
Same comment as above.  it could be considered successful,
or it could be considered a rtn for programming logic error
of a mgmt app. again, I could go either way. ditto above.

>> +		}
>> +	}
>> +
>> +	pr_err("Invalid value for number of VFs to enable: %d\n", num_vfs);
>> +
>> +	return -EINVAL;
>> +}
>> +#else
>> +static ssize_t sriov_totalvfs_show(struct device *dev,
>> +                                   struct device_attribute *attr,
>> +                                   char *buf)
>> +{ return 0; }
>> +static ssize_t sriov_numvfs_show(struct device *dev,
>> +			         struct device_attribute *attr,
>> +			         char *buf)
>> +{ return 0; }
>
> Shouldn't these print "0\n"?
>
agree.

>> +static ssize_t sriov_numvfs_store(struct device *dev,
>> +				  struct device_attribute *attr,
>> +				  const char *buf, size_t count)
>> +{ return count; }
>> +#endif /* CONFIG_PCI_IOV */
>> +
>> +static struct device_attribute sriov_totalvfs_attr = __ATTR_RO(sriov_totalvfs);
>> +static struct device_attribute sriov_numvfs_attr =
>> +		__ATTR(sriov_numvfs, (S_IRUGO|S_IWUSR|S_IWGRP),
>> +                	sriov_numvfs_show, sriov_numvfs_store);
>
> sriov_numvfs should be read-only if !CONFIG_PCI_IOV, rather than
> ignoring writes.
>
If !CONFIG_PCI_IOV, none of these attributes are visible,
so they won't exist.

>>   struct device_attribute pci_dev_attrs[] = {
>>   	__ATTR_RO(resource),
>>   	__ATTR_RO(vendor),
>> @@ -1194,7 +1303,7 @@ static ssize_t reset_store(struct device *dev,
>>   {
>>   	struct pci_dev *pdev = to_pci_dev(dev);
>>   	unsigned long val;
>> -	ssize_t result = strict_strtoul(buf, 0,&val);
>> +	ssize_t result = kstrtoul(buf, 0,&val);
>>
>>   	if (result<  0)
>>   		return result;
>> @@ -1408,8 +1517,14 @@ static struct attribute *pci_dev_dev_attrs[] = {
>>   	NULL,
>>   };
>>
>> +static struct attribute *sriov_dev_attrs[] = {
>> +&sriov_totalvfs_attr.attr,
>> +&sriov_numvfs_attr.attr,
>
> These are wrongly indented with spaces.
>
checkpatch!

>> +	NULL,
>> +};
>> +
>>   static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
>> -						struct attribute *a, int n)
>> +					 struct attribute *a, int n)
>>   {
>>   	struct device *dev = container_of(kobj, struct device, kobj);
>>   	struct pci_dev *pdev = to_pci_dev(dev);
>> @@ -1421,13 +1536,33 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
>>   	return a->mode;
>>   }
>>
>> +static umode_t sriov_attrs_are_visible(struct kobject *kobj,
>> +					 struct attribute *a, int n)
>> +{
>> +	struct device *dev = container_of(kobj, struct device, kobj);
>> +
>> +	if ((a ==&sriov_totalvfs_attr.attr) ||
>> +	    (a ==&sriov_numvfs_attr.attr)) {
>> +		if (!dev_is_pf(dev))
>> +			return 0;
> [...]
>
> An odd thing about dev_is_pf() is that if !CONFIG_PCI_IOV then it is
> false for all devices.  Which means that the dummy attribute reader
> functions will actually be dead code.  I think that either dev_is_pf()
> should be fixed to be accurate when !CONFIG_PCI_IOV, or the attributes
> should be completely removed if !CONFIG_PCI_IOV.
>
> Ben.
>
Since it's all in the same file, I could wrap the following code with
CONFIG_PCI_IOV:

static const struct attribute_group *pci_dev_attr_groups[] = {
         &pci_dev_attr_group,
#ifdef CONFIG_PCI_IOV
         &sriov_dev_attr_group,
#endif
         NULL,
};

and then the dummy attribute function could be removed.
another good cleanup, thanks!

Thanks for the thorough review of the above!


Ok, my turn:
Any feedback on having the sysfs configure call pci_sriov_[enable/disable](),
as well as do the don't-disable if VFs are assigned to guests?


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rose, Gregory V - Oct. 31, 2012, 5:01 p.m.
PiAtLS0tLU9yaWdpbmFsIE1lc3NhZ2UtLS0tLQ0KPiBGcm9tOiBEb24gRHV0aWxlIFttYWlsdG86
ZGR1dGlsZUByZWRoYXQuY29tXQ0KPiBTZW50OiBGcmlkYXksIE9jdG9iZXIgMjYsIDIwMTIgODow
NyBBTQ0KPiBUbzogQmVuIEh1dGNoaW5ncw0KPiBDYzogbGludXgtcGNpQHZnZXIua2VybmVsLm9y
ZzsgYmhlbGdhYXNAZ29vZ2xlLmNvbTsgeXV2YWxtaW5AYnJvYWRjb20uY29tOw0KPiBSb3NlLCBH
cmVnb3J5IFY7IHlpbmdoYWlAa2VybmVsLm9yZzsgZGF2ZW1AZGF2ZW1sb2Z0Lm5ldA0KPiBTdWJq
ZWN0OiBSZTogW1BBVENIIDMvOF0gUENJOiBzeXNmcyBwZXIgZGV2aWNlIFNSSU9WIGNvbnRyb2wg
YW5kIHN0YXR1cw0KPiANCiANCj4gDQo+IE9rLCBteSB0dXJuOg0KPiBBbnkgZmVlZGJhY2sgb24g
aGF2aW5nIHRoZSBzeXNmcyBjb25maWd1cmUgY2FsbA0KPiBwY2lfc3Jpb3ZfW2VuYWJsZS9kaXNh
YmxlXSgpLCBhcyB3ZWxsIGFzIGRvIHRoZSBkb24ndC1kaXNhYmxlIGlmIFZGcyBhcmUNCj4gYXNz
aWduZWQgdG8gZ3Vlc3RzPw0KPiANCg0KRG9uLA0KDQpBcyBJJ3ZlIG1lbnRpb25lZCBiZWZvcmUg
SSBzdGlsbCBwcmVmZXIgdG8gaGF2ZSB0aGUgc3lzZnMgaW50ZXJmYWNlIHlvdSd2ZSB3cml0dGVu
IHVwIG1ha2UgdGhlIGNhbGxzIHRvIHBjaV9zcmlvdl9lbmFibGUvZGlzYWJsZSgpIGFuZCBoYXZl
IHRoZSBjaGVja2luZyBmb3Igd2hldGhlciB0aGUgVkZzIGFyZSBhc3NpZ25lZCB0byBndWVzdHMg
ZG9uZSB0aGVyZSBhbHNvLCBidXQgcmVhbGx5IGl0IGlzbid0IGFueXRoaW5nIHdvcnRoIGdvaW5n
IHRvIHRoZSBtYXRzIGFib3V0LiAgQXMgaXQgc3RhbmRzIEkgdGhpbmsgaWYgeW91IGFkZHJlc3Mg
dGhlIGlzc3VlcyBicm91Z2h0IHVwIGJ5IEJlbiB0aGVuIEknbSBmaW5lIHdpdGggd2hhdCB5b3Un
dmUgd29ya2VkIHVwIHNvIGZhci4gIFNpbmNlIG5vIG9uZSBlbHNlIHNlZW1zIHRvIGhhdmUgYW4g
b3BpbmlvbiBhYm91dCBpdCAoYXMgZGVtb25zdHJhdGVkIGJ5IGEgbGFjayBvZiByZXNwb25zZSBv
dmVyIHRoZSBsYXN0IDUgZGF5cykgdGhlbiBJJ2Qgc3VnZ2VzdCB3ZSBnbyBmb3J3YXJkIHdpdGgg
dGhlIGN1cnJlbnQgaW1wbGVtZW50YXRpb24uICBJJ2QgcmVhbGx5IGxpa2UgdG8gc2VlIHRoaXMg
aW4gMy44IGlmIHBvc3NpYmxlLg0KDQpUaGFua3MgZm9yIGFsbCB5b3VyIHdvcmsuDQoNCi0gR3Jl
Zw0K
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Hutchings - Oct. 31, 2012, 5:36 p.m.
On Wed, 2012-10-31 at 17:01 +0000, Rose, Gregory V wrote:
> > -----Original Message-----
> > From: Don Dutile [mailto:ddutile@redhat.com]
> > Sent: Friday, October 26, 2012 8:07 AM
> > To: Ben Hutchings
> > Cc: linux-pci@vger.kernel.org; bhelgaas@google.com; yuvalmin@broadcom.com;
> > Rose, Gregory V; yinghai@kernel.org; davem@davemloft.net
> > Subject: Re: [PATCH 3/8] PCI: sysfs per device SRIOV control and status
> > 
>  
> > 
> > Ok, my turn:
> > Any feedback on having the sysfs configure call
> > pci_sriov_[enable/disable](), as well as do the don't-disable if VFs are
> > assigned to guests?
> > 
> 
> Don,
> 
> As I've mentioned before I still prefer to have the sysfs interface
> you've written up make the calls to pci_sriov_enable/disable()

I think that would work for sfc, assuming the driver function is to be
called before either of those.  I don't know whether it would work for
any of the other drivers with SR-IOV back-ends, though.

> and have the checking for whether the VFs are assigned to guests done
> there also,

I agree that this is should be centralised, though I think that could be
done as a later step without too much pain.

> but really it isn't anything worth going to the mats about.  As it
> stands I think if you address the issues brought up by Ben then I'm
> fine with what you've worked up so far.  Since no one else seems to
> have an opinion about it (as demonstrated by a lack of response over
> the last 5 days) then I'd suggest we go forward with the current
> implementation.  I'd really like to see this in 3.8 if possible.
> 
> Thanks for all your work.

Agreed, thanks Don.

Ben.
Don Dutile - Oct. 31, 2012, 6:18 p.m.
On 10/31/2012 01:36 PM, Ben Hutchings wrote:
> On Wed, 2012-10-31 at 17:01 +0000, Rose, Gregory V wrote:
>>> -----Original Message-----
>>> From: Don Dutile [mailto:ddutile@redhat.com]
>>> Sent: Friday, October 26, 2012 8:07 AM
>>> To: Ben Hutchings
>>> Cc: linux-pci@vger.kernel.org; bhelgaas@google.com; yuvalmin@broadcom.com;
>>> Rose, Gregory V; yinghai@kernel.org; davem@davemloft.net
>>> Subject: Re: [PATCH 3/8] PCI: sysfs per device SRIOV control and status
>>>
>>
>>>
>>> Ok, my turn:
>>> Any feedback on having the sysfs configure call
>>> pci_sriov_[enable/disable](), as well as do the don't-disable if VFs are
>>> assigned to guests?
>>>
>>
>> Don,
>>
>> As I've mentioned before I still prefer to have the sysfs interface
>> you've written up make the calls to pci_sriov_enable/disable()
>
> I think that would work for sfc, assuming the driver function is to be
> called before either of those.  I don't know whether it would work for
> any of the other drivers with SR-IOV back-ends, though.
>
>> and have the checking for whether the VFs are assigned to guests done
>> there also,
>
> I agree that this is should be centralised, though I think that could be
> done as a later step without too much pain.
>
>> but really it isn't anything worth going to the mats about.  As it
>> stands I think if you address the issues brought up by Ben then I'm
>> fine with what you've worked up so far.  Since no one else seems to
>> have an opinion about it (as demonstrated by a lack of response over
>> the last 5 days) then I'd suggest we go forward with the current
>> implementation.  I'd really like to see this in 3.8 if possible.
>>
>> Thanks for all your work.
>
> Agreed, thanks Don.
>
> Ben.
>
Greg & Ben,
Thanks for your feedback on the core doing the sriov_enable/disable.
I agree that let's get existing the core stuff into 3.8 & do the enable/disable as a follow up.
I'm just polishing the patch set now based on Ben's feedback....
I was delayed when my test machine was (improperly) taken from me (bad IT, bad! :-( ).
Just got the machine back today to test the various (error) cases,
and make checkpatch.pl happy.  I expect to post a PATCH (non-RFC) later today.
- Don
ps -- of course, if Greg wants to give me a set of igb patches like the ones he did
       for ixgbe, it'd get me out of the test machine battles -- I have private
       test machine w/igb's. :)  hint, hint...ok, more like 'beg, beg' ... again... ;)

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rose, Gregory V - Oct. 31, 2012, 6:25 p.m.
> -----Original Message-----

> From: Don Dutile [mailto:ddutile@redhat.com]

> Sent: Wednesday, October 31, 2012 11:19 AM

> To: Ben Hutchings

> Cc: Rose, Gregory V; linux-pci@vger.kernel.org; bhelgaas@google.com;

> yuvalmin@broadcom.com; yinghai@kernel.org; davem@davemloft.net

> Subject: Re: [PATCH 3/8] PCI: sysfs per device SRIOV control and status

> 

> On 10/31/2012 01:36 PM, Ben Hutchings wrote:

> > On Wed, 2012-10-31 at 17:01 +0000, Rose, Gregory V wrote:

> >>> -----Original Message-----

> >>> From: Don Dutile [mailto:ddutile@redhat.com]

> >>> Sent: Friday, October 26, 2012 8:07 AM

> >>> To: Ben Hutchings

> >>> Cc: linux-pci@vger.kernel.org; bhelgaas@google.com;

> >>> yuvalmin@broadcom.com; Rose, Gregory V; yinghai@kernel.org;

> >>> davem@davemloft.net

> >>> Subject: Re: [PATCH 3/8] PCI: sysfs per device SRIOV control and

> >>> status

> >>>

> >>

> >>>

> >>> Ok, my turn:

> >>> Any feedback on having the sysfs configure call

> >>> pci_sriov_[enable/disable](), as well as do the don't-disable if VFs

> >>> are assigned to guests?

> >>>

> >>

> >> Don,

> >>

> >> As I've mentioned before I still prefer to have the sysfs interface

> >> you've written up make the calls to pci_sriov_enable/disable()

> >

> > I think that would work for sfc, assuming the driver function is to be

> > called before either of those.  I don't know whether it would work for

> > any of the other drivers with SR-IOV back-ends, though.

> >

> >> and have the checking for whether the VFs are assigned to guests done

> >> there also,

> >

> > I agree that this is should be centralised, though I think that could

> > be done as a later step without too much pain.

> >

> >> but really it isn't anything worth going to the mats about.  As it

> >> stands I think if you address the issues brought up by Ben then I'm

> >> fine with what you've worked up so far.  Since no one else seems to

> >> have an opinion about it (as demonstrated by a lack of response over

> >> the last 5 days) then I'd suggest we go forward with the current

> >> implementation.  I'd really like to see this in 3.8 if possible.

> >>

> >> Thanks for all your work.

> >

> > Agreed, thanks Don.

> >

> > Ben.

> >

> Greg & Ben,

> Thanks for your feedback on the core doing the sriov_enable/disable.

> I agree that let's get existing the core stuff into 3.8 & do the

> enable/disable as a follow up.

> I'm just polishing the patch set now based on Ben's feedback....

> I was delayed when my test machine was (improperly) taken from me (bad IT,

> bad! :-( ).

> Just got the machine back today to test the various (error) cases, and

> make checkpatch.pl happy.  I expect to post a PATCH (non-RFC) later today.

> - Don

> ps -- of course, if Greg wants to give me a set of igb patches like the

> ones he did

>        for ixgbe, it'd get me out of the test machine battles -- I have

> private

>        test machine w/igb's. :)  hint, hint...ok, more like 'beg, beg' ...

> again... ;)


I'll see what I can do.

:)

- Greg

Patch

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index fbbb97f..c2894ca 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -14,6 +14,7 @@ 
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* has to precede printk.h */
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -66,7 +67,7 @@  static ssize_t broken_parity_status_store(struct device *dev,
 	struct pci_dev *pdev = to_pci_dev(dev);
 	unsigned long val;
 
-	if (strict_strtoul(buf, 0, &val) < 0)
+	if (kstrtoul(buf, 0, &val) < 0)
 		return -EINVAL;
 
 	pdev->broken_parity_status = !!val;
@@ -188,7 +189,7 @@  static ssize_t is_enabled_store(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	unsigned long val;
-	ssize_t result = strict_strtoul(buf, 0, &val);
+	ssize_t result = kstrtoul(buf, 0, &val);
 
 	if (result < 0)
 		return result;
@@ -259,7 +260,7 @@  msi_bus_store(struct device *dev, struct device_attribute *attr,
 	struct pci_dev *pdev = to_pci_dev(dev);
 	unsigned long val;
 
-	if (strict_strtoul(buf, 0, &val) < 0)
+	if (kstrtoul(buf, 0, &val) < 0)
 		return -EINVAL;
 
 	/* bad things may happen if the no_msi flag is changed
@@ -292,7 +293,7 @@  static ssize_t bus_rescan_store(struct bus_type *bus, const char *buf,
 	unsigned long val;
 	struct pci_bus *b = NULL;
 
-	if (strict_strtoul(buf, 0, &val) < 0)
+	if (kstrtoul(buf, 0, &val) < 0)
 		return -EINVAL;
 
 	if (val) {
@@ -316,7 +317,7 @@  dev_rescan_store(struct device *dev, struct device_attribute *attr,
 	unsigned long val;
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	if (strict_strtoul(buf, 0, &val) < 0)
+	if (kstrtoul(buf, 0, &val) < 0)
 		return -EINVAL;
 
 	if (val) {
@@ -343,7 +344,7 @@  remove_store(struct device *dev, struct device_attribute *dummy,
 	int ret = 0;
 	unsigned long val;
 
-	if (strict_strtoul(buf, 0, &val) < 0)
+	if (kstrtoul(buf, 0, &val) < 0)
 		return -EINVAL;
 
 	/* An attribute cannot be unregistered by one of its own methods,
@@ -363,7 +364,7 @@  dev_bus_rescan_store(struct device *dev, struct device_attribute *attr,
 	unsigned long val;
 	struct pci_bus *bus = to_pci_bus(dev);
 
-	if (strict_strtoul(buf, 0, &val) < 0)
+	if (kstrtoul(buf, 0, &val) < 0)
 		return -EINVAL;
 
 	if (val) {
@@ -387,7 +388,7 @@  static ssize_t d3cold_allowed_store(struct device *dev,
 	struct pci_dev *pdev = to_pci_dev(dev);
 	unsigned long val;
 
-	if (strict_strtoul(buf, 0, &val) < 0)
+	if (kstrtoul(buf, 0, &val) < 0)
 		return -EINVAL;
 
 	pdev->d3cold_allowed = !!val;
@@ -404,6 +405,114 @@  static ssize_t d3cold_allowed_show(struct device *dev,
 }
 #endif
 
+#ifdef CONFIG_PCI_IOV
+static ssize_t sriov_totalvfs_show(struct device *dev,
+                                  struct device_attribute *attr,
+                                  char *buf)
+{
+	struct pci_dev *pdev;
+	u16 total;
+
+	pdev = to_pci_dev(dev);
+	total = pdev->sriov->total;
+	return sprintf (buf, "%u\n", total);
+}
+
+
+static ssize_t sriov_numvfs_show(struct device *dev,
+                                 struct device_attribute *attr,
+                                 char *buf)
+{
+	struct pci_dev *pdev;
+
+	pdev = to_pci_dev(dev);
+
+	return sprintf (buf, "%u\n", pdev->sriov->nr_virtfn);
+}
+
+/* 
+ * num_vfs > 0; number of vfs to enable
+ * num_vfs = 0; disable all vfs
+ *
+ * Note: SRIOV spec doesn't allow partial VF
+ *       disable, so its all or none.
+ */
+static ssize_t sriov_numvfs_store(struct device *dev,
+                                  struct device_attribute *attr,
+                                  const char *buf, size_t count)
+{
+	struct pci_dev *pdev;
+	int num_vfs_enabled = 0;
+	int num_vfs;
+	int ret = 0;
+	u16 total;
+
+	pdev = to_pci_dev(dev);
+
+	/* Requested VFs to enable < totalvfs
+	 * and none enabled already
+	 */
+	if (kstrtoint(buf, 0, &num_vfs) < 0)
+		return -EINVAL;
+
+	/* is PF driver loaded w/callback */
+	if (!pdev->driver || !pdev->driver->sriov_configure) {
+		pr_info("Driver doesn't support sriov configuration via sysfs \n");
+		return -ENOSYS;
+	}
+
+	/* if enabling vf's ... */
+	total = pdev->sriov->total;
+	if ((num_vfs > 0) && (num_vfs <= total)) {
+		if (pdev->sriov->nr_virtfn == 0) { /* if not already enabled */
+		    num_vfs_enabled = pdev->driver->sriov_configure(pdev, num_vfs);
+        	    if ((num_vfs_enabled >= 0) && (num_vfs_enabled != num_vfs))
+			pr_warn("%s: %04x:%02x:%02x.%d: Only %d VFs enabled \n",
+                        	pdev->dev.driver->name, pci_domain_nr(pdev->bus),
+                        	pdev->bus->number, PCI_SLOT(pdev->devfn),
+                        	PCI_FUNC(pdev->devfn), num_vfs_enabled);
+			return count;
+		} else {
+			pr_warn("VFs already enabled. Disable before re-enabling\n");
+			return -EINVAL;
+		}
+	} 
+
+	/* disable vfs */
+	if (num_vfs == 0) {
+		if (pdev->sriov->nr_virtfn != 0) {
+			ret = pdev->driver->sriov_configure(pdev, 0);
+			return ret ? ret: count;
+		} else {
+			pr_err("Disable failed since no VFs enabled\n");
+			return -EINVAL;
+		}
+	}
+
+	pr_err("Invalid value for number of VFs to enable: %d\n", num_vfs);
+
+	return -EINVAL;
+}
+#else
+static ssize_t sriov_totalvfs_show(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *buf)
+{ return 0; }
+static ssize_t sriov_numvfs_show(struct device *dev,
+			         struct device_attribute *attr,
+			         char *buf)
+{ return 0; }
+static ssize_t sriov_numvfs_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{ return count; }
+#endif /* CONFIG_PCI_IOV */
+
+static struct device_attribute sriov_totalvfs_attr = __ATTR_RO(sriov_totalvfs);
+static struct device_attribute sriov_numvfs_attr = 
+		__ATTR(sriov_numvfs, (S_IRUGO|S_IWUSR|S_IWGRP),
+                	sriov_numvfs_show, sriov_numvfs_store);
+
 struct device_attribute pci_dev_attrs[] = {
 	__ATTR_RO(resource),
 	__ATTR_RO(vendor),
@@ -1194,7 +1303,7 @@  static ssize_t reset_store(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	unsigned long val;
-	ssize_t result = strict_strtoul(buf, 0, &val);
+	ssize_t result = kstrtoul(buf, 0, &val);
 
 	if (result < 0)
 		return result;
@@ -1408,8 +1517,14 @@  static struct attribute *pci_dev_dev_attrs[] = {
 	NULL,
 };
 
+static struct attribute *sriov_dev_attrs[] = {
+        &sriov_totalvfs_attr.attr,
+        &sriov_numvfs_attr.attr,
+	NULL,
+};
+
 static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
-						struct attribute *a, int n)
+					 struct attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
 	struct pci_dev *pdev = to_pci_dev(dev);
@@ -1421,13 +1536,33 @@  static umode_t pci_dev_attrs_are_visible(struct kobject *kobj,
 	return a->mode;
 }
 
+static umode_t sriov_attrs_are_visible(struct kobject *kobj,
+					 struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+
+	if ((a == &sriov_totalvfs_attr.attr) ||
+	    (a == &sriov_numvfs_attr.attr)) {
+		if (!dev_is_pf(dev))
+			return 0;
+	}
+
+	return a->mode;
+}
+
 static struct attribute_group pci_dev_attr_group = {
 	.attrs = pci_dev_dev_attrs,
 	.is_visible = pci_dev_attrs_are_visible,
 };
 
+static struct attribute_group sriov_dev_attr_group = {
+	.attrs = sriov_dev_attrs,
+	.is_visible = sriov_attrs_are_visible,
+};
+
 static const struct attribute_group *pci_dev_attr_groups[] = {
 	&pci_dev_attr_group,
+	&sriov_dev_attr_group,
 	NULL,
 };
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index be1de01..1d60a23 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -595,6 +595,7 @@  struct pci_driver {
 	int  (*resume_early) (struct pci_dev *dev);
 	int  (*resume) (struct pci_dev *dev);	                /* Device woken up */
 	void (*shutdown) (struct pci_dev *dev);
+	int (*sriov_configure) (struct pci_dev *dev, int num_vfs);    /* PF pci dev */
 	const struct pci_error_handlers *err_handler;
 	struct device_driver	driver;
 	struct pci_dynids dynids;