diff mbox series

[v2,5/7] vfio/pci: Add sriov_configure support

Message ID 158213846731.17090.37693075723046377.stgit@gimli.home
State New
Headers show
Series vfio/pci: SR-IOV support | expand

Commit Message

Alex Williamson Feb. 19, 2020, 6:54 p.m. UTC
With the VF Token interface we can now expect that a vfio userspace
driver must be in collaboration with the PF driver, an unwitting
userspace driver will not be able to get past the GET_DEVICE_FD step
in accessing the device.  We can now move on to actually allowing
SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
enabled by default in this commit, but it does provide a module option
for this to be enabled (enable_sriov=1).  Enabling VFs is rather
straightforward, except we don't want to risk that a VF might get
autoprobed and bound to other drivers, so a bus notifier is used to
"capture" VFs to vfio-pci using the driver_override support.  We
assume any later action to bind the device to other drivers is
condoned by the system admin and allow it with a log warning.

vfio-pci will disable SR-IOV on a PF before releasing the device,
allowing a VF driver to be assured other drivers cannot take over the
PF and that any other userspace driver must know the shared VF token.
This support also does not provide a mechanism for the PF userspace
driver itself to manipulate SR-IOV through the vfio API.  With this
patch SR-IOV can only be enabled via the host sysfs interface and the
PF driver user cannot create or remove VFs.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci.c         |  106 +++++++++++++++++++++++++++++++----
 drivers/vfio/pci/vfio_pci_private.h |    2 +
 2 files changed, 97 insertions(+), 11 deletions(-)

Comments

Tian, Kevin Feb. 25, 2020, 3:08 a.m. UTC | #1
> From: Alex Williamson
> Sent: Thursday, February 20, 2020 2:54 AM
> 
> With the VF Token interface we can now expect that a vfio userspace
> driver must be in collaboration with the PF driver, an unwitting
> userspace driver will not be able to get past the GET_DEVICE_FD step
> in accessing the device.  We can now move on to actually allowing
> SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
> enabled by default in this commit, but it does provide a module option
> for this to be enabled (enable_sriov=1).  Enabling VFs is rather
> straightforward, except we don't want to risk that a VF might get
> autoprobed and bound to other drivers, so a bus notifier is used to
> "capture" VFs to vfio-pci using the driver_override support.  We
> assume any later action to bind the device to other drivers is
> condoned by the system admin and allow it with a log warning.
> 
> vfio-pci will disable SR-IOV on a PF before releasing the device,
> allowing a VF driver to be assured other drivers cannot take over the
> PF and that any other userspace driver must know the shared VF token.
> This support also does not provide a mechanism for the PF userspace
> driver itself to manipulate SR-IOV through the vfio API.  With this
> patch SR-IOV can only be enabled via the host sysfs interface and the
> PF driver user cannot create or remove VFs.

I'm not sure how many devices can be properly configured simply 
with pci_enable_sriov. It is not unusual to require PF driver prepare
something before turning PCI SR-IOV capability. If you look kernel
PF drivers, there are only two using generic pci_sriov_configure_
simple (simple wrapper like pci_enable_sriov), while most others
implementing their own callback. However vfio itself has no idea
thus I'm not sure how an user knows whether using this option can
actually meet his purpose. I may miss something here, possibly 
using DPDK as an example will make it clearer.

> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
>  drivers/vfio/pci/vfio_pci.c         |  106 +++++++++++++++++++++++++++++++--
> --
>  drivers/vfio/pci/vfio_pci_private.h |    2 +
>  2 files changed, 97 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index e4d5d26e5e71..b40ade48a844 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -54,6 +54,12 @@ module_param(disable_idle_d3, bool, S_IRUGO |
> S_IWUSR);
>  MODULE_PARM_DESC(disable_idle_d3,
>  		 "Disable using the PCI D3 low power state for idle, unused
> devices");
> 
> +static bool enable_sriov;
> +#ifdef CONFIG_PCI_IOV
> +module_param(enable_sriov, bool, 0644);
> +MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV
> configuration");
> +#endif
> +
>  static inline bool vfio_vga_disabled(void)
>  {
>  #ifdef CONFIG_VFIO_PCI_VGA
> @@ -1528,6 +1534,35 @@ static const struct vfio_device_ops vfio_pci_ops =
> {
> 
>  static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
>  static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
> +static struct pci_driver vfio_pci_driver;
> +
> +static int vfio_pci_bus_notifier(struct notifier_block *nb,
> +				 unsigned long action, void *data)
> +{
> +	struct vfio_pci_device *vdev = container_of(nb,
> +						    struct vfio_pci_device, nb);
> +	struct device *dev = data;
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct pci_dev *physfn = pci_physfn(pdev);
> +
> +	if (action == BUS_NOTIFY_ADD_DEVICE &&
> +	    pdev->is_virtfn && physfn == vdev->pdev) {
> +		pci_info(vdev->pdev, "Captured SR-IOV VF %s
> driver_override\n",
> +			 pci_name(pdev));
> +		pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
> +						  vfio_pci_ops.name);
> +	} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
> +		   pdev->is_virtfn && physfn == vdev->pdev) {
> +		struct pci_driver *drv = pci_dev_driver(pdev);
> +
> +		if (drv && drv != &vfio_pci_driver)
> +			pci_warn(vdev->pdev,
> +				 "VF %s bound to driver %s while PF bound to
> vfio-pci\n",
> +				 pci_name(pdev), drv->name);
> +	}
> +
> +	return 0;
> +}
> 
>  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  {
> @@ -1539,12 +1574,12 @@ static int vfio_pci_probe(struct pci_dev *pdev,
> const struct pci_device_id *id)
>  		return -EINVAL;
> 
>  	/*
> -	 * Prevent binding to PFs with VFs enabled, this too easily allows
> -	 * userspace instance with VFs and PFs from the same device, which
> -	 * cannot work.  Disabling SR-IOV here would initiate removing the
> -	 * VFs, which would unbind the driver, which is prone to blocking
> -	 * if that VF is also in use by vfio-pci.  Just reject these PFs
> -	 * and let the user sort it out.
> +	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
> +	 * by the host or other users.  We cannot capture the VFs if they
> +	 * already exist, nor can we track VF users.  Disabling SR-IOV here
> +	 * would initiate removing the VFs, which would unbind the driver,
> +	 * which is prone to blocking if that VF is also in use by vfio-pci.
> +	 * Just reject these PFs and let the user sort it out.
>  	 */
>  	if (pci_num_vf(pdev)) {
>  		pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
> @@ -1592,6 +1627,18 @@ static int vfio_pci_probe(struct pci_dev *pdev,
> const struct pci_device_id *id)
>  			kfree(vdev);
>  			return -ENOMEM;
>  		}
> +
> +		vdev->nb.notifier_call = vfio_pci_bus_notifier;
> +		ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
> +		if (ret) {
> +			kfree(vdev->vf_token);
> +			vfio_pci_reflck_put(vdev->reflck);
> +			vfio_del_group_dev(&pdev->dev);
> +			vfio_iommu_group_put(group, &pdev->dev);
> +			kfree(vdev);
> +			return ret;
> +		}
> +
>  		mutex_init(&vdev->vf_token->lock);
>  		uuid_gen(&vdev->vf_token->uuid);
>  	}
> @@ -1625,6 +1672,8 @@ static void vfio_pci_remove(struct pci_dev *pdev)
>  {
>  	struct vfio_pci_device *vdev;
> 
> +	pci_disable_sriov(pdev);
> +
>  	vdev = vfio_del_group_dev(&pdev->dev);
>  	if (!vdev)
>  		return;
> @@ -1635,6 +1684,9 @@ static void vfio_pci_remove(struct pci_dev *pdev)
>  		kfree(vdev->vf_token);
>  	}
> 
> +	if (vdev->nb.notifier_call)
> +		bus_unregister_notifier(&pci_bus_type, &vdev->nb);
> +
>  	vfio_pci_reflck_put(vdev->reflck);
> 
>  	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
> @@ -1683,16 +1735,48 @@ static pci_ers_result_t
> vfio_pci_aer_err_detected(struct pci_dev *pdev,
>  	return PCI_ERS_RESULT_CAN_RECOVER;
>  }
> 
> +static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
> +{
> +	struct vfio_pci_device *vdev;
> +	struct vfio_device *device;
> +	int ret = 0;
> +
> +	might_sleep();
> +
> +	if (!enable_sriov)
> +		return -ENOENT;
> +
> +	device = vfio_device_get_from_dev(&pdev->dev);
> +	if (!device)
> +		return -ENODEV;
> +
> +	vdev = vfio_device_data(device);
> +	if (!vdev) {
> +		vfio_device_put(device);
> +		return -ENODEV;
> +	}
> +
> +	if (nr_virtfn == 0)
> +		pci_disable_sriov(pdev);
> +	else
> +		ret = pci_enable_sriov(pdev, nr_virtfn);
> +
> +	vfio_device_put(device);
> +
> +	return ret < 0 ? ret : nr_virtfn;
> +}
> +
>  static const struct pci_error_handlers vfio_err_handlers = {
>  	.error_detected = vfio_pci_aer_err_detected,
>  };
> 
>  static struct pci_driver vfio_pci_driver = {
> -	.name		= "vfio-pci",
> -	.id_table	= NULL, /* only dynamic ids */
> -	.probe		= vfio_pci_probe,
> -	.remove		= vfio_pci_remove,
> -	.err_handler	= &vfio_err_handlers,
> +	.name			= "vfio-pci",
> +	.id_table		= NULL, /* only dynamic ids */
> +	.probe			= vfio_pci_probe,
> +	.remove			= vfio_pci_remove,
> +	.sriov_configure	= vfio_pci_sriov_configure,
> +	.err_handler		= &vfio_err_handlers,
>  };
> 
>  static DEFINE_MUTEX(reflck_lock);
> diff --git a/drivers/vfio/pci/vfio_pci_private.h
> b/drivers/vfio/pci/vfio_pci_private.h
> index 76c11c915949..36ec69081ecd 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -13,6 +13,7 @@
>  #include <linux/irqbypass.h>
>  #include <linux/types.h>
>  #include <linux/uuid.h>
> +#include <linux/notifier.h>
> 
>  #ifndef VFIO_PCI_PRIVATE_H
>  #define VFIO_PCI_PRIVATE_H
> @@ -130,6 +131,7 @@ struct vfio_pci_device {
>  	struct mutex		ioeventfds_lock;
>  	struct list_head	ioeventfds_list;
>  	struct vfio_pci_vf_token	*vf_token;
> +	struct notifier_block	nb;
>  };
> 
>  #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
Alex Williamson March 5, 2020, 6:22 p.m. UTC | #2
On Tue, 25 Feb 2020 03:08:00 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson
> > Sent: Thursday, February 20, 2020 2:54 AM
> > 
> > With the VF Token interface we can now expect that a vfio userspace
> > driver must be in collaboration with the PF driver, an unwitting
> > userspace driver will not be able to get past the GET_DEVICE_FD step
> > in accessing the device.  We can now move on to actually allowing
> > SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
> > enabled by default in this commit, but it does provide a module option
> > for this to be enabled (enable_sriov=1).  Enabling VFs is rather
> > straightforward, except we don't want to risk that a VF might get
> > autoprobed and bound to other drivers, so a bus notifier is used to
> > "capture" VFs to vfio-pci using the driver_override support.  We
> > assume any later action to bind the device to other drivers is
> > condoned by the system admin and allow it with a log warning.
> > 
> > vfio-pci will disable SR-IOV on a PF before releasing the device,
> > allowing a VF driver to be assured other drivers cannot take over the
> > PF and that any other userspace driver must know the shared VF token.
> > This support also does not provide a mechanism for the PF userspace
> > driver itself to manipulate SR-IOV through the vfio API.  With this
> > patch SR-IOV can only be enabled via the host sysfs interface and the
> > PF driver user cannot create or remove VFs.  
> 
> I'm not sure how many devices can be properly configured simply 
> with pci_enable_sriov. It is not unusual to require PF driver prepare
> something before turning PCI SR-IOV capability. If you look kernel
> PF drivers, there are only two using generic pci_sriov_configure_
> simple (simple wrapper like pci_enable_sriov), while most others
> implementing their own callback. However vfio itself has no idea
> thus I'm not sure how an user knows whether using this option can
> actually meet his purpose. I may miss something here, possibly 
> using DPDK as an example will make it clearer.

There is still the entire vfio userspace driver interface.  Imagine for
example that QEMU emulates the SR-IOV capability and makes a call out
to libvirt (or maybe runs with privs for the PF SR-IOV sysfs attribs)
when the guest enables SR-IOV.  Can't we assume that any PF specific
support can still be performed in the userspace/guest driver, leaving
us with a very simple and generic sriov_configure callback in vfio-pci?
Thanks,

Alex
Tian, Kevin March 6, 2020, 7:57 a.m. UTC | #3
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Friday, March 6, 2020 2:23 AM
> 
> On Tue, 25 Feb 2020 03:08:00 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson
> > > Sent: Thursday, February 20, 2020 2:54 AM
> > >
> > > With the VF Token interface we can now expect that a vfio userspace
> > > driver must be in collaboration with the PF driver, an unwitting
> > > userspace driver will not be able to get past the GET_DEVICE_FD step
> > > in accessing the device.  We can now move on to actually allowing
> > > SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
> > > enabled by default in this commit, but it does provide a module option
> > > for this to be enabled (enable_sriov=1).  Enabling VFs is rather
> > > straightforward, except we don't want to risk that a VF might get
> > > autoprobed and bound to other drivers, so a bus notifier is used to
> > > "capture" VFs to vfio-pci using the driver_override support.  We
> > > assume any later action to bind the device to other drivers is
> > > condoned by the system admin and allow it with a log warning.
> > >
> > > vfio-pci will disable SR-IOV on a PF before releasing the device,
> > > allowing a VF driver to be assured other drivers cannot take over the
> > > PF and that any other userspace driver must know the shared VF token.
> > > This support also does not provide a mechanism for the PF userspace
> > > driver itself to manipulate SR-IOV through the vfio API.  With this
> > > patch SR-IOV can only be enabled via the host sysfs interface and the
> > > PF driver user cannot create or remove VFs.
> >
> > I'm not sure how many devices can be properly configured simply
> > with pci_enable_sriov. It is not unusual to require PF driver prepare
> > something before turning PCI SR-IOV capability. If you look kernel
> > PF drivers, there are only two using generic pci_sriov_configure_
> > simple (simple wrapper like pci_enable_sriov), while most others
> > implementing their own callback. However vfio itself has no idea
> > thus I'm not sure how an user knows whether using this option can
> > actually meet his purpose. I may miss something here, possibly
> > using DPDK as an example will make it clearer.
> 
> There is still the entire vfio userspace driver interface.  Imagine for
> example that QEMU emulates the SR-IOV capability and makes a call out
> to libvirt (or maybe runs with privs for the PF SR-IOV sysfs attribs)
> when the guest enables SR-IOV.  Can't we assume that any PF specific
> support can still be performed in the userspace/guest driver, leaving
> us with a very simple and generic sriov_configure callback in vfio-pci?

Makes sense. One concern, though, is how an user could be warned
if he inadvertently uses sysfs to enable SR-IOV on a vfio device whose
userspace driver is incapable of handling it. Note any VFIO device, 
if SR-IOV capable, will allow user to do so once the module option is 
turned on and the callback is registered. I felt such uncertainty can be 
contained by toggling SR-IOV through a vfio api, but from your description 
obviously it is what you want to avoid. Is it due to the sequence reason,
e.g. that SR-IOV must be enabled before userspace PF driver sets the 
token? 

Thanks
Kevin
Tian, Kevin March 6, 2020, 9:45 a.m. UTC | #4
> From: Tian, Kevin
> Sent: Friday, March 6, 2020 3:57 PM
> 
> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Friday, March 6, 2020 2:23 AM
> >
> > On Tue, 25 Feb 2020 03:08:00 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >
> > > > From: Alex Williamson
> > > > Sent: Thursday, February 20, 2020 2:54 AM
> > > >
> > > > With the VF Token interface we can now expect that a vfio userspace
> > > > driver must be in collaboration with the PF driver, an unwitting
> > > > userspace driver will not be able to get past the GET_DEVICE_FD step
> > > > in accessing the device.  We can now move on to actually allowing
> > > > SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
> > > > enabled by default in this commit, but it does provide a module option
> > > > for this to be enabled (enable_sriov=1).  Enabling VFs is rather
> > > > straightforward, except we don't want to risk that a VF might get
> > > > autoprobed and bound to other drivers, so a bus notifier is used to
> > > > "capture" VFs to vfio-pci using the driver_override support.  We
> > > > assume any later action to bind the device to other drivers is
> > > > condoned by the system admin and allow it with a log warning.
> > > >
> > > > vfio-pci will disable SR-IOV on a PF before releasing the device,
> > > > allowing a VF driver to be assured other drivers cannot take over the
> > > > PF and that any other userspace driver must know the shared VF token.
> > > > This support also does not provide a mechanism for the PF userspace
> > > > driver itself to manipulate SR-IOV through the vfio API.  With this
> > > > patch SR-IOV can only be enabled via the host sysfs interface and the
> > > > PF driver user cannot create or remove VFs.
> > >
> > > I'm not sure how many devices can be properly configured simply
> > > with pci_enable_sriov. It is not unusual to require PF driver prepare
> > > something before turning PCI SR-IOV capability. If you look kernel
> > > PF drivers, there are only two using generic pci_sriov_configure_
> > > simple (simple wrapper like pci_enable_sriov), while most others
> > > implementing their own callback. However vfio itself has no idea
> > > thus I'm not sure how an user knows whether using this option can
> > > actually meet his purpose. I may miss something here, possibly
> > > using DPDK as an example will make it clearer.
> >
> > There is still the entire vfio userspace driver interface.  Imagine for
> > example that QEMU emulates the SR-IOV capability and makes a call out
> > to libvirt (or maybe runs with privs for the PF SR-IOV sysfs attribs)
> > when the guest enables SR-IOV.  Can't we assume that any PF specific
> > support can still be performed in the userspace/guest driver, leaving
> > us with a very simple and generic sriov_configure callback in vfio-pci?
> 
> Makes sense. One concern, though, is how an user could be warned
> if he inadvertently uses sysfs to enable SR-IOV on a vfio device whose
> userspace driver is incapable of handling it. Note any VFIO device,
> if SR-IOV capable, will allow user to do so once the module option is
> turned on and the callback is registered. I felt such uncertainty can be
> contained by toggling SR-IOV through a vfio api, but from your description
> obviously it is what you want to avoid. Is it due to the sequence reason,
> e.g. that SR-IOV must be enabled before userspace PF driver sets the
> token?
> 

reading again I found that you specifically mentioned "the PF driver user 
cannot create or remove VFs.". However I failed to get the rationale 
behind. If the VF drivers have built the trust with the PF driver through
the token, what is the problem of allowing the PF driver to further manage 
SR-IOV itself? suppose any VF removal will be done in a cooperate way
to avoid surprise impact to related VF drivers. then possibly a new vfio
ioctl for setting the VF numbers plus a token from the userspace driver
could also serve the purpose of this patch series (GET_DEVICE_FD + sysfs)?

Thanks
Kevin
Alex Williamson March 6, 2020, 3:50 p.m. UTC | #5
On Fri, 6 Mar 2020 09:45:40 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Tian, Kevin
> > Sent: Friday, March 6, 2020 3:57 PM
> >   
> > > From: Alex Williamson <alex.williamson@redhat.com>
> > > Sent: Friday, March 6, 2020 2:23 AM
> > >
> > > On Tue, 25 Feb 2020 03:08:00 +0000
> > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > >  
> > > > > From: Alex Williamson
> > > > > Sent: Thursday, February 20, 2020 2:54 AM
> > > > >
> > > > > With the VF Token interface we can now expect that a vfio userspace
> > > > > driver must be in collaboration with the PF driver, an unwitting
> > > > > userspace driver will not be able to get past the GET_DEVICE_FD step
> > > > > in accessing the device.  We can now move on to actually allowing
> > > > > SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
> > > > > enabled by default in this commit, but it does provide a module option
> > > > > for this to be enabled (enable_sriov=1).  Enabling VFs is rather
> > > > > straightforward, except we don't want to risk that a VF might get
> > > > > autoprobed and bound to other drivers, so a bus notifier is used to
> > > > > "capture" VFs to vfio-pci using the driver_override support.  We
> > > > > assume any later action to bind the device to other drivers is
> > > > > condoned by the system admin and allow it with a log warning.
> > > > >
> > > > > vfio-pci will disable SR-IOV on a PF before releasing the device,
> > > > > allowing a VF driver to be assured other drivers cannot take over the
> > > > > PF and that any other userspace driver must know the shared VF token.
> > > > > This support also does not provide a mechanism for the PF userspace
> > > > > driver itself to manipulate SR-IOV through the vfio API.  With this
> > > > > patch SR-IOV can only be enabled via the host sysfs interface and the
> > > > > PF driver user cannot create or remove VFs.  
> > > >
> > > > I'm not sure how many devices can be properly configured simply
> > > > with pci_enable_sriov. It is not unusual to require PF driver prepare
> > > > something before turning PCI SR-IOV capability. If you look kernel
> > > > PF drivers, there are only two using generic pci_sriov_configure_
> > > > simple (simple wrapper like pci_enable_sriov), while most others
> > > > implementing their own callback. However vfio itself has no idea
> > > > thus I'm not sure how an user knows whether using this option can
> > > > actually meet his purpose. I may miss something here, possibly
> > > > using DPDK as an example will make it clearer.  
> > >
> > > There is still the entire vfio userspace driver interface.  Imagine for
> > > example that QEMU emulates the SR-IOV capability and makes a call out
> > > to libvirt (or maybe runs with privs for the PF SR-IOV sysfs attribs)
> > > when the guest enables SR-IOV.  Can't we assume that any PF specific
> > > support can still be performed in the userspace/guest driver, leaving
> > > us with a very simple and generic sriov_configure callback in vfio-pci?  
> > 
> > Makes sense. One concern, though, is how an user could be warned
> > if he inadvertently uses sysfs to enable SR-IOV on a vfio device whose
> > userspace driver is incapable of handling it. Note any VFIO device,
> > if SR-IOV capable, will allow user to do so once the module option is
> > turned on and the callback is registered. I felt such uncertainty can be
> > contained by toggling SR-IOV through a vfio api, but from your description
> > obviously it is what you want to avoid. Is it due to the sequence reason,
> > e.g. that SR-IOV must be enabled before userspace PF driver sets the
> > token?
> >   
> 
> reading again I found that you specifically mentioned "the PF driver user 
> cannot create or remove VFs.". However I failed to get the rationale 
> behind. If the VF drivers have built the trust with the PF driver through
> the token, what is the problem of allowing the PF driver to further manage 
> SR-IOV itself? suppose any VF removal will be done in a cooperate way
> to avoid surprise impact to related VF drivers. then possibly a new vfio
> ioctl for setting the VF numbers plus a token from the userspace driver
> could also serve the purpose of this patch series (GET_DEVICE_FD + sysfs)?

If a user is allowed to create VFs, does that user automatically get
ownership of those devices?  How is that accomplished?  What if we want
to make use of the VF via a separate process?  How do we coordinate
that with the PF driver?  All of these problems are resolved if we
assume the userspace PF driver needs to operate in collaboration with a
privileged entity to interact with sysfs to configure SR-IOV and manage
the resulting VFs.  I have no desire to take on that responsibility
within vfio-pci and I also feel that a user owning a PF device should
not inherently grant that user the ability to create and remove other
devices on the host, even if they are sourced from the PF.  Thanks,

Alex
Alex Williamson March 6, 2020, 10:17 p.m. UTC | #6
On Fri, 6 Mar 2020 07:57:19 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Friday, March 6, 2020 2:23 AM
> > 
> > On Tue, 25 Feb 2020 03:08:00 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >   
> > > > From: Alex Williamson
> > > > Sent: Thursday, February 20, 2020 2:54 AM
> > > >
> > > > With the VF Token interface we can now expect that a vfio userspace
> > > > driver must be in collaboration with the PF driver, an unwitting
> > > > userspace driver will not be able to get past the GET_DEVICE_FD step
> > > > in accessing the device.  We can now move on to actually allowing
> > > > SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
> > > > enabled by default in this commit, but it does provide a module option
> > > > for this to be enabled (enable_sriov=1).  Enabling VFs is rather
> > > > straightforward, except we don't want to risk that a VF might get
> > > > autoprobed and bound to other drivers, so a bus notifier is used to
> > > > "capture" VFs to vfio-pci using the driver_override support.  We
> > > > assume any later action to bind the device to other drivers is
> > > > condoned by the system admin and allow it with a log warning.
> > > >
> > > > vfio-pci will disable SR-IOV on a PF before releasing the device,
> > > > allowing a VF driver to be assured other drivers cannot take over the
> > > > PF and that any other userspace driver must know the shared VF token.
> > > > This support also does not provide a mechanism for the PF userspace
> > > > driver itself to manipulate SR-IOV through the vfio API.  With this
> > > > patch SR-IOV can only be enabled via the host sysfs interface and the
> > > > PF driver user cannot create or remove VFs.  
> > >
> > > I'm not sure how many devices can be properly configured simply
> > > with pci_enable_sriov. It is not unusual to require PF driver prepare
> > > something before turning PCI SR-IOV capability. If you look kernel
> > > PF drivers, there are only two using generic pci_sriov_configure_
> > > simple (simple wrapper like pci_enable_sriov), while most others
> > > implementing their own callback. However vfio itself has no idea
> > > thus I'm not sure how an user knows whether using this option can
> > > actually meet his purpose. I may miss something here, possibly
> > > using DPDK as an example will make it clearer.  
> > 
> > There is still the entire vfio userspace driver interface.  Imagine for
> > example that QEMU emulates the SR-IOV capability and makes a call out
> > to libvirt (or maybe runs with privs for the PF SR-IOV sysfs attribs)
> > when the guest enables SR-IOV.  Can't we assume that any PF specific
> > support can still be performed in the userspace/guest driver, leaving
> > us with a very simple and generic sriov_configure callback in vfio-pci?  
> 
> Makes sense. One concern, though, is how an user could be warned
> if he inadvertently uses sysfs to enable SR-IOV on a vfio device whose
> userspace driver is incapable of handling it. Note any VFIO device, 
> if SR-IOV capable, will allow user to do so once the module option is 
> turned on and the callback is registered. I felt such uncertainty can be 
> contained by toggling SR-IOV through a vfio api, but from your description 
> obviously it is what you want to avoid. Is it due to the sequence reason,
> e.g. that SR-IOV must be enabled before userspace PF driver sets the 
> token? 

As in my other reply, enabling SR-IOV via a vfio API suggests that
we're not only granting the user owning the PF device access to the
device itself, but also the ability to create and remove subordinate
devices on the host.  That implies an extended degree of trust in the
user beyond the PF device itself and raises questions about whether a
user who is allowed to create VF devices should automatically be
granted access to those VF devices, what the mechanism would be for
that, and how we might re-assign those devices to other users,
potentially including host kernel usage.  What I'm proposing here
doesn't preclude some future extension in that direction, but instead
tries to simplify a first step towards enabling SR-IOV by leaving the
SR-IOV enablement and VF assignment in the realm of a privileged system
entity.

So, what I think you're suggesting here is that we should restrict
vfio_pci_sriov_configure() to reject enabling SR-IOV until a user
driver has configured a VF token.  That requires both that the
userspace driver has initialized to this point before SR-IOV can be
enabled and that we would be forced to define a termination point for
the user set VF token.  Logically, this would need to be when the
userspace driver exits or closes the PF device, which implies that we
need to disable SR-IOV on the PF at this point, or we're left in an
inconsistent state where VFs are enabled but cannot be disabled because
we don't have a valid VF token.  Now we're back to nearly a state where
the user has control of not creating devices on the host, but removing
them by closing the device, which will necessarily require that any VF
driver release the device, whether userspace or kernel.

I'm not sure what we're gaining by doing this though.  I agree that
there will be users that enable SR-IOV on a PF and then try to, for
example, assign the PF and all the VFs to a VM.  The VFs will fail due
to lacking VF token support, unless they've patch QEMU with my test
code, but depending on the PF driver in the guest, it may, or more
likely won't work.  But don't you think the VFs and probably PF not
working is a sufficient clue that the configuration is invalid?  OTOH,
from what I've heard of the device in the ID table of the pci-pf-stub
driver, they might very well be able to work with both PF and VFs in
QEMU using only my test code to set the VF token.

Therefore, I'm afraid what you're asking for here is to impose a usage
restriction as a sanity test, when we don't really know what might be
sane for this particular piece of hardware or use case.  There are
infinite ways that a vfio based userspace driver can fail to configure
their hardware and make it work correctly, many of them are device
specific.  Isn't this just one of those cases?  Thanks,

Alex
Tian, Kevin March 7, 2020, 1:35 a.m. UTC | #7
> From: Alex Williamson
> Sent: Saturday, March 7, 2020 6:18 AM
> 
> On Fri, 6 Mar 2020 07:57:19 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson <alex.williamson@redhat.com>
> > > Sent: Friday, March 6, 2020 2:23 AM
> > >
> > > On Tue, 25 Feb 2020 03:08:00 +0000
> > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > >
> > > > > From: Alex Williamson
> > > > > Sent: Thursday, February 20, 2020 2:54 AM
> > > > >
> > > > > With the VF Token interface we can now expect that a vfio userspace
> > > > > driver must be in collaboration with the PF driver, an unwitting
> > > > > userspace driver will not be able to get past the GET_DEVICE_FD step
> > > > > in accessing the device.  We can now move on to actually allowing
> > > > > SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
> > > > > enabled by default in this commit, but it does provide a module
> option
> > > > > for this to be enabled (enable_sriov=1).  Enabling VFs is rather
> > > > > straightforward, except we don't want to risk that a VF might get
> > > > > autoprobed and bound to other drivers, so a bus notifier is used to
> > > > > "capture" VFs to vfio-pci using the driver_override support.  We
> > > > > assume any later action to bind the device to other drivers is
> > > > > condoned by the system admin and allow it with a log warning.
> > > > >
> > > > > vfio-pci will disable SR-IOV on a PF before releasing the device,
> > > > > allowing a VF driver to be assured other drivers cannot take over the
> > > > > PF and that any other userspace driver must know the shared VF
> token.
> > > > > This support also does not provide a mechanism for the PF userspace
> > > > > driver itself to manipulate SR-IOV through the vfio API.  With this
> > > > > patch SR-IOV can only be enabled via the host sysfs interface and the
> > > > > PF driver user cannot create or remove VFs.
> > > >
> > > > I'm not sure how many devices can be properly configured simply
> > > > with pci_enable_sriov. It is not unusual to require PF driver prepare
> > > > something before turning PCI SR-IOV capability. If you look kernel
> > > > PF drivers, there are only two using generic pci_sriov_configure_
> > > > simple (simple wrapper like pci_enable_sriov), while most others
> > > > implementing their own callback. However vfio itself has no idea
> > > > thus I'm not sure how an user knows whether using this option can
> > > > actually meet his purpose. I may miss something here, possibly
> > > > using DPDK as an example will make it clearer.
> > >
> > > There is still the entire vfio userspace driver interface.  Imagine for
> > > example that QEMU emulates the SR-IOV capability and makes a call out
> > > to libvirt (or maybe runs with privs for the PF SR-IOV sysfs attribs)
> > > when the guest enables SR-IOV.  Can't we assume that any PF specific
> > > support can still be performed in the userspace/guest driver, leaving
> > > us with a very simple and generic sriov_configure callback in vfio-pci?
> >
> > Makes sense. One concern, though, is how an user could be warned
> > if he inadvertently uses sysfs to enable SR-IOV on a vfio device whose
> > userspace driver is incapable of handling it. Note any VFIO device,
> > if SR-IOV capable, will allow user to do so once the module option is
> > turned on and the callback is registered. I felt such uncertainty can be
> > contained by toggling SR-IOV through a vfio api, but from your description
> > obviously it is what you want to avoid. Is it due to the sequence reason,
> > e.g. that SR-IOV must be enabled before userspace PF driver sets the
> > token?
> 
> As in my other reply, enabling SR-IOV via a vfio API suggests that
> we're not only granting the user owning the PF device access to the
> device itself, but also the ability to create and remove subordinate
> devices on the host.  That implies an extended degree of trust in the
> user beyond the PF device itself and raises questions about whether a
> user who is allowed to create VF devices should automatically be
> granted access to those VF devices, what the mechanism would be for
> that, and how we might re-assign those devices to other users,
> potentially including host kernel usage.  What I'm proposing here
> doesn't preclude some future extension in that direction, but instead
> tries to simplify a first step towards enabling SR-IOV by leaving the
> SR-IOV enablement and VF assignment in the realm of a privileged system
> entity.

the intention is clear to me now.

> 
> So, what I think you're suggesting here is that we should restrict
> vfio_pci_sriov_configure() to reject enabling SR-IOV until a user
> driver has configured a VF token.  That requires both that the
> userspace driver has initialized to this point before SR-IOV can be
> enabled and that we would be forced to define a termination point for
> the user set VF token.  Logically, this would need to be when the
> userspace driver exits or closes the PF device, which implies that we
> need to disable SR-IOV on the PF at this point, or we're left in an
> inconsistent state where VFs are enabled but cannot be disabled because
> we don't have a valid VF token.  Now we're back to nearly a state where
> the user has control of not creating devices on the host, but removing
> them by closing the device, which will necessarily require that any VF
> driver release the device, whether userspace or kernel.
> 
> I'm not sure what we're gaining by doing this though.  I agree that
> there will be users that enable SR-IOV on a PF and then try to, for
> example, assign the PF and all the VFs to a VM.  The VFs will fail due
> to lacking VF token support, unless they've patch QEMU with my test
> code, but depending on the PF driver in the guest, it may, or more
> likely won't work.  But don't you think the VFs and probably PF not
> working is a sufficient clue that the configuration is invalid?  OTOH,
> from what I've heard of the device in the ID table of the pci-pf-stub
> driver, they might very well be able to work with both PF and VFs in
> QEMU using only my test code to set the VF token.
> 
> Therefore, I'm afraid what you're asking for here is to impose a usage
> restriction as a sanity test, when we don't really know what might be
> sane for this particular piece of hardware or use case.  There are
> infinite ways that a vfio based userspace driver can fail to configure
> their hardware and make it work correctly, many of them are device
> specific.  Isn't this just one of those cases?  Thanks,
> 

what you said all makes sense. so I withdraw the idea of manipulating
SR-IOV through vfio ioctl. However I still feel that simply registering 
sriov_configuration callback by vfio-pci somehow violates the typical
expectation of the sysfs interface. Before this patch, the success return
of writing non-zero value to numvfs implies VFs are in sane state and
functionally ready for immediate use. However now the behavior of
success return becomes undefined for vfio devices, since even vfio-pci 
itself doesn't know whether VFs are functional for a random device 
(may know some if carrying the same device IDs from pci-pf-stub). It
simply relies on the privileged entity who knows exactly the implication
of such write, while there is no way to warn inadvertent users which
to me is not a good design from kernel API p.o.v. Of course we may 
document such restriction and the driver_override may also be an 
indirect way to warn such user if he wants to use VFs for other purpose.
But it is still less elegant than reporting it in the first place. Maybe
what we really require is a new sysfs attribute purely for enabling
PCI SR-IOV capability, which doesn't imply making VFs actually 
functional as did through the existing numvfs?

Thanks
Kevin
Alex Williamson March 9, 2020, 12:46 a.m. UTC | #8
On Sat, 7 Mar 2020 01:35:23 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson
> > Sent: Saturday, March 7, 2020 6:18 AM
> > 
> > On Fri, 6 Mar 2020 07:57:19 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >   
> > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > Sent: Friday, March 6, 2020 2:23 AM
> > > >
> > > > On Tue, 25 Feb 2020 03:08:00 +0000
> > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > >  
> > > > > > From: Alex Williamson
> > > > > > Sent: Thursday, February 20, 2020 2:54 AM
> > > > > >
> > > > > > With the VF Token interface we can now expect that a vfio userspace
> > > > > > driver must be in collaboration with the PF driver, an unwitting
> > > > > > userspace driver will not be able to get past the GET_DEVICE_FD step
> > > > > > in accessing the device.  We can now move on to actually allowing
> > > > > > SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
> > > > > > enabled by default in this commit, but it does provide a module  
> > option  
> > > > > > for this to be enabled (enable_sriov=1).  Enabling VFs is rather
> > > > > > straightforward, except we don't want to risk that a VF might get
> > > > > > autoprobed and bound to other drivers, so a bus notifier is used to
> > > > > > "capture" VFs to vfio-pci using the driver_override support.  We
> > > > > > assume any later action to bind the device to other drivers is
> > > > > > condoned by the system admin and allow it with a log warning.
> > > > > >
> > > > > > vfio-pci will disable SR-IOV on a PF before releasing the device,
> > > > > > allowing a VF driver to be assured other drivers cannot take over the
> > > > > > PF and that any other userspace driver must know the shared VF  
> > token.  
> > > > > > This support also does not provide a mechanism for the PF userspace
> > > > > > driver itself to manipulate SR-IOV through the vfio API.  With this
> > > > > > patch SR-IOV can only be enabled via the host sysfs interface and the
> > > > > > PF driver user cannot create or remove VFs.  
> > > > >
> > > > > I'm not sure how many devices can be properly configured simply
> > > > > with pci_enable_sriov. It is not unusual to require PF driver prepare
> > > > > something before turning PCI SR-IOV capability. If you look kernel
> > > > > PF drivers, there are only two using generic pci_sriov_configure_
> > > > > simple (simple wrapper like pci_enable_sriov), while most others
> > > > > implementing their own callback. However vfio itself has no idea
> > > > > thus I'm not sure how an user knows whether using this option can
> > > > > actually meet his purpose. I may miss something here, possibly
> > > > > using DPDK as an example will make it clearer.  
> > > >
> > > > There is still the entire vfio userspace driver interface.  Imagine for
> > > > example that QEMU emulates the SR-IOV capability and makes a call out
> > > > to libvirt (or maybe runs with privs for the PF SR-IOV sysfs attribs)
> > > > when the guest enables SR-IOV.  Can't we assume that any PF specific
> > > > support can still be performed in the userspace/guest driver, leaving
> > > > us with a very simple and generic sriov_configure callback in vfio-pci?  
> > >
> > > Makes sense. One concern, though, is how an user could be warned
> > > if he inadvertently uses sysfs to enable SR-IOV on a vfio device whose
> > > userspace driver is incapable of handling it. Note any VFIO device,
> > > if SR-IOV capable, will allow user to do so once the module option is
> > > turned on and the callback is registered. I felt such uncertainty can be
> > > contained by toggling SR-IOV through a vfio api, but from your description
> > > obviously it is what you want to avoid. Is it due to the sequence reason,
> > > e.g. that SR-IOV must be enabled before userspace PF driver sets the
> > > token?  
> > 
> > As in my other reply, enabling SR-IOV via a vfio API suggests that
> > we're not only granting the user owning the PF device access to the
> > device itself, but also the ability to create and remove subordinate
> > devices on the host.  That implies an extended degree of trust in the
> > user beyond the PF device itself and raises questions about whether a
> > user who is allowed to create VF devices should automatically be
> > granted access to those VF devices, what the mechanism would be for
> > that, and how we might re-assign those devices to other users,
> > potentially including host kernel usage.  What I'm proposing here
> > doesn't preclude some future extension in that direction, but instead
> > tries to simplify a first step towards enabling SR-IOV by leaving the
> > SR-IOV enablement and VF assignment in the realm of a privileged system
> > entity.  
> 
> the intention is clear to me now.
> 
> > 
> > So, what I think you're suggesting here is that we should restrict
> > vfio_pci_sriov_configure() to reject enabling SR-IOV until a user
> > driver has configured a VF token.  That requires both that the
> > userspace driver has initialized to this point before SR-IOV can be
> > enabled and that we would be forced to define a termination point for
> > the user set VF token.  Logically, this would need to be when the
> > userspace driver exits or closes the PF device, which implies that we
> > need to disable SR-IOV on the PF at this point, or we're left in an
> > inconsistent state where VFs are enabled but cannot be disabled because
> > we don't have a valid VF token.  Now we're back to nearly a state where
> > the user has control of not creating devices on the host, but removing
> > them by closing the device, which will necessarily require that any VF
> > driver release the device, whether userspace or kernel.
> > 
> > I'm not sure what we're gaining by doing this though.  I agree that
> > there will be users that enable SR-IOV on a PF and then try to, for
> > example, assign the PF and all the VFs to a VM.  The VFs will fail due
> > to lacking VF token support, unless they've patch QEMU with my test
> > code, but depending on the PF driver in the guest, it may, or more
> > likely won't work.  But don't you think the VFs and probably PF not
> > working is a sufficient clue that the configuration is invalid?  OTOH,
> > from what I've heard of the device in the ID table of the pci-pf-stub
> > driver, they might very well be able to work with both PF and VFs in
> > QEMU using only my test code to set the VF token.
> > 
> > Therefore, I'm afraid what you're asking for here is to impose a usage
> > restriction as a sanity test, when we don't really know what might be
> > sane for this particular piece of hardware or use case.  There are
> > infinite ways that a vfio based userspace driver can fail to configure
> > their hardware and make it work correctly, many of them are device
> > specific.  Isn't this just one of those cases?  Thanks,
> >   
> 
> what you said all makes sense. so I withdraw the idea of manipulating
> SR-IOV through vfio ioctl. However I still feel that simply registering 
> sriov_configuration callback by vfio-pci somehow violates the typical
> expectation of the sysfs interface. Before this patch, the success return
> of writing non-zero value to numvfs implies VFs are in sane state and
> functionally ready for immediate use. However now the behavior of
> success return becomes undefined for vfio devices, since even vfio-pci 
> itself doesn't know whether VFs are functional for a random device 
> (may know some if carrying the same device IDs from pci-pf-stub). It
> simply relies on the privileged entity who knows exactly the implication
> of such write, while there is no way to warn inadvertent users which
> to me is not a good design from kernel API p.o.v. Of course we may 
> document such restriction and the driver_override may also be an 
> indirect way to warn such user if he wants to use VFs for other purpose.
> But it is still less elegant than reporting it in the first place. Maybe
> what we really require is a new sysfs attribute purely for enabling
> PCI SR-IOV capability, which doesn't imply making VFs actually 
> functional as did through the existing numvfs?

I don't read the same guarantee into the sysfs SR-IOV interface.  If
such a guarantee exists, it's already broken by pci-pf-stub, which like
vfio-pci allows dynamic IDs and driver_override to bind to any PF device
allowing the ability to create (potentially) non-functional VFs.  I
think it would be a really bad decision to fork a new sysfs interface
for this.  I've already made SR-IOV support in vfio-pci an opt-in via a
module option, would it ease your concerns if I elaborate in the text
for the option that enabling SR-IOV may depend on support provided by a
vfio-pci userspace driver?

I think that without absolutely knowing that an operation is incorrect,
we're just generating noise and confusion by triggering warnings or
developing alternate interfaces.  Unfortunately, we have no generic
means of knowing that an operation is incorrect, so I assume the best.
Thanks,

Alex
Tian, Kevin March 9, 2020, 1:48 a.m. UTC | #9
> From: Alex Williamson
> Sent: Monday, March 9, 2020 8:46 AM
> 
> On Sat, 7 Mar 2020 01:35:23 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson
> > > Sent: Saturday, March 7, 2020 6:18 AM
> > >
> > > On Fri, 6 Mar 2020 07:57:19 +0000
> > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > >
> > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > Sent: Friday, March 6, 2020 2:23 AM
> > > > >
> > > > > On Tue, 25 Feb 2020 03:08:00 +0000
> > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > > >
> > > > > > > From: Alex Williamson
> > > > > > > Sent: Thursday, February 20, 2020 2:54 AM
> > > > > > >
> > > > > > > With the VF Token interface we can now expect that a vfio
> userspace
> > > > > > > driver must be in collaboration with the PF driver, an unwitting
> > > > > > > userspace driver will not be able to get past the GET_DEVICE_FD
> step
> > > > > > > in accessing the device.  We can now move on to actually allowing
> > > > > > > SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
> > > > > > > enabled by default in this commit, but it does provide a module
> > > option
> > > > > > > for this to be enabled (enable_sriov=1).  Enabling VFs is rather
> > > > > > > straightforward, except we don't want to risk that a VF might get
> > > > > > > autoprobed and bound to other drivers, so a bus notifier is used
> to
> > > > > > > "capture" VFs to vfio-pci using the driver_override support.  We
> > > > > > > assume any later action to bind the device to other drivers is
> > > > > > > condoned by the system admin and allow it with a log warning.
> > > > > > >
> > > > > > > vfio-pci will disable SR-IOV on a PF before releasing the device,
> > > > > > > allowing a VF driver to be assured other drivers cannot take over
> the
> > > > > > > PF and that any other userspace driver must know the shared VF
> > > token.
> > > > > > > This support also does not provide a mechanism for the PF
> userspace
> > > > > > > driver itself to manipulate SR-IOV through the vfio API.  With this
> > > > > > > patch SR-IOV can only be enabled via the host sysfs interface and
> the
> > > > > > > PF driver user cannot create or remove VFs.
> > > > > >
> > > > > > I'm not sure how many devices can be properly configured simply
> > > > > > with pci_enable_sriov. It is not unusual to require PF driver prepare
> > > > > > something before turning PCI SR-IOV capability. If you look kernel
> > > > > > PF drivers, there are only two using generic pci_sriov_configure_
> > > > > > simple (simple wrapper like pci_enable_sriov), while most others
> > > > > > implementing their own callback. However vfio itself has no idea
> > > > > > thus I'm not sure how an user knows whether using this option can
> > > > > > actually meet his purpose. I may miss something here, possibly
> > > > > > using DPDK as an example will make it clearer.
> > > > >
> > > > > There is still the entire vfio userspace driver interface.  Imagine for
> > > > > example that QEMU emulates the SR-IOV capability and makes a call
> out
> > > > > to libvirt (or maybe runs with privs for the PF SR-IOV sysfs attribs)
> > > > > when the guest enables SR-IOV.  Can't we assume that any PF specific
> > > > > support can still be performed in the userspace/guest driver, leaving
> > > > > us with a very simple and generic sriov_configure callback in vfio-pci?
> > > >
> > > > Makes sense. One concern, though, is how an user could be warned
> > > > if he inadvertently uses sysfs to enable SR-IOV on a vfio device whose
> > > > userspace driver is incapable of handling it. Note any VFIO device,
> > > > if SR-IOV capable, will allow user to do so once the module option is
> > > > turned on and the callback is registered. I felt such uncertainty can be
> > > > contained by toggling SR-IOV through a vfio api, but from your
> description
> > > > obviously it is what you want to avoid. Is it due to the sequence reason,
> > > > e.g. that SR-IOV must be enabled before userspace PF driver sets the
> > > > token?
> > >
> > > As in my other reply, enabling SR-IOV via a vfio API suggests that
> > > we're not only granting the user owning the PF device access to the
> > > device itself, but also the ability to create and remove subordinate
> > > devices on the host.  That implies an extended degree of trust in the
> > > user beyond the PF device itself and raises questions about whether a
> > > user who is allowed to create VF devices should automatically be
> > > granted access to those VF devices, what the mechanism would be for
> > > that, and how we might re-assign those devices to other users,
> > > potentially including host kernel usage.  What I'm proposing here
> > > doesn't preclude some future extension in that direction, but instead
> > > tries to simplify a first step towards enabling SR-IOV by leaving the
> > > SR-IOV enablement and VF assignment in the realm of a privileged system
> > > entity.
> >
> > the intention is clear to me now.
> >
> > >
> > > So, what I think you're suggesting here is that we should restrict
> > > vfio_pci_sriov_configure() to reject enabling SR-IOV until a user
> > > driver has configured a VF token.  That requires both that the
> > > userspace driver has initialized to this point before SR-IOV can be
> > > enabled and that we would be forced to define a termination point for
> > > the user set VF token.  Logically, this would need to be when the
> > > userspace driver exits or closes the PF device, which implies that we
> > > need to disable SR-IOV on the PF at this point, or we're left in an
> > > inconsistent state where VFs are enabled but cannot be disabled because
> > > we don't have a valid VF token.  Now we're back to nearly a state where
> > > the user has control of not creating devices on the host, but removing
> > > them by closing the device, which will necessarily require that any VF
> > > driver release the device, whether userspace or kernel.
> > >
> > > I'm not sure what we're gaining by doing this though.  I agree that
> > > there will be users that enable SR-IOV on a PF and then try to, for
> > > example, assign the PF and all the VFs to a VM.  The VFs will fail due
> > > to lacking VF token support, unless they've patch QEMU with my test
> > > code, but depending on the PF driver in the guest, it may, or more
> > > likely won't work.  But don't you think the VFs and probably PF not
> > > working is a sufficient clue that the configuration is invalid?  OTOH,
> > > from what I've heard of the device in the ID table of the pci-pf-stub
> > > driver, they might very well be able to work with both PF and VFs in
> > > QEMU using only my test code to set the VF token.
> > >
> > > Therefore, I'm afraid what you're asking for here is to impose a usage
> > > restriction as a sanity test, when we don't really know what might be
> > > sane for this particular piece of hardware or use case.  There are
> > > infinite ways that a vfio based userspace driver can fail to configure
> > > their hardware and make it work correctly, many of them are device
> > > specific.  Isn't this just one of those cases?  Thanks,
> > >
> >
> > what you said all makes sense. so I withdraw the idea of manipulating
> > SR-IOV through vfio ioctl. However I still feel that simply registering
> > sriov_configuration callback by vfio-pci somehow violates the typical
> > expectation of the sysfs interface. Before this patch, the success return
> > of writing non-zero value to numvfs implies VFs are in sane state and
> > functionally ready for immediate use. However now the behavior of
> > success return becomes undefined for vfio devices, since even vfio-pci
> > itself doesn't know whether VFs are functional for a random device
> > (may know some if carrying the same device IDs from pci-pf-stub). It
> > simply relies on the privileged entity who knows exactly the implication
> > of such write, while there is no way to warn inadvertent users which
> > to me is not a good design from kernel API p.o.v. Of course we may
> > document such restriction and the driver_override may also be an
> > indirect way to warn such user if he wants to use VFs for other purpose.
> > But it is still less elegant than reporting it in the first place. Maybe
> > what we really require is a new sysfs attribute purely for enabling
> > PCI SR-IOV capability, which doesn't imply making VFs actually
> > functional as did through the existing numvfs?
> 
> I don't read the same guarantee into the sysfs SR-IOV interface.  If
> such a guarantee exists, it's already broken by pci-pf-stub, which like
> vfio-pci allows dynamic IDs and driver_override to bind to any PF device
> allowing the ability to create (potentially) non-functional VFs.  I

I don't know whether others raised the similar concern and how 
it was addressed for pci-pf-stub before. Many places describe 
numvfs as the preferred interface to enable/disable VFs while 
'enable' just reads functional to me.

> think it would be a really bad decision to fork a new sysfs interface
> for this.  I've already made SR-IOV support in vfio-pci an opt-in via a
> module option, would it ease your concerns if I elaborate in the text
> for the option that enabling SR-IOV may depend on support provided by a
> vfio-pci userspace driver?

Sure.

> 
> I think that without absolutely knowing that an operation is incorrect,
> we're just generating noise and confusion by triggering warnings or
> developing alternate interfaces.  Unfortunately, we have no generic
> means of knowing that an operation is incorrect, so I assume the best.
> Thanks,
> 
> Alex
Alex Williamson March 9, 2020, 2:56 p.m. UTC | #10
On Mon, 9 Mar 2020 01:48:11 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson
> > Sent: Monday, March 9, 2020 8:46 AM
> > 
> > On Sat, 7 Mar 2020 01:35:23 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >   
> > > > From: Alex Williamson
> > > > Sent: Saturday, March 7, 2020 6:18 AM
> > > >
> > > > On Fri, 6 Mar 2020 07:57:19 +0000
> > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > >  
> > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > Sent: Friday, March 6, 2020 2:23 AM
> > > > > >
> > > > > > On Tue, 25 Feb 2020 03:08:00 +0000
> > > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > > > >  
> > > > > > > > From: Alex Williamson
> > > > > > > > Sent: Thursday, February 20, 2020 2:54 AM
> > > > > > > >
> > > > > > > > With the VF Token interface we can now expect that a vfio  
> > userspace  
> > > > > > > > driver must be in collaboration with the PF driver, an unwitting
> > > > > > > > userspace driver will not be able to get past the GET_DEVICE_FD  
> > step  
> > > > > > > > in accessing the device.  We can now move on to actually allowing
> > > > > > > > SR-IOV to be enabled by vfio-pci on the PF.  Support for this is not
> > > > > > > > enabled by default in this commit, but it does provide a module  
> > > > option  
> > > > > > > > for this to be enabled (enable_sriov=1).  Enabling VFs is rather
> > > > > > > > straightforward, except we don't want to risk that a VF might get
> > > > > > > > autoprobed and bound to other drivers, so a bus notifier is used  
> > to  
> > > > > > > > "capture" VFs to vfio-pci using the driver_override support.  We
> > > > > > > > assume any later action to bind the device to other drivers is
> > > > > > > > condoned by the system admin and allow it with a log warning.
> > > > > > > >
> > > > > > > > vfio-pci will disable SR-IOV on a PF before releasing the device,
> > > > > > > > allowing a VF driver to be assured other drivers cannot take over  
> > the  
> > > > > > > > PF and that any other userspace driver must know the shared VF  
> > > > token.  
> > > > > > > > This support also does not provide a mechanism for the PF  
> > userspace  
> > > > > > > > driver itself to manipulate SR-IOV through the vfio API.  With this
> > > > > > > > patch SR-IOV can only be enabled via the host sysfs interface and  
> > the  
> > > > > > > > PF driver user cannot create or remove VFs.  
> > > > > > >
> > > > > > > I'm not sure how many devices can be properly configured simply
> > > > > > > with pci_enable_sriov. It is not unusual to require PF driver prepare
> > > > > > > something before turning PCI SR-IOV capability. If you look kernel
> > > > > > > PF drivers, there are only two using generic pci_sriov_configure_
> > > > > > > simple (simple wrapper like pci_enable_sriov), while most others
> > > > > > > implementing their own callback. However vfio itself has no idea
> > > > > > > thus I'm not sure how an user knows whether using this option can
> > > > > > > actually meet his purpose. I may miss something here, possibly
> > > > > > > using DPDK as an example will make it clearer.  
> > > > > >
> > > > > > There is still the entire vfio userspace driver interface.  Imagine for
> > > > > > example that QEMU emulates the SR-IOV capability and makes a call  
> > out  
> > > > > > to libvirt (or maybe runs with privs for the PF SR-IOV sysfs attribs)
> > > > > > when the guest enables SR-IOV.  Can't we assume that any PF specific
> > > > > > support can still be performed in the userspace/guest driver, leaving
> > > > > > us with a very simple and generic sriov_configure callback in vfio-pci?  
> > > > >
> > > > > Makes sense. One concern, though, is how an user could be warned
> > > > > if he inadvertently uses sysfs to enable SR-IOV on a vfio device whose
> > > > > userspace driver is incapable of handling it. Note any VFIO device,
> > > > > if SR-IOV capable, will allow user to do so once the module option is
> > > > > turned on and the callback is registered. I felt such uncertainty can be
> > > > > contained by toggling SR-IOV through a vfio api, but from your  
> > description  
> > > > > obviously it is what you want to avoid. Is it due to the sequence reason,
> > > > > e.g. that SR-IOV must be enabled before userspace PF driver sets the
> > > > > token?  
> > > >
> > > > As in my other reply, enabling SR-IOV via a vfio API suggests that
> > > > we're not only granting the user owning the PF device access to the
> > > > device itself, but also the ability to create and remove subordinate
> > > > devices on the host.  That implies an extended degree of trust in the
> > > > user beyond the PF device itself and raises questions about whether a
> > > > user who is allowed to create VF devices should automatically be
> > > > granted access to those VF devices, what the mechanism would be for
> > > > that, and how we might re-assign those devices to other users,
> > > > potentially including host kernel usage.  What I'm proposing here
> > > > doesn't preclude some future extension in that direction, but instead
> > > > tries to simplify a first step towards enabling SR-IOV by leaving the
> > > > SR-IOV enablement and VF assignment in the realm of a privileged system
> > > > entity.  
> > >
> > > the intention is clear to me now.
> > >  
> > > >
> > > > So, what I think you're suggesting here is that we should restrict
> > > > vfio_pci_sriov_configure() to reject enabling SR-IOV until a user
> > > > driver has configured a VF token.  That requires both that the
> > > > userspace driver has initialized to this point before SR-IOV can be
> > > > enabled and that we would be forced to define a termination point for
> > > > the user set VF token.  Logically, this would need to be when the
> > > > userspace driver exits or closes the PF device, which implies that we
> > > > need to disable SR-IOV on the PF at this point, or we're left in an
> > > > inconsistent state where VFs are enabled but cannot be disabled because
> > > > we don't have a valid VF token.  Now we're back to nearly a state where
> > > > the user has control of not creating devices on the host, but removing
> > > > them by closing the device, which will necessarily require that any VF
> > > > driver release the device, whether userspace or kernel.
> > > >
> > > > I'm not sure what we're gaining by doing this though.  I agree that
> > > > there will be users that enable SR-IOV on a PF and then try to, for
> > > > example, assign the PF and all the VFs to a VM.  The VFs will fail due
> > > > to lacking VF token support, unless they've patch QEMU with my test
> > > > code, but depending on the PF driver in the guest, it may, or more
> > > > likely won't work.  But don't you think the VFs and probably PF not
> > > > working is a sufficient clue that the configuration is invalid?  OTOH,
> > > > from what I've heard of the device in the ID table of the pci-pf-stub
> > > > driver, they might very well be able to work with both PF and VFs in
> > > > QEMU using only my test code to set the VF token.
> > > >
> > > > Therefore, I'm afraid what you're asking for here is to impose a usage
> > > > restriction as a sanity test, when we don't really know what might be
> > > > sane for this particular piece of hardware or use case.  There are
> > > > infinite ways that a vfio based userspace driver can fail to configure
> > > > their hardware and make it work correctly, many of them are device
> > > > specific.  Isn't this just one of those cases?  Thanks,
> > > >  
> > >
> > > what you said all makes sense. so I withdraw the idea of manipulating
> > > SR-IOV through vfio ioctl. However I still feel that simply registering
> > > sriov_configuration callback by vfio-pci somehow violates the typical
> > > expectation of the sysfs interface. Before this patch, the success return
> > > of writing non-zero value to numvfs implies VFs are in sane state and
> > > functionally ready for immediate use. However now the behavior of
> > > success return becomes undefined for vfio devices, since even vfio-pci
> > > itself doesn't know whether VFs are functional for a random device
> > > (may know some if carrying the same device IDs from pci-pf-stub). It
> > > simply relies on the privileged entity who knows exactly the implication
> > > of such write, while there is no way to warn inadvertent users which
> > > to me is not a good design from kernel API p.o.v. Of course we may
> > > document such restriction and the driver_override may also be an
> > > indirect way to warn such user if he wants to use VFs for other purpose.
> > > But it is still less elegant than reporting it in the first place. Maybe
> > > what we really require is a new sysfs attribute purely for enabling
> > > PCI SR-IOV capability, which doesn't imply making VFs actually
> > > functional as did through the existing numvfs?  
> > 
> > I don't read the same guarantee into the sysfs SR-IOV interface.  If
> > such a guarantee exists, it's already broken by pci-pf-stub, which like
> > vfio-pci allows dynamic IDs and driver_override to bind to any PF device
> > allowing the ability to create (potentially) non-functional VFs.  I  
> 
> I don't know whether others raised the similar concern and how 
> it was addressed for pci-pf-stub before. Many places describe 
> numvfs as the preferred interface to enable/disable VFs while 
> 'enable' just reads functional to me.

From a PCI perspective, they are functional.  We've enabled them in the
sense that they appear on the bus.  Whether they are functional or not
depends on device specific configuration.  If I take your definition to
an extreme, it seems that we might for example not allow SR-IOV to be
enabled unless an 82576 PF has the network link up because the VF
wouldn't be able to route packets until that point.  Do we require that
the igb PF driver generates a warning that VFs might not be functional
if the link is down when SR-IOV is enabled?  I'm absolutely not
recommending we do this, I'm just pointing out that I think a different
standard is being suggested here than actually exists.  Thanks,

Alex
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index e4d5d26e5e71..b40ade48a844 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -54,6 +54,12 @@  module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(disable_idle_d3,
 		 "Disable using the PCI D3 low power state for idle, unused devices");
 
+static bool enable_sriov;
+#ifdef CONFIG_PCI_IOV
+module_param(enable_sriov, bool, 0644);
+MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration");
+#endif
+
 static inline bool vfio_vga_disabled(void)
 {
 #ifdef CONFIG_VFIO_PCI_VGA
@@ -1528,6 +1534,35 @@  static const struct vfio_device_ops vfio_pci_ops = {
 
 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
 static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
+static struct pci_driver vfio_pci_driver;
+
+static int vfio_pci_bus_notifier(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct vfio_pci_device *vdev = container_of(nb,
+						    struct vfio_pci_device, nb);
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_dev *physfn = pci_physfn(pdev);
+
+	if (action == BUS_NOTIFY_ADD_DEVICE &&
+	    pdev->is_virtfn && physfn == vdev->pdev) {
+		pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
+			 pci_name(pdev));
+		pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
+						  vfio_pci_ops.name);
+	} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
+		   pdev->is_virtfn && physfn == vdev->pdev) {
+		struct pci_driver *drv = pci_dev_driver(pdev);
+
+		if (drv && drv != &vfio_pci_driver)
+			pci_warn(vdev->pdev,
+				 "VF %s bound to driver %s while PF bound to vfio-pci\n",
+				 pci_name(pdev), drv->name);
+	}
+
+	return 0;
+}
 
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
@@ -1539,12 +1574,12 @@  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return -EINVAL;
 
 	/*
-	 * Prevent binding to PFs with VFs enabled, this too easily allows
-	 * userspace instance with VFs and PFs from the same device, which
-	 * cannot work.  Disabling SR-IOV here would initiate removing the
-	 * VFs, which would unbind the driver, which is prone to blocking
-	 * if that VF is also in use by vfio-pci.  Just reject these PFs
-	 * and let the user sort it out.
+	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
+	 * by the host or other users.  We cannot capture the VFs if they
+	 * already exist, nor can we track VF users.  Disabling SR-IOV here
+	 * would initiate removing the VFs, which would unbind the driver,
+	 * which is prone to blocking if that VF is also in use by vfio-pci.
+	 * Just reject these PFs and let the user sort it out.
 	 */
 	if (pci_num_vf(pdev)) {
 		pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
@@ -1592,6 +1627,18 @@  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 			kfree(vdev);
 			return -ENOMEM;
 		}
+
+		vdev->nb.notifier_call = vfio_pci_bus_notifier;
+		ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
+		if (ret) {
+			kfree(vdev->vf_token);
+			vfio_pci_reflck_put(vdev->reflck);
+			vfio_del_group_dev(&pdev->dev);
+			vfio_iommu_group_put(group, &pdev->dev);
+			kfree(vdev);
+			return ret;
+		}
+
 		mutex_init(&vdev->vf_token->lock);
 		uuid_gen(&vdev->vf_token->uuid);
 	}
@@ -1625,6 +1672,8 @@  static void vfio_pci_remove(struct pci_dev *pdev)
 {
 	struct vfio_pci_device *vdev;
 
+	pci_disable_sriov(pdev);
+
 	vdev = vfio_del_group_dev(&pdev->dev);
 	if (!vdev)
 		return;
@@ -1635,6 +1684,9 @@  static void vfio_pci_remove(struct pci_dev *pdev)
 		kfree(vdev->vf_token);
 	}
 
+	if (vdev->nb.notifier_call)
+		bus_unregister_notifier(&pci_bus_type, &vdev->nb);
+
 	vfio_pci_reflck_put(vdev->reflck);
 
 	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
@@ -1683,16 +1735,48 @@  static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 	return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
+static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
+{
+	struct vfio_pci_device *vdev;
+	struct vfio_device *device;
+	int ret = 0;
+
+	might_sleep();
+
+	if (!enable_sriov)
+		return -ENOENT;
+
+	device = vfio_device_get_from_dev(&pdev->dev);
+	if (!device)
+		return -ENODEV;
+
+	vdev = vfio_device_data(device);
+	if (!vdev) {
+		vfio_device_put(device);
+		return -ENODEV;
+	}
+
+	if (nr_virtfn == 0)
+		pci_disable_sriov(pdev);
+	else
+		ret = pci_enable_sriov(pdev, nr_virtfn);
+
+	vfio_device_put(device);
+
+	return ret < 0 ? ret : nr_virtfn;
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
 	.error_detected = vfio_pci_aer_err_detected,
 };
 
 static struct pci_driver vfio_pci_driver = {
-	.name		= "vfio-pci",
-	.id_table	= NULL, /* only dynamic ids */
-	.probe		= vfio_pci_probe,
-	.remove		= vfio_pci_remove,
-	.err_handler	= &vfio_err_handlers,
+	.name			= "vfio-pci",
+	.id_table		= NULL, /* only dynamic ids */
+	.probe			= vfio_pci_probe,
+	.remove			= vfio_pci_remove,
+	.sriov_configure	= vfio_pci_sriov_configure,
+	.err_handler		= &vfio_err_handlers,
 };
 
 static DEFINE_MUTEX(reflck_lock);
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 76c11c915949..36ec69081ecd 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -13,6 +13,7 @@ 
 #include <linux/irqbypass.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
+#include <linux/notifier.h>
 
 #ifndef VFIO_PCI_PRIVATE_H
 #define VFIO_PCI_PRIVATE_H
@@ -130,6 +131,7 @@  struct vfio_pci_device {
 	struct mutex		ioeventfds_lock;
 	struct list_head	ioeventfds_list;
 	struct vfio_pci_vf_token	*vf_token;
+	struct notifier_block	nb;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)