diff mbox series

[bionic/master-next,1/1] UBUNTU: SAUCE: vfio -- release device lock before userspace requests

Message ID 20180912085046.3401-2-apw@canonical.com
State New
Headers show
Series [bionic/master-next,1/1] UBUNTU: SAUCE: vfio -- release device lock before userspace requests | expand

Commit Message

Andy Whitcroft Sept. 12, 2018, 8:50 a.m. UTC
During a hotplug event vfio_pci_remove() will call
vfio_del_group_dev() to release the device group.  This may trigger
a userspace request.  Currently this userspace request is performed
while holding the device lock.  This leads userspace to deadlock
against it while trying to perform the requested cleanup.

Drop the device lock while the userspace request is in flight.
After it completes reaquire the lock and revalidate the device as
it may have been successfully removed by a concurrent operation.
As the remove callback may now drop the lock also check and
revalidation at the end of that operation.

BugLink: http://bugs.launchpad.net/bugs/1792099
Signed-off-by: Andy Whitcroft <apw@canonical.com>
---
 drivers/base/dd.c   |  7 +++++++
 drivers/vfio/vfio.c | 23 ++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

Comments

Stefan Bader Sept. 12, 2018, 9:08 a.m. UTC | #1
On 12.09.2018 10:50, Andy Whitcroft wrote:
> During a hotplug event vfio_pci_remove() will call
> vfio_del_group_dev() to release the device group.  This may trigger
> a userspace request.  Currently this userspace request is performed
> while holding the device lock.  This leads userspace to deadlock
> against it while trying to perform the requested cleanup.
> 
> Drop the device lock while the userspace request is in flight.
> After it completes reaquire the lock and revalidate the device as
> it may have been successfully removed by a concurrent operation.
> As the remove callback may now drop the lock also check and
> revalidation at the end of that operation.
> 
> BugLink: http://bugs.launchpad.net/bugs/1792099
> Signed-off-by: Andy Whitcroft <apw@canonical.com>
Acked-by: Stefan Bader <stefan.bader@canonical.com>
> ---

Based on successful testing. From the looks and description this sounds like it
should go upstream, too. And at least also into Cosmic (added some cc's).

-Stefan

>  drivers/base/dd.c   |  7 +++++++
>  drivers/vfio/vfio.c | 23 ++++++++++++++++++++++-
>  2 files changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> index 2c964f56dafe..37c01054521b 100644
> --- a/drivers/base/dd.c
> +++ b/drivers/base/dd.c
> @@ -868,6 +868,13 @@ static void __device_release_driver(struct device *dev, struct device *parent)
>  			dev->bus->remove(dev);
>  		else if (drv->remove)
>  			drv->remove(dev);
> +		/*
> +		 * A concurrent invocation of the same function might
> +		 * have released the driver successfully while this one
> +		 * was waiting, so check for that.
> +		 */
> +		if (dev->driver != drv)
> +			return;
>  
>  		device_links_driver_cleanup(dev);
>  		dma_deconfigure(dev);
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 2bc3705a99bd..6ae4d3f432fe 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -934,6 +934,10 @@ void *vfio_del_group_dev(struct device *dev)
>  	unsigned int i = 0;
>  	long ret;
>  	bool interrupted = false;
> +	bool locked = true;
> +	struct device_driver *drv;
> +
> +	drv = dev->driver;
>  
>  	/*
>  	 * The group exists so long as we have a device reference.  Get
> @@ -974,8 +978,11 @@ void *vfio_del_group_dev(struct device *dev)
>  		if (!device)
>  			break;
>  
> -		if (device->ops->request)
> +		if (device->ops->request) {
> +			device_unlock(dev);
> +			locked = false;
>  			device->ops->request(device_data, i++);
> +		}
>  
>  		vfio_device_put(device);
>  
> @@ -994,6 +1001,20 @@ void *vfio_del_group_dev(struct device *dev)
>  					 current->comm, task_pid_nr(current));
>  			}
>  		}
> +
> +		if (!locked) {
> +			device_lock(dev);
> +			locked = true;
> +			/*
> +			 * A concurrent operation may have released the driver
> +			 * successfully while we had dropped the lock,
> +			 * check for that.
> +			 */
> +			if (dev->driver != drv) {
> +				vfio_group_put(group);
> +				return NULL;
> +			}
> +		}
>  	} while (ret <= 0);
>  
>  	/*
>
Colin Ian King Sept. 12, 2018, 9:17 a.m. UTC | #2
On 12/09/18 09:50, Andy Whitcroft wrote:
> During a hotplug event vfio_pci_remove() will call
> vfio_del_group_dev() to release the device group.  This may trigger
> a userspace request.  Currently this userspace request is performed
> while holding the device lock.  This leads userspace to deadlock
> against it while trying to perform the requested cleanup.
> 
> Drop the device lock while the userspace request is in flight.
> After it completes reaquire the lock and revalidate the device as
> it may have been successfully removed by a concurrent operation.
> As the remove callback may now drop the lock also check and
> revalidation at the end of that operation.
> 
> BugLink: http://bugs.launchpad.net/bugs/1792099
> Signed-off-by: Andy Whitcroft <apw@canonical.com>
> ---
>  drivers/base/dd.c   |  7 +++++++
>  drivers/vfio/vfio.c | 23 ++++++++++++++++++++++-
>  2 files changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> index 2c964f56dafe..37c01054521b 100644
> --- a/drivers/base/dd.c
> +++ b/drivers/base/dd.c
> @@ -868,6 +868,13 @@ static void __device_release_driver(struct device *dev, struct device *parent)
>  			dev->bus->remove(dev);
>  		else if (drv->remove)
>  			drv->remove(dev);
> +		/*
> +		 * A concurrent invocation of the same function might
> +		 * have released the driver successfully while this one
> +		 * was waiting, so check for that.
> +		 */
> +		if (dev->driver != drv)
> +			return;
>  
>  		device_links_driver_cleanup(dev);
>  		dma_deconfigure(dev);
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 2bc3705a99bd..6ae4d3f432fe 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -934,6 +934,10 @@ void *vfio_del_group_dev(struct device *dev)
>  	unsigned int i = 0;
>  	long ret;
>  	bool interrupted = false;
> +	bool locked = true;
> +	struct device_driver *drv;
> +
> +	drv = dev->driver;
>  
>  	/*
>  	 * The group exists so long as we have a device reference.  Get
> @@ -974,8 +978,11 @@ void *vfio_del_group_dev(struct device *dev)
>  		if (!device)
>  			break;
>  
> -		if (device->ops->request)
> +		if (device->ops->request) {
> +			device_unlock(dev);
> +			locked = false;
>  			device->ops->request(device_data, i++);
> +		}
>  
>  		vfio_device_put(device);
>  
> @@ -994,6 +1001,20 @@ void *vfio_del_group_dev(struct device *dev)
>  					 current->comm, task_pid_nr(current));
>  			}
>  		}
> +
> +		if (!locked) {
> +			device_lock(dev);
> +			locked = true;
> +			/*
> +			 * A concurrent operation may have released the driver
> +			 * successfully while we had dropped the lock,
> +			 * check for that.
> +			 */
> +			if (dev->driver != drv) {
> +				vfio_group_put(group);
> +				return NULL;
> +			}
> +		}
>  	} while (ret <= 0);
>  
>  	/*
> 

Has positive test results, reduced regression risk as this change is to
an uncommonly used driver, so this seems acceptable for a SRU.

+1 on Stefan's comments, e.g. Can this go upstream, does it need
applying to Cosmic and/or backporting to Xenial, etc.

Acked-by: Colin Ian King <colin.king@canonical.com>
Kleber Sacilotto de Souza Sept. 12, 2018, 9:22 a.m. UTC | #3
On 09/12/18 10:50, Andy Whitcroft wrote:
> During a hotplug event vfio_pci_remove() will call
> vfio_del_group_dev() to release the device group.  This may trigger
> a userspace request.  Currently this userspace request is performed
> while holding the device lock.  This leads userspace to deadlock
> against it while trying to perform the requested cleanup.
> 
> Drop the device lock while the userspace request is in flight.
> After it completes reaquire the lock and revalidate the device as
> it may have been successfully removed by a concurrent operation.
> As the remove callback may now drop the lock also check and
> revalidation at the end of that operation.
> 
> BugLink: http://bugs.launchpad.net/bugs/1792099
> Signed-off-by: Andy Whitcroft <apw@canonical.com>
> ---
>  drivers/base/dd.c   |  7 +++++++
>  drivers/vfio/vfio.c | 23 ++++++++++++++++++++++-
>  2 files changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> index 2c964f56dafe..37c01054521b 100644
> --- a/drivers/base/dd.c
> +++ b/drivers/base/dd.c
> @@ -868,6 +868,13 @@ static void __device_release_driver(struct device *dev, struct device *parent)
>  			dev->bus->remove(dev);
>  		else if (drv->remove)
>  			drv->remove(dev);
> +		/*
> +		 * A concurrent invocation of the same function might
> +		 * have released the driver successfully while this one
> +		 * was waiting, so check for that.
> +		 */
> +		if (dev->driver != drv)
> +			return;
>  
>  		device_links_driver_cleanup(dev);
>  		dma_deconfigure(dev);
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 2bc3705a99bd..6ae4d3f432fe 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -934,6 +934,10 @@ void *vfio_del_group_dev(struct device *dev)
>  	unsigned int i = 0;
>  	long ret;
>  	bool interrupted = false;
> +	bool locked = true;
> +	struct device_driver *drv;
> +
> +	drv = dev->driver;
>  
>  	/*
>  	 * The group exists so long as we have a device reference.  Get
> @@ -974,8 +978,11 @@ void *vfio_del_group_dev(struct device *dev)
>  		if (!device)
>  			break;
>  
> -		if (device->ops->request)
> +		if (device->ops->request) {
> +			device_unlock(dev);
> +			locked = false;
>  			device->ops->request(device_data, i++);
> +		}
>  
>  		vfio_device_put(device);
>  
> @@ -994,6 +1001,20 @@ void *vfio_del_group_dev(struct device *dev)
>  					 current->comm, task_pid_nr(current));
>  			}
>  		}
> +
> +		if (!locked) {
> +			device_lock(dev);
> +			locked = true;
> +			/*
> +			 * A concurrent operation may have released the driver
> +			 * successfully while we had dropped the lock,
> +			 * check for that.
> +			 */
> +			if (dev->driver != drv) {
> +				vfio_group_put(group);
> +				return NULL;
> +			}
> +		}
>  	} while (ret <= 0);
>  
>  	/*
> 

Applied to bionic/master-next branch.

Thanks,
Kleber
Seth Forshee Sept. 12, 2018, 12:12 p.m. UTC | #4
On Wed, Sep 12, 2018 at 09:50:46AM +0100, Andy Whitcroft wrote:
> During a hotplug event vfio_pci_remove() will call
> vfio_del_group_dev() to release the device group.  This may trigger
> a userspace request.  Currently this userspace request is performed
> while holding the device lock.  This leads userspace to deadlock
> against it while trying to perform the requested cleanup.
> 
> Drop the device lock while the userspace request is in flight.
> After it completes reaquire the lock and revalidate the device as
> it may have been successfully removed by a concurrent operation.
> As the remove callback may now drop the lock also check and
> revalidation at the end of that operation.
> 
> BugLink: http://bugs.launchpad.net/bugs/1792099
> Signed-off-by: Andy Whitcroft <apw@canonical.com>

Applied to cosmic/master-next, thanks!
diff mbox series

Patch

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 2c964f56dafe..37c01054521b 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -868,6 +868,13 @@  static void __device_release_driver(struct device *dev, struct device *parent)
 			dev->bus->remove(dev);
 		else if (drv->remove)
 			drv->remove(dev);
+		/*
+		 * A concurrent invocation of the same function might
+		 * have released the driver successfully while this one
+		 * was waiting, so check for that.
+		 */
+		if (dev->driver != drv)
+			return;
 
 		device_links_driver_cleanup(dev);
 		dma_deconfigure(dev);
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 2bc3705a99bd..6ae4d3f432fe 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -934,6 +934,10 @@  void *vfio_del_group_dev(struct device *dev)
 	unsigned int i = 0;
 	long ret;
 	bool interrupted = false;
+	bool locked = true;
+	struct device_driver *drv;
+
+	drv = dev->driver;
 
 	/*
 	 * The group exists so long as we have a device reference.  Get
@@ -974,8 +978,11 @@  void *vfio_del_group_dev(struct device *dev)
 		if (!device)
 			break;
 
-		if (device->ops->request)
+		if (device->ops->request) {
+			device_unlock(dev);
+			locked = false;
 			device->ops->request(device_data, i++);
+		}
 
 		vfio_device_put(device);
 
@@ -994,6 +1001,20 @@  void *vfio_del_group_dev(struct device *dev)
 					 current->comm, task_pid_nr(current));
 			}
 		}
+
+		if (!locked) {
+			device_lock(dev);
+			locked = true;
+			/*
+			 * A concurrent operation may have released the driver
+			 * successfully while we had dropped the lock,
+			 * check for that.
+			 */
+			if (dev->driver != drv) {
+				vfio_group_put(group);
+				return NULL;
+			}
+		}
 	} while (ret <= 0);
 
 	/*