Patchwork [v3,04/11] PCI/MSI/pSeries: Make quota traversing and requesting race-safe

login
register
mail settings
Submitter Alexander Gordeev
Date Nov. 26, 2013, 9:09 a.m.
Message ID <97749db3e9562eb05fb3ee22a0be0ed7a93a0615.1385399393.git.agordeev@redhat.com>
Download mbox | patch
Permalink /patch/294409/
State Changes Requested
Headers show

Comments

Alexander Gordeev - Nov. 26, 2013, 9:09 a.m.
The current MSI quota handling is not race-safe and might lead
to incoherent number of MSIs allocated between the firmware and
Linux MSI data structures. I.e. if the following chain is called
from concurrently loading drivers: rtas_setup_msi_irqs() ->
msi_quota_for_device() -> traverse_pci_devices() a driver might
get a stalled value of MSI limit for its device or possibly even
crash.

This update introduces "rtas_quota_mutex" and serializes all
accesses to msi_quota_for_device() function. As result, no driver
could eat into other device's MSI limit.

Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
---
 arch/powerpc/platforms/pseries/msi.c |   24 ++++++++++++++++++++++--
 1 files changed, 22 insertions(+), 2 deletions(-)
Bjorn Helgaas - Dec. 10, 2013, 10:30 p.m.
On Tue, Nov 26, 2013 at 10:09:53AM +0100, Alexander Gordeev wrote:
> The current MSI quota handling is not race-safe and might lead
> to incoherent number of MSIs allocated between the firmware and
> Linux MSI data structures. I.e. if the following chain is called
> from concurrently loading drivers: rtas_setup_msi_irqs() ->
> msi_quota_for_device() -> traverse_pci_devices() a driver might
> get a stalled value of MSI limit for its device or possibly even
> crash.

Can you outline the race and the scenario that leads to incorrect results
or a crash?  I looked through rtas_setup_msi_irqs() (briefly) and I didn't
see the way that concurrent calls for different devices could interfere
with each other.

I was looking for some place that modifies state, where concurrent calls
might trample on each other, but it looks like msi_quota_for_device() is
pretty safe: it traverses a tree, but everything it computes is on the
stack and it doesn't seem to save results anywhere.  Maybe I'm barking up
the wrong tree?

Bjorn

> This update introduces "rtas_quota_mutex" and serializes all
> accesses to msi_quota_for_device() function. As result, no driver
> could eat into other device's MSI limit.
> 
> Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
> ---
>  arch/powerpc/platforms/pseries/msi.c |   24 ++++++++++++++++++++++--
>  1 files changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
> index 009ec73..0e1d288 100644
> --- a/arch/powerpc/platforms/pseries/msi.c
> +++ b/arch/powerpc/platforms/pseries/msi.c
> @@ -26,6 +26,8 @@ static int query_token, change_token;
>  #define RTAS_CHANGE_MSIX_FN	4
>  #define RTAS_CHANGE_32MSI_FN	5
>  
> +static DEFINE_MUTEX(rtas_quota_mutex);
> +
>  /* RTAS Helpers */
>  
>  static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs)
> @@ -345,7 +347,9 @@ static int rtas_msi_check_device(struct pci_dev *pdev, int nvec, int type)
>  	if (rc)
>  		return rc;
>  
> +	mutex_lock(&rtas_quota_mutex);
>  	quota = msi_quota_for_device(pdev, nvec);
> +	mutex_unlock(&rtas_quota_mutex);
>  
>  	if (quota && quota < nvec)
>  		return quota;
> @@ -399,6 +403,7 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
>  	struct msi_msg msg;
>  	int nvec = nvec_in;
>  	int use_32bit_msi_hack = 0;
> +	int quota;
>  
>  	pdn = pci_get_pdn(pdev);
>  	if (!pdn)
> @@ -407,13 +412,21 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
>  	if (type == PCI_CAP_ID_MSIX && check_msix_entries(pdev))
>  		return -EINVAL;
>  
> +	mutex_lock(&rtas_quota_mutex);
> +
> +	quota = msi_quota_for_device(pdev, nvec);
> +	if (quota && quota < nvec) {
> +		mutex_unlock(&rtas_quota_mutex);
> +		return quota;
> +	}
> +
>  	/*
>  	 * Firmware currently refuse any non power of two allocation
>  	 * so we round up if the quota will allow it.
>  	 */
>  	if (type == PCI_CAP_ID_MSIX) {
>  		int m = roundup_pow_of_two(nvec);
> -		int quota = msi_quota_for_device(pdev, m);
> +		quota = msi_quota_for_device(pdev, m);
>  
>  		if (quota >= m)
>  			nvec = m;
> @@ -433,8 +446,11 @@ again:
>  				 * We only want to run the 32 bit MSI hack below if
>  				 * the max bus speed is Gen2 speed
>  				 */
> -				if (pdev->bus->max_bus_speed != PCIE_SPEED_5_0GT)
> +				if (pdev->bus->max_bus_speed !=
> +				    PCIE_SPEED_5_0GT) {
> +					mutex_unlock(&rtas_quota_mutex);
>  					return rc;
> +				}
>  
>  				use_32bit_msi_hack = 1;
>  			}
> @@ -459,6 +475,7 @@ again:
>  			nvec = nvec_in;
>  			goto again;
>  		}
> +		mutex_unlock(&rtas_quota_mutex);
>  		pr_debug("rtas_msi: rtas_change_msi() failed\n");
>  		return rc;
>  	}
> @@ -467,6 +484,7 @@ again:
>  	list_for_each_entry(entry, &pdev->msi_list, list) {
>  		hwirq = rtas_query_irq_number(pdn, i++);
>  		if (hwirq < 0) {
> +			mutex_unlock(&rtas_quota_mutex);
>  			pr_debug("rtas_msi: error (%d) getting hwirq\n", hwirq);
>  			return hwirq;
>  		}
> @@ -474,6 +492,7 @@ again:
>  		virq = irq_create_mapping(NULL, hwirq);
>  
>  		if (virq == NO_IRQ) {
> +			mutex_unlock(&rtas_quota_mutex);
>  			pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
>  			return -ENOSPC;
>  		}
> @@ -486,6 +505,7 @@ again:
>  		entry->msg = msg;
>  	}
>  
> +	mutex_unlock(&rtas_quota_mutex);
>  	return 0;
>  }
>  
> -- 
> 1.7.7.6
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Gordeev - Dec. 13, 2013, 10:29 a.m.
On Tue, Dec 10, 2013 at 03:30:20PM -0700, Bjorn Helgaas wrote:
> Can you outline the race and the scenario that leads to incorrect results
> or a crash?  I looked through rtas_setup_msi_irqs() (briefly) and I didn't
> see the way that concurrent calls for different devices could interfere
> with each other.
> 
> I was looking for some place that modifies state, where concurrent calls
> might trample on each other, but it looks like msi_quota_for_device() is
> pretty safe: it traverses a tree, but everything it computes is on the
> stack and it doesn't seem to save results anywhere.  Maybe I'm barking up
> the wrong tree?

Hmm. I've assumed the number of MSIs for a device is cached, and therefore
concurrent calls to msi_quota_for_device() and rtas_change_msi() could race.
But it seems msi_quota_for_device() indeed computes a quota while reading
only device's properties and gains constant result (well, assuming the device
tree is not updated, but this is a different story). Which makes me confused
about this note from a earlier thread:

[quote]
On Sat, 2013-10-05 at 16:20 +0200, Alexander Gordeev wrote:
> So my point is - drivers should first obtain a number of MSIs they *can*
> get, then *derive* a number of MSIs the device is fine with and only then
> request that number. Not terribly different from memory or any other type
> of resource allocation ;)

What if the limit is for a group of devices ? Your interface is racy in
that case, another driver could have eaten into the limit in between the
calls.

Ben.
[/quote]

Some comment from Ben would be nice, but I think the patch could be dropped
for now.

Thanks, Bjorn!

> Bjorn

Patch

diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index 009ec73..0e1d288 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -26,6 +26,8 @@  static int query_token, change_token;
 #define RTAS_CHANGE_MSIX_FN	4
 #define RTAS_CHANGE_32MSI_FN	5
 
+static DEFINE_MUTEX(rtas_quota_mutex);
+
 /* RTAS Helpers */
 
 static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs)
@@ -345,7 +347,9 @@  static int rtas_msi_check_device(struct pci_dev *pdev, int nvec, int type)
 	if (rc)
 		return rc;
 
+	mutex_lock(&rtas_quota_mutex);
 	quota = msi_quota_for_device(pdev, nvec);
+	mutex_unlock(&rtas_quota_mutex);
 
 	if (quota && quota < nvec)
 		return quota;
@@ -399,6 +403,7 @@  static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
 	struct msi_msg msg;
 	int nvec = nvec_in;
 	int use_32bit_msi_hack = 0;
+	int quota;
 
 	pdn = pci_get_pdn(pdev);
 	if (!pdn)
@@ -407,13 +412,21 @@  static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
 	if (type == PCI_CAP_ID_MSIX && check_msix_entries(pdev))
 		return -EINVAL;
 
+	mutex_lock(&rtas_quota_mutex);
+
+	quota = msi_quota_for_device(pdev, nvec);
+	if (quota && quota < nvec) {
+		mutex_unlock(&rtas_quota_mutex);
+		return quota;
+	}
+
 	/*
 	 * Firmware currently refuse any non power of two allocation
 	 * so we round up if the quota will allow it.
 	 */
 	if (type == PCI_CAP_ID_MSIX) {
 		int m = roundup_pow_of_two(nvec);
-		int quota = msi_quota_for_device(pdev, m);
+		quota = msi_quota_for_device(pdev, m);
 
 		if (quota >= m)
 			nvec = m;
@@ -433,8 +446,11 @@  again:
 				 * We only want to run the 32 bit MSI hack below if
 				 * the max bus speed is Gen2 speed
 				 */
-				if (pdev->bus->max_bus_speed != PCIE_SPEED_5_0GT)
+				if (pdev->bus->max_bus_speed !=
+				    PCIE_SPEED_5_0GT) {
+					mutex_unlock(&rtas_quota_mutex);
 					return rc;
+				}
 
 				use_32bit_msi_hack = 1;
 			}
@@ -459,6 +475,7 @@  again:
 			nvec = nvec_in;
 			goto again;
 		}
+		mutex_unlock(&rtas_quota_mutex);
 		pr_debug("rtas_msi: rtas_change_msi() failed\n");
 		return rc;
 	}
@@ -467,6 +484,7 @@  again:
 	list_for_each_entry(entry, &pdev->msi_list, list) {
 		hwirq = rtas_query_irq_number(pdn, i++);
 		if (hwirq < 0) {
+			mutex_unlock(&rtas_quota_mutex);
 			pr_debug("rtas_msi: error (%d) getting hwirq\n", hwirq);
 			return hwirq;
 		}
@@ -474,6 +492,7 @@  again:
 		virq = irq_create_mapping(NULL, hwirq);
 
 		if (virq == NO_IRQ) {
+			mutex_unlock(&rtas_quota_mutex);
 			pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
 			return -ENOSPC;
 		}
@@ -486,6 +505,7 @@  again:
 		entry->msg = msg;
 	}
 
+	mutex_unlock(&rtas_quota_mutex);
 	return 0;
 }