diff mbox series

[kernel,v3] powerpc/pci: Remove LSI mappings on device teardown

Message ID 20201202005222.5477-1-aik@ozlabs.ru (mailing list archive)
State Accepted
Commit 450be4960a0fb89b931a6bb3c3f0bb538ac4c03c
Headers show
Series [kernel,v3] powerpc/pci: Remove LSI mappings on device teardown | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (78c312324391ee996944e1196123b0060888e189)
snowpatch_ozlabs/build-ppc64le success Build succeeded
snowpatch_ozlabs/build-ppc64be success Build succeeded
snowpatch_ozlabs/build-ppc64e success Build succeeded
snowpatch_ozlabs/build-pmac32 success Build succeeded
snowpatch_ozlabs/checkpatch warning total: 0 errors, 0 warnings, 2 checks, 116 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Alexey Kardashevskiy Dec. 2, 2020, 12:52 a.m. UTC
From: Oliver O'Halloran <oohall@gmail.com>

When a passthrough IO adapter is removed from a pseries machine using hash
MMU and the XIVE interrupt mode, the POWER hypervisor expects the guest OS
to clear all page table entries related to the adapter. If some are still
present, the RTAS call which isolates the PCI slot returns error 9001
"valid outstanding translations" and the removal of the IO adapter fails.
This is because when the PHBs are scanned, Linux maps automatically the
INTx interrupts in the Linux interrupt number space but these are never
removed.

This problem can be fixed by adding the corresponding unmap operation when
the device is removed. There's no pcibios_* hook for the remove case, but
the same effect can be achieved using a bus notifier.

Because INTx are shared among PHBs (and potentially across the system),
this adds tracking of virq to unmap them only when the last user is gone.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
[aik: added refcounter]
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v3:
* free @vi on error path

v2:
* added refcounter
---
 arch/powerpc/kernel/pci-common.c | 82 ++++++++++++++++++++++++++++++--
 1 file changed, 78 insertions(+), 4 deletions(-)

Comments

Frederic Barrat Dec. 2, 2020, 2:17 p.m. UTC | #1
On 02/12/2020 01:52, Alexey Kardashevskiy wrote:
> From: Oliver O'Halloran <oohall@gmail.com>
> 
> When a passthrough IO adapter is removed from a pseries machine using hash
> MMU and the XIVE interrupt mode, the POWER hypervisor expects the guest OS
> to clear all page table entries related to the adapter. If some are still
> present, the RTAS call which isolates the PCI slot returns error 9001
> "valid outstanding translations" and the removal of the IO adapter fails.
> This is because when the PHBs are scanned, Linux maps automatically the
> INTx interrupts in the Linux interrupt number space but these are never
> removed.
> 
> This problem can be fixed by adding the corresponding unmap operation when
> the device is removed. There's no pcibios_* hook for the remove case, but
> the same effect can be achieved using a bus notifier.
> 
> Because INTx are shared among PHBs (and potentially across the system),
> this adds tracking of virq to unmap them only when the last user is gone.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> [aik: added refcounter]
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---


Looks ok to me.
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>


> Changes:
> v3:
> * free @vi on error path
> 
> v2:
> * added refcounter
> ---
>   arch/powerpc/kernel/pci-common.c | 82 ++++++++++++++++++++++++++++++--
>   1 file changed, 78 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
> index be108616a721..2b555997b295 100644
> --- a/arch/powerpc/kernel/pci-common.c
> +++ b/arch/powerpc/kernel/pci-common.c
> @@ -353,6 +353,55 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr)
>   	return NULL;
>   }
>   
> +struct pci_intx_virq {
> +	int virq;
> +	struct kref kref;
> +	struct list_head list_node;
> +};
> +
> +static LIST_HEAD(intx_list);
> +static DEFINE_MUTEX(intx_mutex);
> +
> +static void ppc_pci_intx_release(struct kref *kref)
> +{
> +	struct pci_intx_virq *vi = container_of(kref, struct pci_intx_virq, kref);
> +
> +	list_del(&vi->list_node);
> +	irq_dispose_mapping(vi->virq);
> +	kfree(vi);
> +}
> +
> +static int ppc_pci_unmap_irq_line(struct notifier_block *nb,
> +			       unsigned long action, void *data)
> +{
> +	struct pci_dev *pdev = to_pci_dev(data);
> +
> +	if (action == BUS_NOTIFY_DEL_DEVICE) {
> +		struct pci_intx_virq *vi;
> +
> +		mutex_lock(&intx_mutex);
> +		list_for_each_entry(vi, &intx_list, list_node) {
> +			if (vi->virq == pdev->irq) {
> +				kref_put(&vi->kref, ppc_pci_intx_release);
> +				break;
> +			}
> +		}
> +		mutex_unlock(&intx_mutex);
> +	}
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block ppc_pci_unmap_irq_notifier = {
> +	.notifier_call = ppc_pci_unmap_irq_line,
> +};
> +
> +static int ppc_pci_register_irq_notifier(void)
> +{
> +	return bus_register_notifier(&pci_bus_type, &ppc_pci_unmap_irq_notifier);
> +}
> +arch_initcall(ppc_pci_register_irq_notifier);
> +
>   /*
>    * Reads the interrupt pin to determine if interrupt is use by card.
>    * If the interrupt is used, then gets the interrupt line from the
> @@ -361,6 +410,12 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr)
>   static int pci_read_irq_line(struct pci_dev *pci_dev)
>   {
>   	int virq;
> +	struct pci_intx_virq *vi, *vitmp;
> +
> +	/* Preallocate vi as rewind is complex if this fails after mapping */
> +	vi = kzalloc(sizeof(struct pci_intx_virq), GFP_KERNEL);
> +	if (!vi)
> +		return -1;
>   
>   	pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev));
>   
> @@ -377,12 +432,12 @@ static int pci_read_irq_line(struct pci_dev *pci_dev)
>   		 * function.
>   		 */
>   		if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &pin))
> -			return -1;
> +			goto error_exit;
>   		if (pin == 0)
> -			return -1;
> +			goto error_exit;
>   		if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_LINE, &line) ||
>   		    line == 0xff || line == 0) {
> -			return -1;
> +			goto error_exit;
>   		}
>   		pr_debug(" No map ! Using line %d (pin %d) from PCI config\n",
>   			 line, pin);
> @@ -394,14 +449,33 @@ static int pci_read_irq_line(struct pci_dev *pci_dev)
>   
>   	if (!virq) {
>   		pr_debug(" Failed to map !\n");
> -		return -1;
> +		goto error_exit;
>   	}
>   
>   	pr_debug(" Mapped to linux irq %d\n", virq);
>   
>   	pci_dev->irq = virq;
>   
> +	mutex_lock(&intx_mutex);
> +	list_for_each_entry(vitmp, &intx_list, list_node) {
> +		if (vitmp->virq == virq) {
> +			kref_get(&vitmp->kref);
> +			kfree(vi);
> +			vi = NULL;
> +			break;
> +		}
> +	}
> +	if (vi) {
> +		vi->virq = virq;
> +		kref_init(&vi->kref);
> +		list_add_tail(&vi->list_node, &intx_list);
> +	}
> +	mutex_unlock(&intx_mutex);
> +
>   	return 0;
> +error_exit:
> +	kfree(vi);
> +	return -1;
>   }
>   
>   /*
>
Cédric Le Goater Dec. 2, 2020, 2:47 p.m. UTC | #2
On 12/2/20 1:52 AM, Alexey Kardashevskiy wrote:
> From: Oliver O'Halloran <oohall@gmail.com>
> 
> When a passthrough IO adapter is removed from a pseries machine using hash
> MMU and the XIVE interrupt mode, the POWER hypervisor expects the guest OS
> to clear all page table entries related to the adapter. If some are still
> present, the RTAS call which isolates the PCI slot returns error 9001
> "valid outstanding translations" and the removal of the IO adapter fails.
> This is because when the PHBs are scanned, Linux maps automatically the
> INTx interrupts in the Linux interrupt number space but these are never
> removed.
> 
> This problem can be fixed by adding the corresponding unmap operation when
> the device is removed. There's no pcibios_* hook for the remove case, but
> the same effect can be achieved using a bus notifier.
> 
> Because INTx are shared among PHBs (and potentially across the system),
> this adds tracking of virq to unmap them only when the last user is gone.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> [aik: added refcounter]
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>


Reviewed-by: Cédric Le Goater <clg@kaod.org>

I did some PHB hotplug tests on a KVM guest and a LPAR using only LSIs.

Tested-by: Cédric Le Goater <clg@kaod.org>

Thanks Alexey,

C.

> ---
> Changes:
> v3:
> * free @vi on error path
> 
> v2:
> * added refcounter
> ---
>  arch/powerpc/kernel/pci-common.c | 82 ++++++++++++++++++++++++++++++--
>  1 file changed, 78 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
> index be108616a721..2b555997b295 100644
> --- a/arch/powerpc/kernel/pci-common.c
> +++ b/arch/powerpc/kernel/pci-common.c
> @@ -353,6 +353,55 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr)
>  	return NULL;
>  }
>  
> +struct pci_intx_virq {
> +	int virq;
> +	struct kref kref;
> +	struct list_head list_node;
> +};
> +
> +static LIST_HEAD(intx_list);
> +static DEFINE_MUTEX(intx_mutex);
> +
> +static void ppc_pci_intx_release(struct kref *kref)
> +{
> +	struct pci_intx_virq *vi = container_of(kref, struct pci_intx_virq, kref);
> +
> +	list_del(&vi->list_node);
> +	irq_dispose_mapping(vi->virq);
> +	kfree(vi);
> +}
> +
> +static int ppc_pci_unmap_irq_line(struct notifier_block *nb,
> +			       unsigned long action, void *data)
> +{
> +	struct pci_dev *pdev = to_pci_dev(data);
> +
> +	if (action == BUS_NOTIFY_DEL_DEVICE) {
> +		struct pci_intx_virq *vi;
> +
> +		mutex_lock(&intx_mutex);
> +		list_for_each_entry(vi, &intx_list, list_node) {
> +			if (vi->virq == pdev->irq) {
> +				kref_put(&vi->kref, ppc_pci_intx_release);
> +				break;
> +			}
> +		}
> +		mutex_unlock(&intx_mutex);
> +	}
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block ppc_pci_unmap_irq_notifier = {
> +	.notifier_call = ppc_pci_unmap_irq_line,
> +};
> +
> +static int ppc_pci_register_irq_notifier(void)
> +{
> +	return bus_register_notifier(&pci_bus_type, &ppc_pci_unmap_irq_notifier);
> +}
> +arch_initcall(ppc_pci_register_irq_notifier);
> +
>  /*
>   * Reads the interrupt pin to determine if interrupt is use by card.
>   * If the interrupt is used, then gets the interrupt line from the
> @@ -361,6 +410,12 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr)
>  static int pci_read_irq_line(struct pci_dev *pci_dev)
>  {
>  	int virq;
> +	struct pci_intx_virq *vi, *vitmp;
> +
> +	/* Preallocate vi as rewind is complex if this fails after mapping */
> +	vi = kzalloc(sizeof(struct pci_intx_virq), GFP_KERNEL);
> +	if (!vi)
> +		return -1;
>  
>  	pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev));
>  
> @@ -377,12 +432,12 @@ static int pci_read_irq_line(struct pci_dev *pci_dev)
>  		 * function.
>  		 */
>  		if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &pin))
> -			return -1;
> +			goto error_exit;
>  		if (pin == 0)
> -			return -1;
> +			goto error_exit;
>  		if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_LINE, &line) ||
>  		    line == 0xff || line == 0) {
> -			return -1;
> +			goto error_exit;
>  		}
>  		pr_debug(" No map ! Using line %d (pin %d) from PCI config\n",
>  			 line, pin);
> @@ -394,14 +449,33 @@ static int pci_read_irq_line(struct pci_dev *pci_dev)
>  
>  	if (!virq) {
>  		pr_debug(" Failed to map !\n");
> -		return -1;
> +		goto error_exit;
>  	}
>  
>  	pr_debug(" Mapped to linux irq %d\n", virq);
>  
>  	pci_dev->irq = virq;
>  
> +	mutex_lock(&intx_mutex);
> +	list_for_each_entry(vitmp, &intx_list, list_node) {
> +		if (vitmp->virq == virq) {
> +			kref_get(&vitmp->kref);
> +			kfree(vi);
> +			vi = NULL;
> +			break;
> +		}
> +	}
> +	if (vi) {
> +		vi->virq = virq;
> +		kref_init(&vi->kref);
> +		list_add_tail(&vi->list_node, &intx_list);
> +	}
> +	mutex_unlock(&intx_mutex);
> +
>  	return 0;
> +error_exit:
> +	kfree(vi);
> +	return -1;
>  }
>  
>  /*
>
Michael Ellerman Dec. 10, 2020, 11:29 a.m. UTC | #3
On Wed, 2 Dec 2020 11:52:22 +1100, Alexey Kardashevskiy wrote:
> When a passthrough IO adapter is removed from a pseries machine using hash
> MMU and the XIVE interrupt mode, the POWER hypervisor expects the guest OS
> to clear all page table entries related to the adapter. If some are still
> present, the RTAS call which isolates the PCI slot returns error 9001
> "valid outstanding translations" and the removal of the IO adapter fails.
> This is because when the PHBs are scanned, Linux maps automatically the
> INTx interrupts in the Linux interrupt number space but these are never
> removed.
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/pci: Remove LSI mappings on device teardown
      https://git.kernel.org/powerpc/c/450be4960a0fb89b931a6bb3c3f0bb538ac4c03c

cheers
diff mbox series

Patch

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index be108616a721..2b555997b295 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -353,6 +353,55 @@  struct pci_controller *pci_find_controller_for_domain(int domain_nr)
 	return NULL;
 }
 
+struct pci_intx_virq {
+	int virq;
+	struct kref kref;
+	struct list_head list_node;
+};
+
+static LIST_HEAD(intx_list);
+static DEFINE_MUTEX(intx_mutex);
+
+static void ppc_pci_intx_release(struct kref *kref)
+{
+	struct pci_intx_virq *vi = container_of(kref, struct pci_intx_virq, kref);
+
+	list_del(&vi->list_node);
+	irq_dispose_mapping(vi->virq);
+	kfree(vi);
+}
+
+static int ppc_pci_unmap_irq_line(struct notifier_block *nb,
+			       unsigned long action, void *data)
+{
+	struct pci_dev *pdev = to_pci_dev(data);
+
+	if (action == BUS_NOTIFY_DEL_DEVICE) {
+		struct pci_intx_virq *vi;
+
+		mutex_lock(&intx_mutex);
+		list_for_each_entry(vi, &intx_list, list_node) {
+			if (vi->virq == pdev->irq) {
+				kref_put(&vi->kref, ppc_pci_intx_release);
+				break;
+			}
+		}
+		mutex_unlock(&intx_mutex);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ppc_pci_unmap_irq_notifier = {
+	.notifier_call = ppc_pci_unmap_irq_line,
+};
+
+static int ppc_pci_register_irq_notifier(void)
+{
+	return bus_register_notifier(&pci_bus_type, &ppc_pci_unmap_irq_notifier);
+}
+arch_initcall(ppc_pci_register_irq_notifier);
+
 /*
  * Reads the interrupt pin to determine if interrupt is use by card.
  * If the interrupt is used, then gets the interrupt line from the
@@ -361,6 +410,12 @@  struct pci_controller *pci_find_controller_for_domain(int domain_nr)
 static int pci_read_irq_line(struct pci_dev *pci_dev)
 {
 	int virq;
+	struct pci_intx_virq *vi, *vitmp;
+
+	/* Preallocate vi as rewind is complex if this fails after mapping */
+	vi = kzalloc(sizeof(struct pci_intx_virq), GFP_KERNEL);
+	if (!vi)
+		return -1;
 
 	pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev));
 
@@ -377,12 +432,12 @@  static int pci_read_irq_line(struct pci_dev *pci_dev)
 		 * function.
 		 */
 		if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &pin))
-			return -1;
+			goto error_exit;
 		if (pin == 0)
-			return -1;
+			goto error_exit;
 		if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_LINE, &line) ||
 		    line == 0xff || line == 0) {
-			return -1;
+			goto error_exit;
 		}
 		pr_debug(" No map ! Using line %d (pin %d) from PCI config\n",
 			 line, pin);
@@ -394,14 +449,33 @@  static int pci_read_irq_line(struct pci_dev *pci_dev)
 
 	if (!virq) {
 		pr_debug(" Failed to map !\n");
-		return -1;
+		goto error_exit;
 	}
 
 	pr_debug(" Mapped to linux irq %d\n", virq);
 
 	pci_dev->irq = virq;
 
+	mutex_lock(&intx_mutex);
+	list_for_each_entry(vitmp, &intx_list, list_node) {
+		if (vitmp->virq == virq) {
+			kref_get(&vitmp->kref);
+			kfree(vi);
+			vi = NULL;
+			break;
+		}
+	}
+	if (vi) {
+		vi->virq = virq;
+		kref_init(&vi->kref);
+		list_add_tail(&vi->list_node, &intx_list);
+	}
+	mutex_unlock(&intx_mutex);
+
 	return 0;
+error_exit:
+	kfree(vi);
+	return -1;
 }
 
 /*