Message ID | 20201201073919.5600-1-aik@ozlabs.ru (mailing list archive) |
---|---|
State | Rejected |
Headers | show |
Series | [kernel,v2] powerpc/pci: Remove LSI mappings on device teardown | expand |
Context | Check | Description |
---|---|---|
snowpatch_ozlabs/apply_patch | success | Successfully applied on branch powerpc/merge (78c312324391ee996944e1196123b0060888e189) |
snowpatch_ozlabs/build-ppc64le | success | Build succeeded |
snowpatch_ozlabs/build-ppc64be | success | Build succeeded |
snowpatch_ozlabs/build-ppc64e | success | Build succeeded |
snowpatch_ozlabs/build-pmac32 | success | Build succeeded |
snowpatch_ozlabs/checkpatch | warning | total: 0 errors, 0 warnings, 2 checks, 89 lines checked |
snowpatch_ozlabs/needsstable | success | Patch has no Fixes tags |
On 12/1/20 8:39 AM, Alexey Kardashevskiy wrote: > From: Oliver O'Halloran <oohall@gmail.com> > > When a passthrough IO adapter is removed from a pseries machine using hash > MMU and the XIVE interrupt mode, the POWER hypervisor expects the guest OS > to clear all page table entries related to the adapter. If some are still > present, the RTAS call which isolates the PCI slot returns error 9001 > "valid outstanding translations" and the removal of the IO adapter fails. > This is because when the PHBs are scanned, Linux maps automatically the > INTx interrupts in the Linux interrupt number space but these are never > removed. > > This problem can be fixed by adding the corresponding unmap operation when > the device is removed. There's no pcibios_* hook for the remove case, but > the same effect can be achieved using a bus notifier. > > Because INTx are shared among PHBs (and potentially across the system), > this adds tracking of virq to unmap them only when the last user is gone. > > Signed-off-by: Oliver O'Halloran <oohall@gmail.com> > [aik: added refcounter] > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Looks good to me and the system survives all the PCI hotplug tests I used to do on my first attempts to fix this issue. One comment below, > --- > > > Doing this in the generic irq code is just too much for my small brain :-/ may be more cleanups are required in the PCI/MSI/IRQ PPC layers before considering your first approach. You think too much in advance ! > > --- > arch/powerpc/kernel/pci-common.c | 71 ++++++++++++++++++++++++++++++++ > 1 file changed, 71 insertions(+) > > diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c > index be108616a721..0acf17f17253 100644 > --- a/arch/powerpc/kernel/pci-common.c > +++ b/arch/powerpc/kernel/pci-common.c > @@ -353,6 +353,55 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr) > return NULL; > } > > +struct pci_intx_virq { > + int virq; > + struct kref kref; > + struct list_head list_node; > +}; > + > +static LIST_HEAD(intx_list); > +static DEFINE_MUTEX(intx_mutex); > + > +static void ppc_pci_intx_release(struct kref *kref) > +{ > + struct pci_intx_virq *vi = container_of(kref, struct pci_intx_virq, kref); > + > + list_del(&vi->list_node); > + irq_dispose_mapping(vi->virq); > + kfree(vi); > +} > + > +static int ppc_pci_unmap_irq_line(struct notifier_block *nb, > + unsigned long action, void *data) > +{ > + struct pci_dev *pdev = to_pci_dev(data); > + > + if (action == BUS_NOTIFY_DEL_DEVICE) { > + struct pci_intx_virq *vi; > + > + mutex_lock(&intx_mutex); > + list_for_each_entry(vi, &intx_list, list_node) { > + if (vi->virq == pdev->irq) { > + kref_put(&vi->kref, ppc_pci_intx_release); > + break; > + } > + } > + mutex_unlock(&intx_mutex); > + } > + > + return NOTIFY_DONE; > +} > + > +static struct notifier_block ppc_pci_unmap_irq_notifier = { > + .notifier_call = ppc_pci_unmap_irq_line, > +}; > + > +static int ppc_pci_register_irq_notifier(void) > +{ > + return bus_register_notifier(&pci_bus_type, &ppc_pci_unmap_irq_notifier); > +} > +arch_initcall(ppc_pci_register_irq_notifier); > + > /* > * Reads the interrupt pin to determine if interrupt is use by card. > * If the interrupt is used, then gets the interrupt line from the > @@ -361,6 +410,12 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr) > static int pci_read_irq_line(struct pci_dev *pci_dev) > { > int virq; > + struct pci_intx_virq *vi, *vitmp; > + > + /* Preallocate vi as rewind is complex if this fails after mapping */ AFAICT, we only need to call irq_dispose_mapping() if allocation fails. If so, it would be simpler to isolate the code in a pci_intx_register(virq) helper and call it from pci_read_irq_line(). > + vi = kzalloc(sizeof(struct pci_intx_virq), GFP_KERNEL); > + if (!vi) > + return -1; > > pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev)); > > @@ -401,6 +456,22 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) > > pci_dev->irq = virq; > > + mutex_lock(&intx_mutex); > + list_for_each_entry(vitmp, &intx_list, list_node) { > + if (vitmp->virq == virq) { > + kref_get(&vitmp->kref); > + kfree(vi); > + vi = NULL; > + break; > + } > + } > + if (vi) { > + vi->virq = virq; > + kref_init(&vi->kref); > + list_add_tail(&vi->list_node, &intx_list); > + } > + mutex_unlock(&intx_mutex); > + > return 0; > } > >
On 01/12/2020 08:39, Alexey Kardashevskiy wrote: > From: Oliver O'Halloran <oohall@gmail.com> > > When a passthrough IO adapter is removed from a pseries machine using hash > MMU and the XIVE interrupt mode, the POWER hypervisor expects the guest OS > to clear all page table entries related to the adapter. If some are still > present, the RTAS call which isolates the PCI slot returns error 9001 > "valid outstanding translations" and the removal of the IO adapter fails. > This is because when the PHBs are scanned, Linux maps automatically the > INTx interrupts in the Linux interrupt number space but these are never > removed. > > This problem can be fixed by adding the corresponding unmap operation when > the device is removed. There's no pcibios_* hook for the remove case, but > the same effect can be achieved using a bus notifier. > > Because INTx are shared among PHBs (and potentially across the system), > this adds tracking of virq to unmap them only when the last user is gone. > > Signed-off-by: Oliver O'Halloran <oohall@gmail.com> > [aik: added refcounter] > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > --- > > > Doing this in the generic irq code is just too much for my small brain :-/ > > > --- > arch/powerpc/kernel/pci-common.c | 71 ++++++++++++++++++++++++++++++++ > 1 file changed, 71 insertions(+) > > diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c > index be108616a721..0acf17f17253 100644 > --- a/arch/powerpc/kernel/pci-common.c > +++ b/arch/powerpc/kernel/pci-common.c > @@ -353,6 +353,55 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr) > return NULL; > } > > +struct pci_intx_virq { > + int virq; > + struct kref kref; > + struct list_head list_node; > +}; > + > +static LIST_HEAD(intx_list); > +static DEFINE_MUTEX(intx_mutex); > + > +static void ppc_pci_intx_release(struct kref *kref) > +{ > + struct pci_intx_virq *vi = container_of(kref, struct pci_intx_virq, kref); > + > + list_del(&vi->list_node); > + irq_dispose_mapping(vi->virq); > + kfree(vi); > +} > + > +static int ppc_pci_unmap_irq_line(struct notifier_block *nb, > + unsigned long action, void *data) > +{ > + struct pci_dev *pdev = to_pci_dev(data); > + > + if (action == BUS_NOTIFY_DEL_DEVICE) { > + struct pci_intx_virq *vi; > + > + mutex_lock(&intx_mutex); > + list_for_each_entry(vi, &intx_list, list_node) { > + if (vi->virq == pdev->irq) { > + kref_put(&vi->kref, ppc_pci_intx_release); > + break; > + } > + } > + mutex_unlock(&intx_mutex); > + } > + > + return NOTIFY_DONE; > +} > + > +static struct notifier_block ppc_pci_unmap_irq_notifier = { > + .notifier_call = ppc_pci_unmap_irq_line, > +}; > + > +static int ppc_pci_register_irq_notifier(void) > +{ > + return bus_register_notifier(&pci_bus_type, &ppc_pci_unmap_irq_notifier); > +} > +arch_initcall(ppc_pci_register_irq_notifier); > + > /* > * Reads the interrupt pin to determine if interrupt is use by card. > * If the interrupt is used, then gets the interrupt line from the > @@ -361,6 +410,12 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr) > static int pci_read_irq_line(struct pci_dev *pci_dev) > { > int virq; > + struct pci_intx_virq *vi, *vitmp; > + > + /* Preallocate vi as rewind is complex if this fails after mapping */ Seems ok to me as the failure is unexpected. But then we need to free that memory on all the error paths below. Fred > + vi = kzalloc(sizeof(struct pci_intx_virq), GFP_KERNEL); > + if (!vi) > + return -1; > > pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev)); > > @@ -401,6 +456,22 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) > > pci_dev->irq = virq; > > + mutex_lock(&intx_mutex); > + list_for_each_entry(vitmp, &intx_list, list_node) { > + if (vitmp->virq == virq) { > + kref_get(&vitmp->kref); > + kfree(vi); > + vi = NULL; > + break; > + } > + } > + if (vi) { > + vi->virq = virq; > + kref_init(&vi->kref); > + list_add_tail(&vi->list_node, &intx_list); > + } > + mutex_unlock(&intx_mutex); > + > return 0; > } > >
On 01/12/2020 20:31, Cédric Le Goater wrote: > On 12/1/20 8:39 AM, Alexey Kardashevskiy wrote: >> From: Oliver O'Halloran <oohall@gmail.com> >> >> When a passthrough IO adapter is removed from a pseries machine using hash >> MMU and the XIVE interrupt mode, the POWER hypervisor expects the guest OS >> to clear all page table entries related to the adapter. If some are still >> present, the RTAS call which isolates the PCI slot returns error 9001 >> "valid outstanding translations" and the removal of the IO adapter fails. >> This is because when the PHBs are scanned, Linux maps automatically the >> INTx interrupts in the Linux interrupt number space but these are never >> removed. >> >> This problem can be fixed by adding the corresponding unmap operation when >> the device is removed. There's no pcibios_* hook for the remove case, but >> the same effect can be achieved using a bus notifier. >> >> Because INTx are shared among PHBs (and potentially across the system), >> this adds tracking of virq to unmap them only when the last user is gone. >> >> Signed-off-by: Oliver O'Halloran <oohall@gmail.com> >> [aik: added refcounter] >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > > Looks good to me and the system survives all the PCI hotplug tests I used > to do on my first attempts to fix this issue. > > One comment below, > >> --- >> >> >> Doing this in the generic irq code is just too much for my small brain :-/ > > may be more cleanups are required in the PCI/MSI/IRQ PPC layers before > considering your first approach. You think too much in advance ! > >> >> --- >> arch/powerpc/kernel/pci-common.c | 71 ++++++++++++++++++++++++++++++++ >> 1 file changed, 71 insertions(+) >> >> diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c >> index be108616a721..0acf17f17253 100644 >> --- a/arch/powerpc/kernel/pci-common.c >> +++ b/arch/powerpc/kernel/pci-common.c >> @@ -353,6 +353,55 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr) >> return NULL; >> } >> >> +struct pci_intx_virq { >> + int virq; >> + struct kref kref; >> + struct list_head list_node; >> +}; >> + >> +static LIST_HEAD(intx_list); >> +static DEFINE_MUTEX(intx_mutex); >> + >> +static void ppc_pci_intx_release(struct kref *kref) >> +{ >> + struct pci_intx_virq *vi = container_of(kref, struct pci_intx_virq, kref); >> + >> + list_del(&vi->list_node); >> + irq_dispose_mapping(vi->virq); >> + kfree(vi); >> +} >> + >> +static int ppc_pci_unmap_irq_line(struct notifier_block *nb, >> + unsigned long action, void *data) >> +{ >> + struct pci_dev *pdev = to_pci_dev(data); >> + >> + if (action == BUS_NOTIFY_DEL_DEVICE) { >> + struct pci_intx_virq *vi; >> + >> + mutex_lock(&intx_mutex); >> + list_for_each_entry(vi, &intx_list, list_node) { >> + if (vi->virq == pdev->irq) { >> + kref_put(&vi->kref, ppc_pci_intx_release); >> + break; >> + } >> + } >> + mutex_unlock(&intx_mutex); >> + } >> + >> + return NOTIFY_DONE; >> +} >> + >> +static struct notifier_block ppc_pci_unmap_irq_notifier = { >> + .notifier_call = ppc_pci_unmap_irq_line, >> +}; >> + >> +static int ppc_pci_register_irq_notifier(void) >> +{ >> + return bus_register_notifier(&pci_bus_type, &ppc_pci_unmap_irq_notifier); >> +} >> +arch_initcall(ppc_pci_register_irq_notifier); >> + >> /* >> * Reads the interrupt pin to determine if interrupt is use by card. >> * If the interrupt is used, then gets the interrupt line from the >> @@ -361,6 +410,12 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr) >> static int pci_read_irq_line(struct pci_dev *pci_dev) >> { >> int virq; >> + struct pci_intx_virq *vi, *vitmp; >> + >> + /* Preallocate vi as rewind is complex if this fails after mapping */ > > AFAICT, we only need to call irq_dispose_mapping() if allocation fails. Today - yes but in the future (hierarchical domains or whatever other awesome thing we'll use from there) - not necessarily. Too much is hidden under irq_create_fwspec_mapping(). Thanks, > If so, it would be simpler to isolate the code in a pci_intx_register(virq) > helper and call it from pci_read_irq_line(). > >> + vi = kzalloc(sizeof(struct pci_intx_virq), GFP_KERNEL); >> + if (!vi) >> + return -1; >> >> pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev)); >> >> @@ -401,6 +456,22 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) >> >> pci_dev->irq = virq; >> >> + mutex_lock(&intx_mutex); >> + list_for_each_entry(vitmp, &intx_list, list_node) { >> + if (vitmp->virq == virq) { >> + kref_get(&vitmp->kref); >> + kfree(vi); >> + vi = NULL; >> + break; >> + } >> + } >> + if (vi) { >> + vi->virq = virq; >> + kref_init(&vi->kref); >> + list_add_tail(&vi->list_node, &intx_list); >> + } >> + mutex_unlock(&intx_mutex); >> + >> return 0; >> } >> >> >
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index be108616a721..0acf17f17253 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -353,6 +353,55 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr) return NULL; } +struct pci_intx_virq { + int virq; + struct kref kref; + struct list_head list_node; +}; + +static LIST_HEAD(intx_list); +static DEFINE_MUTEX(intx_mutex); + +static void ppc_pci_intx_release(struct kref *kref) +{ + struct pci_intx_virq *vi = container_of(kref, struct pci_intx_virq, kref); + + list_del(&vi->list_node); + irq_dispose_mapping(vi->virq); + kfree(vi); +} + +static int ppc_pci_unmap_irq_line(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct pci_dev *pdev = to_pci_dev(data); + + if (action == BUS_NOTIFY_DEL_DEVICE) { + struct pci_intx_virq *vi; + + mutex_lock(&intx_mutex); + list_for_each_entry(vi, &intx_list, list_node) { + if (vi->virq == pdev->irq) { + kref_put(&vi->kref, ppc_pci_intx_release); + break; + } + } + mutex_unlock(&intx_mutex); + } + + return NOTIFY_DONE; +} + +static struct notifier_block ppc_pci_unmap_irq_notifier = { + .notifier_call = ppc_pci_unmap_irq_line, +}; + +static int ppc_pci_register_irq_notifier(void) +{ + return bus_register_notifier(&pci_bus_type, &ppc_pci_unmap_irq_notifier); +} +arch_initcall(ppc_pci_register_irq_notifier); + /* * Reads the interrupt pin to determine if interrupt is use by card. * If the interrupt is used, then gets the interrupt line from the @@ -361,6 +410,12 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr) static int pci_read_irq_line(struct pci_dev *pci_dev) { int virq; + struct pci_intx_virq *vi, *vitmp; + + /* Preallocate vi as rewind is complex if this fails after mapping */ + vi = kzalloc(sizeof(struct pci_intx_virq), GFP_KERNEL); + if (!vi) + return -1; pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev)); @@ -401,6 +456,22 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) pci_dev->irq = virq; + mutex_lock(&intx_mutex); + list_for_each_entry(vitmp, &intx_list, list_node) { + if (vitmp->virq == virq) { + kref_get(&vitmp->kref); + kfree(vi); + vi = NULL; + break; + } + } + if (vi) { + vi->virq = virq; + kref_init(&vi->kref); + list_add_tail(&vi->list_node, &intx_list); + } + mutex_unlock(&intx_mutex); + return 0; }