Message ID | 20180228003814.32124-1-alistair@popple.id.au (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [v2] powerpc/npu-dma.c: Fix deadlock in mmio_invalidate | expand |
On Wed, 28 Feb 2018 11:38:14 +1100 Alistair Popple <alistair@popple.id.au> wrote: > When sending TLB invalidates to the NPU we need to send extra flushes due > to a hardware issue. The original implementation would lock the all the > ATSD MMIO registers sequentially before unlocking and relocking each of > them sequentially to do the extra flush. > > This introduced a deadlock as it is possible for one thread to hold one > ATSD register whilst waiting for another register to be freed while the > other thread is holding that register waiting for the one in the first > thread to be freed. > > For example if there are two threads and two ATSD registers: > > Thread A Thread B > Acquire 1 > Acquire 2 > Release 1 Acquire 1 > Wait 1 Wait 2 > > Both threads will be stuck waiting to acquire a register resulting in an > RCU stall warning or soft lockup. > > This patch solves the deadlock by refactoring the code to ensure registers > are not released between flushes and to ensure all registers are either > acquired or released together and in order. > > Fixes: bbd5ff50afff ("powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD") > Signed-off-by: Alistair Popple <alistair@popple.id.au> > > --- > > v2: Added memory barriers around ->npdev[] and ATSD register aquisition/release > > arch/powerpc/platforms/powernv/npu-dma.c | 203 +++++++++++++++++-------------- > 1 file changed, 113 insertions(+), 90 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c > index 0a253b64ac5f..2fed4b116e19 100644 > --- a/arch/powerpc/platforms/powernv/npu-dma.c > +++ b/arch/powerpc/platforms/powernv/npu-dma.c > @@ -410,6 +410,11 @@ struct npu_context { > void *priv; > }; > > +struct mmio_atsd_reg { > + struct npu *npu; > + int reg; > +}; > + > /* > * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC > * if none are available. > @@ -419,7 +424,7 @@ static int get_mmio_atsd_reg(struct npu *npu) > int i; > > for (i = 0; i < npu->mmio_atsd_count; i++) { > - if (!test_and_set_bit(i, &npu->mmio_atsd_usage)) > + if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage)) > return i; > } > > @@ -428,86 +433,90 @@ static int get_mmio_atsd_reg(struct npu *npu) > > static void put_mmio_atsd_reg(struct npu *npu, int reg) > { > - clear_bit(reg, &npu->mmio_atsd_usage); > + clear_bit_unlock(reg, &npu->mmio_atsd_usage); > } I think we need to document in the code that we have a hard reliance on the order of locks being incremental sequential and that any optimizations otherwise will result in probable deadlocks. > > /* MMIO ATSD register offsets */ > #define XTS_ATSD_AVA 1 > #define XTS_ATSD_STAT 2 > > -static int mmio_launch_invalidate(struct npu *npu, unsigned long launch, > - unsigned long va) > +static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg, > + unsigned long launch, unsigned long va) > { > - int mmio_atsd_reg; > - > - do { > - mmio_atsd_reg = get_mmio_atsd_reg(npu); > - cpu_relax(); > - } while (mmio_atsd_reg < 0); > + struct npu *npu = mmio_atsd_reg->npu; > + int reg = mmio_atsd_reg->reg; > > __raw_writeq(cpu_to_be64(va), > - npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA); > + npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA); > eieio(); > - __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]); > - > - return mmio_atsd_reg; > + __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[reg]); > } > > -static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush) > +static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], > + unsigned long pid, bool flush) > { > + int i; > unsigned long launch; > > - /* IS set to invalidate matching PID */ > - launch = PPC_BIT(12); > + for (i = 0; i <= max_npu2_index; i++) { > + if (mmio_atsd_reg[i].reg < 0) > + continue; > + > + /* IS set to invalidate matching PID */ > + launch = PPC_BIT(12); > > - /* PRS set to process-scoped */ > - launch |= PPC_BIT(13); > + /* PRS set to process-scoped */ > + launch |= PPC_BIT(13); > > - /* AP */ > - launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); > + /* AP */ > + launch |= (u64) > + mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); > > - /* PID */ > - launch |= pid << PPC_BITLSHIFT(38); > + /* PID */ > + launch |= pid << PPC_BITLSHIFT(38); > > - /* No flush */ > - launch |= !flush << PPC_BITLSHIFT(39); > + /* No flush */ > + launch |= !flush << PPC_BITLSHIFT(39); > > - /* Invalidating the entire process doesn't use a va */ > - return mmio_launch_invalidate(npu, launch, 0); > + /* Invalidating the entire process doesn't use a va */ > + mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0); > + } > } > > -static int mmio_invalidate_va(struct npu *npu, unsigned long va, > - unsigned long pid, bool flush) > +static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], > + unsigned long va, unsigned long pid, bool flush) > { > + int i; > unsigned long launch; > > - /* IS set to invalidate target VA */ > - launch = 0; > + for (i = 0; i <= max_npu2_index; i++) { > + if (mmio_atsd_reg[i].reg < 0) > + continue; > + > + /* IS set to invalidate target VA */ > + launch = 0; > > - /* PRS set to process scoped */ > - launch |= PPC_BIT(13); > + /* PRS set to process scoped */ > + launch |= PPC_BIT(13); > > - /* AP */ > - launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); > + /* AP */ > + launch |= (u64) > + mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); > > - /* PID */ > - launch |= pid << PPC_BITLSHIFT(38); > + /* PID */ > + launch |= pid << PPC_BITLSHIFT(38); > > - /* No flush */ > - launch |= !flush << PPC_BITLSHIFT(39); > + /* No flush */ > + launch |= !flush << PPC_BITLSHIFT(39); > > - return mmio_launch_invalidate(npu, launch, va); > + mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va); > + } > } > > #define mn_to_npu_context(x) container_of(x, struct npu_context, mn) > > -struct mmio_atsd_reg { > - struct npu *npu; > - int reg; > -}; > - > static void mmio_invalidate_wait( > - struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush) > + struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) > { > struct npu *npu; > int i, reg; > @@ -522,16 +531,46 @@ static void mmio_invalidate_wait( > reg = mmio_atsd_reg[i].reg; > while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) > cpu_relax(); > + } > +} > > - put_mmio_atsd_reg(npu, reg); > +static void acquire_atsd_reg(struct npu_context *npu_context, > + struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) > +{ > + int i, j; > + struct npu *npu; > + struct pci_dev *npdev; > + struct pnv_phb *nphb; > > - /* > - * The GPU requires two flush ATSDs to ensure all entries have > - * been flushed. We use PID 0 as it will never be used for a > - * process on the GPU. > - */ > - if (flush) > - mmio_invalidate_pid(npu, 0, true); > + for (i = 0; i <= max_npu2_index; i++) { > + mmio_atsd_reg[i].reg = -1; > + for (j = 0; j < NV_MAX_LINKS; j++) { > + npdev = READ_ONCE(npu_context->npdev[i][j]); > + if (!npdev) > + continue; > + > + nphb = pci_bus_to_host(npdev->bus)->private_data; > + npu = &nphb->npu; > + mmio_atsd_reg[i].npu = npu; > + mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); > + while (mmio_atsd_reg[i].reg < 0) { > + mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); > + cpu_relax(); I guess a softlockup will occur if we don't get the atsd resource in time and that might aid debugging. Looks good to me otherwise Acked-by: Balbir Singh <bsingharora@gmail.com>
Alistair Popple <alistair@popple.id.au> writes: > When sending TLB invalidates to the NPU we need to send extra flushes due > to a hardware issue. The original implementation would lock the all the > ATSD MMIO registers sequentially before unlocking and relocking each of > them sequentially to do the extra flush. > > This introduced a deadlock as it is possible for one thread to hold one > ATSD register whilst waiting for another register to be freed while the > other thread is holding that register waiting for the one in the first > thread to be freed. > > For example if there are two threads and two ATSD registers: > > Thread A Thread B > Acquire 1 > Acquire 2 > Release 1 Acquire 1 > Wait 1 Wait 2 > > Both threads will be stuck waiting to acquire a register resulting in an > RCU stall warning or soft lockup. > > This patch solves the deadlock by refactoring the code to ensure registers > are not released between flushes and to ensure all registers are either > acquired or released together and in order. > > Fixes: bbd5ff50afff ("powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD") > Signed-off-by: Alistair Popple <alistair@popple.id.au> > > --- > > v2: Added memory barriers around ->npdev[] and ATSD register aquisition/release Apologies to Balbir who was standing nearby when I read this patch this afternoon and copped a bit of a rant as a result ... But READ_ONCE/WRITE_ONCE are not memory barriers, they are only compiler barriers. So the READ_ONCE/WRITE_ONCE usage in here may be necessary, but it's probably not sufficient, we probably *also* need actual memory barriers. But I could be wrong, my main gripe is that the locking/ordering in here is not very obvious. > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c > index 0a253b64ac5f..2fed4b116e19 100644 > --- a/arch/powerpc/platforms/powernv/npu-dma.c > +++ b/arch/powerpc/platforms/powernv/npu-dma.c > @@ -726,7 +749,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, > if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", > &nvlink_index))) > return ERR_PTR(-ENODEV); > - npu_context->npdev[npu->index][nvlink_index] = npdev; > + WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev); When you're publishing a struct via a pointer you would typically have a barrier between the stores that set the fields of the struct, and the store that publishes the struct. Otherwise the reader can see a partially setup struct. I think here the npdev was setup somewhere else, and maybe there has been an intervening barrier, but it's not clear. A comment at least would be good. In general I feel like a spinlock or two could significantly simply the locking/ordering in this code, and given we're doing MMIOs anyway would not affect performance. </rant> cheers
> > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c > > index 0a253b64ac5f..2fed4b116e19 100644 > > --- a/arch/powerpc/platforms/powernv/npu-dma.c > > +++ b/arch/powerpc/platforms/powernv/npu-dma.c > > @@ -726,7 +749,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, > > if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", > > &nvlink_index))) > > return ERR_PTR(-ENODEV); > > - npu_context->npdev[npu->index][nvlink_index] = npdev; > > + WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev); > > When you're publishing a struct via a pointer you would typically have a > barrier between the stores that set the fields of the struct, and the > store that publishes the struct. Otherwise the reader can see a > partially setup struct. npdev is just a pointer to a pci_dev setup by the PCI code. We assign it to npdev[][] to indicate to the mmu notifiers that an invalidation should also be sent to this nvlink. However I don't think there is any ordering required wrt to the rest of the npu_context setup - so long as when the notifiers deference npu_context->npdev[i][j] it either sees a valid non-NULL pointer or NULL assigned to npdev[][] everything should be ok. > I think here the npdev was setup somewhere else, and maybe there has > been an intervening barrier, but it's not clear. A comment at least > would be good. Yes it has been. I will submit a v3 with some more comments incorporating the above. Unless you think my argument is wrong? > In general I feel like a spinlock or two could significantly simply the > locking/ordering in this code, and given we're doing MMIOs anyway would > not affect performance. Indeed, I am working on a patch to add a spinlock around the allocation of the npu_context as pnv_npu2_destroy_context() needs to be serialised with respect to pnv_npu2_init_context() on the same mm_struct. I'd incorrectly assumed the driver would do this serialisation but apparently it can't so we need to ensure kref_get(npu_context) can't race with it's destruction (concurrent init_context() is protected by mmap_sem). I could fold that (unposted) patch into this one if you think that would be better? Thanks! - Alistair > </rant> > > cheers >
Alistair Popple <alistair@popple.id.au> writes: >> > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c >> > index 0a253b64ac5f..2fed4b116e19 100644 >> > --- a/arch/powerpc/platforms/powernv/npu-dma.c >> > +++ b/arch/powerpc/platforms/powernv/npu-dma.c >> > @@ -726,7 +749,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, >> > if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", >> > &nvlink_index))) >> > return ERR_PTR(-ENODEV); >> > - npu_context->npdev[npu->index][nvlink_index] = npdev; >> > + WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev); >> >> When you're publishing a struct via a pointer you would typically have a >> barrier between the stores that set the fields of the struct, and the >> store that publishes the struct. Otherwise the reader can see a >> partially setup struct. > > npdev is just a pointer to a pci_dev setup by the PCI code. We assign it to > npdev[][] to indicate to the mmu notifiers that an invalidation should also > be sent to this nvlink. However I don't think there is any ordering > required wrt to the rest of the npu_context setup - so long as when the > notifiers deference npu_context->npdev[i][j] it either sees a valid > non-NULL pointer or NULL assigned to npdev[][] everything should be ok. Yeah OK. You do then dereference npdev->bus, so you depend on that being setup prior to the store that makes the npdev visible. But I guess we're saying that happened eons ago somewhere in the PCI code and will have happened by now. >> I think here the npdev was setup somewhere else, and maybe there has >> been an intervening barrier, but it's not clear. A comment at least >> would be good. > > Yes it has been. I will submit a v3 with some more comments incorporating > the above. Unless you think my argument is wrong? If you can comment it then I think I'm happy with it. >> In general I feel like a spinlock or two could significantly simply the >> locking/ordering in this code, and given we're doing MMIOs anyway would >> not affect performance. > > Indeed, I am working on a patch to add a spinlock around the allocation of > the npu_context as pnv_npu2_destroy_context() needs to be serialised with > respect to pnv_npu2_init_context() on the same mm_struct. I'd incorrectly > assumed the driver would do this serialisation but apparently it can't so > we need to ensure kref_get(npu_context) can't race with it's destruction > (concurrent init_context() is protected by mmap_sem). I could fold that > (unposted) patch into this one if you think that would be better? No this is big enough already, I almost asked you to split it to make the diff more readable :) So just send the spinlock patch when it's ready as a separate patch. cheers
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 0a253b64ac5f..2fed4b116e19 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -410,6 +410,11 @@ struct npu_context { void *priv; }; +struct mmio_atsd_reg { + struct npu *npu; + int reg; +}; + /* * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC * if none are available. @@ -419,7 +424,7 @@ static int get_mmio_atsd_reg(struct npu *npu) int i; for (i = 0; i < npu->mmio_atsd_count; i++) { - if (!test_and_set_bit(i, &npu->mmio_atsd_usage)) + if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage)) return i; } @@ -428,86 +433,90 @@ static int get_mmio_atsd_reg(struct npu *npu) static void put_mmio_atsd_reg(struct npu *npu, int reg) { - clear_bit(reg, &npu->mmio_atsd_usage); + clear_bit_unlock(reg, &npu->mmio_atsd_usage); } /* MMIO ATSD register offsets */ #define XTS_ATSD_AVA 1 #define XTS_ATSD_STAT 2 -static int mmio_launch_invalidate(struct npu *npu, unsigned long launch, - unsigned long va) +static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg, + unsigned long launch, unsigned long va) { - int mmio_atsd_reg; - - do { - mmio_atsd_reg = get_mmio_atsd_reg(npu); - cpu_relax(); - } while (mmio_atsd_reg < 0); + struct npu *npu = mmio_atsd_reg->npu; + int reg = mmio_atsd_reg->reg; __raw_writeq(cpu_to_be64(va), - npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA); + npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA); eieio(); - __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]); - - return mmio_atsd_reg; + __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[reg]); } -static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush) +static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], + unsigned long pid, bool flush) { + int i; unsigned long launch; - /* IS set to invalidate matching PID */ - launch = PPC_BIT(12); + for (i = 0; i <= max_npu2_index; i++) { + if (mmio_atsd_reg[i].reg < 0) + continue; + + /* IS set to invalidate matching PID */ + launch = PPC_BIT(12); - /* PRS set to process-scoped */ - launch |= PPC_BIT(13); + /* PRS set to process-scoped */ + launch |= PPC_BIT(13); - /* AP */ - launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + /* AP */ + launch |= (u64) + mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); - /* PID */ - launch |= pid << PPC_BITLSHIFT(38); + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); - /* No flush */ - launch |= !flush << PPC_BITLSHIFT(39); + /* No flush */ + launch |= !flush << PPC_BITLSHIFT(39); - /* Invalidating the entire process doesn't use a va */ - return mmio_launch_invalidate(npu, launch, 0); + /* Invalidating the entire process doesn't use a va */ + mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0); + } } -static int mmio_invalidate_va(struct npu *npu, unsigned long va, - unsigned long pid, bool flush) +static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], + unsigned long va, unsigned long pid, bool flush) { + int i; unsigned long launch; - /* IS set to invalidate target VA */ - launch = 0; + for (i = 0; i <= max_npu2_index; i++) { + if (mmio_atsd_reg[i].reg < 0) + continue; + + /* IS set to invalidate target VA */ + launch = 0; - /* PRS set to process scoped */ - launch |= PPC_BIT(13); + /* PRS set to process scoped */ + launch |= PPC_BIT(13); - /* AP */ - launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + /* AP */ + launch |= (u64) + mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); - /* PID */ - launch |= pid << PPC_BITLSHIFT(38); + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); - /* No flush */ - launch |= !flush << PPC_BITLSHIFT(39); + /* No flush */ + launch |= !flush << PPC_BITLSHIFT(39); - return mmio_launch_invalidate(npu, launch, va); + mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va); + } } #define mn_to_npu_context(x) container_of(x, struct npu_context, mn) -struct mmio_atsd_reg { - struct npu *npu; - int reg; -}; - static void mmio_invalidate_wait( - struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush) + struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) { struct npu *npu; int i, reg; @@ -522,16 +531,46 @@ static void mmio_invalidate_wait( reg = mmio_atsd_reg[i].reg; while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) cpu_relax(); + } +} - put_mmio_atsd_reg(npu, reg); +static void acquire_atsd_reg(struct npu_context *npu_context, + struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) +{ + int i, j; + struct npu *npu; + struct pci_dev *npdev; + struct pnv_phb *nphb; - /* - * The GPU requires two flush ATSDs to ensure all entries have - * been flushed. We use PID 0 as it will never be used for a - * process on the GPU. - */ - if (flush) - mmio_invalidate_pid(npu, 0, true); + for (i = 0; i <= max_npu2_index; i++) { + mmio_atsd_reg[i].reg = -1; + for (j = 0; j < NV_MAX_LINKS; j++) { + npdev = READ_ONCE(npu_context->npdev[i][j]); + if (!npdev) + continue; + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + mmio_atsd_reg[i].npu = npu; + mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); + while (mmio_atsd_reg[i].reg < 0) { + mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); + cpu_relax(); + } + break; + } + } +} + +static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) +{ + int i; + + for (i = 0; i <= max_npu2_index; i++) { + if (mmio_atsd_reg[i].reg < 0) + continue; + + put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg); } } @@ -542,10 +581,6 @@ static void mmio_invalidate_wait( static void mmio_invalidate(struct npu_context *npu_context, int va, unsigned long address, bool flush) { - int i, j; - struct npu *npu; - struct pnv_phb *nphb; - struct pci_dev *npdev; struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; unsigned long pid = npu_context->mm->context.id; @@ -561,37 +596,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, * Loop over all the NPUs this process is active on and launch * an invalidate. */ - for (i = 0; i <= max_npu2_index; i++) { - mmio_atsd_reg[i].reg = -1; - for (j = 0; j < NV_MAX_LINKS; j++) { - npdev = npu_context->npdev[i][j]; - if (!npdev) - continue; - - nphb = pci_bus_to_host(npdev->bus)->private_data; - npu = &nphb->npu; - mmio_atsd_reg[i].npu = npu; - - if (va) - mmio_atsd_reg[i].reg = - mmio_invalidate_va(npu, address, pid, - flush); - else - mmio_atsd_reg[i].reg = - mmio_invalidate_pid(npu, pid, flush); - - /* - * The NPU hardware forwards the shootdown to all GPUs - * so we only have to launch one shootdown per NPU. - */ - break; - } + acquire_atsd_reg(npu_context, mmio_atsd_reg); + if (va) + mmio_invalidate_va(mmio_atsd_reg, address, pid, flush); + else + mmio_invalidate_pid(mmio_atsd_reg, pid, flush); + + mmio_invalidate_wait(mmio_atsd_reg); + if (flush) { + /* + * The GPU requires two flush ATSDs to ensure all entries have + * been flushed. We use PID 0 as it will never be used for a + * process on the GPU. + */ + mmio_invalidate_pid(mmio_atsd_reg, 0, true); + mmio_invalidate_wait(mmio_atsd_reg); + mmio_invalidate_pid(mmio_atsd_reg, 0, true); + mmio_invalidate_wait(mmio_atsd_reg); } - - mmio_invalidate_wait(mmio_atsd_reg, flush); - if (flush) - /* Wait for the flush to complete */ - mmio_invalidate_wait(mmio_atsd_reg, false); + release_atsd_reg(mmio_atsd_reg); } static void pnv_npu2_mn_release(struct mmu_notifier *mn, @@ -726,7 +749,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", &nvlink_index))) return ERR_PTR(-ENODEV); - npu_context->npdev[npu->index][nvlink_index] = npdev; + WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev); if (!nphb->npu.nmmu_flush) { /* @@ -778,7 +801,7 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context, if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", &nvlink_index))) return; - npu_context->npdev[npu->index][nvlink_index] = NULL; + WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL); opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id, PCI_DEVID(gpdev->bus->number, gpdev->devfn)); kref_put(&npu_context->kref, pnv_npu2_release_context);
When sending TLB invalidates to the NPU we need to send extra flushes due to a hardware issue. The original implementation would lock the all the ATSD MMIO registers sequentially before unlocking and relocking each of them sequentially to do the extra flush. This introduced a deadlock as it is possible for one thread to hold one ATSD register whilst waiting for another register to be freed while the other thread is holding that register waiting for the one in the first thread to be freed. For example if there are two threads and two ATSD registers: Thread A Thread B Acquire 1 Acquire 2 Release 1 Acquire 1 Wait 1 Wait 2 Both threads will be stuck waiting to acquire a register resulting in an RCU stall warning or soft lockup. This patch solves the deadlock by refactoring the code to ensure registers are not released between flushes and to ensure all registers are either acquired or released together and in order. Fixes: bbd5ff50afff ("powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD") Signed-off-by: Alistair Popple <alistair@popple.id.au> --- v2: Added memory barriers around ->npdev[] and ATSD register aquisition/release arch/powerpc/platforms/powernv/npu-dma.c | 203 +++++++++++++++++-------------- 1 file changed, 113 insertions(+), 90 deletions(-)