diff mbox series

[kernel,v3,18/22] powerpc/powernv/npu: Add compound IOMMU groups

Message ID 20181113082823.2440-19-aik@ozlabs.ru
State Superseded
Headers show
Series powerpc/powernv/npu, vfio: NVIDIA V100 + P9 passthrough | expand

Commit Message

Alexey Kardashevskiy Nov. 13, 2018, 8:28 a.m. UTC
At the moment powernv registers an IOMMU group for each PE. There is
an exception though - NPU (an emulated PCI bridge representing an NVLink);
powernv attaches these bridges to the GPU IOMMU group which becomes
a master.

Now we have POWER9 systems with GPUs connected to each other directly,
bypassing PCI. At the moment powernv does not control these links so
it has to put such interconnected GPUs to the same IOMMU group which
means that the old scheme with a GPU as a master won't work - there will
be up to 3 GPUs in such group.

This introduces a npu_comp struct which represents a compound IOMMU
group made of multiple PEs. This converts the existing NVLink1 code to
use the new scheme. From now on, each PE must have a valid
iommu_table_group_ops which will either be called directly (a single PE
group) or indirectly from a compound group.

This moves IOMMU group registration for NPU-connected GPUs to npu-dma.c.
For POWER8, this stores a new compound group pointer in a PE (so a GPU
is still a master); for POWER9 the new group pointer is stored in an NPU.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/pci.h            |   1 +
 arch/powerpc/platforms/powernv/pci.h      |   7 +
 arch/powerpc/platforms/powernv/npu-dma.c  | 286 ++++++++++++++++++++--
 arch/powerpc/platforms/powernv/pci-ioda.c | 173 +++----------
 4 files changed, 308 insertions(+), 159 deletions(-)

Comments

David Gibson Nov. 19, 2018, 1:12 a.m. UTC | #1
On Tue, Nov 13, 2018 at 07:28:19PM +1100, Alexey Kardashevskiy wrote:
> At the moment powernv registers an IOMMU group for each PE. There is
> an exception though - NPU (an emulated PCI bridge representing an NVLink);
> powernv attaches these bridges to the GPU IOMMU group which becomes
> a master.
> 
> Now we have POWER9 systems with GPUs connected to each other directly,
> bypassing PCI. At the moment powernv does not control these links so
> it has to put such interconnected GPUs to the same IOMMU group which
> means that the old scheme with a GPU as a master won't work - there will
> be up to 3 GPUs in such group.
> 
> This introduces a npu_comp struct which represents a compound IOMMU
> group made of multiple PEs. This converts the existing NVLink1 code to
> use the new scheme. From now on, each PE must have a valid
> iommu_table_group_ops which will either be called directly (a single PE
> group) or indirectly from a compound group.
> 
> This moves IOMMU group registration for NPU-connected GPUs to npu-dma.c.
> For POWER8, this stores a new compound group pointer in a PE (so a GPU
> is still a master); for POWER9 the new group pointer is stored in an NPU.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/pci.h            |   1 +
>  arch/powerpc/platforms/powernv/pci.h      |   7 +
>  arch/powerpc/platforms/powernv/npu-dma.c  | 286 ++++++++++++++++++++--
>  arch/powerpc/platforms/powernv/pci-ioda.c | 173 +++----------
>  4 files changed, 308 insertions(+), 159 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
> index baf2886..0c72f18 100644
> --- a/arch/powerpc/include/asm/pci.h
> +++ b/arch/powerpc/include/asm/pci.h
> @@ -132,5 +132,6 @@ extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index);
>  extern int pnv_npu2_init(struct pci_controller *hose);
>  extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
>  		unsigned long msr);
> +extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev);
>  
>  #endif /* __ASM_POWERPC_PCI_H */
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index cf9f748..aef4bb5 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -62,6 +62,7 @@ struct pnv_ioda_pe {
>  
>  	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
>  	struct iommu_table_group table_group;
> +	struct npu_comp		*npucomp;
>  
>  	/* 64-bit TCE bypass region */
>  	bool			tce_bypass_enabled;
> @@ -201,6 +202,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
>  extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
>  extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
>  extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
> +extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
> +		__u64 window_size, __u32 levels);
>  extern int pnv_eeh_post_init(void);
>  
>  extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
> @@ -216,6 +219,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>  extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
>  extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
>  extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
> +extern struct iommu_table_group *pnv_try_setup_npu_table_group(
> +		struct pnv_ioda_pe *pe);
> +extern struct iommu_table_group *pnv_npu_compound_attach(
> +		struct pnv_ioda_pe *pe);
>  
>  /* pci-ioda-tce.c */
>  #define POWERNV_IOMMU_DEFAULT_LEVELS	1
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> index 1792c7e..2231f4c 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -317,31 +317,6 @@ static struct iommu_table_group_ops pnv_pci_npu_ops = {
>  	.unset_window = pnv_npu_unset_window,
>  	.take_ownership = pnv_npu_take_ownership,
>  };
> -
> -struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
> -{
> -	struct pnv_phb *phb = npe->phb;
> -	struct pci_bus *pbus = phb->hose->bus;
> -	struct pci_dev *npdev, *gpdev = NULL, *gptmp;
> -	struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
> -
> -	if (!gpe || !gpdev)
> -		return NULL;
> -
> -	npe->table_group.ops = &pnv_pci_npu_ops;
> -
> -	list_for_each_entry(npdev, &pbus->devices, bus_list) {
> -		gptmp = pnv_pci_get_gpu_dev(npdev);
> -
> -		if (gptmp != gpdev)
> -			continue;
> -
> -		pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
> -		iommu_group_add_device(gpe->table_group.group, &npdev->dev);
> -	}
> -
> -	return gpe;
> -}
>  #endif /* !CONFIG_IOMMU_API */
>  
>  /*
> @@ -349,6 +324,17 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
>   */
>  /* Maximum possible number of ATSD MMIO registers per NPU */
>  #define NV_NMMU_ATSD_REGS 8
> +#define NV_NPU_MAX_PE_NUM	16
> +
> +/*
> + * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
> + * up to 3 x (GPU + 2xNPUs) (POWER9).
> + */
> +struct npu_comp {
> +	struct iommu_table_group table_group;
> +	int pe_num;
> +	struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
> +};
>  
>  /* An NPU descriptor, valid for POWER9 only */
>  struct npu {
> @@ -365,6 +351,8 @@ struct npu {
>  	struct list_head next;
>  
>  	struct pci_controller *hose;
> +
> +	struct npu_comp npucomp;
>  };

I'm confused by this.  The comment simply there are multiple NPUs in a
single composite-group, but the np_comp structure is embedded in the
npu structure, implying there's a copy per-NPU.


>  static LIST_HEAD(npu2_devices);
> @@ -382,6 +370,254 @@ static struct npu *npdev_to_npu(struct pci_dev *npdev)
>  	return NULL;
>  }
>  
> +#ifdef CONFIG_IOMMU_API
> +static long pnv_npu_peers_create_table_userspace(
> +		struct iommu_table_group *table_group,
> +		int num, __u32 page_shift, __u64 window_size, __u32 levels,
> +		struct iommu_table **ptbl)
> +{
> +	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
> +			table_group);
> +
> +	if (!npucomp->pe_num || !npucomp->pe[0] ||
> +			!npucomp->pe[0]->table_group.ops ||
> +			!npucomp->pe[0]->table_group.ops->create_table)
> +		return -EFAULT;
> +
> +	return npucomp->pe[0]->table_group.ops->create_table(
> +			&npucomp->pe[0]->table_group, num, page_shift,
> +			window_size, levels, ptbl);
> +}
> +
> +static long pnv_npu_peers_set_window(struct iommu_table_group *table_group,
> +		int num, struct iommu_table *tbl)
> +{
> +	int i, j;
> +	long ret = 0;
> +	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
> +			table_group);
> +
> +	for (i = 0; i < npucomp->pe_num; ++i) {
> +		struct pnv_ioda_pe *pe = npucomp->pe[i];
> +
> +		if (!pe->table_group.ops->set_window)
> +			continue;
> +
> +		ret = pe->table_group.ops->set_window(&pe->table_group,
> +				num, tbl);
> +		if (ret)
> +			break;
> +	}
> +
> +	if (ret) {
> +		for (j = 0; j < i; ++j) {
> +			struct pnv_ioda_pe *pe = npucomp->pe[j];
> +
> +			if (!pe->table_group.ops->unset_window)
> +				continue;
> +
> +			ret = pe->table_group.ops->unset_window(
> +					&pe->table_group, num);
> +			if (ret)
> +				break;
> +		}
> +	} else {
> +		table_group->tables[num] = iommu_tce_table_get(tbl);
> +	}
> +
> +	return ret;
> +}
> +
> +static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group,
> +		int num)
> +{
> +	int i, j;
> +	long ret = 0;
> +	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
> +			table_group);
> +
> +	for (i = 0; i < npucomp->pe_num; ++i) {
> +		struct pnv_ioda_pe *pe = npucomp->pe[i];
> +
> +		WARN_ON(npucomp->table_group.tables[num] !=
> +				table_group->tables[num]);
> +		if (!npucomp->table_group.tables[num])
> +			continue;
> +
> +		if (!pe->table_group.ops->unset_window)
> +			continue;
> +
> +		ret = pe->table_group.ops->unset_window(&pe->table_group, num);
> +		if (ret)
> +			break;
> +	}
> +
> +	if (ret) {
> +		for (j = 0; j < i; ++j) {
> +			struct pnv_ioda_pe *pe = npucomp->pe[j];
> +
> +			if (!npucomp->table_group.tables[num])
> +				continue;
> +
> +			if (!pe->table_group.ops->set_window)
> +				continue;
> +
> +			ret = pe->table_group.ops->set_window(&pe->table_group,
> +					num, table_group->tables[num]);
> +			if (ret)
> +				break;
> +		}
> +	} else if (table_group->tables[num]) {
> +		iommu_tce_table_put(table_group->tables[num]);
> +		table_group->tables[num] = NULL;
> +	}
> +
> +	return ret;
> +}
> +
> +static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group)
> +{
> +	int i;
> +	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
> +			table_group);
> +
> +	for (i = 0; i < npucomp->pe_num; ++i) {
> +		struct pnv_ioda_pe *pe = npucomp->pe[i];
> +
> +		if (!pe->table_group.ops->take_ownership)
> +			continue;
> +		pe->table_group.ops->take_ownership(&pe->table_group);
> +	}
> +}
> +
> +static void pnv_npu_peers_release_ownership(
> +		struct iommu_table_group *table_group)
> +{
> +	int i;
> +	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
> +			table_group);
> +
> +	for (i = 0; i < npucomp->pe_num; ++i) {
> +		struct pnv_ioda_pe *pe = npucomp->pe[i];
> +
> +		if (!pe->table_group.ops->release_ownership)
> +			continue;
> +		pe->table_group.ops->release_ownership(&pe->table_group);
> +	}
> +}
> +
> +static struct iommu_table_group_ops pnv_npu_peers_ops = {
> +	.get_table_size = pnv_pci_ioda2_get_table_size,
> +	.create_table = pnv_npu_peers_create_table_userspace,
> +	.set_window = pnv_npu_peers_set_window,
> +	.unset_window = pnv_npu_peers_unset_window,
> +	.take_ownership = pnv_npu_peers_take_ownership,
> +	.release_ownership = pnv_npu_peers_release_ownership,
> +};
> +
> +static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
> +		struct pnv_ioda_pe *pe)
> +{
> +	if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
> +		return;
> +
> +	npucomp->pe[npucomp->pe_num] = pe;
> +	++npucomp->pe_num;
> +}
> +
> +struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
> +{
> +	struct iommu_table_group *table_group;
> +	struct npu *npu;
> +	struct npu_comp *npucomp;
> +	struct pci_dev *gpdev = NULL;
> +	struct pci_controller *hose;
> +	struct pci_dev *npdev;
> +
> +	list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) {
> +		npdev = pnv_pci_get_npu_dev(gpdev, 0);
> +		if (npdev)
> +			break;
> +	}
> +
> +	if (!npdev)
> +		/* It is not an NPU attached device, skip */
> +		return NULL;
> +
> +	hose = pci_bus_to_host(gpdev->bus);
> +	npu = npdev_to_npu(npdev);
> +	if (npu) {
> +		table_group = &npu->npucomp.table_group;
> +
> +		if (!table_group->group) {
> +			table_group->ops = &pnv_npu_peers_ops;
> +			iommu_register_group(table_group,
> +					hose->global_number,
> +					pe->pe_number);
> +		}
> +	} else {
> +		/* Create a group for 1 GPU and attached NPUs */
> +		pe->npucomp = kzalloc(sizeof(pe->npucomp), GFP_KERNEL);
> +		table_group = &pe->npucomp->table_group;
> +		table_group->ops = &pnv_npu_peers_ops;
> +		iommu_register_group(table_group, hose->global_number,
> +				pe->pe_number);
> +	}
> +
> +	/* Steal capabilities from a GPU PE */
> +	table_group->max_dynamic_windows_supported =
> +		pe->table_group.max_dynamic_windows_supported;
> +	table_group->tce32_start = pe->table_group.tce32_start;
> +	table_group->tce32_size = pe->table_group.tce32_size;
> +	table_group->max_levels = pe->table_group.max_levels;
> +	table_group->pgsizes = pe->table_group.pgsizes;
> +
> +	npucomp = container_of(table_group, struct npu_comp, table_group);
> +	pnv_comp_attach_table_group(npucomp, pe);
> +
> +	return table_group;
> +}
> +
> +struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
> +{
> +	struct iommu_table_group *table_group;
> +	struct npu_comp *npucomp;
> +	struct pci_dev *gpdev = NULL;
> +	struct pci_dev *npdev;
> +	struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev);
> +
> +	WARN_ON(!(pe->flags & PNV_IODA_PE_DEV));
> +	if (!gpe)
> +		return NULL;
> +
> +	/*
> +	 * IODA2 bridges get this set up from
> +	 * pci_controller_ops::setup_bridge but NPU bridges do not
> +	 * have this hook defined so we do it here.
> +	 */
> +	pe->table_group.max_dynamic_windows_supported =
> +		IOMMU_TABLE_GROUP_MAX_TABLES;
> +	pe->table_group.ops = &pnv_pci_npu_ops;
> +
> +	table_group = iommu_group_get_iommudata(
> +			iommu_group_get(&gpdev->dev));
> +
> +	npucomp = container_of(table_group, struct npu_comp, table_group);
> +	pnv_comp_attach_table_group(npucomp, pe);
> +
> +	list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) {
> +		struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev);
> +
> +		if (gpdevtmp != gpdev)
> +			continue;
> +
> +		iommu_add_device(table_group, &npdev->dev);
> +	}
> +
> +	return table_group;
> +}
> +#endif /* CONFIG_IOMMU_API */
> +
>  /* Maximum number of nvlinks per npu */
>  #define NV_MAX_LINKS 6
>  
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 04639ae..0e8ada5 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -190,7 +190,8 @@ static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
>  	unsigned int pe_num = pe->pe_number;
>  
>  	WARN_ON(pe->pdev);
> -
> +	WARN_ON(pe->npucomp);
> +	kfree(pe->npucomp);
>  	memset(pe, 0, sizeof(struct pnv_ioda_pe));
>  	clear_bit(pe_num, phb->ioda.pe_alloc);
>  }
> @@ -1269,7 +1270,8 @@ static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
>  		pnv_ioda_setup_npu_PE(pdev);
>  }
>  
> -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe);
> +static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
> +		struct iommu_table_group *table_group, struct pci_bus *bus);
>  
>  static void pnv_pci_ioda_setup_PEs(void)
>  {
> @@ -1593,7 +1595,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>  		mutex_unlock(&phb->ioda.pe_list_mutex);
>  
>  		pnv_pci_ioda2_setup_dma_pe(phb, pe);
> -		pnv_ioda_setup_bus_iommu_group(pe);
> +		pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL);
>  	}
>  }
>  
> @@ -2554,7 +2556,7 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
>  #endif
>  
>  #ifdef CONFIG_IOMMU_API
> -static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
> +unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
>  		__u64 window_size, __u32 levels)
>  {
>  	unsigned long bytes = 0;
> @@ -2628,147 +2630,38 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
>  	.release_ownership = pnv_ioda2_release_ownership,
>  };
>  
> -static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
> -{
> -	struct pci_controller *hose;
> -	struct pnv_phb *phb;
> -	struct pnv_ioda_pe **ptmppe = opaque;
> -	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
> -	struct pci_dn *pdn = pci_get_pdn(pdev);
> -
> -	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
> -		return 0;
> -
> -	hose = pci_bus_to_host(pdev->bus);
> -	phb = hose->private_data;
> -	if (phb->type != PNV_PHB_NPU_NVLINK)
> -		return 0;
> -
> -	*ptmppe = &phb->ioda.pe_array[pdn->pe_number];
> -
> -	return 1;
> -}
> -
> -/*
> - * This returns PE of associated NPU.
> - * This assumes that NPU is in the same IOMMU group with GPU and there is
> - * no other PEs.
> - */
> -static struct pnv_ioda_pe *gpe_table_group_to_npe(
> -		struct iommu_table_group *table_group)
> -{
> -	struct pnv_ioda_pe *npe = NULL;
> -	int ret = iommu_group_for_each_dev(table_group->group, &npe,
> -			gpe_table_group_to_npe_cb);
> -
> -	BUG_ON(!ret || !npe);
> -
> -	return npe;
> -}
> -
> -static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
> -		int num, struct iommu_table *tbl)
> -{
> -	struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
> -	int num2 = (num == 0) ? 1 : 0;
> -	long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
> -
> -	if (ret)
> -		return ret;
> -
> -	if (table_group->tables[num2])
> -		npe->table_group.ops->unset_window(&npe->table_group, num2);
> -
> -	ret = npe->table_group.ops->set_window(&npe->table_group, num, tbl);
> -	if (ret) {
> -		pnv_pci_ioda2_unset_window(table_group, num);
> -		if (table_group->tables[num2])
> -			npe->table_group.ops->set_window(&npe->table_group,
> -					num2, table_group->tables[num2]);
> -	}
> -
> -	return ret;
> -}
> -
> -static long pnv_pci_ioda2_npu_unset_window(
> -		struct iommu_table_group *table_group,
> -		int num)
> -{
> -	struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
> -	int num2 = (num == 0) ? 1 : 0;
> -	long ret = pnv_pci_ioda2_unset_window(table_group, num);
> -
> -	if (ret)
> -		return ret;
> -
> -	if (!npe->table_group.tables[num])
> -		return 0;
> -
> -	ret = npe->table_group.ops->unset_window(&npe->table_group, num);
> -	if (ret)
> -		return ret;
> -
> -	if (table_group->tables[num2])
> -		ret = npe->table_group.ops->set_window(&npe->table_group, num2,
> -				table_group->tables[num2]);
> -
> -	return ret;
> -}
> -
> -static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
> -{
> -	struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
> -
> -	npe->table_group.ops->take_ownership(&npe->table_group);
> -	pnv_ioda2_take_ownership(table_group);
> -}
> -
> -static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
> -	.get_table_size = pnv_pci_ioda2_get_table_size,
> -	.create_table = pnv_pci_ioda2_create_table_userspace,
> -	.set_window = pnv_pci_ioda2_npu_set_window,
> -	.unset_window = pnv_pci_ioda2_npu_unset_window,
> -	.take_ownership = pnv_ioda2_npu_take_ownership,
> -	.release_ownership = pnv_ioda2_release_ownership,
> -};
> -
>  static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
> +		struct iommu_table_group *table_group,
>  		struct pci_bus *bus)
>  {
>  	struct pci_dev *dev;
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
> -		iommu_add_device(&pe->table_group, &dev->dev);
> +		iommu_add_device(table_group, &dev->dev);
>  
>  		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>  			pnv_ioda_setup_bus_iommu_group_add_devices(pe,
> -					dev->subordinate);
> +					table_group, dev->subordinate);
>  	}
>  }
>  
> -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe)
> +static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
> +		struct iommu_table_group *table_group, struct pci_bus *bus)
>  {
> -	if (!pnv_pci_ioda_pe_dma_weight(pe))
> -		return;
>  
> -	iommu_register_group(&pe->table_group, pe->phb->hose->global_number,
> -			pe->pe_number);
> -
> -	/*
> -	 * set_iommu_table_base(&pe->pdev->dev, tbl) should have been called
> -	 * by now
> -	 */
>  	if (pe->flags & PNV_IODA_PE_DEV)
> -		iommu_add_device(&pe->table_group, &pe->pdev->dev);
> -	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
> -		pnv_ioda_setup_bus_iommu_group_add_devices(pe, pe->pbus);
> +		iommu_add_device(table_group, &pe->pdev->dev);
> +
> +	if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus)
> +		pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group,
> +				bus);
>  }
>  
>  static void pnv_pci_ioda_setup_iommu_api(void)
>  {
> -	struct pci_controller *hose, *tmp;
> +	struct pci_controller *hose;
>  	struct pnv_phb *phb;
> -	struct pnv_ioda_pe *pe, *gpe;
> +	struct pnv_ioda_pe *pe;
>  
>  	/*
>  	 * There are 4 types of PEs:
> @@ -2790,29 +2683,41 @@ static void pnv_pci_ioda_setup_iommu_api(void)
>  		if (phb->type == PNV_PHB_NPU_NVLINK)
>  			continue;
>  
> -		list_for_each_entry(pe, &phb->ioda.pe_list, list)
> -			pnv_ioda_setup_bus_iommu_group(pe);
> +		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
> +			struct iommu_table_group *table_group;
> +
> +			table_group = pnv_try_setup_npu_table_group(pe);
> +			if (!table_group) {
> +				if (!pnv_pci_ioda_pe_dma_weight(pe))
> +					continue;
> +
> +				table_group = &pe->table_group;
> +				iommu_register_group(&pe->table_group,
> +						pe->phb->hose->global_number,
> +						pe->pe_number);
> +			}
> +			pnv_ioda_setup_bus_iommu_group(pe, table_group,
> +					pe->pbus);
> +		}
>  	}
>  
>  	/*
>  	 * Now we have all PHBs discovered, time to add NPU devices to
>  	 * the corresponding IOMMU groups.
>  	 */
> -	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
> +	list_for_each_entry(hose, &hose_list, list_node) {
>  		phb = hose->private_data;
>  
>  		if (phb->type != PNV_PHB_NPU_NVLINK)
>  			continue;
>  
> -		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
> -			gpe = pnv_pci_npu_setup_iommu(pe);
> -			if (gpe)
> -				gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
> -		}
> +		list_for_each_entry(pe, &phb->ioda.pe_list, list)
> +			pnv_npu_compound_attach(pe);
>  	}
>  }
>  #else /* !CONFIG_IOMMU_API */
> -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe) { }
> +static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
> +		struct iommu_table_group *table_group, struct pci_bus *bus){}
>  static void pnv_pci_ioda_setup_iommu_api(void) { };
>  #endif
>
Alexey Kardashevskiy Nov. 19, 2018, 2:29 a.m. UTC | #2
On 19/11/2018 12:12, David Gibson wrote:
> On Tue, Nov 13, 2018 at 07:28:19PM +1100, Alexey Kardashevskiy wrote:
>> At the moment powernv registers an IOMMU group for each PE. There is
>> an exception though - NPU (an emulated PCI bridge representing an NVLink);
>> powernv attaches these bridges to the GPU IOMMU group which becomes
>> a master.
>>
>> Now we have POWER9 systems with GPUs connected to each other directly,
>> bypassing PCI. At the moment powernv does not control these links so
>> it has to put such interconnected GPUs to the same IOMMU group which
>> means that the old scheme with a GPU as a master won't work - there will
>> be up to 3 GPUs in such group.
>>
>> This introduces a npu_comp struct which represents a compound IOMMU
>> group made of multiple PEs. This converts the existing NVLink1 code to
>> use the new scheme. From now on, each PE must have a valid
>> iommu_table_group_ops which will either be called directly (a single PE
>> group) or indirectly from a compound group.
>>
>> This moves IOMMU group registration for NPU-connected GPUs to npu-dma.c.
>> For POWER8, this stores a new compound group pointer in a PE (so a GPU
>> is still a master); for POWER9 the new group pointer is stored in an NPU.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  arch/powerpc/include/asm/pci.h            |   1 +
>>  arch/powerpc/platforms/powernv/pci.h      |   7 +
>>  arch/powerpc/platforms/powernv/npu-dma.c  | 286 ++++++++++++++++++++--
>>  arch/powerpc/platforms/powernv/pci-ioda.c | 173 +++----------
>>  4 files changed, 308 insertions(+), 159 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
>> index baf2886..0c72f18 100644
>> --- a/arch/powerpc/include/asm/pci.h
>> +++ b/arch/powerpc/include/asm/pci.h
>> @@ -132,5 +132,6 @@ extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index);
>>  extern int pnv_npu2_init(struct pci_controller *hose);
>>  extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
>>  		unsigned long msr);
>> +extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev);
>>  
>>  #endif /* __ASM_POWERPC_PCI_H */
>> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>> index cf9f748..aef4bb5 100644
>> --- a/arch/powerpc/platforms/powernv/pci.h
>> +++ b/arch/powerpc/platforms/powernv/pci.h
>> @@ -62,6 +62,7 @@ struct pnv_ioda_pe {
>>  
>>  	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
>>  	struct iommu_table_group table_group;
>> +	struct npu_comp		*npucomp;
>>  
>>  	/* 64-bit TCE bypass region */
>>  	bool			tce_bypass_enabled;
>> @@ -201,6 +202,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
>>  extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
>>  extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
>>  extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>> +extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
>> +		__u64 window_size, __u32 levels);
>>  extern int pnv_eeh_post_init(void);
>>  
>>  extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>> @@ -216,6 +219,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>>  extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
>>  extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
>>  extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
>> +extern struct iommu_table_group *pnv_try_setup_npu_table_group(
>> +		struct pnv_ioda_pe *pe);
>> +extern struct iommu_table_group *pnv_npu_compound_attach(
>> +		struct pnv_ioda_pe *pe);
>>  
>>  /* pci-ioda-tce.c */
>>  #define POWERNV_IOMMU_DEFAULT_LEVELS	1
>> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
>> index 1792c7e..2231f4c 100644
>> --- a/arch/powerpc/platforms/powernv/npu-dma.c
>> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
>> @@ -317,31 +317,6 @@ static struct iommu_table_group_ops pnv_pci_npu_ops = {
>>  	.unset_window = pnv_npu_unset_window,
>>  	.take_ownership = pnv_npu_take_ownership,
>>  };
>> -
>> -struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
>> -{
>> -	struct pnv_phb *phb = npe->phb;
>> -	struct pci_bus *pbus = phb->hose->bus;
>> -	struct pci_dev *npdev, *gpdev = NULL, *gptmp;
>> -	struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
>> -
>> -	if (!gpe || !gpdev)
>> -		return NULL;
>> -
>> -	npe->table_group.ops = &pnv_pci_npu_ops;
>> -
>> -	list_for_each_entry(npdev, &pbus->devices, bus_list) {
>> -		gptmp = pnv_pci_get_gpu_dev(npdev);
>> -
>> -		if (gptmp != gpdev)
>> -			continue;
>> -
>> -		pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
>> -		iommu_group_add_device(gpe->table_group.group, &npdev->dev);
>> -	}
>> -
>> -	return gpe;
>> -}
>>  #endif /* !CONFIG_IOMMU_API */
>>  
>>  /*
>> @@ -349,6 +324,17 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
>>   */
>>  /* Maximum possible number of ATSD MMIO registers per NPU */
>>  #define NV_NMMU_ATSD_REGS 8
>> +#define NV_NPU_MAX_PE_NUM	16
>> +
>> +/*
>> + * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
>> + * up to 3 x (GPU + 2xNPUs) (POWER9).
>> + */
>> +struct npu_comp {
>> +	struct iommu_table_group table_group;
>> +	int pe_num;
>> +	struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
>> +};
>>  
>>  /* An NPU descriptor, valid for POWER9 only */
>>  struct npu {
>> @@ -365,6 +351,8 @@ struct npu {
>>  	struct list_head next;
>>  
>>  	struct pci_controller *hose;
>> +
>> +	struct npu_comp npucomp;
>>  };
> 
> I'm confused by this.  The comment simply there are multiple NPUs in a
> single composite-group, but the np_comp structure is embedded in the
> npu structure, implying there's a copy per-NPU.


Yeah, there is a naming confusion. NPU is a big chunk in the CPU with 6
links, and this is what the "struct npu" above describes.

And there are 6 NPU emulated bridge devices which you can see in lspci
with the "ibmnpu" driver bound to them.

I guess from now on I will refer to the big NPU as "NPU" and to the
emulated bridge device as "NVLink2" or "NVlink2 emulated device" unless
you have a better suggestion (Alistair does not though).
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index baf2886..0c72f18 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -132,5 +132,6 @@  extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index);
 extern int pnv_npu2_init(struct pci_controller *hose);
 extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
 		unsigned long msr);
+extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev);
 
 #endif /* __ASM_POWERPC_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index cf9f748..aef4bb5 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -62,6 +62,7 @@  struct pnv_ioda_pe {
 
 	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
 	struct iommu_table_group table_group;
+	struct npu_comp		*npucomp;
 
 	/* 64-bit TCE bypass region */
 	bool			tce_bypass_enabled;
@@ -201,6 +202,8 @@  extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
 extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+		__u64 window_size, __u32 levels);
 extern int pnv_eeh_post_init(void);
 
 extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -216,6 +219,10 @@  extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
 extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
+extern struct iommu_table_group *pnv_try_setup_npu_table_group(
+		struct pnv_ioda_pe *pe);
+extern struct iommu_table_group *pnv_npu_compound_attach(
+		struct pnv_ioda_pe *pe);
 
 /* pci-ioda-tce.c */
 #define POWERNV_IOMMU_DEFAULT_LEVELS	1
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 1792c7e..2231f4c 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -317,31 +317,6 @@  static struct iommu_table_group_ops pnv_pci_npu_ops = {
 	.unset_window = pnv_npu_unset_window,
 	.take_ownership = pnv_npu_take_ownership,
 };
-
-struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
-{
-	struct pnv_phb *phb = npe->phb;
-	struct pci_bus *pbus = phb->hose->bus;
-	struct pci_dev *npdev, *gpdev = NULL, *gptmp;
-	struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
-
-	if (!gpe || !gpdev)
-		return NULL;
-
-	npe->table_group.ops = &pnv_pci_npu_ops;
-
-	list_for_each_entry(npdev, &pbus->devices, bus_list) {
-		gptmp = pnv_pci_get_gpu_dev(npdev);
-
-		if (gptmp != gpdev)
-			continue;
-
-		pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
-		iommu_group_add_device(gpe->table_group.group, &npdev->dev);
-	}
-
-	return gpe;
-}
 #endif /* !CONFIG_IOMMU_API */
 
 /*
@@ -349,6 +324,17 @@  struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
  */
 /* Maximum possible number of ATSD MMIO registers per NPU */
 #define NV_NMMU_ATSD_REGS 8
+#define NV_NPU_MAX_PE_NUM	16
+
+/*
+ * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
+ * up to 3 x (GPU + 2xNPUs) (POWER9).
+ */
+struct npu_comp {
+	struct iommu_table_group table_group;
+	int pe_num;
+	struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
+};
 
 /* An NPU descriptor, valid for POWER9 only */
 struct npu {
@@ -365,6 +351,8 @@  struct npu {
 	struct list_head next;
 
 	struct pci_controller *hose;
+
+	struct npu_comp npucomp;
 };
 
 static LIST_HEAD(npu2_devices);
@@ -382,6 +370,254 @@  static struct npu *npdev_to_npu(struct pci_dev *npdev)
 	return NULL;
 }
 
+#ifdef CONFIG_IOMMU_API
+static long pnv_npu_peers_create_table_userspace(
+		struct iommu_table_group *table_group,
+		int num, __u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table **ptbl)
+{
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	if (!npucomp->pe_num || !npucomp->pe[0] ||
+			!npucomp->pe[0]->table_group.ops ||
+			!npucomp->pe[0]->table_group.ops->create_table)
+		return -EFAULT;
+
+	return npucomp->pe[0]->table_group.ops->create_table(
+			&npucomp->pe[0]->table_group, num, page_shift,
+			window_size, levels, ptbl);
+}
+
+static long pnv_npu_peers_set_window(struct iommu_table_group *table_group,
+		int num, struct iommu_table *tbl)
+{
+	int i, j;
+	long ret = 0;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		if (!pe->table_group.ops->set_window)
+			continue;
+
+		ret = pe->table_group.ops->set_window(&pe->table_group,
+				num, tbl);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		for (j = 0; j < i; ++j) {
+			struct pnv_ioda_pe *pe = npucomp->pe[j];
+
+			if (!pe->table_group.ops->unset_window)
+				continue;
+
+			ret = pe->table_group.ops->unset_window(
+					&pe->table_group, num);
+			if (ret)
+				break;
+		}
+	} else {
+		table_group->tables[num] = iommu_tce_table_get(tbl);
+	}
+
+	return ret;
+}
+
+static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group,
+		int num)
+{
+	int i, j;
+	long ret = 0;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		WARN_ON(npucomp->table_group.tables[num] !=
+				table_group->tables[num]);
+		if (!npucomp->table_group.tables[num])
+			continue;
+
+		if (!pe->table_group.ops->unset_window)
+			continue;
+
+		ret = pe->table_group.ops->unset_window(&pe->table_group, num);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		for (j = 0; j < i; ++j) {
+			struct pnv_ioda_pe *pe = npucomp->pe[j];
+
+			if (!npucomp->table_group.tables[num])
+				continue;
+
+			if (!pe->table_group.ops->set_window)
+				continue;
+
+			ret = pe->table_group.ops->set_window(&pe->table_group,
+					num, table_group->tables[num]);
+			if (ret)
+				break;
+		}
+	} else if (table_group->tables[num]) {
+		iommu_tce_table_put(table_group->tables[num]);
+		table_group->tables[num] = NULL;
+	}
+
+	return ret;
+}
+
+static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group)
+{
+	int i;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		if (!pe->table_group.ops->take_ownership)
+			continue;
+		pe->table_group.ops->take_ownership(&pe->table_group);
+	}
+}
+
+static void pnv_npu_peers_release_ownership(
+		struct iommu_table_group *table_group)
+{
+	int i;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		if (!pe->table_group.ops->release_ownership)
+			continue;
+		pe->table_group.ops->release_ownership(&pe->table_group);
+	}
+}
+
+static struct iommu_table_group_ops pnv_npu_peers_ops = {
+	.get_table_size = pnv_pci_ioda2_get_table_size,
+	.create_table = pnv_npu_peers_create_table_userspace,
+	.set_window = pnv_npu_peers_set_window,
+	.unset_window = pnv_npu_peers_unset_window,
+	.take_ownership = pnv_npu_peers_take_ownership,
+	.release_ownership = pnv_npu_peers_release_ownership,
+};
+
+static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
+		struct pnv_ioda_pe *pe)
+{
+	if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
+		return;
+
+	npucomp->pe[npucomp->pe_num] = pe;
+	++npucomp->pe_num;
+}
+
+struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table_group *table_group;
+	struct npu *npu;
+	struct npu_comp *npucomp;
+	struct pci_dev *gpdev = NULL;
+	struct pci_controller *hose;
+	struct pci_dev *npdev;
+
+	list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) {
+		npdev = pnv_pci_get_npu_dev(gpdev, 0);
+		if (npdev)
+			break;
+	}
+
+	if (!npdev)
+		/* It is not an NPU attached device, skip */
+		return NULL;
+
+	hose = pci_bus_to_host(gpdev->bus);
+	npu = npdev_to_npu(npdev);
+	if (npu) {
+		table_group = &npu->npucomp.table_group;
+
+		if (!table_group->group) {
+			table_group->ops = &pnv_npu_peers_ops;
+			iommu_register_group(table_group,
+					hose->global_number,
+					pe->pe_number);
+		}
+	} else {
+		/* Create a group for 1 GPU and attached NPUs */
+		pe->npucomp = kzalloc(sizeof(pe->npucomp), GFP_KERNEL);
+		table_group = &pe->npucomp->table_group;
+		table_group->ops = &pnv_npu_peers_ops;
+		iommu_register_group(table_group, hose->global_number,
+				pe->pe_number);
+	}
+
+	/* Steal capabilities from a GPU PE */
+	table_group->max_dynamic_windows_supported =
+		pe->table_group.max_dynamic_windows_supported;
+	table_group->tce32_start = pe->table_group.tce32_start;
+	table_group->tce32_size = pe->table_group.tce32_size;
+	table_group->max_levels = pe->table_group.max_levels;
+	table_group->pgsizes = pe->table_group.pgsizes;
+
+	npucomp = container_of(table_group, struct npu_comp, table_group);
+	pnv_comp_attach_table_group(npucomp, pe);
+
+	return table_group;
+}
+
+struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table_group *table_group;
+	struct npu_comp *npucomp;
+	struct pci_dev *gpdev = NULL;
+	struct pci_dev *npdev;
+	struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev);
+
+	WARN_ON(!(pe->flags & PNV_IODA_PE_DEV));
+	if (!gpe)
+		return NULL;
+
+	/*
+	 * IODA2 bridges get this set up from
+	 * pci_controller_ops::setup_bridge but NPU bridges do not
+	 * have this hook defined so we do it here.
+	 */
+	pe->table_group.max_dynamic_windows_supported =
+		IOMMU_TABLE_GROUP_MAX_TABLES;
+	pe->table_group.ops = &pnv_pci_npu_ops;
+
+	table_group = iommu_group_get_iommudata(
+			iommu_group_get(&gpdev->dev));
+
+	npucomp = container_of(table_group, struct npu_comp, table_group);
+	pnv_comp_attach_table_group(npucomp, pe);
+
+	list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) {
+		struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev);
+
+		if (gpdevtmp != gpdev)
+			continue;
+
+		iommu_add_device(table_group, &npdev->dev);
+	}
+
+	return table_group;
+}
+#endif /* CONFIG_IOMMU_API */
+
 /* Maximum number of nvlinks per npu */
 #define NV_MAX_LINKS 6
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 04639ae..0e8ada5 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -190,7 +190,8 @@  static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
 	unsigned int pe_num = pe->pe_number;
 
 	WARN_ON(pe->pdev);
-
+	WARN_ON(pe->npucomp);
+	kfree(pe->npucomp);
 	memset(pe, 0, sizeof(struct pnv_ioda_pe));
 	clear_bit(pe_num, phb->ioda.pe_alloc);
 }
@@ -1269,7 +1270,8 @@  static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
 		pnv_ioda_setup_npu_PE(pdev);
 }
 
-static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe);
+static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
+		struct iommu_table_group *table_group, struct pci_bus *bus);
 
 static void pnv_pci_ioda_setup_PEs(void)
 {
@@ -1593,7 +1595,7 @@  static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		mutex_unlock(&phb->ioda.pe_list_mutex);
 
 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
-		pnv_ioda_setup_bus_iommu_group(pe);
+		pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL);
 	}
 }
 
@@ -2554,7 +2556,7 @@  static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
 #endif
 
 #ifdef CONFIG_IOMMU_API
-static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 		__u64 window_size, __u32 levels)
 {
 	unsigned long bytes = 0;
@@ -2628,147 +2630,38 @@  static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
 	.release_ownership = pnv_ioda2_release_ownership,
 };
 
-static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
-{
-	struct pci_controller *hose;
-	struct pnv_phb *phb;
-	struct pnv_ioda_pe **ptmppe = opaque;
-	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
-	struct pci_dn *pdn = pci_get_pdn(pdev);
-
-	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
-		return 0;
-
-	hose = pci_bus_to_host(pdev->bus);
-	phb = hose->private_data;
-	if (phb->type != PNV_PHB_NPU_NVLINK)
-		return 0;
-
-	*ptmppe = &phb->ioda.pe_array[pdn->pe_number];
-
-	return 1;
-}
-
-/*
- * This returns PE of associated NPU.
- * This assumes that NPU is in the same IOMMU group with GPU and there is
- * no other PEs.
- */
-static struct pnv_ioda_pe *gpe_table_group_to_npe(
-		struct iommu_table_group *table_group)
-{
-	struct pnv_ioda_pe *npe = NULL;
-	int ret = iommu_group_for_each_dev(table_group->group, &npe,
-			gpe_table_group_to_npe_cb);
-
-	BUG_ON(!ret || !npe);
-
-	return npe;
-}
-
-static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
-		int num, struct iommu_table *tbl)
-{
-	struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
-	int num2 = (num == 0) ? 1 : 0;
-	long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
-
-	if (ret)
-		return ret;
-
-	if (table_group->tables[num2])
-		npe->table_group.ops->unset_window(&npe->table_group, num2);
-
-	ret = npe->table_group.ops->set_window(&npe->table_group, num, tbl);
-	if (ret) {
-		pnv_pci_ioda2_unset_window(table_group, num);
-		if (table_group->tables[num2])
-			npe->table_group.ops->set_window(&npe->table_group,
-					num2, table_group->tables[num2]);
-	}
-
-	return ret;
-}
-
-static long pnv_pci_ioda2_npu_unset_window(
-		struct iommu_table_group *table_group,
-		int num)
-{
-	struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
-	int num2 = (num == 0) ? 1 : 0;
-	long ret = pnv_pci_ioda2_unset_window(table_group, num);
-
-	if (ret)
-		return ret;
-
-	if (!npe->table_group.tables[num])
-		return 0;
-
-	ret = npe->table_group.ops->unset_window(&npe->table_group, num);
-	if (ret)
-		return ret;
-
-	if (table_group->tables[num2])
-		ret = npe->table_group.ops->set_window(&npe->table_group, num2,
-				table_group->tables[num2]);
-
-	return ret;
-}
-
-static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
-{
-	struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
-
-	npe->table_group.ops->take_ownership(&npe->table_group);
-	pnv_ioda2_take_ownership(table_group);
-}
-
-static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
-	.get_table_size = pnv_pci_ioda2_get_table_size,
-	.create_table = pnv_pci_ioda2_create_table_userspace,
-	.set_window = pnv_pci_ioda2_npu_set_window,
-	.unset_window = pnv_pci_ioda2_npu_unset_window,
-	.take_ownership = pnv_ioda2_npu_take_ownership,
-	.release_ownership = pnv_ioda2_release_ownership,
-};
-
 static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
+		struct iommu_table_group *table_group,
 		struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
-		iommu_add_device(&pe->table_group, &dev->dev);
+		iommu_add_device(table_group, &dev->dev);
 
 		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
 			pnv_ioda_setup_bus_iommu_group_add_devices(pe,
-					dev->subordinate);
+					table_group, dev->subordinate);
 	}
 }
 
-static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe)
+static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
+		struct iommu_table_group *table_group, struct pci_bus *bus)
 {
-	if (!pnv_pci_ioda_pe_dma_weight(pe))
-		return;
 
-	iommu_register_group(&pe->table_group, pe->phb->hose->global_number,
-			pe->pe_number);
-
-	/*
-	 * set_iommu_table_base(&pe->pdev->dev, tbl) should have been called
-	 * by now
-	 */
 	if (pe->flags & PNV_IODA_PE_DEV)
-		iommu_add_device(&pe->table_group, &pe->pdev->dev);
-	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
-		pnv_ioda_setup_bus_iommu_group_add_devices(pe, pe->pbus);
+		iommu_add_device(table_group, &pe->pdev->dev);
+
+	if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus)
+		pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group,
+				bus);
 }
 
 static void pnv_pci_ioda_setup_iommu_api(void)
 {
-	struct pci_controller *hose, *tmp;
+	struct pci_controller *hose;
 	struct pnv_phb *phb;
-	struct pnv_ioda_pe *pe, *gpe;
+	struct pnv_ioda_pe *pe;
 
 	/*
 	 * There are 4 types of PEs:
@@ -2790,29 +2683,41 @@  static void pnv_pci_ioda_setup_iommu_api(void)
 		if (phb->type == PNV_PHB_NPU_NVLINK)
 			continue;
 
-		list_for_each_entry(pe, &phb->ioda.pe_list, list)
-			pnv_ioda_setup_bus_iommu_group(pe);
+		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+			struct iommu_table_group *table_group;
+
+			table_group = pnv_try_setup_npu_table_group(pe);
+			if (!table_group) {
+				if (!pnv_pci_ioda_pe_dma_weight(pe))
+					continue;
+
+				table_group = &pe->table_group;
+				iommu_register_group(&pe->table_group,
+						pe->phb->hose->global_number,
+						pe->pe_number);
+			}
+			pnv_ioda_setup_bus_iommu_group(pe, table_group,
+					pe->pbus);
+		}
 	}
 
 	/*
 	 * Now we have all PHBs discovered, time to add NPU devices to
 	 * the corresponding IOMMU groups.
 	 */
-	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+	list_for_each_entry(hose, &hose_list, list_node) {
 		phb = hose->private_data;
 
 		if (phb->type != PNV_PHB_NPU_NVLINK)
 			continue;
 
-		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
-			gpe = pnv_pci_npu_setup_iommu(pe);
-			if (gpe)
-				gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
-		}
+		list_for_each_entry(pe, &phb->ioda.pe_list, list)
+			pnv_npu_compound_attach(pe);
 	}
 }
 #else /* !CONFIG_IOMMU_API */
-static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe) { }
+static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
+		struct iommu_table_group *table_group, struct pci_bus *bus){}
 static void pnv_pci_ioda_setup_iommu_api(void) { };
 #endif