diff mbox series

[RFC,3/3] pseries/iommu: Enable DDW for VFIO TCE create

Message ID 171026728072.8367.13581504605624115205.stgit@linux.ibm.com (mailing list archive)
State Changes Requested
Headers show
Series powerpc: pSeries: vfio: iommu: Re-enable support for SPAPR TCE VFIO | expand

Checks

Context Check Description
snowpatch_ozlabs/github-powerpc_ppctests success Successfully ran 8 jobs.
snowpatch_ozlabs/github-powerpc_selftests success Successfully ran 8 jobs.
snowpatch_ozlabs/github-powerpc_kernel_qemu fail boot (ppc64le_guest_defconfig, powernv+p8+tcg, powernv+p9+tcg, qemu-system-ppc64, ppc64le-rootfs.... failed at step Run qemu-powernv+p8+tcg with fedora-38 build kernel.
snowpatch_ozlabs/github-powerpc_sparse success Successfully ran 4 jobs.
snowpatch_ozlabs/github-powerpc_clang success Successfully ran 6 jobs.

Commit Message

Shivaprasad G Bhat March 12, 2024, 6:14 p.m. UTC
The commit 9d67c9433509 ("powerpc/iommu: Add \"borrowing\"
iommu_table_group_ops") implemented the "borrow" mechanism for
the pSeries SPAPR TCE. It did implement this support partially
that it left out creating the DDW if not present already.

The patch here attempts to fix the missing gaps.
 - Expose the DDW info to user by collecting it during probe.
 - Create the window and the iommu table if not present during
   VFIO_SPAPR_TCE_CREATE.
 - Remove and recreate the window if the pageshift and window sizes
   do not match.
 - Restore the original window in enable_ddw() if the user had
   created/modified the DDW. As there is preference for DIRECT mapping
   on the host driver side, the user created window is removed.

The changes work only for the non-SRIOV-VF scenarios for PEs having
2 DMA windows.

Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
---
 arch/powerpc/include/asm/iommu.h       |    3 
 arch/powerpc/kernel/iommu.c            |    7 -
 arch/powerpc/platforms/pseries/iommu.c |  362 +++++++++++++++++++++++++++++++-
 3 files changed, 360 insertions(+), 12 deletions(-)

Comments

Michael Ellerman March 13, 2024, 12:53 p.m. UTC | #1
Hi Shivaprasad,

Shivaprasad G Bhat <sbhat@linux.ibm.com> writes:
> The commit 9d67c9433509 ("powerpc/iommu: Add \"borrowing\"
> iommu_table_group_ops") implemented the "borrow" mechanism for
> the pSeries SPAPR TCE. It did implement this support partially
> that it left out creating the DDW if not present already.
>
> The patch here attempts to fix the missing gaps.
>  - Expose the DDW info to user by collecting it during probe.
>  - Create the window and the iommu table if not present during
>    VFIO_SPAPR_TCE_CREATE.
>  - Remove and recreate the window if the pageshift and window sizes
>    do not match.
>  - Restore the original window in enable_ddw() if the user had
>    created/modified the DDW. As there is preference for DIRECT mapping
>    on the host driver side, the user created window is removed.
>
> The changes work only for the non-SRIOV-VF scenarios for PEs having
> 2 DMA windows.

This crashes on powernv.

Full log at https://github.com/linuxppc/linux-snowpatch/actions/runs/8253875566/job/22577897225.

[    0.958561][    T1] pci_bus 0002:01: Configuring PE for bus
[    0.959699][    T1] pci 0002:01     : [PE# fd] Secondary bus 0x0000000000000001 associated with PE#fd
[    0.961692][    T1] pci 0002:01:00.0: Configured PE#fd
[    0.962424][    T1] pci 0002:01     : [PE# fd] Setting up 32-bit TCE table at 0..80000000
[    0.966424][    T1] IOMMU table initialized, virtual merging enabled
[    0.967544][    T1] pci 0002:01     : [PE# fd] Setting up window#0 0..ffffffff pg=10000
[    0.969362][    T1] pci 0002:01     : [PE# fd] Enabling 64-bit DMA bypass
[    0.971386][    T1] pci 0002:01:00.0: Adding to iommu group 0
[    0.973481][    T1] BUG: Unable to handle kernel instruction fetch (NULL pointer?)
[    0.974388][    T1] Faulting instruction address: 0x00000000
[    0.975578][    T1] Oops: Kernel access of bad area, sig: 11 [#1]
[    0.976476][    T1] LE PAGE_SIZE=64K MMU=Hash SMP ERROR: Error: saw oops/warning etc. while expecting NR_CPUS=2048 NUMA PowerNV
[    0.977777][    T1] Modules linked in:
[    0.978570][    T1] CPU: 1 PID: 1 Comm: swapper/1 Not tainted 6.8.0-rc6-g80dcb4e6d0aa #1
[    0.979766][    T1] Hardware name: IBM PowerNV (emulated by qemu) POWER8 0x4d0200 opal:v6.8-104-g820d43c0 PowerNV
[    0.981197][    T1] NIP:  0000000000000000 LR: c00000000005653c CTR: 0000000000000000
[    0.982221][    T1] REGS: c000000003687420 TRAP: 0480   Not tainted  (6.8.0-rc6-g80dcb4e6d0aa)
[    0.983400][    T1] MSR:  9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 44004422  XER: 00000000
[    0.984742][    T1] CFAR: c000000000056538 IRQMASK: 0 
[    0.984742][    T1] GPR00: c000000000056520 c0000000036876c0 c0000000015b9800 c00000000363ae58 
[    0.984742][    T1] GPR04: c00000000352f0a0 c0000000026d4748 0000000000000001 0000000000000000 
[    0.984742][    T1] GPR08: 0000000000000000 c000000002716668 0000000000000003 0000000000008000 
[    0.984742][    T1] GPR12: 0000000000000000 c000000002be0000 c0000000000110cc 0000000000000000 
[    0.984742][    T1] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 
[    0.984742][    T1] GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000001 
[    0.984742][    T1] GPR24: c0000000014681d8 0000000000000000 c000000003068a00 0000000000000001 
[    0.984742][    T1] GPR28: c000000003068a00 0000000000000000 c00000000363ae58 c00000000352f0a0 
[    0.994647][    T1] NIP [0000000000000000] 0x0
[    0.995699][    T1] LR [c00000000005653c] spapr_tce_platform_iommu_attach_dev+0x74/0xc8
[    0.997399][    T1] Call Trace:
[    0.997897][    T1] [c0000000036876c0] [c000000000056514] spapr_tce_platform_iommu_attach_dev+0x4c/0xc8 (unreliable)
[    0.999383][    T1] [c000000003687700] [c000000000b383dc] __iommu_attach_device+0x44/0xfc
[    1.000476][    T1] [c000000003687730] [c000000000b38574] __iommu_device_set_domain+0xe0/0x170
[    1.001728][    T1] [c0000000036877c0] [c000000000b3869c] __iommu_group_set_domain_internal+0x98/0x1c0
[    1.003014][    T1] [c000000003687820] [c000000000b3bb10] iommu_setup_default_domain+0x544/0x650
[    1.004306][    T1] [c0000000036878e0] [c000000000b3d3b4] __iommu_probe_device+0x5b0/0x604
[    1.005500][    T1] [c000000003687950] [c000000000b3d454] iommu_probe_device+0x4c/0xb0
[    1.006563][    T1] [c000000003687990] [c00000000005648c] iommu_add_device+0x3c/0x78
[    1.007590][    T1] [c0000000036879b0] [c0000000000db920] pnv_pci_ioda_dma_dev_setup+0x168/0x73c
[    1.008918][    T1] [c000000003687a60] [c0000000000729f4] pcibios_bus_add_device+0x80/0x328
[    1.010077][    T1] [c000000003687ac0] [c000000000a49fa0] pci_bus_add_device+0x30/0x11c
[    1.011169][    T1] [c000000003687b30] [c000000000a4a0e4] pci_bus_add_devices+0x58/0xb4
[    1.012230][    T1] [c000000003687b70] [c000000000a4a118] pci_bus_add_devices+0x8c/0xb4
[    1.013301][    T1] [c000000003687bb0] [c00000000201a3c8] pcibios_init+0xd8/0x140
[    1.014314][    T1] [c000000003687c30] [c000000000010d58] do_one_initcall+0x80/0x2f8
[    1.015349][    T1] [c000000003687d00] [c000000002005b0c] kernel_init_freeable+0x31c/0x510
[    1.016470][    T1] [c000000003687de0] [c0000000000110f8] kernel_init+0x34/0x25c
[    1.017527][    T1] [c000000003687e50] [c00000000000debc] ret_from_kernel_user_thread+0x14/0x1c
[    1.018778][    T1] --- interrupt: 0 at 0x0
[    1.019525][    T1] Code: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX 
[    1.022234][    T1] ---[ end trace 0000000000000000 ]---
[    1.022983][    T1] 
[    2.023819][    T1] note: swapper/1[1] exited with irqs disabled
[    2.025051][    T1] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
[    2.027371][    T1] Rebooting in 10 seconds.


cheers
Shivaprasad G Bhat March 26, 2024, 4:56 a.m. UTC | #2
Hi Michael,

On 3/13/24 18:23, Michael Ellerman wrote:
> Hi Shivaprasad,
>
> Shivaprasad G Bhat <sbhat@linux.ibm.com> writes:
>> The commit 9d67c9433509 ("powerpc/iommu: Add \"borrowing\"
>> iommu_table_group_ops") implemented the "borrow" mechanism for
>> the pSeries SPAPR TCE. It did implement this support partially
>> that it left out creating the DDW if not present already.
>>
>> The patch here attempts to fix the missing gaps.
>>   - Expose the DDW info to user by collecting it during probe.
>>   - Create the window and the iommu table if not present during
>>     VFIO_SPAPR_TCE_CREATE.
>>   - Remove and recreate the window if the pageshift and window sizes
>>     do not match.
>>   - Restore the original window in enable_ddw() if the user had
>>     created/modified the DDW. As there is preference for DIRECT mapping
>>     on the host driver side, the user created window is removed.
>>
>> The changes work only for the non-SRIOV-VF scenarios for PEs having
>> 2 DMA windows.
> This crashes on powernv.


Thanks for pointing this out.  I will take care of this in v2 of this RFC.


Regards,

Shivaprasad
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 744cc5fc22d3..fde174122844 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -110,6 +110,7 @@  struct iommu_table {
 	unsigned long  it_page_shift;/* table iommu page size */
 	struct list_head it_group_list;/* List of iommu_table_group_link */
 	__be64 *it_userspace; /* userspace view of the table */
+	bool reset_ddw;
 	struct iommu_table_ops *it_ops;
 	struct kref    it_kref;
 	int it_nid;
@@ -169,6 +170,8 @@  struct iommu_table_group_ops {
 			__u32 page_shift,
 			__u64 window_size,
 			__u32 levels);
+	void (*init_group)(struct iommu_table_group *table_group,
+			struct device *dev);
 	long (*create_table)(struct iommu_table_group *table_group,
 			int num,
 			__u32 page_shift,
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index aa11b2acf24f..1cce2b8b8f2c 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -740,6 +740,7 @@  struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
 		return NULL;
 	}
 
+	tbl->it_nid = nid;
 	iommu_table_reserve_pages(tbl, res_start, res_end);
 
 	/* We only split the IOMMU table if we have 1GB or more of space */
@@ -1141,7 +1142,10 @@  spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain,
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	struct iommu_group *grp = iommu_group_get(dev);
-	struct iommu_table_group *table_group;
+	struct iommu_table_group *table_group = iommu_group_get_iommudata(grp);
+
+	/* This should have been in spapr_tce_iommu_probe_device() ?*/
+	table_group->ops->init_group(table_group, dev);
 
 	/* At first attach the ownership is already set */
 	if (!domain) {
@@ -1149,7 +1153,6 @@  spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain,
 		return 0;
 	}
 
-	table_group = iommu_group_get_iommudata(grp);
 	/*
 	 * The domain being set to PLATFORM from earlier
 	 * BLOCKED. The table_group ownership has to be released.
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 3d9865dadf73..7224107a0f60 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -630,6 +630,62 @@  static void iommu_table_setparms(struct pci_controller *phb,
 	phb->dma_window_base_cur += phb->dma_window_size;
 }
 
+static int iommu_table_reset(struct iommu_table *tbl, unsigned long busno,
+				   unsigned long liobn, unsigned long win_addr,
+				   unsigned long window_size, unsigned long page_shift,
+				   void *base, struct iommu_table_ops *table_ops)
+{
+	unsigned long sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
+	unsigned int i, oldsize = tbl->it_size;
+	struct iommu_pool *p;
+
+	WARN_ON(!tbl->it_ops);
+
+	if (oldsize != (window_size >> page_shift)) {
+		vfree(tbl->it_map);
+
+		tbl->it_map = vzalloc_node(sz, tbl->it_nid);
+		if (!tbl->it_map)
+			return -ENOMEM;
+
+		tbl->it_size = window_size >> page_shift;
+		if (oldsize < (window_size >> page_shift))
+			iommu_table_clear(tbl);
+	}
+	tbl->it_busno = busno;
+	tbl->it_index = liobn;
+	tbl->it_offset = win_addr >> page_shift;
+	tbl->it_blocksize = 16;
+	tbl->it_type = TCE_PCI;
+	tbl->it_ops = table_ops;
+	tbl->it_page_shift = page_shift;
+	tbl->it_base = (unsigned long)base;
+
+	if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
+		tbl->nr_pools = IOMMU_NR_POOLS;
+	else
+		tbl->nr_pools = 1;
+
+	tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
+
+	for (i = 0; i < tbl->nr_pools; i++) {
+		p = &tbl->pools[i];
+		spin_lock_init(&(p->lock));
+		p->start = tbl->poolsize * i;
+		p->hint = p->start;
+		p->end = p->start + tbl->poolsize;
+	}
+
+	p = &tbl->large_pool;
+	spin_lock_init(&(p->lock));
+	p->start = tbl->poolsize * i;
+	p->hint = p->start;
+	p->end = tbl->it_size;
+	return 0;
+}
+
+
+
 struct iommu_table_ops iommu_table_lpar_multi_ops;
 
 struct iommu_table_ops iommu_table_pseries_ops = {
@@ -1016,8 +1072,8 @@  static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_
 	return 0;
 }
 
-static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift,
-			      bool *direct_mapping)
+static bool find_existing_ddw(struct device_node *pdn, u32 *liobn, u64 *dma_addr,
+			      int *window_shift, bool *direct_mapping)
 {
 	struct dma_win *window;
 	const struct dynamic_dma_window_prop *dma64;
@@ -1031,6 +1087,7 @@  static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *windo
 			*dma_addr = be64_to_cpu(dma64->dma_base);
 			*window_shift = be32_to_cpu(dma64->window_shift);
 			*direct_mapping = window->direct;
+			*liobn = be32_to_cpu(dma64->liobn);
 			found = true;
 			break;
 		}
@@ -1315,6 +1372,23 @@  static int iommu_get_page_shift(u32 query_page_size)
 	return ret;
 }
 
+static __u64 query_page_size_to_mask(u32 query_page_size)
+{
+	const long shift[] = {
+		(SZ_4K),   (SZ_64K), (SZ_16M),
+		(SZ_32M),  (SZ_64M), (SZ_128M),
+		(SZ_256M), (SZ_16G), (SZ_2M)
+	};
+	int i, ret = 0;
+
+	for (i = 0; i < ARRAY_SIZE(shift); i++) {
+		if (query_page_size & (1 << i))
+			ret |= shift[i];
+	}
+
+	return ret;
+}
+
 static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr,
 					    u32 page_shift, u32 window_shift)
 {
@@ -1344,6 +1418,9 @@  static struct property *ddw_property_create(const char *propname, u32 liobn, u64
 	return win64;
 }
 
+static long remove_dynamic_dma_windows_locked(struct iommu_table_group *table_group,
+					      struct pci_dev *pdev);
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1373,6 +1450,7 @@  static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	bool pmem_present;
 	struct pci_dn *pci = PCI_DN(pdn);
 	struct property *default_win = NULL;
+	u32 liobn;
 
 	dn = of_find_node_by_type(NULL, "ibm,pmemory");
 	pmem_present = dn != NULL;
@@ -1380,8 +1458,19 @@  static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 
 	mutex_lock(&dma_win_init_mutex);
 
-	if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len, &direct_mapping))
-		goto out_unlock;
+	if (find_existing_ddw(pdn, &liobn, &dev->dev.archdata.dma_offset, &len, &direct_mapping)) {
+		struct iommu_table *tbl = pci->table_group->tables[1];
+
+		if (direct_mapping || (tbl && !tbl->reset_ddw))
+			goto out_unlock;
+		/* VFIO user created window has custom size/pageshift */
+		if (remove_dynamic_dma_windows_locked(pci->table_group, dev))
+			goto out_failed;
+
+		iommu_tce_table_put(tbl);
+		pci->table_group->tables[1] = NULL;
+		set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
+	}
 
 	/*
 	 * If we already went through this for a previous function of
@@ -1726,20 +1815,272 @@  static unsigned long spapr_tce_get_table_size(__u32 page_shift,
 	return size;
 }
 
+static void spapr_tce_init_group(struct iommu_table_group *table_group, struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct device_node *dn, *pdn;
+	u32 ddw_avail[DDW_APPLICABLE_SIZE];
+	struct ddw_query_response query;
+	int ret;
+
+	if (table_group->max_dynamic_windows_supported > 0)
+		return;
+
+	/* No need to insitialize for kdump kernel. */
+	if (is_kdump_kernel())
+		return;
+
+	dn = pci_device_to_OF_node(pdev);
+	pdn = pci_dma_find(dn, NULL);
+	if (!pdn || !PCI_DN(pdn)) {
+		table_group->max_dynamic_windows_supported = -1;
+		return;
+	}
+
+	/* TODO: Phyp sets VF default window base at 512PiB offset. Need
+	 * tce32_base set to the global offset and use the start as 0?
+	 */
+	ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
+			&ddw_avail[0], DDW_APPLICABLE_SIZE);
+	if (ret) {
+		table_group->max_dynamic_windows_supported = -1;
+		return;
+	}
+
+	ret = query_ddw(pdev, ddw_avail, &query, pdn);
+	if (ret) {
+		dev_err(&pdev->dev, "%s: query_ddw failed\n", __func__);
+		table_group->max_dynamic_windows_supported = -1;
+		return;
+	}
+
+	/* The SRIOV VFs have only 1 window, the default is removed
+	 * before creating the 64-bit window
+	 */
+	if (query.windows_available == 0)
+		table_group->max_dynamic_windows_supported = 1;
+	else
+		table_group->max_dynamic_windows_supported = 2;
+
+	table_group->max_levels = 1;
+	table_group->pgsizes |= query_page_size_to_mask(query.page_size);
+}
+
+
+static long remove_dynamic_dma_windows_locked(struct iommu_table_group *table_group,
+					      struct pci_dev *pdev)
+{
+	struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
+	bool direct_mapping;
+	struct dma_win *window;
+	u32 liobn;
+	int len;
+
+	pdn = pci_dma_find(dn, NULL);
+	if (!pdn || !PCI_DN(pdn)) { // Niether of 32s|64-bit exist!
+		return -ENODEV;
+	}
+
+	if (find_existing_ddw(pdn, &liobn, &pdev->dev.archdata.dma_offset, &len, &direct_mapping)) {
+		remove_ddw(pdn, true, direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME);
+		spin_lock(&dma_win_list_lock);
+		list_for_each_entry(window, &dma_win_list, list) {
+			if (window->device == pdn) {
+				list_del(&window->list);
+				kfree(window);
+				break;
+			}
+		}
+		spin_unlock(&dma_win_list_lock);
+	}
+
+	return 0;
+}
+
+static int dev_has_iommu_table(struct device *dev, void *data)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_dev **ppdev = data;
+
+	if (!dev)
+		return 0;
+
+	if (device_iommu_mapped(dev)) {
+		*ppdev = pdev;
+		return 1;
+	}
+
+	return 0;
+}
+
+
+static struct pci_dev *iommu_group_get_first_pci_dev(struct iommu_group *group)
+{
+	struct pci_dev *pdev = NULL;
+	int ret;
+
+	/* No IOMMU group ? */
+	if (!group)
+		return NULL;
+
+	ret = iommu_group_for_each_dev(group, &pdev, dev_has_iommu_table);
+	if (!ret || !pdev)
+		return NULL;
+	return pdev;
+}
+
 static long spapr_tce_create_table(struct iommu_table_group *table_group, int num,
 				   __u32 page_shift, __u64 window_size, __u32 levels,
 				   struct iommu_table **ptbl)
 {
-	struct iommu_table *tbl = table_group->tables[0];
+	struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group);
+	struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
+	struct iommu_table *tbl = table_group->tables[num];
+	u32 window_shift = order_base_2(window_size);
+	u32 ddw_avail[DDW_APPLICABLE_SIZE];
+	struct ddw_create_response create;
+	struct ddw_query_response query;
+	unsigned long start = 0, end = 0;
+	struct failed_ddw_pdn *fpdn;
+	struct dma_win *window;
+	struct property *win64;
+	struct pci_dn *pci;
+	int len, ret = 0;
+	u64 win_addr;
 
-	if (num > 0)
+	if (num > 1)
 		return -EPERM;
 
-	if (tbl->it_page_shift != page_shift ||
-	    tbl->it_size != (window_size >> page_shift) ||
-	    tbl->it_indirect_levels != levels - 1)
-		return -EINVAL;
+	if (tbl && (tbl->it_page_shift == page_shift) &&
+		(tbl->it_size == (window_size >> page_shift)) &&
+		(tbl->it_indirect_levels == levels - 1))
+		goto exit;
+
+	if (num == 0)
+		return -EINVAL; /* Can't modify the default window. */
+
+	/* TODO: The SRIO-VFs have only 1 window. */
+	if (table_group->max_dynamic_windows_supported == 1)
+		return -EPERM;
+
+	mutex_lock(&dma_win_init_mutex);
+
+	ret = -ENODEV;
+	/* If the enable DDW failed for the pdn, dont retry! */
+	list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {
+		if (fpdn->pdn == pdn) {
+			pr_err("%s: %pOF in failed DDW device list\n", __func__, pdn);
+			goto out_unlock;
+		}
+	}
+
+	pdn = pci_dma_find(dn, NULL);
+	if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */
+		pr_err("%s: No dma-windows exist for the node %pOF\n", __func__, pdn);
+		goto out_failed;
+	}
+
+	/* The existing ddw didn't match the size/shift */
+	if (remove_dynamic_dma_windows_locked(table_group, pdev)) {
+		pr_err("%s: The existing DDW remova failed for node %pOF\n", __func__, pdn);
+		goto out_failed; /* Could not remove it either! */
+	}
+
+	pci = PCI_DN(pdn);
+	ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
+				&ddw_avail[0], DDW_APPLICABLE_SIZE);
+	if (ret) {
+		pr_err("%s: ibm,ddw-applicable not found\n", __func__);
+		goto out_failed;
+	}
+
+	ret = query_ddw(pdev, ddw_avail, &query, pdn);
+	if (ret)
+		goto out_failed;
+	ret = -ENODEV;
 
+	len = window_shift;
+	if (query.largest_available_block < (1ULL << (len - page_shift))) {
+		dev_dbg(&pdev->dev, "can't map window 0x%llx with %llu %llu-sized pages\n",
+				1ULL << len, query.largest_available_block,
+				1ULL << page_shift);
+		ret = -EINVAL; /* Retry with smaller window size */
+		goto out_unlock;
+	}
+
+	if (create_ddw(pdev, ddw_avail, &create, page_shift, len))
+		goto out_failed;
+
+	win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
+	win64 = ddw_property_create(DMA64_PROPNAME, create.liobn, win_addr, page_shift, len);
+	if (!win64)
+		goto remove_window;
+
+	ret = of_add_property(pdn, win64);
+	if (ret) {
+		dev_err(&pdev->dev, "unable to add DMA window property for %pOF: %d",
+			pdn, ret);
+		goto free_property;
+	}
+	ret = -ENODEV;
+
+	window = ddw_list_new_entry(pdn, win64->value);
+	if (!window)
+		goto remove_property;
+
+	window->direct = false;
+
+	if (tbl) {
+		iommu_table_reset(tbl, pci->phb->bus->number, create.liobn, win_addr,
+				  1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);
+	} else {
+		tbl = iommu_pseries_alloc_table(pci->phb->node);
+		if (!tbl) {
+			dev_err(&pdev->dev, "couldn't create new IOMMU table\n");
+			goto free_window;
+		}
+		iommu_table_setparms_common(tbl, pci->phb->bus->number, create.liobn, win_addr,
+					    1UL << len, page_shift, NULL,
+					    &iommu_table_lpar_multi_ops);
+		iommu_init_table(tbl, pci->phb->node, start, end);
+	}
+
+	tbl->reset_ddw = true;
+	pci->table_group->tables[1] = tbl;
+	set_iommu_table_base(&pdev->dev, tbl);
+	pdev->dev.archdata.dma_offset = win_addr;
+
+	spin_lock(&dma_win_list_lock);
+	list_add(&window->list, &dma_win_list);
+	spin_unlock(&dma_win_list_lock);
+
+	mutex_unlock(&dma_win_init_mutex);
+
+	goto exit;
+
+free_window:
+	kfree(window);
+remove_property:
+	of_remove_property(pdn, win64);
+free_property:
+	kfree(win64->name);
+	kfree(win64->value);
+	kfree(win64);
+remove_window:
+	__remove_dma_window(pdn, ddw_avail, create.liobn);
+
+out_failed:
+	fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
+	if (!fpdn)
+		goto out_unlock;
+	fpdn->pdn = pdn;
+	list_add(&fpdn->list, &failed_ddw_pdn_list);
+
+out_unlock:
+	mutex_unlock(&dma_win_init_mutex);
+
+	return ret;
+exit:
 	*ptbl = iommu_tce_table_get(tbl);
 	return 0;
 }
@@ -1795,6 +2136,7 @@  static void spapr_tce_release_ownership(struct iommu_table_group *table_group)
 struct iommu_table_group_ops spapr_tce_table_group_ops = {
 	.get_table_size = spapr_tce_get_table_size,
 	.create_table = spapr_tce_create_table,
+	.init_group = spapr_tce_init_group,
 	.set_window = spapr_tce_set_window,
 	.unset_window = spapr_tce_unset_window,
 	.take_ownership = spapr_tce_take_ownership,