diff mbox series

powerpc/pseries/iommu: IOMMU table is not initialized for kdump over SR-IOV

Message ID 20231213195233.10329-1-gbatra@linux.vnet.ibm.com (mailing list archive)
State Superseded
Headers show
Series powerpc/pseries/iommu: IOMMU table is not initialized for kdump over SR-IOV | expand

Checks

Context Check Description
snowpatch_ozlabs/github-powerpc_ppctests success Successfully ran 8 jobs.
snowpatch_ozlabs/github-powerpc_selftests success Successfully ran 8 jobs.
snowpatch_ozlabs/github-powerpc_kernel_qemu success Successfully ran 23 jobs.
snowpatch_ozlabs/github-powerpc_sparse success Successfully ran 4 jobs.
snowpatch_ozlabs/github-powerpc_clang success Successfully ran 6 jobs.

Commit Message

Gaurav Batra Dec. 13, 2023, 7:52 p.m. UTC
When kdump kernel tries to copy dump data over SR-IOV, LPAR panics due to
NULL pointer execption.

Here is the complete stack

[   19.944378] Kernel attempted to read user page (0) - exploit attempt? (uid: 0)^M
[   19.944388] BUG: Kernel NULL pointer dereference on read at 0x00000000^M
[   19.944394] Faulting instruction address: 0xc000000020847ad4^M
[   19.944400] Oops: Kernel access of bad area, sig: 11 [#1]^M
[   19.944406] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries^M
[   19.944413] Modules linked in: mlx5_core(+) vmx_crypto pseries_wdt papr_scm libnvdimm mlxfw tls psample sunrpc fuse overlay squashfs loop^M
[   19.944435] CPU: 12 PID: 315 Comm: systemd-udevd Not tainted 6.4.0-Test102+ #12^M
[   19.944442] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_008) hv:phyp pSeries^M
[   19.944450] NIP:  c000000020847ad4 LR: c00000002083b2dc CTR: 00000000006cd18c^M
[   19.944457] REGS: c000000029162ca0 TRAP: 0300   Not tainted  (6.4.0-Test102+)^M
[   19.944463] MSR:  800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 48288244  XER: 00000008^M
[   19.944480] CFAR: c00000002083b2d8 DAR: 0000000000000000 DSISR: 40000000 IRQMASK: 1 ^M
[   19.944480] GPR00: c00000002088ecd4 c000000029162f40 c000000021542900 ffffffffffffffff ^M
[   19.944480] GPR04: ffffffffffffffff 0000000000000000 0000000000000000 000000000000ffff ^M
[   19.944480] GPR08: 0000000000000000 ffffffffffffffff 0000000000000000 0000000000000001 ^M
[   19.944480] GPR12: 0000000000000000 c000000022ec1000 0000000040000000 0000000000000000 ^M
[   19.944480] GPR16: c00000002b8b3900 c00000002b8b3918 c00000002b8b3800 ffffffffffffffff ^M
[   19.944480] GPR20: 0000000000000000 000000000000ffff ffffffff00000000 00000000ffffffff ^M
[   19.944480] GPR24: 000000000000ffff 0000000000010000 0000000000000000 ffffffffffff0000 ^M
[   19.944480] GPR28: 0000000000000000 ffffffffffffffff ffffffffffffffff 0000000000000000 ^M
[   19.944553] NIP [c000000020847ad4] _find_next_zero_bit+0x24/0x110^M
[   19.944564] LR [c00000002083b2dc] bitmap_find_next_zero_area_off+0x5c/0xe0^M
[   19.944572] Call Trace:^M
[   19.944576] [c000000029162f40] [c0000000209fec70] dev_printk_emit+0x38/0x48 (unreliable)^M
[   19.944587] [c000000029162fa0] [c00000002088ecd4] iommu_area_alloc+0xc4/0x180^M
[   19.944596] [c000000029163020] [c00000002005a3d8] iommu_range_alloc+0x1e8/0x580^M
[   19.944606] [c000000029163150] [c00000002005a7d0] iommu_alloc+0x60/0x130^M
[   19.944613] [c0000000291631a0] [c00000002005b9f8] iommu_alloc_coherent+0x158/0x2b0^M
[   19.944621] [c000000029163260] [c000000020058fdc] dma_iommu_alloc_coherent+0x3c/0x50^M
[   19.944629] [c000000029163280] [c000000020235260] dma_alloc_attrs+0x170/0x1f0^M
[   19.944637] [c0000000291632f0] [c00800000140c058] mlx5_cmd_init+0xc0/0x760 [mlx5_core]^M
[   19.944745] [c0000000291633c0] [c008000001403448] mlx5_function_setup+0xf0/0x510 [mlx5_core]^M
[   19.944845] [c000000029163490] [c0080000014039bc] mlx5_init_one+0x84/0x210 [mlx5_core]^M
[   19.944943] [c000000029163520] [c008000001404c00] probe_one+0x118/0x2c0 [mlx5_core]^M
[   19.945041] [c0000000291635b0] [c0000000208ddce8] local_pci_probe+0x68/0x110^M
[   19.945049] [c000000029163630] [c0000000208deb98] pci_call_probe+0x68/0x200^M
[   19.945057] [c000000029163790] [c0000000208dfecc] pci_device_probe+0xbc/0x1a0^M
[   19.945065] [c0000000291637d0] [c000000020a032f4] really_probe+0x104/0x540^M
[   19.945072] [c000000029163850] [c000000020a037e4] __driver_probe_device+0xb4/0x230^M
[   19.945080] [c0000000291638d0] [c000000020a039b4] driver_probe_device+0x54/0x130^M
[   19.945088] [c000000029163910] [c000000020a03da8] __driver_attach+0x158/0x2b0^M
[   19.945096] [c000000029163990] [c0000000209ffa78] bus_for_each_dev+0xa8/0x130^M
[   19.945103] [c0000000291639f0] [c000000020a026c4] driver_attach+0x34/0x50^M
[   19.945110] [c000000029163a10] [c000000020a01aac] bus_add_driver+0x16c/0x300^M
[   19.945118] [c000000029163aa0] [c000000020a05944] driver_register+0xa4/0x1b0^M
[   19.945126] [c000000029163b10] [c0000000208dd428] __pci_register_driver+0x68/0x80^M
[   19.945133] [c000000029163b30] [c0080000015414cc] mlx5_init+0xb8/0x100 [mlx5_core]^M
[   19.945247] [c000000029163ba0] [c000000020012f60] do_one_initcall+0x60/0x300^M
[   19.945255] [c000000029163c80] [c000000020241cbc] do_init_module+0x7c/0x2b0^M

At the time of LPAR dump, before kexec hands over control to kdump kernel,
DDWs are scanned and added in the FDT. For the SR-IOV case, default DMA
window "ibm,dma-window" is removed from the FDT and DDW added, for the
device.

Now, kexec hands over control to the kdump kernel.

When the kdump kernel initializes, PCI busses are scanned and IOMMU
group/tables created, in pci_dma_bus_setup_pSeriesLP(). For the SR-IOV
case, there is no "ibm,dma-window". As a result, due to the code bug, the
newly created IOMMU table is not initialized. Later, when the device driver
tries to enter TCEs for the SR-IOV device, NULL pointer execption is thrown
from iommu_area_alloc().

The fix would be to initialize IOMMU table with DDW property stored in
the FDT. There are 2 points to remember -

	1. For the dedicated adapter, kdump kernel would encounter both
	   default and DDW in FDT. In this case, DDW property is used to
	   initialize the IOMMU table.

	2. A DDW could be direct or dynamic mapped. kdump kernel would
	   initialize IOMMU table and mark the existing DDW as
	   "dynamic". This works fine since, at the time of table
	   initialization, iommu_table_clear() makes some space in the
	   DDW, for some predefined number of TCEs which are needed for
	   kdump to succeed.

Fixes: b1fc44eaa9ba ("pseries/iommu/ddw: Fix kdump to work in absence of ibm,dma-window")
Signed-off-by: Gaurav Batra <gbatra@linux.vnet.ibm.com>
Reviewed-by: Brian King <brking@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 150 ++++++++++++++++---------
 1 file changed, 99 insertions(+), 51 deletions(-)

Comments

Aneesh Kumar K.V Dec. 14, 2023, 10:18 a.m. UTC | #1
Gaurav Batra <gbatra@linux.vnet.ibm.com> writes:

> When kdump kernel tries to copy dump data over SR-IOV, LPAR panics due to
> NULL pointer execption.
>
> Here is the complete stack
>
> [   19.944378] Kernel attempted to read user page (0) - exploit attempt? (uid: 0)^M
> [   19.944388] BUG: Kernel NULL pointer dereference on read at 0x00000000^M
> [   19.944394] Faulting instruction address: 0xc000000020847ad4^M
> [   19.944400] Oops: Kernel access of bad area, sig: 11 [#1]^M
> [   19.944406] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries^M
> [   19.944413] Modules linked in: mlx5_core(+) vmx_crypto pseries_wdt papr_scm libnvdimm mlxfw tls psample sunrpc fuse overlay squashfs loop^M
> [   19.944435] CPU: 12 PID: 315 Comm: systemd-udevd Not tainted 6.4.0-Test102+ #12^M
> [   19.944442] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_008) hv:phyp pSeries^M
> [   19.944450] NIP:  c000000020847ad4 LR: c00000002083b2dc CTR: 00000000006cd18c^M
> [   19.944457] REGS: c000000029162ca0 TRAP: 0300   Not tainted  (6.4.0-Test102+)^M
> [   19.944463] MSR:  800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 48288244  XER: 00000008^M
> [   19.944480] CFAR: c00000002083b2d8 DAR: 0000000000000000 DSISR: 40000000 IRQMASK: 1 ^M
> [   19.944480] GPR00: c00000002088ecd4 c000000029162f40 c000000021542900 ffffffffffffffff ^M
> [   19.944480] GPR04: ffffffffffffffff 0000000000000000 0000000000000000 000000000000ffff ^M
> [   19.944480] GPR08: 0000000000000000 ffffffffffffffff 0000000000000000 0000000000000001 ^M
> [   19.944480] GPR12: 0000000000000000 c000000022ec1000 0000000040000000 0000000000000000 ^M
> [   19.944480] GPR16: c00000002b8b3900 c00000002b8b3918 c00000002b8b3800 ffffffffffffffff ^M
> [   19.944480] GPR20: 0000000000000000 000000000000ffff ffffffff00000000 00000000ffffffff ^M
> [   19.944480] GPR24: 000000000000ffff 0000000000010000 0000000000000000 ffffffffffff0000 ^M
> [   19.944480] GPR28: 0000000000000000 ffffffffffffffff ffffffffffffffff 0000000000000000 ^M
> [   19.944553] NIP [c000000020847ad4] _find_next_zero_bit+0x24/0x110^M
> [   19.944564] LR [c00000002083b2dc] bitmap_find_next_zero_area_off+0x5c/0xe0^M
> [   19.944572] Call Trace:^M
> [   19.944576] [c000000029162f40] [c0000000209fec70] dev_printk_emit+0x38/0x48 (unreliable)^M
> [   19.944587] [c000000029162fa0] [c00000002088ecd4] iommu_area_alloc+0xc4/0x180^M
> [   19.944596] [c000000029163020] [c00000002005a3d8] iommu_range_alloc+0x1e8/0x580^M
> [   19.944606] [c000000029163150] [c00000002005a7d0] iommu_alloc+0x60/0x130^M
> [   19.944613] [c0000000291631a0] [c00000002005b9f8] iommu_alloc_coherent+0x158/0x2b0^M
> [   19.944621] [c000000029163260] [c000000020058fdc] dma_iommu_alloc_coherent+0x3c/0x50^M
> [   19.944629] [c000000029163280] [c000000020235260] dma_alloc_attrs+0x170/0x1f0^M
> [   19.944637] [c0000000291632f0] [c00800000140c058] mlx5_cmd_init+0xc0/0x760 [mlx5_core]^M
> [   19.944745] [c0000000291633c0] [c008000001403448] mlx5_function_setup+0xf0/0x510 [mlx5_core]^M
> [   19.944845] [c000000029163490] [c0080000014039bc] mlx5_init_one+0x84/0x210 [mlx5_core]^M
> [   19.944943] [c000000029163520] [c008000001404c00] probe_one+0x118/0x2c0 [mlx5_core]^M
> [   19.945041] [c0000000291635b0] [c0000000208ddce8] local_pci_probe+0x68/0x110^M
> [   19.945049] [c000000029163630] [c0000000208deb98] pci_call_probe+0x68/0x200^M
> [   19.945057] [c000000029163790] [c0000000208dfecc] pci_device_probe+0xbc/0x1a0^M
> [   19.945065] [c0000000291637d0] [c000000020a032f4] really_probe+0x104/0x540^M
> [   19.945072] [c000000029163850] [c000000020a037e4] __driver_probe_device+0xb4/0x230^M
> [   19.945080] [c0000000291638d0] [c000000020a039b4] driver_probe_device+0x54/0x130^M
> [   19.945088] [c000000029163910] [c000000020a03da8] __driver_attach+0x158/0x2b0^M
> [   19.945096] [c000000029163990] [c0000000209ffa78] bus_for_each_dev+0xa8/0x130^M
> [   19.945103] [c0000000291639f0] [c000000020a026c4] driver_attach+0x34/0x50^M
> [   19.945110] [c000000029163a10] [c000000020a01aac] bus_add_driver+0x16c/0x300^M
> [   19.945118] [c000000029163aa0] [c000000020a05944] driver_register+0xa4/0x1b0^M
> [   19.945126] [c000000029163b10] [c0000000208dd428] __pci_register_driver+0x68/0x80^M
> [   19.945133] [c000000029163b30] [c0080000015414cc] mlx5_init+0xb8/0x100 [mlx5_core]^M
> [   19.945247] [c000000029163ba0] [c000000020012f60] do_one_initcall+0x60/0x300^M
> [   19.945255] [c000000029163c80] [c000000020241cbc] do_init_module+0x7c/0x2b0^M
>
> At the time of LPAR dump, before kexec hands over control to kdump kernel,
> DDWs are scanned and added in the FDT. For the SR-IOV case, default DMA
> window "ibm,dma-window" is removed from the FDT and DDW added, for the
> device.
>
> Now, kexec hands over control to the kdump kernel.
>
> When the kdump kernel initializes, PCI busses are scanned and IOMMU
> group/tables created, in pci_dma_bus_setup_pSeriesLP(). For the SR-IOV
> case, there is no "ibm,dma-window". As a result, due to the code bug, the
> newly created IOMMU table is not initialized. Later, when the device driver
> tries to enter TCEs for the SR-IOV device, NULL pointer execption is thrown
> from iommu_area_alloc().
>
> The fix would be to initialize IOMMU table with DDW property stored in
> the FDT. There are 2 points to remember -
>
> 	1. For the dedicated adapter, kdump kernel would encounter both
> 	   default and DDW in FDT. In this case, DDW property is used to
> 	   initialize the IOMMU table.
>
> 	2. A DDW could be direct or dynamic mapped. kdump kernel would
> 	   initialize IOMMU table and mark the existing DDW as
> 	   "dynamic". This works fine since, at the time of table
> 	   initialization, iommu_table_clear() makes some space in the
> 	   DDW, for some predefined number of TCEs which are needed for
> 	   kdump to succeed.
>


While looking at this change, I'm wondering if it would be helpful to
note that the iommu_table_setparms_lpar function currently only examines
the ibm,dma-window device node, which can cause issues with kexec kernel
in certain scenarios in place where you mention "code bug".

Additionally, I'm curious how this worked before
considering that commit b1fc44eaa9ba was supposed to address the same
issue that you are fixing here.

>
> Fixes: b1fc44eaa9ba ("pseries/iommu/ddw: Fix kdump to work in absence of ibm,dma-window")
> Signed-off-by: Gaurav Batra <gbatra@linux.vnet.ibm.com>
> Reviewed-by: Brian King <brking@linux.vnet.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 150 ++++++++++++++++---------
>  1 file changed, 99 insertions(+), 51 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 496e16c588aa..6f022c51bd57 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -574,29 +574,6 @@ static void iommu_table_setparms(struct pci_controller *phb,
>  
>  struct iommu_table_ops iommu_table_lpar_multi_ops;
>  
> -/*
> - * iommu_table_setparms_lpar
> - *
> - * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
> - */
> -static void iommu_table_setparms_lpar(struct pci_controller *phb,
> -				      struct device_node *dn,
> -				      struct iommu_table *tbl,
> -				      struct iommu_table_group *table_group,
> -				      const __be32 *dma_window)
> -{
> -	unsigned long offset, size, liobn;
> -
> -	of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
> -
> -	iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
> -				    &iommu_table_lpar_multi_ops);
> -
> -
> -	table_group->tce32_start = offset;
> -	table_group->tce32_size = size;
> -}
> -
>  struct iommu_table_ops iommu_table_pseries_ops = {
>  	.set = tce_build_pSeries,
>  	.clear = tce_free_pSeries,
> @@ -724,26 +701,71 @@ struct iommu_table_ops iommu_table_lpar_multi_ops = {
>   * dynamic 64bit DMA window, walking up the device tree.
>   */
>  static struct device_node *pci_dma_find(struct device_node *dn,
> -					const __be32 **dma_window)
> +					struct dynamic_dma_window_prop *prop)
>  {
> -	const __be32 *dw = NULL;
> +	const __be32 *default_prop = NULL;
> +	const __be32 *ddw_prop = NULL;
> +	struct device_node *rdn = NULL;
> +	bool default_win = false, ddw_win = false;
>  
>  	for ( ; dn && PCI_DN(dn); dn = dn->parent) {
> -		dw = of_get_property(dn, "ibm,dma-window", NULL);
> -		if (dw) {
> -			if (dma_window)
> -				*dma_window = dw;
> -			return dn;
> +		default_prop = of_get_property(dn, "ibm,dma-window", NULL);
> +		if (default_prop) {
> +			rdn = dn;
> +			default_win = true;
> +		}
> +		ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);
> +		if (ddw_prop) {
> +			rdn = dn;
> +			ddw_win = true;
> +			break;
> +		}
> +		ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL);
> +		if (ddw_prop) {
> +			rdn = dn;
> +			ddw_win = true;
> +			break;
>  		}
> -		dw = of_get_property(dn, DIRECT64_PROPNAME, NULL);
> -		if (dw)
> -			return dn;
> -		dw = of_get_property(dn, DMA64_PROPNAME, NULL);
> -		if (dw)
> -			return dn;
> +
> +		/* At least found default window, which is the case for normal boot */
> +		if (default_win)
> +			break;
>  	}
>  
> -	return NULL;
> +	/* For PCI devices there will always be a DMA window, either on the device
> +	 * or parent bus
> +	 */
> +	WARN_ON(!(default_win | ddw_win));
> +
> +	/* caller doesn't want to get DMA window property */
> +	if (!prop)
> +		return rdn;
> +
> +	/* parse DMA window property. During normal system boot, only default
> +	 * DMA window is passed in OF. But, for kdump, a dedicated adapter might
> +	 * have both default and DDW in FDT. In this scenario, DDW takes precedence
> +	 * over default window.
> +	 */
> +	if (ddw_win) {
> +		struct dynamic_dma_window_prop *p;
> +
> +		p = (struct dynamic_dma_window_prop *)ddw_prop;
> +		prop->liobn = be32_to_cpu(p->liobn);
> +		prop->dma_base = be64_to_cpu(p->dma_base);
> +		prop->tce_shift = be32_to_cpu(p->tce_shift);
> +		prop->window_shift = be32_to_cpu(p->window_shift);
> +	} else if (default_win) {
> +		unsigned long offset, size, liobn;
> +
> +		of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size);
> +
> +		prop->liobn = liobn;
> +		prop->dma_base = offset;
> +		prop->tce_shift = IOMMU_PAGE_SHIFT_4K;
> +		prop->window_shift = order_base_2(size);
> +	}
> +
> +	return rdn;
>  }
>  
>  static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
> @@ -751,17 +773,20 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>  	struct iommu_table *tbl;
>  	struct device_node *dn, *pdn;
>  	struct pci_dn *ppci;
> -	const __be32 *dma_window = NULL;
> +	struct dynamic_dma_window_prop prop;
>  
>  	dn = pci_bus_to_OF_node(bus);
>  
>  	pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
>  		 dn);
>  
> -	pdn = pci_dma_find(dn, &dma_window);
> +	pdn = pci_dma_find(dn, &prop);
>  
> -	if (dma_window == NULL)
> -		pr_debug("  no ibm,dma-window property !\n");
> +	/* In PPC architecture, there will always be DMA window on bus or one of the
> +	 * parent bus. During reboot, there will be ibm,dma-window property to
> +	 * define DMA window. For kdump, there will at least be default window or DDW
> +	 * or both.
> +	 */
>  
>  	ppci = PCI_DN(pdn);
>  
> @@ -771,13 +796,21 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>  	if (!ppci->table_group) {
>  		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
>  		tbl = ppci->table_group->tables[0];
> -		if (dma_window) {
> -			iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
> -						  ppci->table_group, dma_window);
>  
> -			if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
> -				panic("Failed to initialize iommu table");
> -		}
> +		iommu_table_setparms_common(tbl, ppci->phb->bus->number, prop.liobn,
> +				prop.dma_base, 1ULL << prop.window_shift,
> +				prop.tce_shift, NULL, &iommu_table_lpar_multi_ops);
> +
> +		/* Only for normal boot with default window. Doesn't matter even
> +		 * if we set these with DDW which is 64bit during kdump, since
> +		 * these will not be used during kdump.
> +		 */
> +		ppci->table_group->tce32_start = prop.dma_base;
> +		ppci->table_group->tce32_size = 1ULL << prop.window_shift;
> +
> +		if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
> +			panic("Failed to initialize iommu table");
> +
>  		iommu_register_group(ppci->table_group,
>  				pci_domain_nr(bus), 0);
>  		pr_debug("  created table: %p\n", ppci->table_group);
> @@ -968,6 +1001,12 @@ static void find_existing_ddw_windows_named(const char *name)
>  			continue;
>  		}
>  
> +		/* If at the time of system initialization, there are DDWs in OF,
> +		 * it means this is during kexec. DDW could be direct or dynamic.
> +		 * We will just mark DDWs as "dynamic" since this is kdump path,
> +		 * no need to worry about perforance. ddw_list_new_entry() will
> +		 * set window->direct = false.
> +		 */
>  		window = ddw_list_new_entry(pdn, dma64);
>  		if (!window) {
>  			of_node_put(pdn);
> @@ -1524,8 +1563,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>  {
>  	struct device_node *pdn, *dn;
>  	struct iommu_table *tbl;
> -	const __be32 *dma_window = NULL;
>  	struct pci_dn *pci;
> +	struct dynamic_dma_window_prop prop;
>  
>  	pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
>  
> @@ -1538,7 +1577,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>  	dn = pci_device_to_OF_node(dev);
>  	pr_debug("  node is %pOF\n", dn);
>  
> -	pdn = pci_dma_find(dn, &dma_window);
> +	pdn = pci_dma_find(dn, &prop);
>  	if (!pdn || !PCI_DN(pdn)) {
>  		printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
>  		       "no DMA window found for pci dev=%s dn=%pOF\n",
> @@ -1551,8 +1590,17 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>  	if (!pci->table_group) {
>  		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
>  		tbl = pci->table_group->tables[0];
> -		iommu_table_setparms_lpar(pci->phb, pdn, tbl,
> -				pci->table_group, dma_window);
> +
> +		iommu_table_setparms_common(tbl, pci->phb->bus->number, prop.liobn,
> +				prop.dma_base, 1ULL << prop.window_shift,
> +				prop.tce_shift, NULL, &iommu_table_lpar_multi_ops);
> +
> +		/* Only for normal boot with default window. Doesn't matter even
> +		 * if we set these with DDW which is 64bit during kdump, since
> +		 * these will not be used during kdump.
> +		 */
> +		pci->table_group->tce32_start = prop.dma_base;
> +		pci->table_group->tce32_size = 1ULL << prop.window_shift;
>  
>  		iommu_init_table(tbl, pci->phb->node, 0, 0);
>  		iommu_register_group(pci->table_group,
> -- 
> 2.39.3 (Apple Git-145)
kernel test robot Dec. 14, 2023, 11:56 a.m. UTC | #2
Hi Gaurav,

kernel test robot noticed the following build warnings:

[auto build test WARNING on powerpc/next]
[also build test WARNING on powerpc/fixes linus/master v6.7-rc5 next-20231214]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Gaurav-Batra/powerpc-pseries-iommu-IOMMU-table-is-not-initialized-for-kdump-over-SR-IOV/20231214-035436
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
patch link:    https://lore.kernel.org/r/20231213195233.10329-1-gbatra%40linux.vnet.ibm.com
patch subject: [PATCH] powerpc/pseries/iommu: IOMMU table is not initialized for kdump over SR-IOV
config: powerpc64-randconfig-r113-20231214 (https://download.01.org/0day-ci/archive/20231214/202312141941.Qraw6P8l-lkp@intel.com/config)
compiler: powerpc64-linux-gcc (GCC) 13.2.0
reproduce: (https://download.01.org/0day-ci/archive/20231214/202312141941.Qraw6P8l-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202312141941.Qraw6P8l-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> arch/powerpc/platforms/pseries/iommu.c:753:29: sparse: sparse: incorrect type in assignment (different base types) @@     expected restricted __be32 [usertype] liobn @@     got unsigned int [usertype] @@
   arch/powerpc/platforms/pseries/iommu.c:753:29: sparse:     expected restricted __be32 [usertype] liobn
   arch/powerpc/platforms/pseries/iommu.c:753:29: sparse:     got unsigned int [usertype]
>> arch/powerpc/platforms/pseries/iommu.c:754:32: sparse: sparse: incorrect type in assignment (different base types) @@     expected restricted __be64 [usertype] dma_base @@     got unsigned long long [usertype] @@
   arch/powerpc/platforms/pseries/iommu.c:754:32: sparse:     expected restricted __be64 [usertype] dma_base
   arch/powerpc/platforms/pseries/iommu.c:754:32: sparse:     got unsigned long long [usertype]
>> arch/powerpc/platforms/pseries/iommu.c:755:33: sparse: sparse: incorrect type in assignment (different base types) @@     expected restricted __be32 [usertype] tce_shift @@     got unsigned int [usertype] @@
   arch/powerpc/platforms/pseries/iommu.c:755:33: sparse:     expected restricted __be32 [usertype] tce_shift
   arch/powerpc/platforms/pseries/iommu.c:755:33: sparse:     got unsigned int [usertype]
>> arch/powerpc/platforms/pseries/iommu.c:756:36: sparse: sparse: incorrect type in assignment (different base types) @@     expected restricted __be32 [usertype] window_shift @@     got unsigned int [usertype] @@
   arch/powerpc/platforms/pseries/iommu.c:756:36: sparse:     expected restricted __be32 [usertype] window_shift
   arch/powerpc/platforms/pseries/iommu.c:756:36: sparse:     got unsigned int [usertype]
>> arch/powerpc/platforms/pseries/iommu.c:762:29: sparse: sparse: incorrect type in assignment (different base types) @@     expected restricted __be32 [usertype] liobn @@     got unsigned long [addressable] liobn @@
   arch/powerpc/platforms/pseries/iommu.c:762:29: sparse:     expected restricted __be32 [usertype] liobn
   arch/powerpc/platforms/pseries/iommu.c:762:29: sparse:     got unsigned long [addressable] liobn
>> arch/powerpc/platforms/pseries/iommu.c:763:32: sparse: sparse: incorrect type in assignment (different base types) @@     expected restricted __be64 [usertype] dma_base @@     got unsigned long [addressable] offset @@
   arch/powerpc/platforms/pseries/iommu.c:763:32: sparse:     expected restricted __be64 [usertype] dma_base
   arch/powerpc/platforms/pseries/iommu.c:763:32: sparse:     got unsigned long [addressable] offset
>> arch/powerpc/platforms/pseries/iommu.c:764:33: sparse: sparse: incorrect type in assignment (different base types) @@     expected restricted __be32 [usertype] tce_shift @@     got int @@
   arch/powerpc/platforms/pseries/iommu.c:764:33: sparse:     expected restricted __be32 [usertype] tce_shift
   arch/powerpc/platforms/pseries/iommu.c:764:33: sparse:     got int
>> arch/powerpc/platforms/pseries/iommu.c:765:36: sparse: sparse: incorrect type in assignment (different base types) @@     expected restricted __be32 [usertype] window_shift @@     got int @@
   arch/powerpc/platforms/pseries/iommu.c:765:36: sparse:     expected restricted __be32 [usertype] window_shift
   arch/powerpc/platforms/pseries/iommu.c:765:36: sparse:     got int
>> arch/powerpc/platforms/pseries/iommu.c:800:78: sparse: sparse: incorrect type in argument 3 (different base types) @@     expected unsigned long liobn @@     got restricted __be32 [addressable] [usertype] liobn @@
   arch/powerpc/platforms/pseries/iommu.c:800:78: sparse:     expected unsigned long liobn
   arch/powerpc/platforms/pseries/iommu.c:800:78: sparse:     got restricted __be32 [addressable] [usertype] liobn
>> arch/powerpc/platforms/pseries/iommu.c:801:37: sparse: sparse: incorrect type in argument 4 (different base types) @@     expected unsigned long win_addr @@     got restricted __be64 [addressable] [usertype] dma_base @@
   arch/powerpc/platforms/pseries/iommu.c:801:37: sparse:     expected unsigned long win_addr
   arch/powerpc/platforms/pseries/iommu.c:801:37: sparse:     got restricted __be64 [addressable] [usertype] dma_base
>> arch/powerpc/platforms/pseries/iommu.c:801:60: sparse: sparse: restricted __be32 degrades to integer
>> arch/powerpc/platforms/pseries/iommu.c:802:37: sparse: sparse: incorrect type in argument 6 (different base types) @@     expected unsigned long page_shift @@     got restricted __be32 [addressable] [usertype] tce_shift @@
   arch/powerpc/platforms/pseries/iommu.c:802:37: sparse:     expected unsigned long page_shift
   arch/powerpc/platforms/pseries/iommu.c:802:37: sparse:     got restricted __be32 [addressable] [usertype] tce_shift
>> arch/powerpc/platforms/pseries/iommu.c:808:48: sparse: sparse: incorrect type in assignment (different base types) @@     expected unsigned int [usertype] tce32_start @@     got restricted __be64 [addressable] [usertype] dma_base @@
   arch/powerpc/platforms/pseries/iommu.c:808:48: sparse:     expected unsigned int [usertype] tce32_start
   arch/powerpc/platforms/pseries/iommu.c:808:48: sparse:     got restricted __be64 [addressable] [usertype] dma_base
   arch/powerpc/platforms/pseries/iommu.c:809:61: sparse: sparse: restricted __be32 degrades to integer
   arch/powerpc/platforms/pseries/iommu.c:1594:77: sparse: sparse: incorrect type in argument 3 (different base types) @@     expected unsigned long liobn @@     got restricted __be32 [addressable] [usertype] liobn @@
   arch/powerpc/platforms/pseries/iommu.c:1594:77: sparse:     expected unsigned long liobn
   arch/powerpc/platforms/pseries/iommu.c:1594:77: sparse:     got restricted __be32 [addressable] [usertype] liobn
   arch/powerpc/platforms/pseries/iommu.c:1595:37: sparse: sparse: incorrect type in argument 4 (different base types) @@     expected unsigned long win_addr @@     got restricted __be64 [addressable] [usertype] dma_base @@
   arch/powerpc/platforms/pseries/iommu.c:1595:37: sparse:     expected unsigned long win_addr
   arch/powerpc/platforms/pseries/iommu.c:1595:37: sparse:     got restricted __be64 [addressable] [usertype] dma_base
   arch/powerpc/platforms/pseries/iommu.c:1595:60: sparse: sparse: restricted __be32 degrades to integer
   arch/powerpc/platforms/pseries/iommu.c:1596:37: sparse: sparse: incorrect type in argument 6 (different base types) @@     expected unsigned long page_shift @@     got restricted __be32 [addressable] [usertype] tce_shift @@
   arch/powerpc/platforms/pseries/iommu.c:1596:37: sparse:     expected unsigned long page_shift
   arch/powerpc/platforms/pseries/iommu.c:1596:37: sparse:     got restricted __be32 [addressable] [usertype] tce_shift
   arch/powerpc/platforms/pseries/iommu.c:1602:47: sparse: sparse: incorrect type in assignment (different base types) @@     expected unsigned int [usertype] tce32_start @@     got restricted __be64 [addressable] [usertype] dma_base @@
   arch/powerpc/platforms/pseries/iommu.c:1602:47: sparse:     expected unsigned int [usertype] tce32_start
   arch/powerpc/platforms/pseries/iommu.c:1602:47: sparse:     got restricted __be64 [addressable] [usertype] dma_base
   arch/powerpc/platforms/pseries/iommu.c:1603:60: sparse: sparse: restricted __be32 degrades to integer
   arch/powerpc/platforms/pseries/iommu.c: note: in included file (through include/linux/preempt.h, include/linux/spinlock.h, include/linux/mmzone.h, ...):
   include/linux/list.h:83:21: sparse: sparse: self-comparison always evaluates to true
   include/linux/list.h:83:21: sparse: sparse: self-comparison always evaluates to true
   include/linux/list.h:83:21: sparse: sparse: self-comparison always evaluates to true

vim +753 arch/powerpc/platforms/pseries/iommu.c

   698	
   699	/*
   700	 * Find nearest ibm,dma-window (default DMA window) or direct DMA window or
   701	 * dynamic 64bit DMA window, walking up the device tree.
   702	 */
   703	static struct device_node *pci_dma_find(struct device_node *dn,
   704						struct dynamic_dma_window_prop *prop)
   705	{
   706		const __be32 *default_prop = NULL;
   707		const __be32 *ddw_prop = NULL;
   708		struct device_node *rdn = NULL;
   709		bool default_win = false, ddw_win = false;
   710	
   711		for ( ; dn && PCI_DN(dn); dn = dn->parent) {
   712			default_prop = of_get_property(dn, "ibm,dma-window", NULL);
   713			if (default_prop) {
   714				rdn = dn;
   715				default_win = true;
   716			}
   717			ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);
   718			if (ddw_prop) {
   719				rdn = dn;
   720				ddw_win = true;
   721				break;
   722			}
   723			ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL);
   724			if (ddw_prop) {
   725				rdn = dn;
   726				ddw_win = true;
   727				break;
   728			}
   729	
   730			/* At least found default window, which is the case for normal boot */
   731			if (default_win)
   732				break;
   733		}
   734	
   735		/* For PCI devices there will always be a DMA window, either on the device
   736		 * or parent bus
   737		 */
   738		WARN_ON(!(default_win | ddw_win));
   739	
   740		/* caller doesn't want to get DMA window property */
   741		if (!prop)
   742			return rdn;
   743	
   744		/* parse DMA window property. During normal system boot, only default
   745		 * DMA window is passed in OF. But, for kdump, a dedicated adapter might
   746		 * have both default and DDW in FDT. In this scenario, DDW takes precedence
   747		 * over default window.
   748		 */
   749		if (ddw_win) {
   750			struct dynamic_dma_window_prop *p;
   751	
   752			p = (struct dynamic_dma_window_prop *)ddw_prop;
 > 753			prop->liobn = be32_to_cpu(p->liobn);
 > 754			prop->dma_base = be64_to_cpu(p->dma_base);
 > 755			prop->tce_shift = be32_to_cpu(p->tce_shift);
 > 756			prop->window_shift = be32_to_cpu(p->window_shift);
   757		} else if (default_win) {
   758			unsigned long offset, size, liobn;
   759	
   760			of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size);
   761	
 > 762			prop->liobn = liobn;
 > 763			prop->dma_base = offset;
 > 764			prop->tce_shift = IOMMU_PAGE_SHIFT_4K;
 > 765			prop->window_shift = order_base_2(size);
   766		}
   767	
   768		return rdn;
   769	}
   770	
   771	static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
   772	{
   773		struct iommu_table *tbl;
   774		struct device_node *dn, *pdn;
   775		struct pci_dn *ppci;
   776		struct dynamic_dma_window_prop prop;
   777	
   778		dn = pci_bus_to_OF_node(bus);
   779	
   780		pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
   781			 dn);
   782	
   783		pdn = pci_dma_find(dn, &prop);
   784	
   785		/* In PPC architecture, there will always be DMA window on bus or one of the
   786		 * parent bus. During reboot, there will be ibm,dma-window property to
   787		 * define DMA window. For kdump, there will at least be default window or DDW
   788		 * or both.
   789		 */
   790	
   791		ppci = PCI_DN(pdn);
   792	
   793		pr_debug("  parent is %pOF, iommu_table: 0x%p\n",
   794			 pdn, ppci->table_group);
   795	
   796		if (!ppci->table_group) {
   797			ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
   798			tbl = ppci->table_group->tables[0];
   799	
 > 800			iommu_table_setparms_common(tbl, ppci->phb->bus->number, prop.liobn,
 > 801					prop.dma_base, 1ULL << prop.window_shift,
 > 802					prop.tce_shift, NULL, &iommu_table_lpar_multi_ops);
   803	
   804			/* Only for normal boot with default window. Doesn't matter even
   805			 * if we set these with DDW which is 64bit during kdump, since
   806			 * these will not be used during kdump.
   807			 */
 > 808			ppci->table_group->tce32_start = prop.dma_base;
   809			ppci->table_group->tce32_size = 1ULL << prop.window_shift;
   810	
   811			if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
   812				panic("Failed to initialize iommu table");
   813	
   814			iommu_register_group(ppci->table_group,
   815					pci_domain_nr(bus), 0);
   816			pr_debug("  created table: %p\n", ppci->table_group);
   817		}
   818	}
   819
Gaurav Batra Dec. 14, 2023, 8:53 p.m. UTC | #3
On 12/14/23 4:18 AM, Aneesh Kumar K.V wrote:
> Gaurav Batra <gbatra@linux.vnet.ibm.com> writes:
>
>> When kdump kernel tries to copy dump data over SR-IOV, LPAR panics due to
>> NULL pointer execption.
>>
>> Here is the complete stack
>>
>> [   19.944378] Kernel attempted to read user page (0) - exploit attempt? (uid: 0)^M
>> [   19.944388] BUG: Kernel NULL pointer dereference on read at 0x00000000^M
>> [   19.944394] Faulting instruction address: 0xc000000020847ad4^M
>> [   19.944400] Oops: Kernel access of bad area, sig: 11 [#1]^M
>> [   19.944406] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries^M
>> [   19.944413] Modules linked in: mlx5_core(+) vmx_crypto pseries_wdt papr_scm libnvdimm mlxfw tls psample sunrpc fuse overlay squashfs loop^M
>> [   19.944435] CPU: 12 PID: 315 Comm: systemd-udevd Not tainted 6.4.0-Test102+ #12^M
>> [   19.944442] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_008) hv:phyp pSeries^M
>> [   19.944450] NIP:  c000000020847ad4 LR: c00000002083b2dc CTR: 00000000006cd18c^M
>> [   19.944457] REGS: c000000029162ca0 TRAP: 0300   Not tainted  (6.4.0-Test102+)^M
>> [   19.944463] MSR:  800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 48288244  XER: 00000008^M
>> [   19.944480] CFAR: c00000002083b2d8 DAR: 0000000000000000 DSISR: 40000000 IRQMASK: 1 ^M
>> [   19.944480] GPR00: c00000002088ecd4 c000000029162f40 c000000021542900 ffffffffffffffff ^M
>> [   19.944480] GPR04: ffffffffffffffff 0000000000000000 0000000000000000 000000000000ffff ^M
>> [   19.944480] GPR08: 0000000000000000 ffffffffffffffff 0000000000000000 0000000000000001 ^M
>> [   19.944480] GPR12: 0000000000000000 c000000022ec1000 0000000040000000 0000000000000000 ^M
>> [   19.944480] GPR16: c00000002b8b3900 c00000002b8b3918 c00000002b8b3800 ffffffffffffffff ^M
>> [   19.944480] GPR20: 0000000000000000 000000000000ffff ffffffff00000000 00000000ffffffff ^M
>> [   19.944480] GPR24: 000000000000ffff 0000000000010000 0000000000000000 ffffffffffff0000 ^M
>> [   19.944480] GPR28: 0000000000000000 ffffffffffffffff ffffffffffffffff 0000000000000000 ^M
>> [   19.944553] NIP [c000000020847ad4] _find_next_zero_bit+0x24/0x110^M
>> [   19.944564] LR [c00000002083b2dc] bitmap_find_next_zero_area_off+0x5c/0xe0^M
>> [   19.944572] Call Trace:^M
>> [   19.944576] [c000000029162f40] [c0000000209fec70] dev_printk_emit+0x38/0x48 (unreliable)^M
>> [   19.944587] [c000000029162fa0] [c00000002088ecd4] iommu_area_alloc+0xc4/0x180^M
>> [   19.944596] [c000000029163020] [c00000002005a3d8] iommu_range_alloc+0x1e8/0x580^M
>> [   19.944606] [c000000029163150] [c00000002005a7d0] iommu_alloc+0x60/0x130^M
>> [   19.944613] [c0000000291631a0] [c00000002005b9f8] iommu_alloc_coherent+0x158/0x2b0^M
>> [   19.944621] [c000000029163260] [c000000020058fdc] dma_iommu_alloc_coherent+0x3c/0x50^M
>> [   19.944629] [c000000029163280] [c000000020235260] dma_alloc_attrs+0x170/0x1f0^M
>> [   19.944637] [c0000000291632f0] [c00800000140c058] mlx5_cmd_init+0xc0/0x760 [mlx5_core]^M
>> [   19.944745] [c0000000291633c0] [c008000001403448] mlx5_function_setup+0xf0/0x510 [mlx5_core]^M
>> [   19.944845] [c000000029163490] [c0080000014039bc] mlx5_init_one+0x84/0x210 [mlx5_core]^M
>> [   19.944943] [c000000029163520] [c008000001404c00] probe_one+0x118/0x2c0 [mlx5_core]^M
>> [   19.945041] [c0000000291635b0] [c0000000208ddce8] local_pci_probe+0x68/0x110^M
>> [   19.945049] [c000000029163630] [c0000000208deb98] pci_call_probe+0x68/0x200^M
>> [   19.945057] [c000000029163790] [c0000000208dfecc] pci_device_probe+0xbc/0x1a0^M
>> [   19.945065] [c0000000291637d0] [c000000020a032f4] really_probe+0x104/0x540^M
>> [   19.945072] [c000000029163850] [c000000020a037e4] __driver_probe_device+0xb4/0x230^M
>> [   19.945080] [c0000000291638d0] [c000000020a039b4] driver_probe_device+0x54/0x130^M
>> [   19.945088] [c000000029163910] [c000000020a03da8] __driver_attach+0x158/0x2b0^M
>> [   19.945096] [c000000029163990] [c0000000209ffa78] bus_for_each_dev+0xa8/0x130^M
>> [   19.945103] [c0000000291639f0] [c000000020a026c4] driver_attach+0x34/0x50^M
>> [   19.945110] [c000000029163a10] [c000000020a01aac] bus_add_driver+0x16c/0x300^M
>> [   19.945118] [c000000029163aa0] [c000000020a05944] driver_register+0xa4/0x1b0^M
>> [   19.945126] [c000000029163b10] [c0000000208dd428] __pci_register_driver+0x68/0x80^M
>> [   19.945133] [c000000029163b30] [c0080000015414cc] mlx5_init+0xb8/0x100 [mlx5_core]^M
>> [   19.945247] [c000000029163ba0] [c000000020012f60] do_one_initcall+0x60/0x300^M
>> [   19.945255] [c000000029163c80] [c000000020241cbc] do_init_module+0x7c/0x2b0^M
>>
>> At the time of LPAR dump, before kexec hands over control to kdump kernel,
>> DDWs are scanned and added in the FDT. For the SR-IOV case, default DMA
>> window "ibm,dma-window" is removed from the FDT and DDW added, for the
>> device.
>>
>> Now, kexec hands over control to the kdump kernel.
>>
>> When the kdump kernel initializes, PCI busses are scanned and IOMMU
>> group/tables created, in pci_dma_bus_setup_pSeriesLP(). For the SR-IOV
>> case, there is no "ibm,dma-window". As a result, due to the code bug, the
>> newly created IOMMU table is not initialized. Later, when the device driver
>> tries to enter TCEs for the SR-IOV device, NULL pointer execption is thrown
>> from iommu_area_alloc().
>>
>> The fix would be to initialize IOMMU table with DDW property stored in
>> the FDT. There are 2 points to remember -
>>
>> 	1. For the dedicated adapter, kdump kernel would encounter both
>> 	   default and DDW in FDT. In this case, DDW property is used to
>> 	   initialize the IOMMU table.
>>
>> 	2. A DDW could be direct or dynamic mapped. kdump kernel would
>> 	   initialize IOMMU table and mark the existing DDW as
>> 	   "dynamic". This works fine since, at the time of table
>> 	   initialization, iommu_table_clear() makes some space in the
>> 	   DDW, for some predefined number of TCEs which are needed for
>> 	   kdump to succeed.
>>
>
> While looking at this change, I'm wondering if it would be helpful to
> note that the iommu_table_setparms_lpar function currently only examines
> the ibm,dma-window device node, which can cause issues with kexec kernel
> in certain scenarios in place where you mention "code bug".
Yes, this will be good to add. I will update commit message in the next 
version
of the patch.
>
> Additionally, I'm curious how this worked before
> considering that commit b1fc44eaa9ba was supposed to address the same
> issue that you are fixing here.
For an SR-IOV device, default DMA window is deleted and DDW created. Now,
If DDW is large enough, all the memory is pre-mapped (direct map), otherwise
TCEs are created dynamically and mapped in the DDW. If there is vPMEM 
assigned
to the LPAR, DDW is dynamically mapped since both RAM and vPMEM share 
the same DDW.

The original patch is incomplete. It does take care of direct mapped DDW 
but
not the dynamic case. For Dynamic case, IOMMU table needs to be created 
and initialized.

This patch fixes this dynamic DDW SR-IOV + vPMEM scenario.

I should mention this in the commit message as well that the patch is 
fixing "dynamic"
DDW scenario.
>
>> Fixes: b1fc44eaa9ba ("pseries/iommu/ddw: Fix kdump to work in absence of ibm,dma-window")
>> Signed-off-by: Gaurav Batra <gbatra@linux.vnet.ibm.com>
>> Reviewed-by: Brian King <brking@linux.vnet.ibm.com>
>> ---
>>   arch/powerpc/platforms/pseries/iommu.c | 150 ++++++++++++++++---------
>>   1 file changed, 99 insertions(+), 51 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>> index 496e16c588aa..6f022c51bd57 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -574,29 +574,6 @@ static void iommu_table_setparms(struct pci_controller *phb,
>>   
>>   struct iommu_table_ops iommu_table_lpar_multi_ops;
>>   
>> -/*
>> - * iommu_table_setparms_lpar
>> - *
>> - * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
>> - */
>> -static void iommu_table_setparms_lpar(struct pci_controller *phb,
>> -				      struct device_node *dn,
>> -				      struct iommu_table *tbl,
>> -				      struct iommu_table_group *table_group,
>> -				      const __be32 *dma_window)
>> -{
>> -	unsigned long offset, size, liobn;
>> -
>> -	of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
>> -
>> -	iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
>> -				    &iommu_table_lpar_multi_ops);
>> -
>> -
>> -	table_group->tce32_start = offset;
>> -	table_group->tce32_size = size;
>> -}
>> -
>>   struct iommu_table_ops iommu_table_pseries_ops = {
>>   	.set = tce_build_pSeries,
>>   	.clear = tce_free_pSeries,
>> @@ -724,26 +701,71 @@ struct iommu_table_ops iommu_table_lpar_multi_ops = {
>>    * dynamic 64bit DMA window, walking up the device tree.
>>    */
>>   static struct device_node *pci_dma_find(struct device_node *dn,
>> -					const __be32 **dma_window)
>> +					struct dynamic_dma_window_prop *prop)
>>   {
>> -	const __be32 *dw = NULL;
>> +	const __be32 *default_prop = NULL;
>> +	const __be32 *ddw_prop = NULL;
>> +	struct device_node *rdn = NULL;
>> +	bool default_win = false, ddw_win = false;
>>   
>>   	for ( ; dn && PCI_DN(dn); dn = dn->parent) {
>> -		dw = of_get_property(dn, "ibm,dma-window", NULL);
>> -		if (dw) {
>> -			if (dma_window)
>> -				*dma_window = dw;
>> -			return dn;
>> +		default_prop = of_get_property(dn, "ibm,dma-window", NULL);
>> +		if (default_prop) {
>> +			rdn = dn;
>> +			default_win = true;
>> +		}
>> +		ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);
>> +		if (ddw_prop) {
>> +			rdn = dn;
>> +			ddw_win = true;
>> +			break;
>> +		}
>> +		ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL);
>> +		if (ddw_prop) {
>> +			rdn = dn;
>> +			ddw_win = true;
>> +			break;
>>   		}
>> -		dw = of_get_property(dn, DIRECT64_PROPNAME, NULL);
>> -		if (dw)
>> -			return dn;
>> -		dw = of_get_property(dn, DMA64_PROPNAME, NULL);
>> -		if (dw)
>> -			return dn;
>> +
>> +		/* At least found default window, which is the case for normal boot */
>> +		if (default_win)
>> +			break;
>>   	}
>>   
>> -	return NULL;
>> +	/* For PCI devices there will always be a DMA window, either on the device
>> +	 * or parent bus
>> +	 */
>> +	WARN_ON(!(default_win | ddw_win));
>> +
>> +	/* caller doesn't want to get DMA window property */
>> +	if (!prop)
>> +		return rdn;
>> +
>> +	/* parse DMA window property. During normal system boot, only default
>> +	 * DMA window is passed in OF. But, for kdump, a dedicated adapter might
>> +	 * have both default and DDW in FDT. In this scenario, DDW takes precedence
>> +	 * over default window.
>> +	 */
>> +	if (ddw_win) {
>> +		struct dynamic_dma_window_prop *p;
>> +
>> +		p = (struct dynamic_dma_window_prop *)ddw_prop;
>> +		prop->liobn = be32_to_cpu(p->liobn);
>> +		prop->dma_base = be64_to_cpu(p->dma_base);
>> +		prop->tce_shift = be32_to_cpu(p->tce_shift);
>> +		prop->window_shift = be32_to_cpu(p->window_shift);
>> +	} else if (default_win) {
>> +		unsigned long offset, size, liobn;
>> +
>> +		of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size);
>> +
>> +		prop->liobn = liobn;
>> +		prop->dma_base = offset;
>> +		prop->tce_shift = IOMMU_PAGE_SHIFT_4K;
>> +		prop->window_shift = order_base_2(size);
>> +	}
>> +
>> +	return rdn;
>>   }
>>   
>>   static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>> @@ -751,17 +773,20 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>>   	struct iommu_table *tbl;
>>   	struct device_node *dn, *pdn;
>>   	struct pci_dn *ppci;
>> -	const __be32 *dma_window = NULL;
>> +	struct dynamic_dma_window_prop prop;
>>   
>>   	dn = pci_bus_to_OF_node(bus);
>>   
>>   	pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
>>   		 dn);
>>   
>> -	pdn = pci_dma_find(dn, &dma_window);
>> +	pdn = pci_dma_find(dn, &prop);
>>   
>> -	if (dma_window == NULL)
>> -		pr_debug("  no ibm,dma-window property !\n");
>> +	/* In PPC architecture, there will always be DMA window on bus or one of the
>> +	 * parent bus. During reboot, there will be ibm,dma-window property to
>> +	 * define DMA window. For kdump, there will at least be default window or DDW
>> +	 * or both.
>> +	 */
>>   
>>   	ppci = PCI_DN(pdn);
>>   
>> @@ -771,13 +796,21 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>>   	if (!ppci->table_group) {
>>   		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
>>   		tbl = ppci->table_group->tables[0];
>> -		if (dma_window) {
>> -			iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
>> -						  ppci->table_group, dma_window);
>>   
>> -			if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
>> -				panic("Failed to initialize iommu table");
>> -		}
>> +		iommu_table_setparms_common(tbl, ppci->phb->bus->number, prop.liobn,
>> +				prop.dma_base, 1ULL << prop.window_shift,
>> +				prop.tce_shift, NULL, &iommu_table_lpar_multi_ops);
>> +
>> +		/* Only for normal boot with default window. Doesn't matter even
>> +		 * if we set these with DDW which is 64bit during kdump, since
>> +		 * these will not be used during kdump.
>> +		 */
>> +		ppci->table_group->tce32_start = prop.dma_base;
>> +		ppci->table_group->tce32_size = 1ULL << prop.window_shift;
>> +
>> +		if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
>> +			panic("Failed to initialize iommu table");
>> +
>>   		iommu_register_group(ppci->table_group,
>>   				pci_domain_nr(bus), 0);
>>   		pr_debug("  created table: %p\n", ppci->table_group);
>> @@ -968,6 +1001,12 @@ static void find_existing_ddw_windows_named(const char *name)
>>   			continue;
>>   		}
>>   
>> +		/* If at the time of system initialization, there are DDWs in OF,
>> +		 * it means this is during kexec. DDW could be direct or dynamic.
>> +		 * We will just mark DDWs as "dynamic" since this is kdump path,
>> +		 * no need to worry about perforance. ddw_list_new_entry() will
>> +		 * set window->direct = false.
>> +		 */
>>   		window = ddw_list_new_entry(pdn, dma64);
>>   		if (!window) {
>>   			of_node_put(pdn);
>> @@ -1524,8 +1563,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>>   {
>>   	struct device_node *pdn, *dn;
>>   	struct iommu_table *tbl;
>> -	const __be32 *dma_window = NULL;
>>   	struct pci_dn *pci;
>> +	struct dynamic_dma_window_prop prop;
>>   
>>   	pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
>>   
>> @@ -1538,7 +1577,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>>   	dn = pci_device_to_OF_node(dev);
>>   	pr_debug("  node is %pOF\n", dn);
>>   
>> -	pdn = pci_dma_find(dn, &dma_window);
>> +	pdn = pci_dma_find(dn, &prop);
>>   	if (!pdn || !PCI_DN(pdn)) {
>>   		printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
>>   		       "no DMA window found for pci dev=%s dn=%pOF\n",
>> @@ -1551,8 +1590,17 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>>   	if (!pci->table_group) {
>>   		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
>>   		tbl = pci->table_group->tables[0];
>> -		iommu_table_setparms_lpar(pci->phb, pdn, tbl,
>> -				pci->table_group, dma_window);
>> +
>> +		iommu_table_setparms_common(tbl, pci->phb->bus->number, prop.liobn,
>> +				prop.dma_base, 1ULL << prop.window_shift,
>> +				prop.tce_shift, NULL, &iommu_table_lpar_multi_ops);
>> +
>> +		/* Only for normal boot with default window. Doesn't matter even
>> +		 * if we set these with DDW which is 64bit during kdump, since
>> +		 * these will not be used during kdump.
>> +		 */
>> +		pci->table_group->tce32_start = prop.dma_base;
>> +		pci->table_group->tce32_size = 1ULL << prop.window_shift;
>>   
>>   		iommu_init_table(tbl, pci->phb->node, 0, 0);
>>   		iommu_register_group(pci->table_group,
>> -- 
>> 2.39.3 (Apple Git-145)
diff mbox series

Patch

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 496e16c588aa..6f022c51bd57 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -574,29 +574,6 @@  static void iommu_table_setparms(struct pci_controller *phb,
 
 struct iommu_table_ops iommu_table_lpar_multi_ops;
 
-/*
- * iommu_table_setparms_lpar
- *
- * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
- */
-static void iommu_table_setparms_lpar(struct pci_controller *phb,
-				      struct device_node *dn,
-				      struct iommu_table *tbl,
-				      struct iommu_table_group *table_group,
-				      const __be32 *dma_window)
-{
-	unsigned long offset, size, liobn;
-
-	of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
-
-	iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
-				    &iommu_table_lpar_multi_ops);
-
-
-	table_group->tce32_start = offset;
-	table_group->tce32_size = size;
-}
-
 struct iommu_table_ops iommu_table_pseries_ops = {
 	.set = tce_build_pSeries,
 	.clear = tce_free_pSeries,
@@ -724,26 +701,71 @@  struct iommu_table_ops iommu_table_lpar_multi_ops = {
  * dynamic 64bit DMA window, walking up the device tree.
  */
 static struct device_node *pci_dma_find(struct device_node *dn,
-					const __be32 **dma_window)
+					struct dynamic_dma_window_prop *prop)
 {
-	const __be32 *dw = NULL;
+	const __be32 *default_prop = NULL;
+	const __be32 *ddw_prop = NULL;
+	struct device_node *rdn = NULL;
+	bool default_win = false, ddw_win = false;
 
 	for ( ; dn && PCI_DN(dn); dn = dn->parent) {
-		dw = of_get_property(dn, "ibm,dma-window", NULL);
-		if (dw) {
-			if (dma_window)
-				*dma_window = dw;
-			return dn;
+		default_prop = of_get_property(dn, "ibm,dma-window", NULL);
+		if (default_prop) {
+			rdn = dn;
+			default_win = true;
+		}
+		ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);
+		if (ddw_prop) {
+			rdn = dn;
+			ddw_win = true;
+			break;
+		}
+		ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL);
+		if (ddw_prop) {
+			rdn = dn;
+			ddw_win = true;
+			break;
 		}
-		dw = of_get_property(dn, DIRECT64_PROPNAME, NULL);
-		if (dw)
-			return dn;
-		dw = of_get_property(dn, DMA64_PROPNAME, NULL);
-		if (dw)
-			return dn;
+
+		/* At least found default window, which is the case for normal boot */
+		if (default_win)
+			break;
 	}
 
-	return NULL;
+	/* For PCI devices there will always be a DMA window, either on the device
+	 * or parent bus
+	 */
+	WARN_ON(!(default_win | ddw_win));
+
+	/* caller doesn't want to get DMA window property */
+	if (!prop)
+		return rdn;
+
+	/* parse DMA window property. During normal system boot, only default
+	 * DMA window is passed in OF. But, for kdump, a dedicated adapter might
+	 * have both default and DDW in FDT. In this scenario, DDW takes precedence
+	 * over default window.
+	 */
+	if (ddw_win) {
+		struct dynamic_dma_window_prop *p;
+
+		p = (struct dynamic_dma_window_prop *)ddw_prop;
+		prop->liobn = be32_to_cpu(p->liobn);
+		prop->dma_base = be64_to_cpu(p->dma_base);
+		prop->tce_shift = be32_to_cpu(p->tce_shift);
+		prop->window_shift = be32_to_cpu(p->window_shift);
+	} else if (default_win) {
+		unsigned long offset, size, liobn;
+
+		of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size);
+
+		prop->liobn = liobn;
+		prop->dma_base = offset;
+		prop->tce_shift = IOMMU_PAGE_SHIFT_4K;
+		prop->window_shift = order_base_2(size);
+	}
+
+	return rdn;
 }
 
 static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
@@ -751,17 +773,20 @@  static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 	struct iommu_table *tbl;
 	struct device_node *dn, *pdn;
 	struct pci_dn *ppci;
-	const __be32 *dma_window = NULL;
+	struct dynamic_dma_window_prop prop;
 
 	dn = pci_bus_to_OF_node(bus);
 
 	pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 		 dn);
 
-	pdn = pci_dma_find(dn, &dma_window);
+	pdn = pci_dma_find(dn, &prop);
 
-	if (dma_window == NULL)
-		pr_debug("  no ibm,dma-window property !\n");
+	/* In PPC architecture, there will always be DMA window on bus or one of the
+	 * parent bus. During reboot, there will be ibm,dma-window property to
+	 * define DMA window. For kdump, there will at least be default window or DDW
+	 * or both.
+	 */
 
 	ppci = PCI_DN(pdn);
 
@@ -771,13 +796,21 @@  static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 	if (!ppci->table_group) {
 		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
 		tbl = ppci->table_group->tables[0];
-		if (dma_window) {
-			iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
-						  ppci->table_group, dma_window);
 
-			if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
-				panic("Failed to initialize iommu table");
-		}
+		iommu_table_setparms_common(tbl, ppci->phb->bus->number, prop.liobn,
+				prop.dma_base, 1ULL << prop.window_shift,
+				prop.tce_shift, NULL, &iommu_table_lpar_multi_ops);
+
+		/* Only for normal boot with default window. Doesn't matter even
+		 * if we set these with DDW which is 64bit during kdump, since
+		 * these will not be used during kdump.
+		 */
+		ppci->table_group->tce32_start = prop.dma_base;
+		ppci->table_group->tce32_size = 1ULL << prop.window_shift;
+
+		if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
+			panic("Failed to initialize iommu table");
+
 		iommu_register_group(ppci->table_group,
 				pci_domain_nr(bus), 0);
 		pr_debug("  created table: %p\n", ppci->table_group);
@@ -968,6 +1001,12 @@  static void find_existing_ddw_windows_named(const char *name)
 			continue;
 		}
 
+		/* If at the time of system initialization, there are DDWs in OF,
+		 * it means this is during kexec. DDW could be direct or dynamic.
+		 * We will just mark DDWs as "dynamic" since this is kdump path,
+		 * no need to worry about perforance. ddw_list_new_entry() will
+		 * set window->direct = false.
+		 */
 		window = ddw_list_new_entry(pdn, dma64);
 		if (!window) {
 			of_node_put(pdn);
@@ -1524,8 +1563,8 @@  static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 {
 	struct device_node *pdn, *dn;
 	struct iommu_table *tbl;
-	const __be32 *dma_window = NULL;
 	struct pci_dn *pci;
+	struct dynamic_dma_window_prop prop;
 
 	pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
 
@@ -1538,7 +1577,7 @@  static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	dn = pci_device_to_OF_node(dev);
 	pr_debug("  node is %pOF\n", dn);
 
-	pdn = pci_dma_find(dn, &dma_window);
+	pdn = pci_dma_find(dn, &prop);
 	if (!pdn || !PCI_DN(pdn)) {
 		printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
 		       "no DMA window found for pci dev=%s dn=%pOF\n",
@@ -1551,8 +1590,17 @@  static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	if (!pci->table_group) {
 		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
 		tbl = pci->table_group->tables[0];
-		iommu_table_setparms_lpar(pci->phb, pdn, tbl,
-				pci->table_group, dma_window);
+
+		iommu_table_setparms_common(tbl, pci->phb->bus->number, prop.liobn,
+				prop.dma_base, 1ULL << prop.window_shift,
+				prop.tce_shift, NULL, &iommu_table_lpar_multi_ops);
+
+		/* Only for normal boot with default window. Doesn't matter even
+		 * if we set these with DDW which is 64bit during kdump, since
+		 * these will not be used during kdump.
+		 */
+		pci->table_group->tce32_start = prop.dma_base;
+		pci->table_group->tce32_size = 1ULL << prop.window_shift;
 
 		iommu_init_table(tbl, pci->phb->node, 0, 0);
 		iommu_register_group(pci->table_group,