diff mbox

[v4,07/21] powerpc/powernv: Release PEs dynamically

Message ID 1430460188-31343-8-git-send-email-gwshan@linux.vnet.ibm.com (mailing list archive)
State Superseded
Delegated to: Benjamin Herrenschmidt
Headers show

Commit Message

Gavin Shan May 1, 2015, 6:02 a.m. UTC
The original code doesn't support releasing PEs dynamically, meaning
that PE and the associated resources (IO, M32, M64 and DMA) can't
be released when unplugging a PCI adapter from one hotpluggable slot.

The patch takes object oriented methodology, introducs reference
count to PE, which is initialized to 1 and increased with 1 when a
new PCI device joins the PE. Once the last PCI device leaves the
PE, the PE is going to be release together with its associated
(IO, M32, M64, DMA) resources.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pci-bridge.h     |   3 +
 arch/powerpc/kernel/pci-hotplug.c         |   5 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++-----------
 arch/powerpc/platforms/powernv/pci.h      |   4 +-
 4 files changed, 432 insertions(+), 238 deletions(-)

Comments

Alexey Kardashevskiy May 9, 2015, 12:43 p.m. UTC | #1
On 05/01/2015 04:02 PM, Gavin Shan wrote:
> The original code doesn't support releasing PEs dynamically, meaning
> that PE and the associated resources (IO, M32, M64 and DMA) can't
> be released when unplugging a PCI adapter from one hotpluggable slot.
>
> The patch takes object oriented methodology, introducs reference
> count to PE, which is initialized to 1 and increased with 1 when a
> new PCI device joins the PE. Once the last PCI device leaves the
> PE, the PE is going to be release together with its associated
> (IO, M32, M64, DMA) resources.


Too little commit log for non-trivial non-cut-n-paste 30KB patch...


>
> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> ---
>   arch/powerpc/include/asm/pci-bridge.h     |   3 +
>   arch/powerpc/kernel/pci-hotplug.c         |   5 +
>   arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++-----------
>   arch/powerpc/platforms/powernv/pci.h      |   4 +-
>   4 files changed, 432 insertions(+), 238 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
> index 5367eb3..a6ad4b1 100644
> --- a/arch/powerpc/include/asm/pci-bridge.h
> +++ b/arch/powerpc/include/asm/pci-bridge.h
> @@ -31,6 +31,9 @@ struct pci_controller_ops {
>   	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
>   	void		(*setup_bridge)(struct pci_bus *, unsigned long);
>   	void		(*reset_secondary_bus)(struct pci_dev *dev);
> +
> +	/* Called when PCI device is released */
> +	void		(*release_device)(struct pci_dev *);
>   };
>
>   /*
> diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
> index 7ed85a6..0040343 100644
> --- a/arch/powerpc/kernel/pci-hotplug.c
> +++ b/arch/powerpc/kernel/pci-hotplug.c
> @@ -29,6 +29,11 @@
>    */
>   void pcibios_release_device(struct pci_dev *dev)
>   {
> +	struct pci_controller *hose = pci_bus_to_host(dev->bus);
> +
> +	if (hose->controller_ops.release_device)
> +		hose->controller_ops.release_device(dev);
> +
>   	eeh_remove_device(dev);
>   }
>
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 910fb67..ef8c216 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -12,6 +12,8 @@
>   #undef DEBUG
>
>   #include <linux/kernel.h>
> +#include <linux/atomic.h>
> +#include <linux/kref.h>
>   #include <linux/pci.h>
>   #include <linux/crash_dump.h>
>   #include <linux/debugfs.h>
> @@ -47,6 +49,8 @@
>   /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
>   #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
>
> +static void pnv_ioda_release_pe(struct kref *kref);
> +
>   static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>   			    const char *fmt, ...)
>   {
> @@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>   		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>   }
>
> -static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
> +static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>   {
> -	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
> -		pr_warn("%s: Invalid PE %d on PHB#%x\n",
> -			__func__, pe_no, phb->hose->global_number);
> +	if (!pe)
> +		return;
> +
> +	kref_get(&pe->kref);
> +}
> +
> +static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
> +{
> +	unsigned int count;
> +
> +	if (!pe)
>   		return;
> +
> +	/*
> +	 * The count is initialized to 1 and increased with 1 when
> +	 * a new PCI device is bound with the PE. Once the last PCI
> +	 * device is leaving from the PE, the PE is going to be
> +	 * released.
> +	 */
> +	count = atomic_read(&pe->kref.refcount);
> +	if (count == 2)
> +		kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
> +	else
> +		kref_put(&pe->kref, pnv_ioda_release_pe);


What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()?


> +}
> +
> +static void pnv_pci_release_device(struct pci_dev *pdev)
> +{
> +	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> +	struct pnv_phb *phb = hose->private_data;
> +	struct pci_dn *pdn = pci_get_pdn(pdev);
> +	struct pnv_ioda_pe *pe;
> +
> +	if (pdn && pdn->pe_number != IODA_INVALID_PE) {
> +		pe = &phb->ioda.pe_array[pdn->pe_number];
> +		pnv_ioda_pe_put(pe);
> +		pdn->pe_number = IODA_INVALID_PE;
>   	}
> +}
>
> -	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
> -		pr_warn("%s: PE %d was assigned on PHB#%x\n",
> -			__func__, pe_no, phb->hose->global_number);
> +static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
> +{
> +	struct pnv_phb *phb = pe->phb;
> +	int index, count;
> +	unsigned long tbl_addr, tbl_size;
> +
> +	/* No DMA capability for slave PEs */
> +	if (pe->flags & PNV_IODA_PE_SLAVE)
> +		return;
> +
> +	/* Bypass DMA window */
> +	if (phb->type == PNV_PHB_IODA2 &&
> +	    pe->tce_bypass_enabled &&
> +	    pe->tce32_table &&
> +	    pe->tce32_table->set_bypass)
> +		pe->tce32_table->set_bypass(pe->tce32_table, false);
> +
> +	/* 32-bits DMA window */
> +	count = pe->tce32_seg_end - pe->tce32_seg_start;
> +	tbl_addr = pe->tce32_table->it_base;
> +	if (!count)
>   		return;
> +
> +	/* Free IOMMU table */
> +	iommu_free_table(pe->tce32_table,
> +			 of_node_full_name(phb->hose->dn));
> +
> +	/* Deconfigure TCE table */
> +	switch (phb->type) {
> +	case PNV_PHB_IODA1:
> +		for (index = 0; index < count; index++)
> +			opal_pci_map_pe_dma_window(phb->opal_id,
> +						   pe->pe_number,
> +						   pe->tce32_seg_start + index,
> +						   1,
> +						   __pa(tbl_addr) +
> +						   index * TCE32_TABLE_SIZE,
> +						   0,
> +						   0x1000);
> +		bitmap_clear(phb->ioda.tce32_segmap,
> +			     pe->tce32_seg_start,
> +			     count);
> +		tbl_size = TCE32_TABLE_SIZE * count;
> +		break;
> +	case PNV_PHB_IODA2:
> +		opal_pci_map_pe_dma_window(phb->opal_id,
> +					   pe->pe_number,
> +					   pe->pe_number << 1,
> +					   1,
> +					   __pa(tbl_addr),
> +					   0,
> +					   0x1000);
> +		tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
> +		tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
> +		break;
> +	default:
> +		pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
> +		return;
> +	}
> +
> +	/* Free memory of IOMMU table */
> +	free_pages(tbl_addr, get_order(tbl_size));


You just programmed the table address to TVT and then you are releasing the 
pages. It does not seem right, it will leave garbage in TVT. Also, I am 
adding helpers to alloc/free TCE pages in DDW patchset, you could reuse 
bits from there (I'll post v10 soon, you'll be in copy and you'll have to 
review that ;) ).


> +	pe->tce32_table = NULL;
> +	pe->tce32_seg_start = 0;
> +	pe->tce32_seg_end = 0;
> +}
> +
> +static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
> +{
> +	struct pnv_phb *phb = pe->phb;
> +	unsigned long *segmap = NULL, *pe_segmap = NULL;
> +	int i;
> +	uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE,
> +				     OPAL_M32_WINDOW_TYPE,
> +				     OPAL_M64_WINDOW_TYPE };
> +
> +	for (win = 0; win < ARRAY_SIZE(win_type); win++) {
> +		switch (win_type[win]) {
> +		case OPAL_IO_WINDOW_TYPE:
> +			segmap = phb->ioda.io_segmap;
> +			pe_segmap = pe->io_segmap;
> +			break;
> +		case OPAL_M32_WINDOW_TYPE:
> +			segmap = phb->ioda.m32_segmap;
> +			pe_segmap = pe->m32_segmap;
> +			break;
> +		case OPAL_M64_WINDOW_TYPE:
> +			segmap = phb->ioda.m64_segmap;
> +			pe_segmap = pe->m64_segmap;
> +			break;
> +		}
> +		i = -1;
> +		while ((i = find_next_bit(pe_segmap,
> +			phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) {
> +			if (win_type[win] == OPAL_IO_WINDOW_TYPE ||
> +			    win_type[win] == OPAL_M32_WINDOW_TYPE)
> +				opal_pci_map_pe_mmio_window(phb->opal_id,
> +						phb->ioda.reserved_pe,
> +						win_type[win], 0, i);
> +			else if (phb->type == PNV_PHB_IODA1)
> +				opal_pci_map_pe_mmio_window(phb->opal_id,
> +						phb->ioda.reserved_pe,
> +						win_type[win],
> +						i / 8, i % 8);

The function is called ""release" but it programs something what looks like 
reasonable values, is it correct?



> +
> +			clear_bit(i, pe_segmap);
> +			clear_bit(i, segmap);
> +		}
> +	}
> +}
> +
> +static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
> +				  struct pnv_ioda_pe *parent,
> +				  struct pnv_ioda_pe *child,
> +				  bool is_add)
> +{
> +	const char *desc = is_add ? "adding" : "removing";
> +	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
> +			      OPAL_REMOVE_PE_FROM_DOMAIN;
> +	struct pnv_ioda_pe *slave;
> +	long rc;
> +
> +	/* Parent PE affects child PE */
> +	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
> +				child->pe_number, op);
> +	if (rc != OPAL_SUCCESS) {
> +		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
> +			rc, desc);
> +		return -ENXIO;
> +	}
> +
> +	if (!(child->flags & PNV_IODA_PE_MASTER))
> +		return 0;
> +
> +	/* Compound case: parent PE affects slave PEs */
> +	list_for_each_entry(slave, &child->slaves, list) {
> +		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
> +					slave->pe_number, op);
> +		if (rc != OPAL_SUCCESS) {
> +			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
> +				rc, desc);
> +			return -ENXIO;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add)
> +{
> +	struct pnv_phb *phb = pe->phb;
> +	struct pnv_ioda_pe *slave;
> +	struct pci_dev *pdev = NULL;
> +	int ret;
> +
> +	/*
> +	 * Clear PE frozen state. If it's master PE, we need
> +	 * clear slave PE frozen state as well.
> +	 */
> +	opal_pci_eeh_freeze_clear(phb->opal_id,
> +				  pe->pe_number,
> +				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> +	if (pe->flags & PNV_IODA_PE_MASTER) {
> +		list_for_each_entry(slave, &pe->slaves, list) {
> +			opal_pci_eeh_freeze_clear(phb->opal_id,
> +						  slave->pe_number,
> +						  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> +		}
> +	}
> +
> +	/*
> +	 * Associate PE in PELT. We need add the PE into the
> +	 * corresponding PELT-V as well. Otherwise, the error
> +	 * originated from the PE might contribute to other
> +	 * PEs.
> +	 */
> +	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
> +	if (ret)
> +		return ret;
> +
> +	/* For compound PEs, any one affects all of them */
> +	if (pe->flags & PNV_IODA_PE_MASTER) {
> +		list_for_each_entry(slave, &pe->slaves, list) {
> +			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +
> +	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
> +		pdev = pe->pbus->self;
> +	else if (pe->flags & PNV_IODA_PE_DEV)
> +		pdev = pe->pdev->bus->self;
> +#ifdef CONFIG_PCI_IOV
> +	else if (pe->flags & PNV_IODA_PE_VF)
> +		pdev = pe->parent_dev->bus->self;
> +#endif /* CONFIG_PCI_IOV */
> +
> +	while (pdev) {
> +		struct pci_dn *pdn = pci_get_pdn(pdev);
> +		struct pnv_ioda_pe *parent;
> +
> +		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
> +			parent = &phb->ioda.pe_array[pdn->pe_number];
> +			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
> +			if (ret)
> +				return ret;
> +		}
> +
> +		pdev = pdev->bus->self;
> +	}
> +
> +	return 0;
> +}
> +
> +static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe)


It used to be under #ifdef CONFIG_PCI_IOV, now it is not. Looks like just 
moving of this function to a different place deserves a separate patch with 
a comment why ("it is going to be used now for non-SRIOV case too" may be?).



> +{
> +	struct pnv_phb *phb = pe->phb;
> +	struct pci_dev *parent;
> +	uint8_t bcomp, dcomp, fcomp;
> +	long rid_end, rid;
> +	int64_t rc;
> +
> +	/* Tear down MVE */
> +	if (phb->type == PNV_PHB_IODA1 &&
> +	    pe->mve_number != -1) {
> +		rc = opal_pci_set_mve(phb->opal_id,
> +				      pe->mve_number,
> +				      phb->ioda.reserved_pe);
> +		if (rc != OPAL_SUCCESS)
> +			pe_warn(pe, "Error %lld unmapping MVE#%d\n",
> +				rc, pe->mve_number);
> +		rc = opal_pci_set_mve_enable(phb->opal_id,
> +					     pe->mve_number,
> +					     OPAL_DISABLE_MVE);
> +		if (rc != OPAL_SUCCESS)
> +			pe_warn(pe, "Error %lld disabling MVE#%d\n",
> +				rc, pe->mve_number);
> +		pe->mve_number = -1;
> +	}
> +
> +	/* Unmapping PELTV */
> +	pnv_ioda_set_peltv(pe, false);
> +
> +	/* To unmap PELTM */
> +	if (pe->pbus) {
> +		int count;
> +
> +		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
> +		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
> +		parent = pe->pbus->self;
> +		if (pe->flags & PNV_IODA_PE_BUS_ALL)
> +			count = pe->pbus->busn_res.end -
> +				pe->pbus->busn_res.start + 1;
> +		else
> +			count = 1;
> +
> +		switch(count) {
> +		case  1: bcomp = OpalPciBusAll;   break;
> +		case  2: bcomp = OpalPciBus7Bits; break;
> +		case  4: bcomp = OpalPciBus6Bits; break;
> +		case  8: bcomp = OpalPciBus5Bits; break;
> +		case 16: bcomp = OpalPciBus4Bits; break;
> +		case 32: bcomp = OpalPciBus3Bits; break;
> +		default:
> +			/* Fail back to case of one bus */
> +			pe_warn(pe, "Cannot support %d buses\n", count);
> +			bcomp = OpalPciBusAll;
> +		}
> +		rid_end = pe->rid + (count << 8);
> +	} else {
> +#ifdef CONFIG_PCI_IOV
> +		if (pe->flags & PNV_IODA_PE_VF)
> +			parent = pe->parent_dev;
> +		else
> +#endif
> +			parent = pe->pdev->bus->self;
> +		bcomp = OpalPciBusAll;
> +		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
> +		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
> +		rid_end = pe->rid + 1;
> +	}
> +
> +	/* Clear RID mapping */
> +	for (rid = pe->rid; rid < rid_end; rid++)
> +		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
> +
> +	/* Unmapping PELTM */
> +	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
> +			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
> +	if (rc)
> +		pe_warn(pe, "Error %ld unmapping PELTM\n", rc);
> +}
> +
> +static void pnv_ioda_release_pe(struct kref *kref)
> +{
> +	struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref);
> +	struct pnv_ioda_pe *tmp, *slave;
> +	struct pnv_phb *phb = pe->phb;
> +
> +	pnv_ioda_release_pe_dma(pe);
> +	pnv_ioda_release_pe_seg(pe);
> +	pnv_ioda_deconfigure_pe(pe);
> +
> +	/* Release slave PEs for compound PE */
> +	if (pe->flags & PNV_IODA_PE_MASTER) {
> +		list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
> +			pnv_ioda_pe_put(slave);
> +	}
> +
> +	/* Remove the PE from various list. We need remove slave
> +	 * PE from master's list.
> +	 */
> +	list_del(&pe->dma_link);
> +	list_del(&pe->list);
> +
> +	/* Free PE number */
> +	clear_bit(pe->pe_number, phb->ioda.pe_alloc);
> +}
> +
> +static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb,
> +					    int pe_no)
> +{
> +	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
> +
> +	kref_init(&pe->kref);
> +	pe->phb = phb;
> +	pe->pe_number = pe_no;
> +	INIT_LIST_HEAD(&pe->dma_link);
> +	INIT_LIST_HEAD(&pe->list);
> +
> +	return pe;
> +}
> +
> +static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb,
> +					       int pe_no)
> +{
> +	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
> +		pr_warn("%s: Invalid PE %d on PHB#%x\n",
> +			__func__, pe_no, phb->hose->global_number);
> +		return NULL;
>   	}
>
> -	phb->ioda.pe_array[pe_no].phb = phb;
> -	phb->ioda.pe_array[pe_no].pe_number = pe_no;
> +	/*
> +	 * Same PE might be reserved for multiple times, which
> +	 * is out of problem actually.
> +	 */
> +	set_bit(pe_no, phb->ioda.pe_alloc);
> +	return pnv_ioda_init_pe(phb, pe_no);
>   }
>
> -static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
> +static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>   {
>   	unsigned long pe_no;
>   	unsigned long limit = phb->ioda.total_pe - 1;
> @@ -154,20 +533,10 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>   			break;
>
>   		if (--limit >= phb->ioda.total_pe)
> -			return IODA_INVALID_PE;
> +			return NULL;
>   	} while(1);
>
> -	phb->ioda.pe_array[pe_no].phb = phb;
> -	phb->ioda.pe_array[pe_no].pe_number = pe_no;
> -	return pe_no;
> -}
> -
> -static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
> -{
> -	WARN_ON(phb->ioda.pe_array[pe].pdev);
> -
> -	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
> -	clear_bit(pe, phb->ioda.pe_alloc);
> +	return pnv_ioda_init_pe(phb, pe_no);
>   }
>
>   static int pnv_ioda1_init_m64(struct pnv_phb *phb)
> @@ -382,8 +751,9 @@ static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb,
>   	}
>   }
>
> -static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
> -				struct pci_bus *bus, int all)
> +static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
> +						struct pci_bus *bus,
> +						int all)


Mechanic changes like this could easily go to a separate patch.


>   {
>   	resource_size_t segsz = phb->ioda.m64_segsize;
>   	struct pci_dev *pdev;
> @@ -394,14 +764,14 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>   	int i;
>
>   	if (!pnv_ioda_need_m64_pe(phb, bus))
> -		return IODA_INVALID_PE;
> +		return NULL;
>
>           /* Allocate bitmap */
>   	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
>   	pe_bitsmap = kzalloc(size, GFP_KERNEL);
>   	if (!pe_bitsmap) {
>   		pr_warn("%s: Out of memory !\n", __func__);
> -		return IODA_INVALID_PE;
> +		return NULL;
>   	}
>
>   	/* The bridge's M64 window might be extended to PHB's M64
> @@ -438,7 +808,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>   	/* No M64 window found ? */
>   	if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) {
>   		kfree(pe_bitsmap);
> -		return IODA_INVALID_PE;
> +		return NULL;
>   	}
>
>   	/* Figure out the master PE and put all slave PEs
> @@ -491,7 +861,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>   	}
>
>   	kfree(pe_bitsmap);
> -	return master_pe->pe_number;
> +	return master_pe;
>   }
>
>   static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
> @@ -695,7 +1065,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
>    * but in the meantime, we need to protect them to avoid warnings
>    */
>   #ifdef CONFIG_PCI_MSI
> -static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
> +static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev)
>   {
>   	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>   	struct pnv_phb *phb = hose->private_data;
> @@ -709,191 +1079,6 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>   }
>   #endif /* CONFIG_PCI_MSI */
>
> -static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
> -				  struct pnv_ioda_pe *parent,
> -				  struct pnv_ioda_pe *child,
> -				  bool is_add)
> -{
> -	const char *desc = is_add ? "adding" : "removing";
> -	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
> -			      OPAL_REMOVE_PE_FROM_DOMAIN;
> -	struct pnv_ioda_pe *slave;
> -	long rc;
> -
> -	/* Parent PE affects child PE */
> -	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
> -				child->pe_number, op);
> -	if (rc != OPAL_SUCCESS) {
> -		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
> -			rc, desc);
> -		return -ENXIO;
> -	}
> -
> -	if (!(child->flags & PNV_IODA_PE_MASTER))
> -		return 0;
> -
> -	/* Compound case: parent PE affects slave PEs */
> -	list_for_each_entry(slave, &child->slaves, list) {
> -		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
> -					slave->pe_number, op);
> -		if (rc != OPAL_SUCCESS) {
> -			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
> -				rc, desc);
> -			return -ENXIO;
> -		}
> -	}
> -
> -	return 0;
> -}
> -
> -static int pnv_ioda_set_peltv(struct pnv_phb *phb,
> -			      struct pnv_ioda_pe *pe,
> -			      bool is_add)
> -{
> -	struct pnv_ioda_pe *slave;
> -	struct pci_dev *pdev = NULL;
> -	int ret;
> -
> -	/*
> -	 * Clear PE frozen state. If it's master PE, we need
> -	 * clear slave PE frozen state as well.
> -	 */
> -	if (is_add) {
> -		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
> -					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> -		if (pe->flags & PNV_IODA_PE_MASTER) {
> -			list_for_each_entry(slave, &pe->slaves, list)
> -				opal_pci_eeh_freeze_clear(phb->opal_id,
> -							  slave->pe_number,
> -							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> -		}
> -	}
> -
> -	/*
> -	 * Associate PE in PELT. We need add the PE into the
> -	 * corresponding PELT-V as well. Otherwise, the error
> -	 * originated from the PE might contribute to other
> -	 * PEs.
> -	 */
> -	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
> -	if (ret)
> -		return ret;
> -
> -	/* For compound PEs, any one affects all of them */
> -	if (pe->flags & PNV_IODA_PE_MASTER) {
> -		list_for_each_entry(slave, &pe->slaves, list) {
> -			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
> -			if (ret)
> -				return ret;
> -		}
> -	}
> -
> -	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
> -		pdev = pe->pbus->self;
> -	else if (pe->flags & PNV_IODA_PE_DEV)
> -		pdev = pe->pdev->bus->self;
> -#ifdef CONFIG_PCI_IOV
> -	else if (pe->flags & PNV_IODA_PE_VF)
> -		pdev = pe->parent_dev->bus->self;
> -#endif /* CONFIG_PCI_IOV */
> -	while (pdev) {
> -		struct pci_dn *pdn = pci_get_pdn(pdev);
> -		struct pnv_ioda_pe *parent;
> -
> -		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
> -			parent = &phb->ioda.pe_array[pdn->pe_number];
> -			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
> -			if (ret)
> -				return ret;
> -		}
> -
> -		pdev = pdev->bus->self;
> -	}
> -
> -	return 0;
> -}
> -
> -#ifdef CONFIG_PCI_IOV
> -static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
> -{
> -	struct pci_dev *parent;
> -	uint8_t bcomp, dcomp, fcomp;
> -	int64_t rc;
> -	long rid_end, rid;
> -
> -	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
> -	if (pe->pbus) {
> -		int count;
> -
> -		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
> -		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
> -		parent = pe->pbus->self;
> -		if (pe->flags & PNV_IODA_PE_BUS_ALL)
> -			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
> -		else
> -			count = 1;
> -
> -		switch(count) {
> -		case  1: bcomp = OpalPciBusAll;         break;
> -		case  2: bcomp = OpalPciBus7Bits;       break;
> -		case  4: bcomp = OpalPciBus6Bits;       break;
> -		case  8: bcomp = OpalPciBus5Bits;       break;
> -		case 16: bcomp = OpalPciBus4Bits;       break;
> -		case 32: bcomp = OpalPciBus3Bits;       break;
> -		default:
> -			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
> -			        count);
> -			/* Do an exact match only */
> -			bcomp = OpalPciBusAll;
> -		}
> -		rid_end = pe->rid + (count << 8);
> -	} else {
> -		if (pe->flags & PNV_IODA_PE_VF)
> -			parent = pe->parent_dev;
> -		else
> -			parent = pe->pdev->bus->self;
> -		bcomp = OpalPciBusAll;
> -		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
> -		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
> -		rid_end = pe->rid + 1;
> -	}
> -
> -	/* Clear the reverse map */
> -	for (rid = pe->rid; rid < rid_end; rid++)
> -		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
> -
> -	/* Release from all parents PELT-V */
> -	while (parent) {
> -		struct pci_dn *pdn = pci_get_pdn(parent);
> -		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
> -			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
> -						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
> -			/* XXX What to do in case of error ? */


Not much :) Free associated memory and mark it "dead" so it won't be used 
again till reboot. In what circumstance can this opal_pci_set_peltv() fail 
at all?


> -		}
> -		parent = parent->bus->self;
> -	}
> -
> -	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
> -				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> -
> -	/* Disassociate PE in PELT */
> -	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
> -				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
> -	if (rc)
> -		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
> -	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
> -			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
> -	if (rc)
> -		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
> -
> -	pe->pbus = NULL;
> -	pe->pdev = NULL;
> -	pe->parent_dev = NULL;
> -
> -	return 0;
> -}
> -#endif /* CONFIG_PCI_IOV */
> -
>   static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>   {
>   	struct pci_dev *parent;
> @@ -953,7 +1138,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>   	}
>
>   	/* Configure PELTV */
> -	pnv_ioda_set_peltv(phb, pe, true);
> +	pnv_ioda_set_peltv(pe, true);
>
>   	/* Setup reverse map */
>   	for (rid = pe->rid; rid < rid_end; rid++)
> @@ -1207,6 +1392,8 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>   		if (pdn->pe_number != IODA_INVALID_PE)
>   			continue;
>
> +		/* Increase reference count of the parent PE */

When you comment like this, I read it as the comment belongs to the whole 
next chunk till the first empty line, i.e. to all 5 lines below, which is 
not the case. I'd remove the comment as 1) "pe_get" in pnv_ioda_pe_get() 
name suggests incrementing the reference counter 2) "pe" is always parent 
in this function. I do not insist though.


> +		pnv_ioda_pe_get(pe);
>   		pdn->pe_number = pe->pe_number;
>   		pe->dma_weight += pnv_ioda_dev_dma_weight(dev);
>   		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
> @@ -1224,7 +1411,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>   {
>   	struct pci_controller *hose = pci_bus_to_host(bus);
>   	struct pnv_phb *phb = hose->private_data;
> -	struct pnv_ioda_pe *pe;
> +	struct pnv_ioda_pe *pe = NULL;
>   	int pe_num = IODA_INVALID_PE;
>
>   	/* For partial hotplug case, the PE instance hasn't been destroyed
> @@ -1240,24 +1427,24 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>   	}
>
>   	/* PE number for root bus should have been reserved */
> -	if (pci_is_root_bus(bus))
> -		pe_num = phb->ioda.root_pe_no;
> +	if (pci_is_root_bus(bus) &&
> +	    phb->ioda.root_pe_no != IODA_INVALID_PE)
> +		pe = &phb->ioda.pe_array[phb->ioda.root_pe_no];
>
>   	/* Check if PE is determined by M64 */
> -	if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe)
> -		pe_num = phb->pick_m64_pe(phb, bus, all);
> +	if (!pe && phb->pick_m64_pe)
> +		pe = phb->pick_m64_pe(phb, bus, all);
>
>   	/* The PE number isn't pinned by M64 */
> -	if (pe_num == IODA_INVALID_PE)
> -		pe_num = pnv_ioda_alloc_pe(phb);
> +	if (!pe)
> +		pe = pnv_ioda_alloc_pe(phb);
>
> -	if (pe_num == IODA_INVALID_PE) {
> -		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
> +	if (!pe) {
> +		pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n",
>   			__func__, pci_domain_nr(bus), bus->number);
>   		return NULL;
>   	}
>
> -	pe = &phb->ioda.pe_array[pe_num];
>   	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>   	pe->pbus = bus;
>   	pe->pdev = NULL;
> @@ -1274,14 +1461,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>
>   	if (pnv_ioda_configure_pe(phb, pe)) {
>   		/* XXX What do we do here ? */
> -		if (pe_num)
> -			pnv_ioda_free_pe(phb, pe_num);
> -		pe->pbus = NULL;
> +		pnv_ioda_pe_put(pe);
>   		return NULL;
>   	}
>
>   	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
> -			GFP_KERNEL, hose->node);
> +				       GFP_KERNEL, hose->node);

Seems like spaces change only - if you really want this change (which I 
hate - makes code look inaccurate to my taste but it seems I am in minority 
here :) ), please put it to the separate patch.


>   	pe->tce32_table->data = pe;
>
>   	/* Associate it with all child devices */
> @@ -1521,9 +1706,9 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>   		list_del(&pe->list);
>   		mutex_unlock(&phb->ioda.pe_list_mutex);
>
> -		pnv_ioda_deconfigure_pe(phb, pe);
> +		pnv_ioda_deconfigure_pe(pe);


Is this change necessary to get "Release PEs dynamically" working? Move it 
to mechanical changes patch may be?


>
> -		pnv_ioda_free_pe(phb, pe->pe_number);
> +		pnv_ioda_pe_put(pe);
>   	}
>   }
>
> @@ -1601,9 +1786,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>
>   		if (pnv_ioda_configure_pe(phb, pe)) {
>   			/* XXX What do we do here ? */
> -			if (pe_num)
> -				pnv_ioda_free_pe(phb, pe_num);
> -			pe->pdev = NULL;
> +			pnv_ioda_pe_put(pe);
>   			continue;
>   		}
>
> @@ -2263,7 +2446,7 @@ int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
>   	struct pnv_ioda_pe *pe;
>   	int rc;
>
> -	pe = pnv_ioda_get_pe(dev);
> +	pe = pnv_ioda_pci_dev_to_pe(dev);


And this change could to separately. Not clear how this helps to "Release 
PEs dynamically".


>   	if (!pe)
>   		return -ENODEV;
>
> @@ -2379,7 +2562,7 @@ int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
>   	struct pnv_ioda_pe *pe;
>   	int rc;
>
> -	if (!(pe = pnv_ioda_get_pe(dev)))
> +	if (!(pe = pnv_ioda_pci_dev_to_pe(dev)))
>   		return -ENODEV;
>
>   	/* Assign XIVE to PE */
> @@ -2401,7 +2584,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
>   				  unsigned int hwirq, unsigned int virq,
>   				  unsigned int is_64, struct msi_msg *msg)
>   {
> -	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
> +	struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev);
>   	unsigned int xive_num = hwirq - phb->msi_base;
>   	__be32 data;
>   	int rc;
> @@ -3065,6 +3248,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>   	pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge;
>   	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
>   	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
> +	pnv_pci_controller_ops.release_device = pnv_pci_release_device;
>   	hose->controller_ops = pnv_pci_controller_ops;
>
>   #ifdef CONFIG_PCI_IOV
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index 1bea3a8..8b10f01 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -28,6 +28,7 @@ enum pnv_phb_model {
>   /* Data associated with a PE, including IOMMU tracking etc.. */
>   struct pnv_phb;
>   struct pnv_ioda_pe {
> +	struct kref		kref;
>   	unsigned long		flags;
>   	struct pnv_phb		*phb;
>
> @@ -120,7 +121,8 @@ struct pnv_phb {
>   	void (*shutdown)(struct pnv_phb *phb);
>   	int (*init_m64)(struct pnv_phb *phb);
>   	void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus);
> -	int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
> +	struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb,
> +					   struct pci_bus *bus, int all);
>   	int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
>   	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
>   	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
>
Gavin Shan May 11, 2015, 6:25 a.m. UTC | #2
On Sat, May 09, 2015 at 10:43:23PM +1000, Alexey Kardashevskiy wrote:
>On 05/01/2015 04:02 PM, Gavin Shan wrote:
>>The original code doesn't support releasing PEs dynamically, meaning
>>that PE and the associated resources (IO, M32, M64 and DMA) can't
>>be released when unplugging a PCI adapter from one hotpluggable slot.
>>
>>The patch takes object oriented methodology, introducs reference
>>count to PE, which is initialized to 1 and increased with 1 when a
>>new PCI device joins the PE. Once the last PCI device leaves the
>>PE, the PE is going to be release together with its associated
>>(IO, M32, M64, DMA) resources.
>
>
>Too little commit log for non-trivial non-cut-n-paste 30KB patch...
>

Ok. I'll add more details in next revision.

>>
>>Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>---
>>  arch/powerpc/include/asm/pci-bridge.h     |   3 +
>>  arch/powerpc/kernel/pci-hotplug.c         |   5 +
>>  arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++-----------
>>  arch/powerpc/platforms/powernv/pci.h      |   4 +-
>>  4 files changed, 432 insertions(+), 238 deletions(-)
>>
>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>index 5367eb3..a6ad4b1 100644
>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>@@ -31,6 +31,9 @@ struct pci_controller_ops {
>>  	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
>>  	void		(*setup_bridge)(struct pci_bus *, unsigned long);
>>  	void		(*reset_secondary_bus)(struct pci_dev *dev);
>>+
>>+	/* Called when PCI device is released */
>>+	void		(*release_device)(struct pci_dev *);
>>  };
>>
>>  /*
>>diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
>>index 7ed85a6..0040343 100644
>>--- a/arch/powerpc/kernel/pci-hotplug.c
>>+++ b/arch/powerpc/kernel/pci-hotplug.c
>>@@ -29,6 +29,11 @@
>>   */
>>  void pcibios_release_device(struct pci_dev *dev)
>>  {
>>+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>+
>>+	if (hose->controller_ops.release_device)
>>+		hose->controller_ops.release_device(dev);
>>+
>>  	eeh_remove_device(dev);
>>  }
>>
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 910fb67..ef8c216 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -12,6 +12,8 @@
>>  #undef DEBUG
>>
>>  #include <linux/kernel.h>
>>+#include <linux/atomic.h>
>>+#include <linux/kref.h>
>>  #include <linux/pci.h>
>>  #include <linux/crash_dump.h>
>>  #include <linux/debugfs.h>
>>@@ -47,6 +49,8 @@
>>  /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
>>  #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
>>
>>+static void pnv_ioda_release_pe(struct kref *kref);
>>+
>>  static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>>  			    const char *fmt, ...)
>>  {
>>@@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>>  		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>>  }
>>
>>-static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
>>+static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>>  {
>>-	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>-		pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>-			__func__, pe_no, phb->hose->global_number);
>>+	if (!pe)
>>+		return;
>>+
>>+	kref_get(&pe->kref);
>>+}
>>+
>>+static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
>>+{
>>+	unsigned int count;
>>+
>>+	if (!pe)
>>  		return;
>>+
>>+	/*
>>+	 * The count is initialized to 1 and increased with 1 when
>>+	 * a new PCI device is bound with the PE. Once the last PCI
>>+	 * device is leaving from the PE, the PE is going to be
>>+	 * released.
>>+	 */
>>+	count = atomic_read(&pe->kref.refcount);
>>+	if (count == 2)
>>+		kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
>>+	else
>>+		kref_put(&pe->kref, pnv_ioda_release_pe);
>
>
>What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()?
>

Yeah, that would have problem. But it shouldn't happen because the
PCI devices are joining the parent PE# in strictly serialized mode.
Same thing happens when detaching PCI devices from its parent PE.

>>+}
>>+
>>+static void pnv_pci_release_device(struct pci_dev *pdev)
>>+{
>>+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>+	struct pnv_phb *phb = hose->private_data;
>>+	struct pci_dn *pdn = pci_get_pdn(pdev);
>>+	struct pnv_ioda_pe *pe;
>>+
>>+	if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>+		pe = &phb->ioda.pe_array[pdn->pe_number];
>>+		pnv_ioda_pe_put(pe);
>>+		pdn->pe_number = IODA_INVALID_PE;
>>  	}
>>+}
>>
>>-	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
>>-		pr_warn("%s: PE %d was assigned on PHB#%x\n",
>>-			__func__, pe_no, phb->hose->global_number);
>>+static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+	int index, count;
>>+	unsigned long tbl_addr, tbl_size;
>>+
>>+	/* No DMA capability for slave PEs */
>>+	if (pe->flags & PNV_IODA_PE_SLAVE)
>>+		return;
>>+
>>+	/* Bypass DMA window */
>>+	if (phb->type == PNV_PHB_IODA2 &&
>>+	    pe->tce_bypass_enabled &&
>>+	    pe->tce32_table &&
>>+	    pe->tce32_table->set_bypass)
>>+		pe->tce32_table->set_bypass(pe->tce32_table, false);
>>+
>>+	/* 32-bits DMA window */
>>+	count = pe->tce32_seg_end - pe->tce32_seg_start;
>>+	tbl_addr = pe->tce32_table->it_base;
>>+	if (!count)
>>  		return;
>>+
>>+	/* Free IOMMU table */
>>+	iommu_free_table(pe->tce32_table,
>>+			 of_node_full_name(phb->hose->dn));
>>+
>>+	/* Deconfigure TCE table */
>>+	switch (phb->type) {
>>+	case PNV_PHB_IODA1:
>>+		for (index = 0; index < count; index++)
>>+			opal_pci_map_pe_dma_window(phb->opal_id,
>>+						   pe->pe_number,
>>+						   pe->tce32_seg_start + index,
>>+						   1,
>>+						   __pa(tbl_addr) +
>>+						   index * TCE32_TABLE_SIZE,
>>+						   0,
>>+						   0x1000);
>>+		bitmap_clear(phb->ioda.tce32_segmap,
>>+			     pe->tce32_seg_start,
>>+			     count);
>>+		tbl_size = TCE32_TABLE_SIZE * count;
>>+		break;
>>+	case PNV_PHB_IODA2:
>>+		opal_pci_map_pe_dma_window(phb->opal_id,
>>+					   pe->pe_number,
>>+					   pe->pe_number << 1,
>>+					   1,
>>+					   __pa(tbl_addr),
>>+					   0,
>>+					   0x1000);
>>+		tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
>>+		tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
>>+		break;
>>+	default:
>>+		pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
>>+		return;
>>+	}
>>+
>>+	/* Free memory of IOMMU table */
>>+	free_pages(tbl_addr, get_order(tbl_size));
>
>
>You just programmed the table address to TVT and then you are releasing the
>pages. It does not seem right, it will leave garbage in TVT. Also, I am
>adding helpers to alloc/free TCE pages in DDW patchset, you could reuse bits
>from there (I'll post v10 soon, you'll be in copy and you'll have to review
>that ;) ).
>

I assume you're talking about TVE. I don't understand how garbage will be left
in TVE. opal_pci_map_pe_dma_window(), which is handled by skiboot, clear TVE
with zero'ed "tce_table_size". The pages previously allocated for TCE table is
released to buddy system, which can be allocated by somebody else (from buddy
or slab).

Ok. Please put me into the cc list. I guess the whole series of patches is
better to rebased on your DDW patchset, which is to be merged first, I believe.

>
>>+	pe->tce32_table = NULL;
>>+	pe->tce32_seg_start = 0;
>>+	pe->tce32_seg_end = 0;
>>+}
>>+
>>+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+	unsigned long *segmap = NULL, *pe_segmap = NULL;
>>+	int i;
>>+	uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE,
>>+				     OPAL_M32_WINDOW_TYPE,
>>+				     OPAL_M64_WINDOW_TYPE };
>>+
>>+	for (win = 0; win < ARRAY_SIZE(win_type); win++) {
>>+		switch (win_type[win]) {
>>+		case OPAL_IO_WINDOW_TYPE:
>>+			segmap = phb->ioda.io_segmap;
>>+			pe_segmap = pe->io_segmap;
>>+			break;
>>+		case OPAL_M32_WINDOW_TYPE:
>>+			segmap = phb->ioda.m32_segmap;
>>+			pe_segmap = pe->m32_segmap;
>>+			break;
>>+		case OPAL_M64_WINDOW_TYPE:
>>+			segmap = phb->ioda.m64_segmap;
>>+			pe_segmap = pe->m64_segmap;
>>+			break;
>>+		}
>>+		i = -1;
>>+		while ((i = find_next_bit(pe_segmap,
>>+			phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) {
>>+			if (win_type[win] == OPAL_IO_WINDOW_TYPE ||
>>+			    win_type[win] == OPAL_M32_WINDOW_TYPE)
>>+				opal_pci_map_pe_mmio_window(phb->opal_id,
>>+						phb->ioda.reserved_pe,
>>+						win_type[win], 0, i);
>>+			else if (phb->type == PNV_PHB_IODA1)
>>+				opal_pci_map_pe_mmio_window(phb->opal_id,
>>+						phb->ioda.reserved_pe,
>>+						win_type[win],
>>+						i / 8, i % 8);
>
>The function is called ""release" but it programs something what looks like
>reasonable values, is it correct?
>

It's out of problem, When the segment is deallocated, it's mapped to the
reserved PE#.

>
>
>>+
>>+			clear_bit(i, pe_segmap);
>>+			clear_bit(i, segmap);
>>+		}
>>+	}
>>+}
>>+
>>+static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>+				  struct pnv_ioda_pe *parent,
>>+				  struct pnv_ioda_pe *child,
>>+				  bool is_add)
>>+{
>>+	const char *desc = is_add ? "adding" : "removing";
>>+	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>+			      OPAL_REMOVE_PE_FROM_DOMAIN;
>>+	struct pnv_ioda_pe *slave;
>>+	long rc;
>>+
>>+	/* Parent PE affects child PE */
>>+	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>+				child->pe_number, op);
>>+	if (rc != OPAL_SUCCESS) {
>>+		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>+			rc, desc);
>>+		return -ENXIO;
>>+	}
>>+
>>+	if (!(child->flags & PNV_IODA_PE_MASTER))
>>+		return 0;
>>+
>>+	/* Compound case: parent PE affects slave PEs */
>>+	list_for_each_entry(slave, &child->slaves, list) {
>>+		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>+					slave->pe_number, op);
>>+		if (rc != OPAL_SUCCESS) {
>>+			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>+				rc, desc);
>>+			return -ENXIO;
>>+		}
>>+	}
>>+
>>+	return 0;
>>+}
>>+
>>+static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add)
>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+	struct pnv_ioda_pe *slave;
>>+	struct pci_dev *pdev = NULL;
>>+	int ret;
>>+
>>+	/*
>>+	 * Clear PE frozen state. If it's master PE, we need
>>+	 * clear slave PE frozen state as well.
>>+	 */
>>+	opal_pci_eeh_freeze_clear(phb->opal_id,
>>+				  pe->pe_number,
>>+				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>+	if (pe->flags & PNV_IODA_PE_MASTER) {
>>+		list_for_each_entry(slave, &pe->slaves, list) {
>>+			opal_pci_eeh_freeze_clear(phb->opal_id,
>>+						  slave->pe_number,
>>+						  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>+		}
>>+	}
>>+
>>+	/*
>>+	 * Associate PE in PELT. We need add the PE into the
>>+	 * corresponding PELT-V as well. Otherwise, the error
>>+	 * originated from the PE might contribute to other
>>+	 * PEs.
>>+	 */
>>+	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>+	if (ret)
>>+		return ret;
>>+
>>+	/* For compound PEs, any one affects all of them */
>>+	if (pe->flags & PNV_IODA_PE_MASTER) {
>>+		list_for_each_entry(slave, &pe->slaves, list) {
>>+			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>+			if (ret)
>>+				return ret;
>>+		}
>>+	}
>>+
>>+	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>+		pdev = pe->pbus->self;
>>+	else if (pe->flags & PNV_IODA_PE_DEV)
>>+		pdev = pe->pdev->bus->self;
>>+#ifdef CONFIG_PCI_IOV
>>+	else if (pe->flags & PNV_IODA_PE_VF)
>>+		pdev = pe->parent_dev->bus->self;
>>+#endif /* CONFIG_PCI_IOV */
>>+
>>+	while (pdev) {
>>+		struct pci_dn *pdn = pci_get_pdn(pdev);
>>+		struct pnv_ioda_pe *parent;
>>+
>>+		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>+			parent = &phb->ioda.pe_array[pdn->pe_number];
>>+			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>+			if (ret)
>>+				return ret;
>>+		}
>>+
>>+		pdev = pdev->bus->self;
>>+	}
>>+
>>+	return 0;
>>+}
>>+
>>+static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe)
>
>
>It used to be under #ifdef CONFIG_PCI_IOV, now it is not. Looks like just
>moving of this function to a different place deserves a separate patch with a
>comment why ("it is going to be used now for non-SRIOV case too" may be?).
>

Yeah, it makes sense to me. Will fix it up.

>
>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+	struct pci_dev *parent;
>>+	uint8_t bcomp, dcomp, fcomp;
>>+	long rid_end, rid;
>>+	int64_t rc;
>>+
>>+	/* Tear down MVE */
>>+	if (phb->type == PNV_PHB_IODA1 &&
>>+	    pe->mve_number != -1) {
>>+		rc = opal_pci_set_mve(phb->opal_id,
>>+				      pe->mve_number,
>>+				      phb->ioda.reserved_pe);
>>+		if (rc != OPAL_SUCCESS)
>>+			pe_warn(pe, "Error %lld unmapping MVE#%d\n",
>>+				rc, pe->mve_number);
>>+		rc = opal_pci_set_mve_enable(phb->opal_id,
>>+					     pe->mve_number,
>>+					     OPAL_DISABLE_MVE);
>>+		if (rc != OPAL_SUCCESS)
>>+			pe_warn(pe, "Error %lld disabling MVE#%d\n",
>>+				rc, pe->mve_number);
>>+		pe->mve_number = -1;
>>+	}
>>+
>>+	/* Unmapping PELTV */
>>+	pnv_ioda_set_peltv(pe, false);
>>+
>>+	/* To unmap PELTM */
>>+	if (pe->pbus) {
>>+		int count;
>>+
>>+		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>+		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>+		parent = pe->pbus->self;
>>+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>+			count = pe->pbus->busn_res.end -
>>+				pe->pbus->busn_res.start + 1;
>>+		else
>>+			count = 1;
>>+
>>+		switch(count) {
>>+		case  1: bcomp = OpalPciBusAll;   break;
>>+		case  2: bcomp = OpalPciBus7Bits; break;
>>+		case  4: bcomp = OpalPciBus6Bits; break;
>>+		case  8: bcomp = OpalPciBus5Bits; break;
>>+		case 16: bcomp = OpalPciBus4Bits; break;
>>+		case 32: bcomp = OpalPciBus3Bits; break;
>>+		default:
>>+			/* Fail back to case of one bus */
>>+			pe_warn(pe, "Cannot support %d buses\n", count);
>>+			bcomp = OpalPciBusAll;
>>+		}
>>+		rid_end = pe->rid + (count << 8);
>>+	} else {
>>+#ifdef CONFIG_PCI_IOV
>>+		if (pe->flags & PNV_IODA_PE_VF)
>>+			parent = pe->parent_dev;
>>+		else
>>+#endif
>>+			parent = pe->pdev->bus->self;
>>+		bcomp = OpalPciBusAll;
>>+		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>+		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>+		rid_end = pe->rid + 1;
>>+	}
>>+
>>+	/* Clear RID mapping */
>>+	for (rid = pe->rid; rid < rid_end; rid++)
>>+		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>+
>>+	/* Unmapping PELTM */
>>+	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>+			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>+	if (rc)
>>+		pe_warn(pe, "Error %ld unmapping PELTM\n", rc);
>>+}
>>+
>>+static void pnv_ioda_release_pe(struct kref *kref)
>>+{
>>+	struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref);
>>+	struct pnv_ioda_pe *tmp, *slave;
>>+	struct pnv_phb *phb = pe->phb;
>>+
>>+	pnv_ioda_release_pe_dma(pe);
>>+	pnv_ioda_release_pe_seg(pe);
>>+	pnv_ioda_deconfigure_pe(pe);
>>+
>>+	/* Release slave PEs for compound PE */
>>+	if (pe->flags & PNV_IODA_PE_MASTER) {
>>+		list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
>>+			pnv_ioda_pe_put(slave);
>>+	}
>>+
>>+	/* Remove the PE from various list. We need remove slave
>>+	 * PE from master's list.
>>+	 */
>>+	list_del(&pe->dma_link);
>>+	list_del(&pe->list);
>>+
>>+	/* Free PE number */
>>+	clear_bit(pe->pe_number, phb->ioda.pe_alloc);
>>+}
>>+
>>+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb,
>>+					    int pe_no)
>>+{
>>+	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
>>+
>>+	kref_init(&pe->kref);
>>+	pe->phb = phb;
>>+	pe->pe_number = pe_no;
>>+	INIT_LIST_HEAD(&pe->dma_link);
>>+	INIT_LIST_HEAD(&pe->list);
>>+
>>+	return pe;
>>+}
>>+
>>+static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb,
>>+					       int pe_no)
>>+{
>>+	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>+		pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>+			__func__, pe_no, phb->hose->global_number);
>>+		return NULL;
>>  	}
>>
>>-	phb->ioda.pe_array[pe_no].phb = phb;
>>-	phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>+	/*
>>+	 * Same PE might be reserved for multiple times, which
>>+	 * is out of problem actually.
>>+	 */
>>+	set_bit(pe_no, phb->ioda.pe_alloc);
>>+	return pnv_ioda_init_pe(phb, pe_no);
>>  }
>>
>>-static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>+static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>  {
>>  	unsigned long pe_no;
>>  	unsigned long limit = phb->ioda.total_pe - 1;
>>@@ -154,20 +533,10 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>  			break;
>>
>>  		if (--limit >= phb->ioda.total_pe)
>>-			return IODA_INVALID_PE;
>>+			return NULL;
>>  	} while(1);
>>
>>-	phb->ioda.pe_array[pe_no].phb = phb;
>>-	phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>-	return pe_no;
>>-}
>>-
>>-static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>>-{
>>-	WARN_ON(phb->ioda.pe_array[pe].pdev);
>>-
>>-	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
>>-	clear_bit(pe, phb->ioda.pe_alloc);
>>+	return pnv_ioda_init_pe(phb, pe_no);
>>  }
>>
>>  static int pnv_ioda1_init_m64(struct pnv_phb *phb)
>>@@ -382,8 +751,9 @@ static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb,
>>  	}
>>  }
>>
>>-static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>-				struct pci_bus *bus, int all)
>>+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>+						struct pci_bus *bus,
>>+						int all)
>
>
>Mechanic changes like this could easily go to a separate patch.
>

Indeed. I'll see how I can split the patches up in next revision.
Thanks for the suggestion.

>>  {
>>  	resource_size_t segsz = phb->ioda.m64_segsize;
>>  	struct pci_dev *pdev;
>>@@ -394,14 +764,14 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>  	int i;
>>
>>  	if (!pnv_ioda_need_m64_pe(phb, bus))
>>-		return IODA_INVALID_PE;
>>+		return NULL;
>>
>>          /* Allocate bitmap */
>>  	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
>>  	pe_bitsmap = kzalloc(size, GFP_KERNEL);
>>  	if (!pe_bitsmap) {
>>  		pr_warn("%s: Out of memory !\n", __func__);
>>-		return IODA_INVALID_PE;
>>+		return NULL;
>>  	}
>>
>>  	/* The bridge's M64 window might be extended to PHB's M64
>>@@ -438,7 +808,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>  	/* No M64 window found ? */
>>  	if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) {
>>  		kfree(pe_bitsmap);
>>-		return IODA_INVALID_PE;
>>+		return NULL;
>>  	}
>>
>>  	/* Figure out the master PE and put all slave PEs
>>@@ -491,7 +861,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>  	}
>>
>>  	kfree(pe_bitsmap);
>>-	return master_pe->pe_number;
>>+	return master_pe;
>>  }
>>
>>  static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
>>@@ -695,7 +1065,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
>>   * but in the meantime, we need to protect them to avoid warnings
>>   */
>>  #ifdef CONFIG_PCI_MSI
>>-static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>>+static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev)
>>  {
>>  	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>  	struct pnv_phb *phb = hose->private_data;
>>@@ -709,191 +1079,6 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>>  }
>>  #endif /* CONFIG_PCI_MSI */
>>
>>-static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>-				  struct pnv_ioda_pe *parent,
>>-				  struct pnv_ioda_pe *child,
>>-				  bool is_add)
>>-{
>>-	const char *desc = is_add ? "adding" : "removing";
>>-	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>-			      OPAL_REMOVE_PE_FROM_DOMAIN;
>>-	struct pnv_ioda_pe *slave;
>>-	long rc;
>>-
>>-	/* Parent PE affects child PE */
>>-	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>-				child->pe_number, op);
>>-	if (rc != OPAL_SUCCESS) {
>>-		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>-			rc, desc);
>>-		return -ENXIO;
>>-	}
>>-
>>-	if (!(child->flags & PNV_IODA_PE_MASTER))
>>-		return 0;
>>-
>>-	/* Compound case: parent PE affects slave PEs */
>>-	list_for_each_entry(slave, &child->slaves, list) {
>>-		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>-					slave->pe_number, op);
>>-		if (rc != OPAL_SUCCESS) {
>>-			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>-				rc, desc);
>>-			return -ENXIO;
>>-		}
>>-	}
>>-
>>-	return 0;
>>-}
>>-
>>-static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>>-			      struct pnv_ioda_pe *pe,
>>-			      bool is_add)
>>-{
>>-	struct pnv_ioda_pe *slave;
>>-	struct pci_dev *pdev = NULL;
>>-	int ret;
>>-
>>-	/*
>>-	 * Clear PE frozen state. If it's master PE, we need
>>-	 * clear slave PE frozen state as well.
>>-	 */
>>-	if (is_add) {
>>-		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
>>-					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>-		if (pe->flags & PNV_IODA_PE_MASTER) {
>>-			list_for_each_entry(slave, &pe->slaves, list)
>>-				opal_pci_eeh_freeze_clear(phb->opal_id,
>>-							  slave->pe_number,
>>-							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>-		}
>>-	}
>>-
>>-	/*
>>-	 * Associate PE in PELT. We need add the PE into the
>>-	 * corresponding PELT-V as well. Otherwise, the error
>>-	 * originated from the PE might contribute to other
>>-	 * PEs.
>>-	 */
>>-	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>-	if (ret)
>>-		return ret;
>>-
>>-	/* For compound PEs, any one affects all of them */
>>-	if (pe->flags & PNV_IODA_PE_MASTER) {
>>-		list_for_each_entry(slave, &pe->slaves, list) {
>>-			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>-			if (ret)
>>-				return ret;
>>-		}
>>-	}
>>-
>>-	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>-		pdev = pe->pbus->self;
>>-	else if (pe->flags & PNV_IODA_PE_DEV)
>>-		pdev = pe->pdev->bus->self;
>>-#ifdef CONFIG_PCI_IOV
>>-	else if (pe->flags & PNV_IODA_PE_VF)
>>-		pdev = pe->parent_dev->bus->self;
>>-#endif /* CONFIG_PCI_IOV */
>>-	while (pdev) {
>>-		struct pci_dn *pdn = pci_get_pdn(pdev);
>>-		struct pnv_ioda_pe *parent;
>>-
>>-		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>-			parent = &phb->ioda.pe_array[pdn->pe_number];
>>-			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>-			if (ret)
>>-				return ret;
>>-		}
>>-
>>-		pdev = pdev->bus->self;
>>-	}
>>-
>>-	return 0;
>>-}
>>-
>>-#ifdef CONFIG_PCI_IOV
>>-static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>-{
>>-	struct pci_dev *parent;
>>-	uint8_t bcomp, dcomp, fcomp;
>>-	int64_t rc;
>>-	long rid_end, rid;
>>-
>>-	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
>>-	if (pe->pbus) {
>>-		int count;
>>-
>>-		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>-		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>-		parent = pe->pbus->self;
>>-		if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>-			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
>>-		else
>>-			count = 1;
>>-
>>-		switch(count) {
>>-		case  1: bcomp = OpalPciBusAll;         break;
>>-		case  2: bcomp = OpalPciBus7Bits;       break;
>>-		case  4: bcomp = OpalPciBus6Bits;       break;
>>-		case  8: bcomp = OpalPciBus5Bits;       break;
>>-		case 16: bcomp = OpalPciBus4Bits;       break;
>>-		case 32: bcomp = OpalPciBus3Bits;       break;
>>-		default:
>>-			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
>>-			        count);
>>-			/* Do an exact match only */
>>-			bcomp = OpalPciBusAll;
>>-		}
>>-		rid_end = pe->rid + (count << 8);
>>-	} else {
>>-		if (pe->flags & PNV_IODA_PE_VF)
>>-			parent = pe->parent_dev;
>>-		else
>>-			parent = pe->pdev->bus->self;
>>-		bcomp = OpalPciBusAll;
>>-		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>-		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>-		rid_end = pe->rid + 1;
>>-	}
>>-
>>-	/* Clear the reverse map */
>>-	for (rid = pe->rid; rid < rid_end; rid++)
>>-		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>-
>>-	/* Release from all parents PELT-V */
>>-	while (parent) {
>>-		struct pci_dn *pdn = pci_get_pdn(parent);
>>-		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>-			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
>>-						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
>>-			/* XXX What to do in case of error ? */
>
>
>Not much :) Free associated memory and mark it "dead" so it won't be used
>again till reboot. In what circumstance can this opal_pci_set_peltv() fail at
>all?
>

Yeah, maybe. Until now, I didn't see this failure since the code is there
from the day. Note the code has been there for almost 4 years since the
day Ben wrote it.

>
>>-		}
>>-		parent = parent->bus->self;
>>-	}
>>-
>>-	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
>>-				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>-
>>-	/* Disassociate PE in PELT */
>>-	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
>>-				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
>>-	if (rc)
>>-		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
>>-	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>-			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>-	if (rc)
>>-		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
>>-
>>-	pe->pbus = NULL;
>>-	pe->pdev = NULL;
>>-	pe->parent_dev = NULL;
>>-
>>-	return 0;
>>-}
>>-#endif /* CONFIG_PCI_IOV */
>>-
>>  static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>  {
>>  	struct pci_dev *parent;
>>@@ -953,7 +1138,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>  	}
>>
>>  	/* Configure PELTV */
>>-	pnv_ioda_set_peltv(phb, pe, true);
>>+	pnv_ioda_set_peltv(pe, true);
>>
>>  	/* Setup reverse map */
>>  	for (rid = pe->rid; rid < rid_end; rid++)
>>@@ -1207,6 +1392,8 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>>  		if (pdn->pe_number != IODA_INVALID_PE)
>>  			continue;
>>
>>+		/* Increase reference count of the parent PE */
>
>When you comment like this, I read it as the comment belongs to the whole
>next chunk till the first empty line, i.e. to all 5 lines below, which is not
>the case. I'd remove the comment as 1) "pe_get" in pnv_ioda_pe_get() name
>suggests incrementing the reference counter 2) "pe" is always parent in this
>function. I do not insist though.
>

Agree on your explaining. I'll remove this unuseful comments.

>
>>+		pnv_ioda_pe_get(pe);
>>  		pdn->pe_number = pe->pe_number;
>>  		pe->dma_weight += pnv_ioda_dev_dma_weight(dev);
>>  		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>>@@ -1224,7 +1411,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>  {
>>  	struct pci_controller *hose = pci_bus_to_host(bus);
>>  	struct pnv_phb *phb = hose->private_data;
>>-	struct pnv_ioda_pe *pe;
>>+	struct pnv_ioda_pe *pe = NULL;
>>  	int pe_num = IODA_INVALID_PE;
>>
>>  	/* For partial hotplug case, the PE instance hasn't been destroyed
>>@@ -1240,24 +1427,24 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>  	}
>>
>>  	/* PE number for root bus should have been reserved */
>>-	if (pci_is_root_bus(bus))
>>-		pe_num = phb->ioda.root_pe_no;
>>+	if (pci_is_root_bus(bus) &&
>>+	    phb->ioda.root_pe_no != IODA_INVALID_PE)
>>+		pe = &phb->ioda.pe_array[phb->ioda.root_pe_no];
>>
>>  	/* Check if PE is determined by M64 */
>>-	if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe)
>>-		pe_num = phb->pick_m64_pe(phb, bus, all);
>>+	if (!pe && phb->pick_m64_pe)
>>+		pe = phb->pick_m64_pe(phb, bus, all);
>>
>>  	/* The PE number isn't pinned by M64 */
>>-	if (pe_num == IODA_INVALID_PE)
>>-		pe_num = pnv_ioda_alloc_pe(phb);
>>+	if (!pe)
>>+		pe = pnv_ioda_alloc_pe(phb);
>>
>>-	if (pe_num == IODA_INVALID_PE) {
>>-		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
>>+	if (!pe) {
>>+		pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n",
>>  			__func__, pci_domain_nr(bus), bus->number);
>>  		return NULL;
>>  	}
>>
>>-	pe = &phb->ioda.pe_array[pe_num];
>>  	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>>  	pe->pbus = bus;
>>  	pe->pdev = NULL;
>>@@ -1274,14 +1461,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>
>>  	if (pnv_ioda_configure_pe(phb, pe)) {
>>  		/* XXX What do we do here ? */
>>-		if (pe_num)
>>-			pnv_ioda_free_pe(phb, pe_num);
>>-		pe->pbus = NULL;
>>+		pnv_ioda_pe_put(pe);
>>  		return NULL;
>>  	}
>>
>>  	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
>>-			GFP_KERNEL, hose->node);
>>+				       GFP_KERNEL, hose->node);
>
>Seems like spaces change only - if you really want this change (which I hate
>- makes code look inaccurate to my taste but it seems I am in minority here
>:) ), please put it to the separate patch.
>

Ok. Confirm with you: You prefer the original format? I don't know
why I prefer the later one. Maybe my eyes are quite broken :-)

>
>>  	pe->tce32_table->data = pe;
>>
>>  	/* Associate it with all child devices */
>>@@ -1521,9 +1706,9 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>  		list_del(&pe->list);
>>  		mutex_unlock(&phb->ioda.pe_list_mutex);
>>
>>-		pnv_ioda_deconfigure_pe(phb, pe);
>>+		pnv_ioda_deconfigure_pe(pe);
>
>
>Is this change necessary to get "Release PEs dynamically" working? Move it to
>mechanical changes patch may be?
>

Ok. I'll try to do that.

>
>>
>>-		pnv_ioda_free_pe(phb, pe->pe_number);
>>+		pnv_ioda_pe_put(pe);
>>  	}
>>  }
>>
>>@@ -1601,9 +1786,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>
>>  		if (pnv_ioda_configure_pe(phb, pe)) {
>>  			/* XXX What do we do here ? */
>>-			if (pe_num)
>>-				pnv_ioda_free_pe(phb, pe_num);
>>-			pe->pdev = NULL;
>>+			pnv_ioda_pe_put(pe);
>>  			continue;
>>  		}
>>
>>@@ -2263,7 +2446,7 @@ int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
>>  	struct pnv_ioda_pe *pe;
>>  	int rc;
>>
>>-	pe = pnv_ioda_get_pe(dev);
>>+	pe = pnv_ioda_pci_dev_to_pe(dev);
>
>
>And this change could to separately. Not clear how this helps to "Release PEs
>dynamically".
>
>

It's not related to "Release PEs dynamically". The change is introduced by
the function rename: Original pnv_ioda_get_pe() is renamed to pnv_ioda_pci_dev_to_pe().

>>  	if (!pe)
>>  		return -ENODEV;
>>
>>@@ -2379,7 +2562,7 @@ int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
>>  	struct pnv_ioda_pe *pe;
>>  	int rc;
>>
>>-	if (!(pe = pnv_ioda_get_pe(dev)))
>>+	if (!(pe = pnv_ioda_pci_dev_to_pe(dev)))
>>  		return -ENODEV;
>>
>>  	/* Assign XIVE to PE */
>>@@ -2401,7 +2584,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
>>  				  unsigned int hwirq, unsigned int virq,
>>  				  unsigned int is_64, struct msi_msg *msg)
>>  {
>>-	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
>>+	struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev);
>>  	unsigned int xive_num = hwirq - phb->msi_base;
>>  	__be32 data;
>>  	int rc;
>>@@ -3065,6 +3248,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>>  	pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge;
>>  	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
>>  	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
>>+	pnv_pci_controller_ops.release_device = pnv_pci_release_device;
>>  	hose->controller_ops = pnv_pci_controller_ops;
>>
>>  #ifdef CONFIG_PCI_IOV
>>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>>index 1bea3a8..8b10f01 100644
>>--- a/arch/powerpc/platforms/powernv/pci.h
>>+++ b/arch/powerpc/platforms/powernv/pci.h
>>@@ -28,6 +28,7 @@ enum pnv_phb_model {
>>  /* Data associated with a PE, including IOMMU tracking etc.. */
>>  struct pnv_phb;
>>  struct pnv_ioda_pe {
>>+	struct kref		kref;
>>  	unsigned long		flags;
>>  	struct pnv_phb		*phb;
>>
>>@@ -120,7 +121,8 @@ struct pnv_phb {
>>  	void (*shutdown)(struct pnv_phb *phb);
>>  	int (*init_m64)(struct pnv_phb *phb);
>>  	void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus);
>>-	int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
>>+	struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb,
>>+					   struct pci_bus *bus, int all);
>>  	int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
>>  	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
>>  	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
>>

Thanks,
Gavin
Alexey Kardashevskiy May 11, 2015, 7:02 a.m. UTC | #3
On 05/11/2015 04:25 PM, Gavin Shan wrote:
> On Sat, May 09, 2015 at 10:43:23PM +1000, Alexey Kardashevskiy wrote:
>> On 05/01/2015 04:02 PM, Gavin Shan wrote:
>>> The original code doesn't support releasing PEs dynamically, meaning
>>> that PE and the associated resources (IO, M32, M64 and DMA) can't
>>> be released when unplugging a PCI adapter from one hotpluggable slot.
>>>
>>> The patch takes object oriented methodology, introducs reference
>>> count to PE, which is initialized to 1 and increased with 1 when a
>>> new PCI device joins the PE. Once the last PCI device leaves the
>>> PE, the PE is going to be release together with its associated
>>> (IO, M32, M64, DMA) resources.
>>
>>
>> Too little commit log for non-trivial non-cut-n-paste 30KB patch...
>>
>
> Ok. I'll add more details in next revision.
>
>>>
>>> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>> ---
>>>   arch/powerpc/include/asm/pci-bridge.h     |   3 +
>>>   arch/powerpc/kernel/pci-hotplug.c         |   5 +
>>>   arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++-----------
>>>   arch/powerpc/platforms/powernv/pci.h      |   4 +-
>>>   4 files changed, 432 insertions(+), 238 deletions(-)
>>>
>>> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>> index 5367eb3..a6ad4b1 100644
>>> --- a/arch/powerpc/include/asm/pci-bridge.h
>>> +++ b/arch/powerpc/include/asm/pci-bridge.h
>>> @@ -31,6 +31,9 @@ struct pci_controller_ops {
>>>   	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
>>>   	void		(*setup_bridge)(struct pci_bus *, unsigned long);
>>>   	void		(*reset_secondary_bus)(struct pci_dev *dev);
>>> +
>>> +	/* Called when PCI device is released */
>>> +	void		(*release_device)(struct pci_dev *);
>>>   };
>>>
>>>   /*
>>> diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
>>> index 7ed85a6..0040343 100644
>>> --- a/arch/powerpc/kernel/pci-hotplug.c
>>> +++ b/arch/powerpc/kernel/pci-hotplug.c
>>> @@ -29,6 +29,11 @@
>>>    */
>>>   void pcibios_release_device(struct pci_dev *dev)
>>>   {
>>> +	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>> +
>>> +	if (hose->controller_ops.release_device)
>>> +		hose->controller_ops.release_device(dev);
>>> +
>>>   	eeh_remove_device(dev);
>>>   }
>>>
>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> index 910fb67..ef8c216 100644
>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> @@ -12,6 +12,8 @@
>>>   #undef DEBUG
>>>
>>>   #include <linux/kernel.h>
>>> +#include <linux/atomic.h>
>>> +#include <linux/kref.h>
>>>   #include <linux/pci.h>
>>>   #include <linux/crash_dump.h>
>>>   #include <linux/debugfs.h>
>>> @@ -47,6 +49,8 @@
>>>   /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
>>>   #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
>>>
>>> +static void pnv_ioda_release_pe(struct kref *kref);
>>> +
>>>   static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>>>   			    const char *fmt, ...)
>>>   {
>>> @@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>>>   		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>>>   }
>>>
>>> -static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
>>> +static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>>>   {
>>> -	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>> -		pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>> -			__func__, pe_no, phb->hose->global_number);
>>> +	if (!pe)
>>> +		return;
>>> +
>>> +	kref_get(&pe->kref);
>>> +}
>>> +
>>> +static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
>>> +{
>>> +	unsigned int count;
>>> +
>>> +	if (!pe)
>>>   		return;
>>> +
>>> +	/*
>>> +	 * The count is initialized to 1 and increased with 1 when
>>> +	 * a new PCI device is bound with the PE. Once the last PCI
>>> +	 * device is leaving from the PE, the PE is going to be
>>> +	 * released.
>>> +	 */
>>> +	count = atomic_read(&pe->kref.refcount);
>>> +	if (count == 2)
>>> +		kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
>>> +	else
>>> +		kref_put(&pe->kref, pnv_ioda_release_pe);
>>
>>
>> What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()?
>>
>
> Yeah, that would have problem. But it shouldn't happen because the
> PCI devices are joining the parent PE# in strictly serialized mode.
> Same thing happens when detaching PCI devices from its parent PE.


oookay. Another thing then - why is this kref counter initialized to 1?
It would make sense if you did something special when the counter becomes 1 
after decrement but you do not.

Also, this kref thing makes sense if you do kref_put() in multiple places 
and do not know which one will be the last one so you pass the callback to 
all of them. Here you do kref_put/sub in one place and you read the counter 
- so you can call pnv_ioda_release_pe() directly. And it feels like a 
simple atomic_t would do the job just fine. If you still feel that the 
counter should start from 1, there are atomic_dec_if_positive() and 
atomic_inc_not_zero() and others.




>>> +}
>>> +
>>> +static void pnv_pci_release_device(struct pci_dev *pdev)
>>> +{
>>> +	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>> +	struct pnv_phb *phb = hose->private_data;
>>> +	struct pci_dn *pdn = pci_get_pdn(pdev);
>>> +	struct pnv_ioda_pe *pe;
>>> +
>>> +	if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>> +		pe = &phb->ioda.pe_array[pdn->pe_number];
>>> +		pnv_ioda_pe_put(pe);
>>> +		pdn->pe_number = IODA_INVALID_PE;
>>>   	}
>>> +}
>>>
>>> -	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
>>> -		pr_warn("%s: PE %d was assigned on PHB#%x\n",
>>> -			__func__, pe_no, phb->hose->global_number);
>>> +static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>>> +{
>>> +	struct pnv_phb *phb = pe->phb;
>>> +	int index, count;
>>> +	unsigned long tbl_addr, tbl_size;
>>> +
>>> +	/* No DMA capability for slave PEs */
>>> +	if (pe->flags & PNV_IODA_PE_SLAVE)
>>> +		return;
>>> +
>>> +	/* Bypass DMA window */
>>> +	if (phb->type == PNV_PHB_IODA2 &&
>>> +	    pe->tce_bypass_enabled &&
>>> +	    pe->tce32_table &&
>>> +	    pe->tce32_table->set_bypass)
>>> +		pe->tce32_table->set_bypass(pe->tce32_table, false);
>>> +
>>> +	/* 32-bits DMA window */
>>> +	count = pe->tce32_seg_end - pe->tce32_seg_start;
>>> +	tbl_addr = pe->tce32_table->it_base;
>>> +	if (!count)
>>>   		return;
>>> +
>>> +	/* Free IOMMU table */
>>> +	iommu_free_table(pe->tce32_table,
>>> +			 of_node_full_name(phb->hose->dn));
>>> +
>>> +	/* Deconfigure TCE table */
>>> +	switch (phb->type) {
>>> +	case PNV_PHB_IODA1:
>>> +		for (index = 0; index < count; index++)
>>> +			opal_pci_map_pe_dma_window(phb->opal_id,
>>> +						   pe->pe_number,
>>> +						   pe->tce32_seg_start + index,
>>> +						   1,
>>> +						   __pa(tbl_addr) +
>>> +						   index * TCE32_TABLE_SIZE,
>>> +						   0,
>>> +						   0x1000);
>>> +		bitmap_clear(phb->ioda.tce32_segmap,
>>> +			     pe->tce32_seg_start,
>>> +			     count);
>>> +		tbl_size = TCE32_TABLE_SIZE * count;
>>> +		break;
>>> +	case PNV_PHB_IODA2:
>>> +		opal_pci_map_pe_dma_window(phb->opal_id,
>>> +					   pe->pe_number,
>>> +					   pe->pe_number << 1,
>>> +					   1,
>>> +					   __pa(tbl_addr),
>>> +					   0,
>>> +					   0x1000);
>>> +		tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
>>> +		tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
>>> +		break;
>>> +	default:
>>> +		pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
>>> +		return;
>>> +	}
>>> +
>>> +	/* Free memory of IOMMU table */
>>> +	free_pages(tbl_addr, get_order(tbl_size));
>>
>>
>> You just programmed the table address to TVT and then you are releasing the
>> pages. It does not seem right, it will leave garbage in TVT. Also, I am
>> adding helpers to alloc/free TCE pages in DDW patchset, you could reuse bits
>>from there (I'll post v10 soon, you'll be in copy and you'll have to review
>> that ;) ).
>>
>
> I assume you're talking about TVE. I don't understand how garbage will be left
> in TVE. opal_pci_map_pe_dma_window(), which is handled by skiboot, clear TVE
> with zero'ed "tce_table_size". The pages previously allocated for TCE table is
> released to buddy system, which can be allocated by somebody else (from buddy
> or slab).

opal_pci_map_pe_dma_window() takes __pa(tbl_addr) which points to some 
memory which is still allocated. This value goes to a table (which has 2 
entries per PE, one for 32bit DMA window and one for bypass/hugewindow) 
which PHB uses to get the actual TCE table address. What is the name of 
this table? :) Anyway, you write an address there and then you call 
free_pages() so after free_pages(), the value in that TVE/TVT/whatever 
table is a garbage.


>
> Ok. Please put me into the cc list. I guess the whole series of patches is
> better to rebased on your DDW patchset, which is to be merged first, I believe.
>
>>
>>> +	pe->tce32_table = NULL;
>>> +	pe->tce32_seg_start = 0;
>>> +	pe->tce32_seg_end = 0;
>>> +}
>>> +
>>> +static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
>>> +{
>>> +	struct pnv_phb *phb = pe->phb;
>>> +	unsigned long *segmap = NULL, *pe_segmap = NULL;
>>> +	int i;
>>> +	uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE,
>>> +				     OPAL_M32_WINDOW_TYPE,
>>> +				     OPAL_M64_WINDOW_TYPE };
>>> +
>>> +	for (win = 0; win < ARRAY_SIZE(win_type); win++) {
>>> +		switch (win_type[win]) {
>>> +		case OPAL_IO_WINDOW_TYPE:
>>> +			segmap = phb->ioda.io_segmap;
>>> +			pe_segmap = pe->io_segmap;
>>> +			break;
>>> +		case OPAL_M32_WINDOW_TYPE:
>>> +			segmap = phb->ioda.m32_segmap;
>>> +			pe_segmap = pe->m32_segmap;
>>> +			break;
>>> +		case OPAL_M64_WINDOW_TYPE:
>>> +			segmap = phb->ioda.m64_segmap;
>>> +			pe_segmap = pe->m64_segmap;
>>> +			break;
>>> +		}
>>> +		i = -1;
>>> +		while ((i = find_next_bit(pe_segmap,
>>> +			phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) {
>>> +			if (win_type[win] == OPAL_IO_WINDOW_TYPE ||
>>> +			    win_type[win] == OPAL_M32_WINDOW_TYPE)
>>> +				opal_pci_map_pe_mmio_window(phb->opal_id,
>>> +						phb->ioda.reserved_pe,
>>> +						win_type[win], 0, i);
>>> +			else if (phb->type == PNV_PHB_IODA1)
>>> +				opal_pci_map_pe_mmio_window(phb->opal_id,
>>> +						phb->ioda.reserved_pe,
>>> +						win_type[win],
>>> +						i / 8, i % 8);
>>
>> The function is called ""release" but it programs something what looks like
>> reasonable values, is it correct?
>>
>
> It's out of problem, When the segment is deallocated, it's mapped to the
> reserved PE#.
>
>>
>>
>>> +
>>> +			clear_bit(i, pe_segmap);
>>> +			clear_bit(i, segmap);
>>> +		}
>>> +	}
>>> +}
>>> +
>>> +static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>> +				  struct pnv_ioda_pe *parent,
>>> +				  struct pnv_ioda_pe *child,
>>> +				  bool is_add)
>>> +{
>>> +	const char *desc = is_add ? "adding" : "removing";
>>> +	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>> +			      OPAL_REMOVE_PE_FROM_DOMAIN;
>>> +	struct pnv_ioda_pe *slave;
>>> +	long rc;
>>> +
>>> +	/* Parent PE affects child PE */
>>> +	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>> +				child->pe_number, op);
>>> +	if (rc != OPAL_SUCCESS) {
>>> +		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>> +			rc, desc);
>>> +		return -ENXIO;
>>> +	}
>>> +
>>> +	if (!(child->flags & PNV_IODA_PE_MASTER))
>>> +		return 0;
>>> +
>>> +	/* Compound case: parent PE affects slave PEs */
>>> +	list_for_each_entry(slave, &child->slaves, list) {
>>> +		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>> +					slave->pe_number, op);
>>> +		if (rc != OPAL_SUCCESS) {
>>> +			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>> +				rc, desc);
>>> +			return -ENXIO;
>>> +		}
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add)
>>> +{
>>> +	struct pnv_phb *phb = pe->phb;
>>> +	struct pnv_ioda_pe *slave;
>>> +	struct pci_dev *pdev = NULL;
>>> +	int ret;
>>> +
>>> +	/*
>>> +	 * Clear PE frozen state. If it's master PE, we need
>>> +	 * clear slave PE frozen state as well.
>>> +	 */
>>> +	opal_pci_eeh_freeze_clear(phb->opal_id,
>>> +				  pe->pe_number,
>>> +				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>> +	if (pe->flags & PNV_IODA_PE_MASTER) {
>>> +		list_for_each_entry(slave, &pe->slaves, list) {
>>> +			opal_pci_eeh_freeze_clear(phb->opal_id,
>>> +						  slave->pe_number,
>>> +						  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>> +		}
>>> +	}
>>> +
>>> +	/*
>>> +	 * Associate PE in PELT. We need add the PE into the
>>> +	 * corresponding PELT-V as well. Otherwise, the error
>>> +	 * originated from the PE might contribute to other
>>> +	 * PEs.
>>> +	 */
>>> +	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>> +	if (ret)
>>> +		return ret;
>>> +
>>> +	/* For compound PEs, any one affects all of them */
>>> +	if (pe->flags & PNV_IODA_PE_MASTER) {
>>> +		list_for_each_entry(slave, &pe->slaves, list) {
>>> +			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>> +			if (ret)
>>> +				return ret;
>>> +		}
>>> +	}
>>> +
>>> +	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>> +		pdev = pe->pbus->self;
>>> +	else if (pe->flags & PNV_IODA_PE_DEV)
>>> +		pdev = pe->pdev->bus->self;
>>> +#ifdef CONFIG_PCI_IOV
>>> +	else if (pe->flags & PNV_IODA_PE_VF)
>>> +		pdev = pe->parent_dev->bus->self;
>>> +#endif /* CONFIG_PCI_IOV */
>>> +
>>> +	while (pdev) {
>>> +		struct pci_dn *pdn = pci_get_pdn(pdev);
>>> +		struct pnv_ioda_pe *parent;
>>> +
>>> +		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>> +			parent = &phb->ioda.pe_array[pdn->pe_number];
>>> +			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>> +			if (ret)
>>> +				return ret;
>>> +		}
>>> +
>>> +		pdev = pdev->bus->self;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe)
>>
>>
>> It used to be under #ifdef CONFIG_PCI_IOV, now it is not. Looks like just
>> moving of this function to a different place deserves a separate patch with a
>> comment why ("it is going to be used now for non-SRIOV case too" may be?).
>>
>
> Yeah, it makes sense to me. Will fix it up.
>
>>
>>> +{
>>> +	struct pnv_phb *phb = pe->phb;
>>> +	struct pci_dev *parent;
>>> +	uint8_t bcomp, dcomp, fcomp;
>>> +	long rid_end, rid;
>>> +	int64_t rc;
>>> +
>>> +	/* Tear down MVE */
>>> +	if (phb->type == PNV_PHB_IODA1 &&
>>> +	    pe->mve_number != -1) {
>>> +		rc = opal_pci_set_mve(phb->opal_id,
>>> +				      pe->mve_number,
>>> +				      phb->ioda.reserved_pe);
>>> +		if (rc != OPAL_SUCCESS)
>>> +			pe_warn(pe, "Error %lld unmapping MVE#%d\n",
>>> +				rc, pe->mve_number);
>>> +		rc = opal_pci_set_mve_enable(phb->opal_id,
>>> +					     pe->mve_number,
>>> +					     OPAL_DISABLE_MVE);
>>> +		if (rc != OPAL_SUCCESS)
>>> +			pe_warn(pe, "Error %lld disabling MVE#%d\n",
>>> +				rc, pe->mve_number);
>>> +		pe->mve_number = -1;
>>> +	}
>>> +
>>> +	/* Unmapping PELTV */
>>> +	pnv_ioda_set_peltv(pe, false);
>>> +
>>> +	/* To unmap PELTM */
>>> +	if (pe->pbus) {
>>> +		int count;
>>> +
>>> +		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>> +		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>> +		parent = pe->pbus->self;
>>> +		if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>> +			count = pe->pbus->busn_res.end -
>>> +				pe->pbus->busn_res.start + 1;
>>> +		else
>>> +			count = 1;
>>> +
>>> +		switch(count) {
>>> +		case  1: bcomp = OpalPciBusAll;   break;
>>> +		case  2: bcomp = OpalPciBus7Bits; break;
>>> +		case  4: bcomp = OpalPciBus6Bits; break;
>>> +		case  8: bcomp = OpalPciBus5Bits; break;
>>> +		case 16: bcomp = OpalPciBus4Bits; break;
>>> +		case 32: bcomp = OpalPciBus3Bits; break;
>>> +		default:
>>> +			/* Fail back to case of one bus */
>>> +			pe_warn(pe, "Cannot support %d buses\n", count);
>>> +			bcomp = OpalPciBusAll;
>>> +		}
>>> +		rid_end = pe->rid + (count << 8);
>>> +	} else {
>>> +#ifdef CONFIG_PCI_IOV
>>> +		if (pe->flags & PNV_IODA_PE_VF)
>>> +			parent = pe->parent_dev;
>>> +		else
>>> +#endif
>>> +			parent = pe->pdev->bus->self;
>>> +		bcomp = OpalPciBusAll;
>>> +		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>> +		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>> +		rid_end = pe->rid + 1;
>>> +	}
>>> +
>>> +	/* Clear RID mapping */
>>> +	for (rid = pe->rid; rid < rid_end; rid++)
>>> +		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>> +
>>> +	/* Unmapping PELTM */
>>> +	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>> +			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>> +	if (rc)
>>> +		pe_warn(pe, "Error %ld unmapping PELTM\n", rc);
>>> +}
>>> +
>>> +static void pnv_ioda_release_pe(struct kref *kref)
>>> +{
>>> +	struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref);
>>> +	struct pnv_ioda_pe *tmp, *slave;
>>> +	struct pnv_phb *phb = pe->phb;
>>> +
>>> +	pnv_ioda_release_pe_dma(pe);
>>> +	pnv_ioda_release_pe_seg(pe);
>>> +	pnv_ioda_deconfigure_pe(pe);
>>> +
>>> +	/* Release slave PEs for compound PE */
>>> +	if (pe->flags & PNV_IODA_PE_MASTER) {
>>> +		list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
>>> +			pnv_ioda_pe_put(slave);
>>> +	}
>>> +
>>> +	/* Remove the PE from various list. We need remove slave
>>> +	 * PE from master's list.
>>> +	 */
>>> +	list_del(&pe->dma_link);
>>> +	list_del(&pe->list);
>>> +
>>> +	/* Free PE number */
>>> +	clear_bit(pe->pe_number, phb->ioda.pe_alloc);
>>> +}
>>> +
>>> +static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb,
>>> +					    int pe_no)
>>> +{
>>> +	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
>>> +
>>> +	kref_init(&pe->kref);
>>> +	pe->phb = phb;
>>> +	pe->pe_number = pe_no;
>>> +	INIT_LIST_HEAD(&pe->dma_link);
>>> +	INIT_LIST_HEAD(&pe->list);
>>> +
>>> +	return pe;
>>> +}
>>> +
>>> +static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb,
>>> +					       int pe_no)
>>> +{
>>> +	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>> +		pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>> +			__func__, pe_no, phb->hose->global_number);
>>> +		return NULL;
>>>   	}
>>>
>>> -	phb->ioda.pe_array[pe_no].phb = phb;
>>> -	phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>> +	/*
>>> +	 * Same PE might be reserved for multiple times, which
>>> +	 * is out of problem actually.
>>> +	 */
>>> +	set_bit(pe_no, phb->ioda.pe_alloc);
>>> +	return pnv_ioda_init_pe(phb, pe_no);
>>>   }
>>>
>>> -static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>> +static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>>   {
>>>   	unsigned long pe_no;
>>>   	unsigned long limit = phb->ioda.total_pe - 1;
>>> @@ -154,20 +533,10 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>>   			break;
>>>
>>>   		if (--limit >= phb->ioda.total_pe)
>>> -			return IODA_INVALID_PE;
>>> +			return NULL;
>>>   	} while(1);
>>>
>>> -	phb->ioda.pe_array[pe_no].phb = phb;
>>> -	phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>> -	return pe_no;
>>> -}
>>> -
>>> -static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>>> -{
>>> -	WARN_ON(phb->ioda.pe_array[pe].pdev);
>>> -
>>> -	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
>>> -	clear_bit(pe, phb->ioda.pe_alloc);
>>> +	return pnv_ioda_init_pe(phb, pe_no);
>>>   }
>>>
>>>   static int pnv_ioda1_init_m64(struct pnv_phb *phb)
>>> @@ -382,8 +751,9 @@ static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb,
>>>   	}
>>>   }
>>>
>>> -static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>> -				struct pci_bus *bus, int all)
>>> +static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>> +						struct pci_bus *bus,
>>> +						int all)
>>
>>
>> Mechanic changes like this could easily go to a separate patch.
>>
>
> Indeed. I'll see how I can split the patches up in next revision.
> Thanks for the suggestion.
>
>>>   {
>>>   	resource_size_t segsz = phb->ioda.m64_segsize;
>>>   	struct pci_dev *pdev;
>>> @@ -394,14 +764,14 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>   	int i;
>>>
>>>   	if (!pnv_ioda_need_m64_pe(phb, bus))
>>> -		return IODA_INVALID_PE;
>>> +		return NULL;
>>>
>>>           /* Allocate bitmap */
>>>   	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
>>>   	pe_bitsmap = kzalloc(size, GFP_KERNEL);
>>>   	if (!pe_bitsmap) {
>>>   		pr_warn("%s: Out of memory !\n", __func__);
>>> -		return IODA_INVALID_PE;
>>> +		return NULL;
>>>   	}
>>>
>>>   	/* The bridge's M64 window might be extended to PHB's M64
>>> @@ -438,7 +808,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>   	/* No M64 window found ? */
>>>   	if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) {
>>>   		kfree(pe_bitsmap);
>>> -		return IODA_INVALID_PE;
>>> +		return NULL;
>>>   	}
>>>
>>>   	/* Figure out the master PE and put all slave PEs
>>> @@ -491,7 +861,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>   	}
>>>
>>>   	kfree(pe_bitsmap);
>>> -	return master_pe->pe_number;
>>> +	return master_pe;
>>>   }
>>>
>>>   static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
>>> @@ -695,7 +1065,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
>>>    * but in the meantime, we need to protect them to avoid warnings
>>>    */
>>>   #ifdef CONFIG_PCI_MSI
>>> -static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>>> +static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev)
>>>   {
>>>   	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>>   	struct pnv_phb *phb = hose->private_data;
>>> @@ -709,191 +1079,6 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>>>   }
>>>   #endif /* CONFIG_PCI_MSI */
>>>
>>> -static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>> -				  struct pnv_ioda_pe *parent,
>>> -				  struct pnv_ioda_pe *child,
>>> -				  bool is_add)
>>> -{
>>> -	const char *desc = is_add ? "adding" : "removing";
>>> -	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>> -			      OPAL_REMOVE_PE_FROM_DOMAIN;
>>> -	struct pnv_ioda_pe *slave;
>>> -	long rc;
>>> -
>>> -	/* Parent PE affects child PE */
>>> -	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>> -				child->pe_number, op);
>>> -	if (rc != OPAL_SUCCESS) {
>>> -		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>> -			rc, desc);
>>> -		return -ENXIO;
>>> -	}
>>> -
>>> -	if (!(child->flags & PNV_IODA_PE_MASTER))
>>> -		return 0;
>>> -
>>> -	/* Compound case: parent PE affects slave PEs */
>>> -	list_for_each_entry(slave, &child->slaves, list) {
>>> -		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>> -					slave->pe_number, op);
>>> -		if (rc != OPAL_SUCCESS) {
>>> -			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>> -				rc, desc);
>>> -			return -ENXIO;
>>> -		}
>>> -	}
>>> -
>>> -	return 0;
>>> -}
>>> -
>>> -static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>>> -			      struct pnv_ioda_pe *pe,
>>> -			      bool is_add)
>>> -{
>>> -	struct pnv_ioda_pe *slave;
>>> -	struct pci_dev *pdev = NULL;
>>> -	int ret;
>>> -
>>> -	/*
>>> -	 * Clear PE frozen state. If it's master PE, we need
>>> -	 * clear slave PE frozen state as well.
>>> -	 */
>>> -	if (is_add) {
>>> -		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
>>> -					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>> -		if (pe->flags & PNV_IODA_PE_MASTER) {
>>> -			list_for_each_entry(slave, &pe->slaves, list)
>>> -				opal_pci_eeh_freeze_clear(phb->opal_id,
>>> -							  slave->pe_number,
>>> -							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>> -		}
>>> -	}
>>> -
>>> -	/*
>>> -	 * Associate PE in PELT. We need add the PE into the
>>> -	 * corresponding PELT-V as well. Otherwise, the error
>>> -	 * originated from the PE might contribute to other
>>> -	 * PEs.
>>> -	 */
>>> -	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>> -	if (ret)
>>> -		return ret;
>>> -
>>> -	/* For compound PEs, any one affects all of them */
>>> -	if (pe->flags & PNV_IODA_PE_MASTER) {
>>> -		list_for_each_entry(slave, &pe->slaves, list) {
>>> -			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>> -			if (ret)
>>> -				return ret;
>>> -		}
>>> -	}
>>> -
>>> -	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>> -		pdev = pe->pbus->self;
>>> -	else if (pe->flags & PNV_IODA_PE_DEV)
>>> -		pdev = pe->pdev->bus->self;
>>> -#ifdef CONFIG_PCI_IOV
>>> -	else if (pe->flags & PNV_IODA_PE_VF)
>>> -		pdev = pe->parent_dev->bus->self;
>>> -#endif /* CONFIG_PCI_IOV */
>>> -	while (pdev) {
>>> -		struct pci_dn *pdn = pci_get_pdn(pdev);
>>> -		struct pnv_ioda_pe *parent;
>>> -
>>> -		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>> -			parent = &phb->ioda.pe_array[pdn->pe_number];
>>> -			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>> -			if (ret)
>>> -				return ret;
>>> -		}
>>> -
>>> -		pdev = pdev->bus->self;
>>> -	}
>>> -
>>> -	return 0;
>>> -}
>>> -
>>> -#ifdef CONFIG_PCI_IOV
>>> -static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>> -{
>>> -	struct pci_dev *parent;
>>> -	uint8_t bcomp, dcomp, fcomp;
>>> -	int64_t rc;
>>> -	long rid_end, rid;
>>> -
>>> -	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
>>> -	if (pe->pbus) {
>>> -		int count;
>>> -
>>> -		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>> -		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>> -		parent = pe->pbus->self;
>>> -		if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>> -			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
>>> -		else
>>> -			count = 1;
>>> -
>>> -		switch(count) {
>>> -		case  1: bcomp = OpalPciBusAll;         break;
>>> -		case  2: bcomp = OpalPciBus7Bits;       break;
>>> -		case  4: bcomp = OpalPciBus6Bits;       break;
>>> -		case  8: bcomp = OpalPciBus5Bits;       break;
>>> -		case 16: bcomp = OpalPciBus4Bits;       break;
>>> -		case 32: bcomp = OpalPciBus3Bits;       break;
>>> -		default:
>>> -			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
>>> -			        count);
>>> -			/* Do an exact match only */
>>> -			bcomp = OpalPciBusAll;
>>> -		}
>>> -		rid_end = pe->rid + (count << 8);
>>> -	} else {
>>> -		if (pe->flags & PNV_IODA_PE_VF)
>>> -			parent = pe->parent_dev;
>>> -		else
>>> -			parent = pe->pdev->bus->self;
>>> -		bcomp = OpalPciBusAll;
>>> -		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>> -		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>> -		rid_end = pe->rid + 1;
>>> -	}
>>> -
>>> -	/* Clear the reverse map */
>>> -	for (rid = pe->rid; rid < rid_end; rid++)
>>> -		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>> -
>>> -	/* Release from all parents PELT-V */
>>> -	while (parent) {
>>> -		struct pci_dn *pdn = pci_get_pdn(parent);
>>> -		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>> -			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
>>> -						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
>>> -			/* XXX What to do in case of error ? */
>>
>>
>> Not much :) Free associated memory and mark it "dead" so it won't be used
>> again till reboot. In what circumstance can this opal_pci_set_peltv() fail at
>> all?
>>
>
> Yeah, maybe. Until now, I didn't see this failure since the code is there
> from the day. Note the code has been there for almost 4 years since the
> day Ben wrote it.


Sure. But if it starts failing, we won't even notice it - there is no even 
pr_err() or WARN_ON.


>
>>
>>> -		}
>>> -		parent = parent->bus->self;
>>> -	}
>>> -
>>> -	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
>>> -				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>> -
>>> -	/* Disassociate PE in PELT */
>>> -	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
>>> -				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
>>> -	if (rc)
>>> -		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
>>> -	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>> -			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>> -	if (rc)
>>> -		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
>>> -
>>> -	pe->pbus = NULL;
>>> -	pe->pdev = NULL;
>>> -	pe->parent_dev = NULL;
>>> -
>>> -	return 0;
>>> -}
>>> -#endif /* CONFIG_PCI_IOV */
>>> -
>>>   static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>>   {
>>>   	struct pci_dev *parent;
>>> @@ -953,7 +1138,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>>   	}
>>>
>>>   	/* Configure PELTV */
>>> -	pnv_ioda_set_peltv(phb, pe, true);
>>> +	pnv_ioda_set_peltv(pe, true);
>>>
>>>   	/* Setup reverse map */
>>>   	for (rid = pe->rid; rid < rid_end; rid++)
>>> @@ -1207,6 +1392,8 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>>>   		if (pdn->pe_number != IODA_INVALID_PE)
>>>   			continue;
>>>
>>> +		/* Increase reference count of the parent PE */
>>
>> When you comment like this, I read it as the comment belongs to the whole
>> next chunk till the first empty line, i.e. to all 5 lines below, which is not
>> the case. I'd remove the comment as 1) "pe_get" in pnv_ioda_pe_get() name
>> suggests incrementing the reference counter 2) "pe" is always parent in this
>> function. I do not insist though.
>>
>
> Agree on your explaining. I'll remove this unuseful comments.
>
>>
>>> +		pnv_ioda_pe_get(pe);
>>>   		pdn->pe_number = pe->pe_number;
>>>   		pe->dma_weight += pnv_ioda_dev_dma_weight(dev);
>>>   		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>>> @@ -1224,7 +1411,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>>   {
>>>   	struct pci_controller *hose = pci_bus_to_host(bus);
>>>   	struct pnv_phb *phb = hose->private_data;
>>> -	struct pnv_ioda_pe *pe;
>>> +	struct pnv_ioda_pe *pe = NULL;
>>>   	int pe_num = IODA_INVALID_PE;
>>>
>>>   	/* For partial hotplug case, the PE instance hasn't been destroyed
>>> @@ -1240,24 +1427,24 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>>   	}
>>>
>>>   	/* PE number for root bus should have been reserved */
>>> -	if (pci_is_root_bus(bus))
>>> -		pe_num = phb->ioda.root_pe_no;
>>> +	if (pci_is_root_bus(bus) &&
>>> +	    phb->ioda.root_pe_no != IODA_INVALID_PE)
>>> +		pe = &phb->ioda.pe_array[phb->ioda.root_pe_no];
>>>
>>>   	/* Check if PE is determined by M64 */
>>> -	if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe)
>>> -		pe_num = phb->pick_m64_pe(phb, bus, all);
>>> +	if (!pe && phb->pick_m64_pe)
>>> +		pe = phb->pick_m64_pe(phb, bus, all);
>>>
>>>   	/* The PE number isn't pinned by M64 */
>>> -	if (pe_num == IODA_INVALID_PE)
>>> -		pe_num = pnv_ioda_alloc_pe(phb);
>>> +	if (!pe)
>>> +		pe = pnv_ioda_alloc_pe(phb);
>>>
>>> -	if (pe_num == IODA_INVALID_PE) {
>>> -		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
>>> +	if (!pe) {
>>> +		pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n",
>>>   			__func__, pci_domain_nr(bus), bus->number);
>>>   		return NULL;
>>>   	}
>>>
>>> -	pe = &phb->ioda.pe_array[pe_num];
>>>   	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>>>   	pe->pbus = bus;
>>>   	pe->pdev = NULL;
>>> @@ -1274,14 +1461,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>>
>>>   	if (pnv_ioda_configure_pe(phb, pe)) {
>>>   		/* XXX What do we do here ? */
>>> -		if (pe_num)
>>> -			pnv_ioda_free_pe(phb, pe_num);
>>> -		pe->pbus = NULL;
>>> +		pnv_ioda_pe_put(pe);
>>>   		return NULL;
>>>   	}
>>>
>>>   	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
>>> -			GFP_KERNEL, hose->node);
>>> +				       GFP_KERNEL, hose->node);
>>
>> Seems like spaces change only - if you really want this change (which I hate
>> - makes code look inaccurate to my taste but it seems I am in minority here
>> :) ), please put it to the separate patch.
>>
>
> Ok. Confirm with you: You prefer the original format? I don't know
> why I prefer the later one. Maybe my eyes are quite broken :-)


I prefer not to change existing whitespaces unless it is done once and for 
the entire file :) Just remove this change from the patch.



>>
>>>   	pe->tce32_table->data = pe;
>>>
>>>   	/* Associate it with all child devices */
>>> @@ -1521,9 +1706,9 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>   		list_del(&pe->list);
>>>   		mutex_unlock(&phb->ioda.pe_list_mutex);
>>>
>>> -		pnv_ioda_deconfigure_pe(phb, pe);
>>> +		pnv_ioda_deconfigure_pe(pe);
>>
>>
>> Is this change necessary to get "Release PEs dynamically" working? Move it to
>> mechanical changes patch may be?
>>
>
> Ok. I'll try to do that.
>
>>
>>>
>>> -		pnv_ioda_free_pe(phb, pe->pe_number);
>>> +		pnv_ioda_pe_put(pe);
>>>   	}
>>>   }
>>>
>>> @@ -1601,9 +1786,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>
>>>   		if (pnv_ioda_configure_pe(phb, pe)) {
>>>   			/* XXX What do we do here ? */
>>> -			if (pe_num)
>>> -				pnv_ioda_free_pe(phb, pe_num);
>>> -			pe->pdev = NULL;
>>> +			pnv_ioda_pe_put(pe);
>>>   			continue;
>>>   		}
>>>
>>> @@ -2263,7 +2446,7 @@ int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
>>>   	struct pnv_ioda_pe *pe;
>>>   	int rc;
>>>
>>> -	pe = pnv_ioda_get_pe(dev);
>>> +	pe = pnv_ioda_pci_dev_to_pe(dev);
>>
>>
>> And this change could to separately. Not clear how this helps to "Release PEs
>> dynamically".
>>
>>
>
> It's not related to "Release PEs dynamically". The change is introduced by
> the function rename: Original pnv_ioda_get_pe() is renamed to pnv_ioda_pci_dev_to_pe().


But the rename happened in this patch and the patch's subj is "Release PEs 
dynamically" so it should be related somehow or move it to a simple 
separate patch "let's give the lalala function a better name to reflect 
what it actually does" (but in this case the new name does not make any 
more sense than the old one).



>>>   	if (!pe)
>>>   		return -ENODEV;
>>>
>>> @@ -2379,7 +2562,7 @@ int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
>>>   	struct pnv_ioda_pe *pe;
>>>   	int rc;
>>>
>>> -	if (!(pe = pnv_ioda_get_pe(dev)))
>>> +	if (!(pe = pnv_ioda_pci_dev_to_pe(dev)))
>>>   		return -ENODEV;
>>>
>>>   	/* Assign XIVE to PE */
>>> @@ -2401,7 +2584,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
>>>   				  unsigned int hwirq, unsigned int virq,
>>>   				  unsigned int is_64, struct msi_msg *msg)
>>>   {
>>> -	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
>>> +	struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev);
>>>   	unsigned int xive_num = hwirq - phb->msi_base;
>>>   	__be32 data;
>>>   	int rc;
>>> @@ -3065,6 +3248,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>>>   	pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge;
>>>   	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
>>>   	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
>>> +	pnv_pci_controller_ops.release_device = pnv_pci_release_device;
>>>   	hose->controller_ops = pnv_pci_controller_ops;
>>>
>>>   #ifdef CONFIG_PCI_IOV
>>> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>>> index 1bea3a8..8b10f01 100644
>>> --- a/arch/powerpc/platforms/powernv/pci.h
>>> +++ b/arch/powerpc/platforms/powernv/pci.h
>>> @@ -28,6 +28,7 @@ enum pnv_phb_model {
>>>   /* Data associated with a PE, including IOMMU tracking etc.. */
>>>   struct pnv_phb;
>>>   struct pnv_ioda_pe {
>>> +	struct kref		kref;
>>>   	unsigned long		flags;
>>>   	struct pnv_phb		*phb;
>>>
>>> @@ -120,7 +121,8 @@ struct pnv_phb {
>>>   	void (*shutdown)(struct pnv_phb *phb);
>>>   	int (*init_m64)(struct pnv_phb *phb);
>>>   	void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus);
>>> -	int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
>>> +	struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb,
>>> +					   struct pci_bus *bus, int all);
>>>   	int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
>>>   	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
>>>   	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
>>>
>
> Thanks,
> Gavin
>
Gavin Shan May 12, 2015, 12:03 a.m. UTC | #4
On Mon, May 11, 2015 at 05:02:08PM +1000, Alexey Kardashevskiy wrote:
>On 05/11/2015 04:25 PM, Gavin Shan wrote:
>>On Sat, May 09, 2015 at 10:43:23PM +1000, Alexey Kardashevskiy wrote:
>>>On 05/01/2015 04:02 PM, Gavin Shan wrote:
>>>>The original code doesn't support releasing PEs dynamically, meaning
>>>>that PE and the associated resources (IO, M32, M64 and DMA) can't
>>>>be released when unplugging a PCI adapter from one hotpluggable slot.
>>>>
>>>>The patch takes object oriented methodology, introducs reference
>>>>count to PE, which is initialized to 1 and increased with 1 when a
>>>>new PCI device joins the PE. Once the last PCI device leaves the
>>>>PE, the PE is going to be release together with its associated
>>>>(IO, M32, M64, DMA) resources.
>>>
>>>
>>>Too little commit log for non-trivial non-cut-n-paste 30KB patch...
>>>
>>
>>Ok. I'll add more details in next revision.
>>
>>>>
>>>>Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>>>---
>>>>  arch/powerpc/include/asm/pci-bridge.h     |   3 +
>>>>  arch/powerpc/kernel/pci-hotplug.c         |   5 +
>>>>  arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++-----------
>>>>  arch/powerpc/platforms/powernv/pci.h      |   4 +-
>>>>  4 files changed, 432 insertions(+), 238 deletions(-)
>>>>
>>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>index 5367eb3..a6ad4b1 100644
>>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>@@ -31,6 +31,9 @@ struct pci_controller_ops {
>>>>  	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
>>>>  	void		(*setup_bridge)(struct pci_bus *, unsigned long);
>>>>  	void		(*reset_secondary_bus)(struct pci_dev *dev);
>>>>+
>>>>+	/* Called when PCI device is released */
>>>>+	void		(*release_device)(struct pci_dev *);
>>>>  };
>>>>
>>>>  /*
>>>>diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
>>>>index 7ed85a6..0040343 100644
>>>>--- a/arch/powerpc/kernel/pci-hotplug.c
>>>>+++ b/arch/powerpc/kernel/pci-hotplug.c
>>>>@@ -29,6 +29,11 @@
>>>>   */
>>>>  void pcibios_release_device(struct pci_dev *dev)
>>>>  {
>>>>+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>>>+
>>>>+	if (hose->controller_ops.release_device)
>>>>+		hose->controller_ops.release_device(dev);
>>>>+
>>>>  	eeh_remove_device(dev);
>>>>  }
>>>>
>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>index 910fb67..ef8c216 100644
>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>@@ -12,6 +12,8 @@
>>>>  #undef DEBUG
>>>>
>>>>  #include <linux/kernel.h>
>>>>+#include <linux/atomic.h>
>>>>+#include <linux/kref.h>
>>>>  #include <linux/pci.h>
>>>>  #include <linux/crash_dump.h>
>>>>  #include <linux/debugfs.h>
>>>>@@ -47,6 +49,8 @@
>>>>  /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
>>>>  #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
>>>>
>>>>+static void pnv_ioda_release_pe(struct kref *kref);
>>>>+
>>>>  static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>>>>  			    const char *fmt, ...)
>>>>  {
>>>>@@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>>>>  		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>>>>  }
>>>>
>>>>-static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
>>>>+static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>>>>  {
>>>>-	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>>>-		pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>>>-			__func__, pe_no, phb->hose->global_number);
>>>>+	if (!pe)
>>>>+		return;
>>>>+
>>>>+	kref_get(&pe->kref);
>>>>+}
>>>>+
>>>>+static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
>>>>+{
>>>>+	unsigned int count;
>>>>+
>>>>+	if (!pe)
>>>>  		return;
>>>>+
>>>>+	/*
>>>>+	 * The count is initialized to 1 and increased with 1 when
>>>>+	 * a new PCI device is bound with the PE. Once the last PCI
>>>>+	 * device is leaving from the PE, the PE is going to be
>>>>+	 * released.
>>>>+	 */
>>>>+	count = atomic_read(&pe->kref.refcount);
>>>>+	if (count == 2)
>>>>+		kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
>>>>+	else
>>>>+		kref_put(&pe->kref, pnv_ioda_release_pe);
>>>
>>>
>>>What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()?
>>>
>>
>>Yeah, that would have problem. But it shouldn't happen because the
>>PCI devices are joining the parent PE# in strictly serialized mode.
>>Same thing happens when detaching PCI devices from its parent PE.
>
>
>oookay. Another thing then - why is this kref counter initialized to 1?
>It would make sense if you did something special when the counter becomes 1
>after decrement but you do not.
>
>Also, this kref thing makes sense if you do kref_put() in multiple places and
>do not know which one will be the last one so you pass the callback to all of
>them. Here you do kref_put/sub in one place and you read the counter - so you
>can call pnv_ioda_release_pe() directly. And it feels like a simple atomic_t
>would do the job just fine. If you still feel that the counter should start
>from 1, there are atomic_dec_if_positive() and atomic_inc_not_zero() and
>others.
>

It's good question actually. The counter is initialized to 1 when the PE
is reserved because of M64 requirement or allocated for non-M64 case. If
we reserve or allocate PE#, there is one thing for sure: the PCI bus has
one PCI device (including PCI bridge) at least. After the PE# is reserved
or allocated, the PCI device joins the PE with the result of increasing
the counter with 1. It means the counter is 2 when PE contains one PCI
device, and 3 when there're 2 devices. One reason for this design is that
we just need decrease the counter if we have to release this PE in the
window between PE reservation/allocation and first PCI device joins. I
think you're correct that we can call pnv_ioda_release_pe() in this window.
In this way, the counter is always reflecting the number of PCI devices
the PE contains.

>>>>+}
>>>>+
>>>>+static void pnv_pci_release_device(struct pci_dev *pdev)
>>>>+{
>>>>+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>>>+	struct pnv_phb *phb = hose->private_data;
>>>>+	struct pci_dn *pdn = pci_get_pdn(pdev);
>>>>+	struct pnv_ioda_pe *pe;
>>>>+
>>>>+	if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>>>+		pe = &phb->ioda.pe_array[pdn->pe_number];
>>>>+		pnv_ioda_pe_put(pe);
>>>>+		pdn->pe_number = IODA_INVALID_PE;
>>>>  	}
>>>>+}
>>>>
>>>>-	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
>>>>-		pr_warn("%s: PE %d was assigned on PHB#%x\n",
>>>>-			__func__, pe_no, phb->hose->global_number);
>>>>+static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>>>>+{
>>>>+	struct pnv_phb *phb = pe->phb;
>>>>+	int index, count;
>>>>+	unsigned long tbl_addr, tbl_size;
>>>>+
>>>>+	/* No DMA capability for slave PEs */
>>>>+	if (pe->flags & PNV_IODA_PE_SLAVE)
>>>>+		return;
>>>>+
>>>>+	/* Bypass DMA window */
>>>>+	if (phb->type == PNV_PHB_IODA2 &&
>>>>+	    pe->tce_bypass_enabled &&
>>>>+	    pe->tce32_table &&
>>>>+	    pe->tce32_table->set_bypass)
>>>>+		pe->tce32_table->set_bypass(pe->tce32_table, false);
>>>>+
>>>>+	/* 32-bits DMA window */
>>>>+	count = pe->tce32_seg_end - pe->tce32_seg_start;
>>>>+	tbl_addr = pe->tce32_table->it_base;
>>>>+	if (!count)
>>>>  		return;
>>>>+
>>>>+	/* Free IOMMU table */
>>>>+	iommu_free_table(pe->tce32_table,
>>>>+			 of_node_full_name(phb->hose->dn));
>>>>+
>>>>+	/* Deconfigure TCE table */
>>>>+	switch (phb->type) {
>>>>+	case PNV_PHB_IODA1:
>>>>+		for (index = 0; index < count; index++)
>>>>+			opal_pci_map_pe_dma_window(phb->opal_id,
>>>>+						   pe->pe_number,
>>>>+						   pe->tce32_seg_start + index,
>>>>+						   1,
>>>>+						   __pa(tbl_addr) +
>>>>+						   index * TCE32_TABLE_SIZE,
>>>>+						   0,
>>>>+						   0x1000);
>>>>+		bitmap_clear(phb->ioda.tce32_segmap,
>>>>+			     pe->tce32_seg_start,
>>>>+			     count);
>>>>+		tbl_size = TCE32_TABLE_SIZE * count;
>>>>+		break;
>>>>+	case PNV_PHB_IODA2:
>>>>+		opal_pci_map_pe_dma_window(phb->opal_id,
>>>>+					   pe->pe_number,
>>>>+					   pe->pe_number << 1,
>>>>+					   1,
>>>>+					   __pa(tbl_addr),
>>>>+					   0,
>>>>+					   0x1000);
>>>>+		tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
>>>>+		tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
>>>>+		break;
>>>>+	default:
>>>>+		pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
>>>>+		return;
>>>>+	}
>>>>+
>>>>+	/* Free memory of IOMMU table */
>>>>+	free_pages(tbl_addr, get_order(tbl_size));
>>>
>>>
>>>You just programmed the table address to TVT and then you are releasing the
>>>pages. It does not seem right, it will leave garbage in TVT. Also, I am
>>>adding helpers to alloc/free TCE pages in DDW patchset, you could reuse bits
>>>from there (I'll post v10 soon, you'll be in copy and you'll have to review
>>>that ;) ).
>>>
>>
>>I assume you're talking about TVE. I don't understand how garbage will be left
>>in TVE. opal_pci_map_pe_dma_window(), which is handled by skiboot, clear TVE
>>with zero'ed "tce_table_size". The pages previously allocated for TCE table is
>>released to buddy system, which can be allocated by somebody else (from buddy
>>or slab).
>
>opal_pci_map_pe_dma_window() takes __pa(tbl_addr) which points to some memory
>which is still allocated. This value goes to a table (which has 2 entries per
>PE, one for 32bit DMA window and one for bypass/hugewindow) which PHB uses to
>get the actual TCE table address. What is the name of this table? :) Anyway,
>you write an address there and then you call free_pages() so after
>free_pages(), the value in that TVE/TVT/whatever table is a garbage.
>

I don't look into your DDW code yet. Before we have DDW patchset, the bypass
TVE (window) isn't supposed to have corresponding TCE table. I guess you might
change the behaviour in your DDW patchset and I'll take a close look on that.
For DMA32 window, which is the name of the table, the TVE is cleared by skiboot
when having zero "tce_table_size" argument.

	opal_pci_map_pe_dma_window(phb->opal_id,
				   pe->pe_number,
				   pe->pe_number << 1,
				   1,
				   __pa(tbl_addr),
				   0,			<<<< "tce_table_size".
				   0x1000);

>
>>
>>Ok. Please put me into the cc list. I guess the whole series of patches is
>>better to rebased on your DDW patchset, which is to be merged first, I believe.
>>
>>>
>>>>+	pe->tce32_table = NULL;
>>>>+	pe->tce32_seg_start = 0;
>>>>+	pe->tce32_seg_end = 0;
>>>>+}
>>>>+
>>>>+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
>>>>+{
>>>>+	struct pnv_phb *phb = pe->phb;
>>>>+	unsigned long *segmap = NULL, *pe_segmap = NULL;
>>>>+	int i;
>>>>+	uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE,
>>>>+				     OPAL_M32_WINDOW_TYPE,
>>>>+				     OPAL_M64_WINDOW_TYPE };
>>>>+
>>>>+	for (win = 0; win < ARRAY_SIZE(win_type); win++) {
>>>>+		switch (win_type[win]) {
>>>>+		case OPAL_IO_WINDOW_TYPE:
>>>>+			segmap = phb->ioda.io_segmap;
>>>>+			pe_segmap = pe->io_segmap;
>>>>+			break;
>>>>+		case OPAL_M32_WINDOW_TYPE:
>>>>+			segmap = phb->ioda.m32_segmap;
>>>>+			pe_segmap = pe->m32_segmap;
>>>>+			break;
>>>>+		case OPAL_M64_WINDOW_TYPE:
>>>>+			segmap = phb->ioda.m64_segmap;
>>>>+			pe_segmap = pe->m64_segmap;
>>>>+			break;
>>>>+		}
>>>>+		i = -1;
>>>>+		while ((i = find_next_bit(pe_segmap,
>>>>+			phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) {
>>>>+			if (win_type[win] == OPAL_IO_WINDOW_TYPE ||
>>>>+			    win_type[win] == OPAL_M32_WINDOW_TYPE)
>>>>+				opal_pci_map_pe_mmio_window(phb->opal_id,
>>>>+						phb->ioda.reserved_pe,
>>>>+						win_type[win], 0, i);
>>>>+			else if (phb->type == PNV_PHB_IODA1)
>>>>+				opal_pci_map_pe_mmio_window(phb->opal_id,
>>>>+						phb->ioda.reserved_pe,
>>>>+						win_type[win],
>>>>+						i / 8, i % 8);
>>>
>>>The function is called ""release" but it programs something what looks like
>>>reasonable values, is it correct?
>>>
>>
>>It's out of problem, When the segment is deallocated, it's mapped to the
>>reserved PE#.
>>
>>>
>>>
>>>>+
>>>>+			clear_bit(i, pe_segmap);
>>>>+			clear_bit(i, segmap);
>>>>+		}
>>>>+	}
>>>>+}
>>>>+
>>>>+static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>>>+				  struct pnv_ioda_pe *parent,
>>>>+				  struct pnv_ioda_pe *child,
>>>>+				  bool is_add)
>>>>+{
>>>>+	const char *desc = is_add ? "adding" : "removing";
>>>>+	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>>>+			      OPAL_REMOVE_PE_FROM_DOMAIN;
>>>>+	struct pnv_ioda_pe *slave;
>>>>+	long rc;
>>>>+
>>>>+	/* Parent PE affects child PE */
>>>>+	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>>>+				child->pe_number, op);
>>>>+	if (rc != OPAL_SUCCESS) {
>>>>+		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>>>+			rc, desc);
>>>>+		return -ENXIO;
>>>>+	}
>>>>+
>>>>+	if (!(child->flags & PNV_IODA_PE_MASTER))
>>>>+		return 0;
>>>>+
>>>>+	/* Compound case: parent PE affects slave PEs */
>>>>+	list_for_each_entry(slave, &child->slaves, list) {
>>>>+		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>>>+					slave->pe_number, op);
>>>>+		if (rc != OPAL_SUCCESS) {
>>>>+			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>>>+				rc, desc);
>>>>+			return -ENXIO;
>>>>+		}
>>>>+	}
>>>>+
>>>>+	return 0;
>>>>+}
>>>>+
>>>>+static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add)
>>>>+{
>>>>+	struct pnv_phb *phb = pe->phb;
>>>>+	struct pnv_ioda_pe *slave;
>>>>+	struct pci_dev *pdev = NULL;
>>>>+	int ret;
>>>>+
>>>>+	/*
>>>>+	 * Clear PE frozen state. If it's master PE, we need
>>>>+	 * clear slave PE frozen state as well.
>>>>+	 */
>>>>+	opal_pci_eeh_freeze_clear(phb->opal_id,
>>>>+				  pe->pe_number,
>>>>+				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>>>+	if (pe->flags & PNV_IODA_PE_MASTER) {
>>>>+		list_for_each_entry(slave, &pe->slaves, list) {
>>>>+			opal_pci_eeh_freeze_clear(phb->opal_id,
>>>>+						  slave->pe_number,
>>>>+						  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>>>+		}
>>>>+	}
>>>>+
>>>>+	/*
>>>>+	 * Associate PE in PELT. We need add the PE into the
>>>>+	 * corresponding PELT-V as well. Otherwise, the error
>>>>+	 * originated from the PE might contribute to other
>>>>+	 * PEs.
>>>>+	 */
>>>>+	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>>>+	if (ret)
>>>>+		return ret;
>>>>+
>>>>+	/* For compound PEs, any one affects all of them */
>>>>+	if (pe->flags & PNV_IODA_PE_MASTER) {
>>>>+		list_for_each_entry(slave, &pe->slaves, list) {
>>>>+			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>>>+			if (ret)
>>>>+				return ret;
>>>>+		}
>>>>+	}
>>>>+
>>>>+	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>>>+		pdev = pe->pbus->self;
>>>>+	else if (pe->flags & PNV_IODA_PE_DEV)
>>>>+		pdev = pe->pdev->bus->self;
>>>>+#ifdef CONFIG_PCI_IOV
>>>>+	else if (pe->flags & PNV_IODA_PE_VF)
>>>>+		pdev = pe->parent_dev->bus->self;
>>>>+#endif /* CONFIG_PCI_IOV */
>>>>+
>>>>+	while (pdev) {
>>>>+		struct pci_dn *pdn = pci_get_pdn(pdev);
>>>>+		struct pnv_ioda_pe *parent;
>>>>+
>>>>+		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>>>+			parent = &phb->ioda.pe_array[pdn->pe_number];
>>>>+			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>>>+			if (ret)
>>>>+				return ret;
>>>>+		}
>>>>+
>>>>+		pdev = pdev->bus->self;
>>>>+	}
>>>>+
>>>>+	return 0;
>>>>+}
>>>>+
>>>>+static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe)
>>>
>>>
>>>It used to be under #ifdef CONFIG_PCI_IOV, now it is not. Looks like just
>>>moving of this function to a different place deserves a separate patch with a
>>>comment why ("it is going to be used now for non-SRIOV case too" may be?).
>>>
>>
>>Yeah, it makes sense to me. Will fix it up.
>>
>>>
>>>>+{
>>>>+	struct pnv_phb *phb = pe->phb;
>>>>+	struct pci_dev *parent;
>>>>+	uint8_t bcomp, dcomp, fcomp;
>>>>+	long rid_end, rid;
>>>>+	int64_t rc;
>>>>+
>>>>+	/* Tear down MVE */
>>>>+	if (phb->type == PNV_PHB_IODA1 &&
>>>>+	    pe->mve_number != -1) {
>>>>+		rc = opal_pci_set_mve(phb->opal_id,
>>>>+				      pe->mve_number,
>>>>+				      phb->ioda.reserved_pe);
>>>>+		if (rc != OPAL_SUCCESS)
>>>>+			pe_warn(pe, "Error %lld unmapping MVE#%d\n",
>>>>+				rc, pe->mve_number);
>>>>+		rc = opal_pci_set_mve_enable(phb->opal_id,
>>>>+					     pe->mve_number,
>>>>+					     OPAL_DISABLE_MVE);
>>>>+		if (rc != OPAL_SUCCESS)
>>>>+			pe_warn(pe, "Error %lld disabling MVE#%d\n",
>>>>+				rc, pe->mve_number);
>>>>+		pe->mve_number = -1;
>>>>+	}
>>>>+
>>>>+	/* Unmapping PELTV */
>>>>+	pnv_ioda_set_peltv(pe, false);
>>>>+
>>>>+	/* To unmap PELTM */
>>>>+	if (pe->pbus) {
>>>>+		int count;
>>>>+
>>>>+		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>>>+		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>>>+		parent = pe->pbus->self;
>>>>+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>>>+			count = pe->pbus->busn_res.end -
>>>>+				pe->pbus->busn_res.start + 1;
>>>>+		else
>>>>+			count = 1;
>>>>+
>>>>+		switch(count) {
>>>>+		case  1: bcomp = OpalPciBusAll;   break;
>>>>+		case  2: bcomp = OpalPciBus7Bits; break;
>>>>+		case  4: bcomp = OpalPciBus6Bits; break;
>>>>+		case  8: bcomp = OpalPciBus5Bits; break;
>>>>+		case 16: bcomp = OpalPciBus4Bits; break;
>>>>+		case 32: bcomp = OpalPciBus3Bits; break;
>>>>+		default:
>>>>+			/* Fail back to case of one bus */
>>>>+			pe_warn(pe, "Cannot support %d buses\n", count);
>>>>+			bcomp = OpalPciBusAll;
>>>>+		}
>>>>+		rid_end = pe->rid + (count << 8);
>>>>+	} else {
>>>>+#ifdef CONFIG_PCI_IOV
>>>>+		if (pe->flags & PNV_IODA_PE_VF)
>>>>+			parent = pe->parent_dev;
>>>>+		else
>>>>+#endif
>>>>+			parent = pe->pdev->bus->self;
>>>>+		bcomp = OpalPciBusAll;
>>>>+		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>>>+		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>>>+		rid_end = pe->rid + 1;
>>>>+	}
>>>>+
>>>>+	/* Clear RID mapping */
>>>>+	for (rid = pe->rid; rid < rid_end; rid++)
>>>>+		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>>>+
>>>>+	/* Unmapping PELTM */
>>>>+	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>>>+			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>>>+	if (rc)
>>>>+		pe_warn(pe, "Error %ld unmapping PELTM\n", rc);
>>>>+}
>>>>+
>>>>+static void pnv_ioda_release_pe(struct kref *kref)
>>>>+{
>>>>+	struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref);
>>>>+	struct pnv_ioda_pe *tmp, *slave;
>>>>+	struct pnv_phb *phb = pe->phb;
>>>>+
>>>>+	pnv_ioda_release_pe_dma(pe);
>>>>+	pnv_ioda_release_pe_seg(pe);
>>>>+	pnv_ioda_deconfigure_pe(pe);
>>>>+
>>>>+	/* Release slave PEs for compound PE */
>>>>+	if (pe->flags & PNV_IODA_PE_MASTER) {
>>>>+		list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
>>>>+			pnv_ioda_pe_put(slave);
>>>>+	}
>>>>+
>>>>+	/* Remove the PE from various list. We need remove slave
>>>>+	 * PE from master's list.
>>>>+	 */
>>>>+	list_del(&pe->dma_link);
>>>>+	list_del(&pe->list);
>>>>+
>>>>+	/* Free PE number */
>>>>+	clear_bit(pe->pe_number, phb->ioda.pe_alloc);
>>>>+}
>>>>+
>>>>+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb,
>>>>+					    int pe_no)
>>>>+{
>>>>+	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
>>>>+
>>>>+	kref_init(&pe->kref);
>>>>+	pe->phb = phb;
>>>>+	pe->pe_number = pe_no;
>>>>+	INIT_LIST_HEAD(&pe->dma_link);
>>>>+	INIT_LIST_HEAD(&pe->list);
>>>>+
>>>>+	return pe;
>>>>+}
>>>>+
>>>>+static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb,
>>>>+					       int pe_no)
>>>>+{
>>>>+	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>>>+		pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>>>+			__func__, pe_no, phb->hose->global_number);
>>>>+		return NULL;
>>>>  	}
>>>>
>>>>-	phb->ioda.pe_array[pe_no].phb = phb;
>>>>-	phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>>>+	/*
>>>>+	 * Same PE might be reserved for multiple times, which
>>>>+	 * is out of problem actually.
>>>>+	 */
>>>>+	set_bit(pe_no, phb->ioda.pe_alloc);
>>>>+	return pnv_ioda_init_pe(phb, pe_no);
>>>>  }
>>>>
>>>>-static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>>>+static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>>>  {
>>>>  	unsigned long pe_no;
>>>>  	unsigned long limit = phb->ioda.total_pe - 1;
>>>>@@ -154,20 +533,10 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>>>  			break;
>>>>
>>>>  		if (--limit >= phb->ioda.total_pe)
>>>>-			return IODA_INVALID_PE;
>>>>+			return NULL;
>>>>  	} while(1);
>>>>
>>>>-	phb->ioda.pe_array[pe_no].phb = phb;
>>>>-	phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>>>-	return pe_no;
>>>>-}
>>>>-
>>>>-static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>>>>-{
>>>>-	WARN_ON(phb->ioda.pe_array[pe].pdev);
>>>>-
>>>>-	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
>>>>-	clear_bit(pe, phb->ioda.pe_alloc);
>>>>+	return pnv_ioda_init_pe(phb, pe_no);
>>>>  }
>>>>
>>>>  static int pnv_ioda1_init_m64(struct pnv_phb *phb)
>>>>@@ -382,8 +751,9 @@ static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb,
>>>>  	}
>>>>  }
>>>>
>>>>-static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>>-				struct pci_bus *bus, int all)
>>>>+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>>+						struct pci_bus *bus,
>>>>+						int all)
>>>
>>>
>>>Mechanic changes like this could easily go to a separate patch.
>>>
>>
>>Indeed. I'll see how I can split the patches up in next revision.
>>Thanks for the suggestion.
>>
>>>>  {
>>>>  	resource_size_t segsz = phb->ioda.m64_segsize;
>>>>  	struct pci_dev *pdev;
>>>>@@ -394,14 +764,14 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>>  	int i;
>>>>
>>>>  	if (!pnv_ioda_need_m64_pe(phb, bus))
>>>>-		return IODA_INVALID_PE;
>>>>+		return NULL;
>>>>
>>>>          /* Allocate bitmap */
>>>>  	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
>>>>  	pe_bitsmap = kzalloc(size, GFP_KERNEL);
>>>>  	if (!pe_bitsmap) {
>>>>  		pr_warn("%s: Out of memory !\n", __func__);
>>>>-		return IODA_INVALID_PE;
>>>>+		return NULL;
>>>>  	}
>>>>
>>>>  	/* The bridge's M64 window might be extended to PHB's M64
>>>>@@ -438,7 +808,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>>  	/* No M64 window found ? */
>>>>  	if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) {
>>>>  		kfree(pe_bitsmap);
>>>>-		return IODA_INVALID_PE;
>>>>+		return NULL;
>>>>  	}
>>>>
>>>>  	/* Figure out the master PE and put all slave PEs
>>>>@@ -491,7 +861,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>>  	}
>>>>
>>>>  	kfree(pe_bitsmap);
>>>>-	return master_pe->pe_number;
>>>>+	return master_pe;
>>>>  }
>>>>
>>>>  static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
>>>>@@ -695,7 +1065,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
>>>>   * but in the meantime, we need to protect them to avoid warnings
>>>>   */
>>>>  #ifdef CONFIG_PCI_MSI
>>>>-static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>>>>+static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev)
>>>>  {
>>>>  	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>>>  	struct pnv_phb *phb = hose->private_data;
>>>>@@ -709,191 +1079,6 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>>>>  }
>>>>  #endif /* CONFIG_PCI_MSI */
>>>>
>>>>-static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>>>-				  struct pnv_ioda_pe *parent,
>>>>-				  struct pnv_ioda_pe *child,
>>>>-				  bool is_add)
>>>>-{
>>>>-	const char *desc = is_add ? "adding" : "removing";
>>>>-	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>>>-			      OPAL_REMOVE_PE_FROM_DOMAIN;
>>>>-	struct pnv_ioda_pe *slave;
>>>>-	long rc;
>>>>-
>>>>-	/* Parent PE affects child PE */
>>>>-	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>>>-				child->pe_number, op);
>>>>-	if (rc != OPAL_SUCCESS) {
>>>>-		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>>>-			rc, desc);
>>>>-		return -ENXIO;
>>>>-	}
>>>>-
>>>>-	if (!(child->flags & PNV_IODA_PE_MASTER))
>>>>-		return 0;
>>>>-
>>>>-	/* Compound case: parent PE affects slave PEs */
>>>>-	list_for_each_entry(slave, &child->slaves, list) {
>>>>-		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>>>-					slave->pe_number, op);
>>>>-		if (rc != OPAL_SUCCESS) {
>>>>-			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>>>-				rc, desc);
>>>>-			return -ENXIO;
>>>>-		}
>>>>-	}
>>>>-
>>>>-	return 0;
>>>>-}
>>>>-
>>>>-static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>>>>-			      struct pnv_ioda_pe *pe,
>>>>-			      bool is_add)
>>>>-{
>>>>-	struct pnv_ioda_pe *slave;
>>>>-	struct pci_dev *pdev = NULL;
>>>>-	int ret;
>>>>-
>>>>-	/*
>>>>-	 * Clear PE frozen state. If it's master PE, we need
>>>>-	 * clear slave PE frozen state as well.
>>>>-	 */
>>>>-	if (is_add) {
>>>>-		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
>>>>-					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>>>-		if (pe->flags & PNV_IODA_PE_MASTER) {
>>>>-			list_for_each_entry(slave, &pe->slaves, list)
>>>>-				opal_pci_eeh_freeze_clear(phb->opal_id,
>>>>-							  slave->pe_number,
>>>>-							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>>>-		}
>>>>-	}
>>>>-
>>>>-	/*
>>>>-	 * Associate PE in PELT. We need add the PE into the
>>>>-	 * corresponding PELT-V as well. Otherwise, the error
>>>>-	 * originated from the PE might contribute to other
>>>>-	 * PEs.
>>>>-	 */
>>>>-	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>>>-	if (ret)
>>>>-		return ret;
>>>>-
>>>>-	/* For compound PEs, any one affects all of them */
>>>>-	if (pe->flags & PNV_IODA_PE_MASTER) {
>>>>-		list_for_each_entry(slave, &pe->slaves, list) {
>>>>-			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>>>-			if (ret)
>>>>-				return ret;
>>>>-		}
>>>>-	}
>>>>-
>>>>-	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>>>-		pdev = pe->pbus->self;
>>>>-	else if (pe->flags & PNV_IODA_PE_DEV)
>>>>-		pdev = pe->pdev->bus->self;
>>>>-#ifdef CONFIG_PCI_IOV
>>>>-	else if (pe->flags & PNV_IODA_PE_VF)
>>>>-		pdev = pe->parent_dev->bus->self;
>>>>-#endif /* CONFIG_PCI_IOV */
>>>>-	while (pdev) {
>>>>-		struct pci_dn *pdn = pci_get_pdn(pdev);
>>>>-		struct pnv_ioda_pe *parent;
>>>>-
>>>>-		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>>>-			parent = &phb->ioda.pe_array[pdn->pe_number];
>>>>-			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>>>-			if (ret)
>>>>-				return ret;
>>>>-		}
>>>>-
>>>>-		pdev = pdev->bus->self;
>>>>-	}
>>>>-
>>>>-	return 0;
>>>>-}
>>>>-
>>>>-#ifdef CONFIG_PCI_IOV
>>>>-static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>>>-{
>>>>-	struct pci_dev *parent;
>>>>-	uint8_t bcomp, dcomp, fcomp;
>>>>-	int64_t rc;
>>>>-	long rid_end, rid;
>>>>-
>>>>-	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
>>>>-	if (pe->pbus) {
>>>>-		int count;
>>>>-
>>>>-		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>>>-		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>>>-		parent = pe->pbus->self;
>>>>-		if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>>>-			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
>>>>-		else
>>>>-			count = 1;
>>>>-
>>>>-		switch(count) {
>>>>-		case  1: bcomp = OpalPciBusAll;         break;
>>>>-		case  2: bcomp = OpalPciBus7Bits;       break;
>>>>-		case  4: bcomp = OpalPciBus6Bits;       break;
>>>>-		case  8: bcomp = OpalPciBus5Bits;       break;
>>>>-		case 16: bcomp = OpalPciBus4Bits;       break;
>>>>-		case 32: bcomp = OpalPciBus3Bits;       break;
>>>>-		default:
>>>>-			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
>>>>-			        count);
>>>>-			/* Do an exact match only */
>>>>-			bcomp = OpalPciBusAll;
>>>>-		}
>>>>-		rid_end = pe->rid + (count << 8);
>>>>-	} else {
>>>>-		if (pe->flags & PNV_IODA_PE_VF)
>>>>-			parent = pe->parent_dev;
>>>>-		else
>>>>-			parent = pe->pdev->bus->self;
>>>>-		bcomp = OpalPciBusAll;
>>>>-		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>>>-		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>>>-		rid_end = pe->rid + 1;
>>>>-	}
>>>>-
>>>>-	/* Clear the reverse map */
>>>>-	for (rid = pe->rid; rid < rid_end; rid++)
>>>>-		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>>>-
>>>>-	/* Release from all parents PELT-V */
>>>>-	while (parent) {
>>>>-		struct pci_dn *pdn = pci_get_pdn(parent);
>>>>-		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>>>-			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
>>>>-						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
>>>>-			/* XXX What to do in case of error ? */
>>>
>>>
>>>Not much :) Free associated memory and mark it "dead" so it won't be used
>>>again till reboot. In what circumstance can this opal_pci_set_peltv() fail at
>>>all?
>>>
>>
>>Yeah, maybe. Until now, I didn't see this failure since the code is there
>>from the day. Note the code has been there for almost 4 years since the
>>day Ben wrote it.
>
>
>Sure. But if it starts failing, we won't even notice it - there is no even
>pr_err() or WARN_ON.
>

Agree. I'll see what I can do. At least I can have error message to alert.

>>
>>>
>>>>-		}
>>>>-		parent = parent->bus->self;
>>>>-	}
>>>>-
>>>>-	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
>>>>-				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>>>-
>>>>-	/* Disassociate PE in PELT */
>>>>-	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
>>>>-				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
>>>>-	if (rc)
>>>>-		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
>>>>-	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>>>-			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>>>-	if (rc)
>>>>-		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
>>>>-
>>>>-	pe->pbus = NULL;
>>>>-	pe->pdev = NULL;
>>>>-	pe->parent_dev = NULL;
>>>>-
>>>>-	return 0;
>>>>-}
>>>>-#endif /* CONFIG_PCI_IOV */
>>>>-
>>>>  static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>>>  {
>>>>  	struct pci_dev *parent;
>>>>@@ -953,7 +1138,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>>>  	}
>>>>
>>>>  	/* Configure PELTV */
>>>>-	pnv_ioda_set_peltv(phb, pe, true);
>>>>+	pnv_ioda_set_peltv(pe, true);
>>>>
>>>>  	/* Setup reverse map */
>>>>  	for (rid = pe->rid; rid < rid_end; rid++)
>>>>@@ -1207,6 +1392,8 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>>>>  		if (pdn->pe_number != IODA_INVALID_PE)
>>>>  			continue;
>>>>
>>>>+		/* Increase reference count of the parent PE */
>>>
>>>When you comment like this, I read it as the comment belongs to the whole
>>>next chunk till the first empty line, i.e. to all 5 lines below, which is not
>>>the case. I'd remove the comment as 1) "pe_get" in pnv_ioda_pe_get() name
>>>suggests incrementing the reference counter 2) "pe" is always parent in this
>>>function. I do not insist though.
>>>
>>
>>Agree on your explaining. I'll remove this unuseful comments.
>>
>>>
>>>>+		pnv_ioda_pe_get(pe);
>>>>  		pdn->pe_number = pe->pe_number;
>>>>  		pe->dma_weight += pnv_ioda_dev_dma_weight(dev);
>>>>  		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>>>>@@ -1224,7 +1411,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>>>  {
>>>>  	struct pci_controller *hose = pci_bus_to_host(bus);
>>>>  	struct pnv_phb *phb = hose->private_data;
>>>>-	struct pnv_ioda_pe *pe;
>>>>+	struct pnv_ioda_pe *pe = NULL;
>>>>  	int pe_num = IODA_INVALID_PE;
>>>>
>>>>  	/* For partial hotplug case, the PE instance hasn't been destroyed
>>>>@@ -1240,24 +1427,24 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>>>  	}
>>>>
>>>>  	/* PE number for root bus should have been reserved */
>>>>-	if (pci_is_root_bus(bus))
>>>>-		pe_num = phb->ioda.root_pe_no;
>>>>+	if (pci_is_root_bus(bus) &&
>>>>+	    phb->ioda.root_pe_no != IODA_INVALID_PE)
>>>>+		pe = &phb->ioda.pe_array[phb->ioda.root_pe_no];
>>>>
>>>>  	/* Check if PE is determined by M64 */
>>>>-	if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe)
>>>>-		pe_num = phb->pick_m64_pe(phb, bus, all);
>>>>+	if (!pe && phb->pick_m64_pe)
>>>>+		pe = phb->pick_m64_pe(phb, bus, all);
>>>>
>>>>  	/* The PE number isn't pinned by M64 */
>>>>-	if (pe_num == IODA_INVALID_PE)
>>>>-		pe_num = pnv_ioda_alloc_pe(phb);
>>>>+	if (!pe)
>>>>+		pe = pnv_ioda_alloc_pe(phb);
>>>>
>>>>-	if (pe_num == IODA_INVALID_PE) {
>>>>-		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
>>>>+	if (!pe) {
>>>>+		pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n",
>>>>  			__func__, pci_domain_nr(bus), bus->number);
>>>>  		return NULL;
>>>>  	}
>>>>
>>>>-	pe = &phb->ioda.pe_array[pe_num];
>>>>  	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>>>>  	pe->pbus = bus;
>>>>  	pe->pdev = NULL;
>>>>@@ -1274,14 +1461,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>>>
>>>>  	if (pnv_ioda_configure_pe(phb, pe)) {
>>>>  		/* XXX What do we do here ? */
>>>>-		if (pe_num)
>>>>-			pnv_ioda_free_pe(phb, pe_num);
>>>>-		pe->pbus = NULL;
>>>>+		pnv_ioda_pe_put(pe);
>>>>  		return NULL;
>>>>  	}
>>>>
>>>>  	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
>>>>-			GFP_KERNEL, hose->node);
>>>>+				       GFP_KERNEL, hose->node);
>>>
>>>Seems like spaces change only - if you really want this change (which I hate
>>>- makes code look inaccurate to my taste but it seems I am in minority here
>>>:) ), please put it to the separate patch.
>>>
>>
>>Ok. Confirm with you: You prefer the original format? I don't know
>>why I prefer the later one. Maybe my eyes are quite broken :-)
>
>
>I prefer not to change existing whitespaces unless it is done once and for
>the entire file :) Just remove this change from the patch.
>

Sure.

>>>
>>>>  	pe->tce32_table->data = pe;
>>>>
>>>>  	/* Associate it with all child devices */
>>>>@@ -1521,9 +1706,9 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>>  		list_del(&pe->list);
>>>>  		mutex_unlock(&phb->ioda.pe_list_mutex);
>>>>
>>>>-		pnv_ioda_deconfigure_pe(phb, pe);
>>>>+		pnv_ioda_deconfigure_pe(pe);
>>>
>>>
>>>Is this change necessary to get "Release PEs dynamically" working? Move it to
>>>mechanical changes patch may be?
>>>
>>
>>Ok. I'll try to do that.
>>
>>>
>>>>
>>>>-		pnv_ioda_free_pe(phb, pe->pe_number);
>>>>+		pnv_ioda_pe_put(pe);
>>>>  	}
>>>>  }
>>>>
>>>>@@ -1601,9 +1786,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>>
>>>>  		if (pnv_ioda_configure_pe(phb, pe)) {
>>>>  			/* XXX What do we do here ? */
>>>>-			if (pe_num)
>>>>-				pnv_ioda_free_pe(phb, pe_num);
>>>>-			pe->pdev = NULL;
>>>>+			pnv_ioda_pe_put(pe);
>>>>  			continue;
>>>>  		}
>>>>
>>>>@@ -2263,7 +2446,7 @@ int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
>>>>  	struct pnv_ioda_pe *pe;
>>>>  	int rc;
>>>>
>>>>-	pe = pnv_ioda_get_pe(dev);
>>>>+	pe = pnv_ioda_pci_dev_to_pe(dev);
>>>
>>>
>>>And this change could to separately. Not clear how this helps to "Release PEs
>>>dynamically".
>>>
>>>
>>
>>It's not related to "Release PEs dynamically". The change is introduced by
>>the function rename: Original pnv_ioda_get_pe() is renamed to pnv_ioda_pci_dev_to_pe().
>
>
>But the rename happened in this patch and the patch's subj is "Release PEs
>dynamically" so it should be related somehow or move it to a simple separate
>patch "let's give the lalala function a better name to reflect what it
>actually does" (but in this case the new name does not make any more sense
>than the old one).
>

Yeah, I'll try to split the patches to separate blala and walala :-)

>>>>  	if (!pe)
>>>>  		return -ENODEV;
>>>>
>>>>@@ -2379,7 +2562,7 @@ int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
>>>>  	struct pnv_ioda_pe *pe;
>>>>  	int rc;
>>>>
>>>>-	if (!(pe = pnv_ioda_get_pe(dev)))
>>>>+	if (!(pe = pnv_ioda_pci_dev_to_pe(dev)))
>>>>  		return -ENODEV;
>>>>
>>>>  	/* Assign XIVE to PE */
>>>>@@ -2401,7 +2584,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
>>>>  				  unsigned int hwirq, unsigned int virq,
>>>>  				  unsigned int is_64, struct msi_msg *msg)
>>>>  {
>>>>-	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
>>>>+	struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev);
>>>>  	unsigned int xive_num = hwirq - phb->msi_base;
>>>>  	__be32 data;
>>>>  	int rc;
>>>>@@ -3065,6 +3248,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>>>>  	pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge;
>>>>  	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
>>>>  	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
>>>>+	pnv_pci_controller_ops.release_device = pnv_pci_release_device;
>>>>  	hose->controller_ops = pnv_pci_controller_ops;
>>>>
>>>>  #ifdef CONFIG_PCI_IOV
>>>>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>>>>index 1bea3a8..8b10f01 100644
>>>>--- a/arch/powerpc/platforms/powernv/pci.h
>>>>+++ b/arch/powerpc/platforms/powernv/pci.h
>>>>@@ -28,6 +28,7 @@ enum pnv_phb_model {
>>>>  /* Data associated with a PE, including IOMMU tracking etc.. */
>>>>  struct pnv_phb;
>>>>  struct pnv_ioda_pe {
>>>>+	struct kref		kref;
>>>>  	unsigned long		flags;
>>>>  	struct pnv_phb		*phb;
>>>>
>>>>@@ -120,7 +121,8 @@ struct pnv_phb {
>>>>  	void (*shutdown)(struct pnv_phb *phb);
>>>>  	int (*init_m64)(struct pnv_phb *phb);
>>>>  	void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus);
>>>>-	int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
>>>>+	struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb,
>>>>+					   struct pci_bus *bus, int all);
>>>>  	int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
>>>>  	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
>>>>  	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
>>>>

Thanks,
Gavin
Alexey Kardashevskiy May 12, 2015, 12:53 a.m. UTC | #5
On 05/12/2015 10:03 AM, Gavin Shan wrote:
> On Mon, May 11, 2015 at 05:02:08PM +1000, Alexey Kardashevskiy wrote:
>> On 05/11/2015 04:25 PM, Gavin Shan wrote:
>>> On Sat, May 09, 2015 at 10:43:23PM +1000, Alexey Kardashevskiy wrote:
>>>> On 05/01/2015 04:02 PM, Gavin Shan wrote:
>>>>> The original code doesn't support releasing PEs dynamically, meaning
>>>>> that PE and the associated resources (IO, M32, M64 and DMA) can't
>>>>> be released when unplugging a PCI adapter from one hotpluggable slot.
>>>>>
>>>>> The patch takes object oriented methodology, introducs reference
>>>>> count to PE, which is initialized to 1 and increased with 1 when a
>>>>> new PCI device joins the PE. Once the last PCI device leaves the
>>>>> PE, the PE is going to be release together with its associated
>>>>> (IO, M32, M64, DMA) resources.
>>>>
>>>>
>>>> Too little commit log for non-trivial non-cut-n-paste 30KB patch...
>>>>
>>>
>>> Ok. I'll add more details in next revision.
>>>
>>>>>
>>>>> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>>>> ---
>>>>>   arch/powerpc/include/asm/pci-bridge.h     |   3 +
>>>>>   arch/powerpc/kernel/pci-hotplug.c         |   5 +
>>>>>   arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++-----------
>>>>>   arch/powerpc/platforms/powernv/pci.h      |   4 +-
>>>>>   4 files changed, 432 insertions(+), 238 deletions(-)
>>>>>
>>>>> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>> index 5367eb3..a6ad4b1 100644
>>>>> --- a/arch/powerpc/include/asm/pci-bridge.h
>>>>> +++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>> @@ -31,6 +31,9 @@ struct pci_controller_ops {
>>>>>   	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
>>>>>   	void		(*setup_bridge)(struct pci_bus *, unsigned long);
>>>>>   	void		(*reset_secondary_bus)(struct pci_dev *dev);
>>>>> +
>>>>> +	/* Called when PCI device is released */
>>>>> +	void		(*release_device)(struct pci_dev *);
>>>>>   };
>>>>>
>>>>>   /*
>>>>> diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
>>>>> index 7ed85a6..0040343 100644
>>>>> --- a/arch/powerpc/kernel/pci-hotplug.c
>>>>> +++ b/arch/powerpc/kernel/pci-hotplug.c
>>>>> @@ -29,6 +29,11 @@
>>>>>    */
>>>>>   void pcibios_release_device(struct pci_dev *dev)
>>>>>   {
>>>>> +	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>>>> +
>>>>> +	if (hose->controller_ops.release_device)
>>>>> +		hose->controller_ops.release_device(dev);
>>>>> +
>>>>>   	eeh_remove_device(dev);
>>>>>   }
>>>>>
>>>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>> index 910fb67..ef8c216 100644
>>>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>> @@ -12,6 +12,8 @@
>>>>>   #undef DEBUG
>>>>>
>>>>>   #include <linux/kernel.h>
>>>>> +#include <linux/atomic.h>
>>>>> +#include <linux/kref.h>
>>>>>   #include <linux/pci.h>
>>>>>   #include <linux/crash_dump.h>
>>>>>   #include <linux/debugfs.h>
>>>>> @@ -47,6 +49,8 @@
>>>>>   /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
>>>>>   #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
>>>>>
>>>>> +static void pnv_ioda_release_pe(struct kref *kref);
>>>>> +
>>>>>   static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>>>>>   			    const char *fmt, ...)
>>>>>   {
>>>>> @@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>>>>>   		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>>>>>   }
>>>>>
>>>>> -static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
>>>>> +static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>>>>>   {
>>>>> -	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>>>> -		pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>>>> -			__func__, pe_no, phb->hose->global_number);
>>>>> +	if (!pe)
>>>>> +		return;
>>>>> +
>>>>> +	kref_get(&pe->kref);
>>>>> +}
>>>>> +
>>>>> +static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
>>>>> +{
>>>>> +	unsigned int count;
>>>>> +
>>>>> +	if (!pe)
>>>>>   		return;
>>>>> +
>>>>> +	/*
>>>>> +	 * The count is initialized to 1 and increased with 1 when
>>>>> +	 * a new PCI device is bound with the PE. Once the last PCI
>>>>> +	 * device is leaving from the PE, the PE is going to be
>>>>> +	 * released.
>>>>> +	 */
>>>>> +	count = atomic_read(&pe->kref.refcount);
>>>>> +	if (count == 2)
>>>>> +		kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
>>>>> +	else
>>>>> +		kref_put(&pe->kref, pnv_ioda_release_pe);
>>>>
>>>>
>>>> What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()?
>>>>
>>>
>>> Yeah, that would have problem. But it shouldn't happen because the
>>> PCI devices are joining the parent PE# in strictly serialized mode.
>>> Same thing happens when detaching PCI devices from its parent PE.
>>
>>
>> oookay. Another thing then - why is this kref counter initialized to 1?
>> It would make sense if you did something special when the counter becomes 1
>> after decrement but you do not.
>>
>> Also, this kref thing makes sense if you do kref_put() in multiple places and
>> do not know which one will be the last one so you pass the callback to all of
>> them. Here you do kref_put/sub in one place and you read the counter - so you
>> can call pnv_ioda_release_pe() directly. And it feels like a simple atomic_t
>> would do the job just fine. If you still feel that the counter should start
>>from 1, there are atomic_dec_if_positive() and atomic_inc_not_zero() and
>> others.
>>
>
> It's good question actually. The counter is initialized to 1 when the PE
> is reserved because of M64 requirement or allocated for non-M64 case. If
> we reserve or allocate PE#, there is one thing for sure: the PCI bus has
> one PCI device (including PCI bridge) at least. After the PE# is reserved
> or allocated, the PCI device joins the PE with the result of increasing
> the counter with 1. It means the counter is 2 when PE contains one PCI
> device, and 3 when there're 2 devices. One reason for this design is that
> we just need decrease the counter if we have to release this PE in the
> window between PE reservation/allocation and first PCI device joins. I
> think you're correct that we can call pnv_ioda_release_pe() in this window.
> In this way, the counter is always reflecting the number of PCI devices
> the PE contains.


Good :) I believe it was something different 2-3 versions ago and evolved 
to this so you do not notice it straight away :)


>
>>>>> +}
>>>>> +
>>>>> +static void pnv_pci_release_device(struct pci_dev *pdev)
>>>>> +{
>>>>> +	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>>>> +	struct pnv_phb *phb = hose->private_data;
>>>>> +	struct pci_dn *pdn = pci_get_pdn(pdev);
>>>>> +	struct pnv_ioda_pe *pe;
>>>>> +
>>>>> +	if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>>>> +		pe = &phb->ioda.pe_array[pdn->pe_number];
>>>>> +		pnv_ioda_pe_put(pe);
>>>>> +		pdn->pe_number = IODA_INVALID_PE;
>>>>>   	}
>>>>> +}
>>>>>
>>>>> -	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
>>>>> -		pr_warn("%s: PE %d was assigned on PHB#%x\n",
>>>>> -			__func__, pe_no, phb->hose->global_number);
>>>>> +static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>>>>> +{
>>>>> +	struct pnv_phb *phb = pe->phb;
>>>>> +	int index, count;
>>>>> +	unsigned long tbl_addr, tbl_size;
>>>>> +
>>>>> +	/* No DMA capability for slave PEs */
>>>>> +	if (pe->flags & PNV_IODA_PE_SLAVE)
>>>>> +		return;
>>>>> +
>>>>> +	/* Bypass DMA window */
>>>>> +	if (phb->type == PNV_PHB_IODA2 &&
>>>>> +	    pe->tce_bypass_enabled &&
>>>>> +	    pe->tce32_table &&
>>>>> +	    pe->tce32_table->set_bypass)
>>>>> +		pe->tce32_table->set_bypass(pe->tce32_table, false);
>>>>> +
>>>>> +	/* 32-bits DMA window */
>>>>> +	count = pe->tce32_seg_end - pe->tce32_seg_start;
>>>>> +	tbl_addr = pe->tce32_table->it_base;
>>>>> +	if (!count)
>>>>>   		return;
>>>>> +
>>>>> +	/* Free IOMMU table */
>>>>> +	iommu_free_table(pe->tce32_table,
>>>>> +			 of_node_full_name(phb->hose->dn));
>>>>> +
>>>>> +	/* Deconfigure TCE table */
>>>>> +	switch (phb->type) {
>>>>> +	case PNV_PHB_IODA1:
>>>>> +		for (index = 0; index < count; index++)
>>>>> +			opal_pci_map_pe_dma_window(phb->opal_id,
>>>>> +						   pe->pe_number,
>>>>> +						   pe->tce32_seg_start + index,
>>>>> +						   1,
>>>>> +						   __pa(tbl_addr) +
>>>>> +						   index * TCE32_TABLE_SIZE,
>>>>> +						   0,
>>>>> +						   0x1000);
>>>>> +		bitmap_clear(phb->ioda.tce32_segmap,
>>>>> +			     pe->tce32_seg_start,
>>>>> +			     count);
>>>>> +		tbl_size = TCE32_TABLE_SIZE * count;
>>>>> +		break;
>>>>> +	case PNV_PHB_IODA2:
>>>>> +		opal_pci_map_pe_dma_window(phb->opal_id,
>>>>> +					   pe->pe_number,
>>>>> +					   pe->pe_number << 1,
>>>>> +					   1,
>>>>> +					   __pa(tbl_addr),
>>>>> +					   0,
>>>>> +					   0x1000);
>>>>> +		tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
>>>>> +		tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
>>>>> +		break;
>>>>> +	default:
>>>>> +		pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
>>>>> +		return;
>>>>> +	}
>>>>> +
>>>>> +	/* Free memory of IOMMU table */
>>>>> +	free_pages(tbl_addr, get_order(tbl_size));
>>>>
>>>>
>>>> You just programmed the table address to TVT and then you are releasing the
>>>> pages. It does not seem right, it will leave garbage in TVT. Also, I am
>>>> adding helpers to alloc/free TCE pages in DDW patchset, you could reuse bits
>>> >from there (I'll post v10 soon, you'll be in copy and you'll have to review
>>>> that ;) ).
>>>>
>>>
>>> I assume you're talking about TVE. I don't understand how garbage will be left
>>> in TVE. opal_pci_map_pe_dma_window(), which is handled by skiboot, clear TVE
>>> with zero'ed "tce_table_size". The pages previously allocated for TCE table is
>>> released to buddy system, which can be allocated by somebody else (from buddy
>>> or slab).
>>
>> opal_pci_map_pe_dma_window() takes __pa(tbl_addr) which points to some memory
>> which is still allocated. This value goes to a table (which has 2 entries per
>> PE, one for 32bit DMA window and one for bypass/hugewindow) which PHB uses to
>> get the actual TCE table address. What is the name of this table? :) Anyway,
>> you write an address there and then you call free_pages() so after
>> free_pages(), the value in that TVE/TVT/whatever table is a garbage.
>>
>
> I don't look into your DDW code yet. Before we have DDW patchset, the bypass
> TVE (window) isn't supposed to have corresponding TCE table. I guess you might
> change the behaviour in your DDW patchset and I'll take a close look on that.
> For DMA32 window, which is the name of the table, the TVE is cleared by skiboot
> when having zero "tce_table_size" argument.
>
> 	opal_pci_map_pe_dma_window(phb->opal_id,
> 				   pe->pe_number,
> 				   pe->pe_number << 1,
> 				   1,
> 				   __pa(tbl_addr),
> 				   0,			<<<< "tce_table_size".
> 				   0x1000);


Then please, when you pass tce_table_size==0, also pass zero address/zero 
page size/zero levels, unless you have very good reason to pass non-zero 
values for these. What you have now is confusing - it looks like you are 
initializing the table - it is not obvious that "0" is the size and not 
some flags.

When people see this (which does the same thing, please correct me if I am 
wrong), they do not have questions what you are actually trying to do:

  	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
  				   pe->pe_number << 1, 0, 0, 0, 0);
Gavin Shan May 12, 2015, 1:25 a.m. UTC | #6
On Tue, May 12, 2015 at 10:53:29AM +1000, Alexey Kardashevskiy wrote:
>On 05/12/2015 10:03 AM, Gavin Shan wrote:
>>On Mon, May 11, 2015 at 05:02:08PM +1000, Alexey Kardashevskiy wrote:
>>>On 05/11/2015 04:25 PM, Gavin Shan wrote:
>>>>On Sat, May 09, 2015 at 10:43:23PM +1000, Alexey Kardashevskiy wrote:
>>>>>On 05/01/2015 04:02 PM, Gavin Shan wrote:
>>>>>>The original code doesn't support releasing PEs dynamically, meaning
>>>>>>that PE and the associated resources (IO, M32, M64 and DMA) can't
>>>>>>be released when unplugging a PCI adapter from one hotpluggable slot.
>>>>>>
>>>>>>The patch takes object oriented methodology, introducs reference
>>>>>>count to PE, which is initialized to 1 and increased with 1 when a
>>>>>>new PCI device joins the PE. Once the last PCI device leaves the
>>>>>>PE, the PE is going to be release together with its associated
>>>>>>(IO, M32, M64, DMA) resources.
>>>>>
>>>>>
>>>>>Too little commit log for non-trivial non-cut-n-paste 30KB patch...
>>>>>
>>>>
>>>>Ok. I'll add more details in next revision.
>>>>
>>>>>>
>>>>>>Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>>>>>---
>>>>>>  arch/powerpc/include/asm/pci-bridge.h     |   3 +
>>>>>>  arch/powerpc/kernel/pci-hotplug.c         |   5 +
>>>>>>  arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++-----------
>>>>>>  arch/powerpc/platforms/powernv/pci.h      |   4 +-
>>>>>>  4 files changed, 432 insertions(+), 238 deletions(-)
>>>>>>
>>>>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>>>index 5367eb3..a6ad4b1 100644
>>>>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>>>@@ -31,6 +31,9 @@ struct pci_controller_ops {
>>>>>>  	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
>>>>>>  	void		(*setup_bridge)(struct pci_bus *, unsigned long);
>>>>>>  	void		(*reset_secondary_bus)(struct pci_dev *dev);
>>>>>>+
>>>>>>+	/* Called when PCI device is released */
>>>>>>+	void		(*release_device)(struct pci_dev *);
>>>>>>  };
>>>>>>
>>>>>>  /*
>>>>>>diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
>>>>>>index 7ed85a6..0040343 100644
>>>>>>--- a/arch/powerpc/kernel/pci-hotplug.c
>>>>>>+++ b/arch/powerpc/kernel/pci-hotplug.c
>>>>>>@@ -29,6 +29,11 @@
>>>>>>   */
>>>>>>  void pcibios_release_device(struct pci_dev *dev)
>>>>>>  {
>>>>>>+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>>>>>+
>>>>>>+	if (hose->controller_ops.release_device)
>>>>>>+		hose->controller_ops.release_device(dev);
>>>>>>+
>>>>>>  	eeh_remove_device(dev);
>>>>>>  }
>>>>>>
>>>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>>index 910fb67..ef8c216 100644
>>>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>>@@ -12,6 +12,8 @@
>>>>>>  #undef DEBUG
>>>>>>
>>>>>>  #include <linux/kernel.h>
>>>>>>+#include <linux/atomic.h>
>>>>>>+#include <linux/kref.h>
>>>>>>  #include <linux/pci.h>
>>>>>>  #include <linux/crash_dump.h>
>>>>>>  #include <linux/debugfs.h>
>>>>>>@@ -47,6 +49,8 @@
>>>>>>  /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
>>>>>>  #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
>>>>>>
>>>>>>+static void pnv_ioda_release_pe(struct kref *kref);
>>>>>>+
>>>>>>  static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>>>>>>  			    const char *fmt, ...)
>>>>>>  {
>>>>>>@@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>>>>>>  		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>>>>>>  }
>>>>>>
>>>>>>-static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
>>>>>>+static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>>>>>>  {
>>>>>>-	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>>>>>-		pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>>>>>-			__func__, pe_no, phb->hose->global_number);
>>>>>>+	if (!pe)
>>>>>>+		return;
>>>>>>+
>>>>>>+	kref_get(&pe->kref);
>>>>>>+}
>>>>>>+
>>>>>>+static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
>>>>>>+{
>>>>>>+	unsigned int count;
>>>>>>+
>>>>>>+	if (!pe)
>>>>>>  		return;
>>>>>>+
>>>>>>+	/*
>>>>>>+	 * The count is initialized to 1 and increased with 1 when
>>>>>>+	 * a new PCI device is bound with the PE. Once the last PCI
>>>>>>+	 * device is leaving from the PE, the PE is going to be
>>>>>>+	 * released.
>>>>>>+	 */
>>>>>>+	count = atomic_read(&pe->kref.refcount);
>>>>>>+	if (count == 2)
>>>>>>+		kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
>>>>>>+	else
>>>>>>+		kref_put(&pe->kref, pnv_ioda_release_pe);
>>>>>
>>>>>
>>>>>What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()?
>>>>>
>>>>
>>>>Yeah, that would have problem. But it shouldn't happen because the
>>>>PCI devices are joining the parent PE# in strictly serialized mode.
>>>>Same thing happens when detaching PCI devices from its parent PE.
>>>
>>>
>>>oookay. Another thing then - why is this kref counter initialized to 1?
>>>It would make sense if you did something special when the counter becomes 1
>>>after decrement but you do not.
>>>
>>>Also, this kref thing makes sense if you do kref_put() in multiple places and
>>>do not know which one will be the last one so you pass the callback to all of
>>>them. Here you do kref_put/sub in one place and you read the counter - so you
>>>can call pnv_ioda_release_pe() directly. And it feels like a simple atomic_t
>>>would do the job just fine. If you still feel that the counter should start
>>>from 1, there are atomic_dec_if_positive() and atomic_inc_not_zero() and
>>>others.
>>>
>>
>>It's good question actually. The counter is initialized to 1 when the PE
>>is reserved because of M64 requirement or allocated for non-M64 case. If
>>we reserve or allocate PE#, there is one thing for sure: the PCI bus has
>>one PCI device (including PCI bridge) at least. After the PE# is reserved
>>or allocated, the PCI device joins the PE with the result of increasing
>>the counter with 1. It means the counter is 2 when PE contains one PCI
>>device, and 3 when there're 2 devices. One reason for this design is that
>>we just need decrease the counter if we have to release this PE in the
>>window between PE reservation/allocation and first PCI device joins. I
>>think you're correct that we can call pnv_ioda_release_pe() in this window.
>>In this way, the counter is always reflecting the number of PCI devices
>>the PE contains.
>
>
>Good :) I believe it was something different 2-3 versions ago and evolved to
>this so you do not notice it straight away :)
>

Thanks :)

>>
>>>>>>+}
>>>>>>+
>>>>>>+static void pnv_pci_release_device(struct pci_dev *pdev)
>>>>>>+{
>>>>>>+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>>>>>+	struct pnv_phb *phb = hose->private_data;
>>>>>>+	struct pci_dn *pdn = pci_get_pdn(pdev);
>>>>>>+	struct pnv_ioda_pe *pe;
>>>>>>+
>>>>>>+	if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>>>>>+		pe = &phb->ioda.pe_array[pdn->pe_number];
>>>>>>+		pnv_ioda_pe_put(pe);
>>>>>>+		pdn->pe_number = IODA_INVALID_PE;
>>>>>>  	}
>>>>>>+}
>>>>>>
>>>>>>-	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
>>>>>>-		pr_warn("%s: PE %d was assigned on PHB#%x\n",
>>>>>>-			__func__, pe_no, phb->hose->global_number);
>>>>>>+static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>>>>>>+{
>>>>>>+	struct pnv_phb *phb = pe->phb;
>>>>>>+	int index, count;
>>>>>>+	unsigned long tbl_addr, tbl_size;
>>>>>>+
>>>>>>+	/* No DMA capability for slave PEs */
>>>>>>+	if (pe->flags & PNV_IODA_PE_SLAVE)
>>>>>>+		return;
>>>>>>+
>>>>>>+	/* Bypass DMA window */
>>>>>>+	if (phb->type == PNV_PHB_IODA2 &&
>>>>>>+	    pe->tce_bypass_enabled &&
>>>>>>+	    pe->tce32_table &&
>>>>>>+	    pe->tce32_table->set_bypass)
>>>>>>+		pe->tce32_table->set_bypass(pe->tce32_table, false);
>>>>>>+
>>>>>>+	/* 32-bits DMA window */
>>>>>>+	count = pe->tce32_seg_end - pe->tce32_seg_start;
>>>>>>+	tbl_addr = pe->tce32_table->it_base;
>>>>>>+	if (!count)
>>>>>>  		return;
>>>>>>+
>>>>>>+	/* Free IOMMU table */
>>>>>>+	iommu_free_table(pe->tce32_table,
>>>>>>+			 of_node_full_name(phb->hose->dn));
>>>>>>+
>>>>>>+	/* Deconfigure TCE table */
>>>>>>+	switch (phb->type) {
>>>>>>+	case PNV_PHB_IODA1:
>>>>>>+		for (index = 0; index < count; index++)
>>>>>>+			opal_pci_map_pe_dma_window(phb->opal_id,
>>>>>>+						   pe->pe_number,
>>>>>>+						   pe->tce32_seg_start + index,
>>>>>>+						   1,
>>>>>>+						   __pa(tbl_addr) +
>>>>>>+						   index * TCE32_TABLE_SIZE,
>>>>>>+						   0,
>>>>>>+						   0x1000);
>>>>>>+		bitmap_clear(phb->ioda.tce32_segmap,
>>>>>>+			     pe->tce32_seg_start,
>>>>>>+			     count);
>>>>>>+		tbl_size = TCE32_TABLE_SIZE * count;
>>>>>>+		break;
>>>>>>+	case PNV_PHB_IODA2:
>>>>>>+		opal_pci_map_pe_dma_window(phb->opal_id,
>>>>>>+					   pe->pe_number,
>>>>>>+					   pe->pe_number << 1,
>>>>>>+					   1,
>>>>>>+					   __pa(tbl_addr),
>>>>>>+					   0,
>>>>>>+					   0x1000);
>>>>>>+		tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
>>>>>>+		tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
>>>>>>+		break;
>>>>>>+	default:
>>>>>>+		pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
>>>>>>+		return;
>>>>>>+	}
>>>>>>+
>>>>>>+	/* Free memory of IOMMU table */
>>>>>>+	free_pages(tbl_addr, get_order(tbl_size));
>>>>>
>>>>>
>>>>>You just programmed the table address to TVT and then you are releasing the
>>>>>pages. It does not seem right, it will leave garbage in TVT. Also, I am
>>>>>adding helpers to alloc/free TCE pages in DDW patchset, you could reuse bits
>>>>>from there (I'll post v10 soon, you'll be in copy and you'll have to review
>>>>>that ;) ).
>>>>>
>>>>
>>>>I assume you're talking about TVE. I don't understand how garbage will be left
>>>>in TVE. opal_pci_map_pe_dma_window(), which is handled by skiboot, clear TVE
>>>>with zero'ed "tce_table_size". The pages previously allocated for TCE table is
>>>>released to buddy system, which can be allocated by somebody else (from buddy
>>>>or slab).
>>>
>>>opal_pci_map_pe_dma_window() takes __pa(tbl_addr) which points to some memory
>>>which is still allocated. This value goes to a table (which has 2 entries per
>>>PE, one for 32bit DMA window and one for bypass/hugewindow) which PHB uses to
>>>get the actual TCE table address. What is the name of this table? :) Anyway,
>>>you write an address there and then you call free_pages() so after
>>>free_pages(), the value in that TVE/TVT/whatever table is a garbage.
>>>
>>
>>I don't look into your DDW code yet. Before we have DDW patchset, the bypass
>>TVE (window) isn't supposed to have corresponding TCE table. I guess you might
>>change the behaviour in your DDW patchset and I'll take a close look on that.
>>For DMA32 window, which is the name of the table, the TVE is cleared by skiboot
>>when having zero "tce_table_size" argument.
>>
>>	opal_pci_map_pe_dma_window(phb->opal_id,
>>				   pe->pe_number,
>>				   pe->pe_number << 1,
>>				   1,
>>				   __pa(tbl_addr),
>>				   0,			<<<< "tce_table_size".
>>				   0x1000);
>
>
>Then please, when you pass tce_table_size==0, also pass zero address/zero
>page size/zero levels, unless you have very good reason to pass non-zero
>values for these. What you have now is confusing - it looks like you are
>initializing the table - it is not obvious that "0" is the size and not some
>flags.
>
>When people see this (which does the same thing, please correct me if I am
>wrong), they do not have questions what you are actually trying to do:
>
> 	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
> 				   pe->pe_number << 1, 0, 0, 0, 0);
>

Sure, with more zero'ed parameters to the OPAL call, the purpose will be
more clear. I also check the skiboot implementation of this function, it
should work. I'll use the code as you're suggesting.

Thanks,
Gavin
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 5367eb3..a6ad4b1 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -31,6 +31,9 @@  struct pci_controller_ops {
 	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
 	void		(*setup_bridge)(struct pci_bus *, unsigned long);
 	void		(*reset_secondary_bus)(struct pci_dev *dev);
+
+	/* Called when PCI device is released */
+	void		(*release_device)(struct pci_dev *);
 };
 
 /*
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index 7ed85a6..0040343 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -29,6 +29,11 @@ 
  */
 void pcibios_release_device(struct pci_dev *dev)
 {
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+
+	if (hose->controller_ops.release_device)
+		hose->controller_ops.release_device(dev);
+
 	eeh_remove_device(dev);
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 910fb67..ef8c216 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -12,6 +12,8 @@ 
 #undef DEBUG
 
 #include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/kref.h>
 #include <linux/pci.h>
 #include <linux/crash_dump.h>
 #include <linux/debugfs.h>
@@ -47,6 +49,8 @@ 
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
 
+static void pnv_ioda_release_pe(struct kref *kref);
+
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 			    const char *fmt, ...)
 {
@@ -123,25 +127,400 @@  static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
 		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
 }
 
-static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
+static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
 {
-	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
-		pr_warn("%s: Invalid PE %d on PHB#%x\n",
-			__func__, pe_no, phb->hose->global_number);
+	if (!pe)
+		return;
+
+	kref_get(&pe->kref);
+}
+
+static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
+{
+	unsigned int count;
+
+	if (!pe)
 		return;
+
+	/*
+	 * The count is initialized to 1 and increased with 1 when
+	 * a new PCI device is bound with the PE. Once the last PCI
+	 * device is leaving from the PE, the PE is going to be
+	 * released.
+	 */
+	count = atomic_read(&pe->kref.refcount);
+	if (count == 2)
+		kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
+	else
+		kref_put(&pe->kref, pnv_ioda_release_pe);
+}
+
+static void pnv_pci_release_device(struct pci_dev *pdev)
+{
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	struct pnv_ioda_pe *pe;
+
+	if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+		pe = &phb->ioda.pe_array[pdn->pe_number];
+		pnv_ioda_pe_put(pe);
+		pdn->pe_number = IODA_INVALID_PE;
 	}
+}
 
-	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
-		pr_warn("%s: PE %d was assigned on PHB#%x\n",
-			__func__, pe_no, phb->hose->global_number);
+static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb;
+	int index, count;
+	unsigned long tbl_addr, tbl_size;
+
+	/* No DMA capability for slave PEs */
+	if (pe->flags & PNV_IODA_PE_SLAVE)
+		return;
+
+	/* Bypass DMA window */
+	if (phb->type == PNV_PHB_IODA2 &&
+	    pe->tce_bypass_enabled &&
+	    pe->tce32_table &&
+	    pe->tce32_table->set_bypass)
+		pe->tce32_table->set_bypass(pe->tce32_table, false);
+
+	/* 32-bits DMA window */
+	count = pe->tce32_seg_end - pe->tce32_seg_start;
+	tbl_addr = pe->tce32_table->it_base;
+	if (!count)
 		return;
+
+	/* Free IOMMU table */
+	iommu_free_table(pe->tce32_table,
+			 of_node_full_name(phb->hose->dn));
+
+	/* Deconfigure TCE table */
+	switch (phb->type) {
+	case PNV_PHB_IODA1:
+		for (index = 0; index < count; index++)
+			opal_pci_map_pe_dma_window(phb->opal_id,
+						   pe->pe_number,
+						   pe->tce32_seg_start + index,
+						   1,
+						   __pa(tbl_addr) +
+						   index * TCE32_TABLE_SIZE,
+						   0,
+						   0x1000);
+		bitmap_clear(phb->ioda.tce32_segmap,
+			     pe->tce32_seg_start,
+			     count);
+		tbl_size = TCE32_TABLE_SIZE * count;
+		break;
+	case PNV_PHB_IODA2:
+		opal_pci_map_pe_dma_window(phb->opal_id,
+					   pe->pe_number,
+					   pe->pe_number << 1,
+					   1,
+					   __pa(tbl_addr),
+					   0,
+					   0x1000);
+		tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
+		tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
+		break;
+	default:
+		pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
+		return;
+	}
+
+	/* Free memory of IOMMU table */
+	free_pages(tbl_addr, get_order(tbl_size));
+	pe->tce32_table = NULL;
+	pe->tce32_seg_start = 0;
+	pe->tce32_seg_end = 0;
+}
+
+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb;
+	unsigned long *segmap = NULL, *pe_segmap = NULL;
+	int i;
+	uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE,
+				     OPAL_M32_WINDOW_TYPE,
+				     OPAL_M64_WINDOW_TYPE };
+
+	for (win = 0; win < ARRAY_SIZE(win_type); win++) {
+		switch (win_type[win]) {
+		case OPAL_IO_WINDOW_TYPE:
+			segmap = phb->ioda.io_segmap;
+			pe_segmap = pe->io_segmap;
+			break;
+		case OPAL_M32_WINDOW_TYPE:
+			segmap = phb->ioda.m32_segmap;
+			pe_segmap = pe->m32_segmap;
+			break;
+		case OPAL_M64_WINDOW_TYPE:
+			segmap = phb->ioda.m64_segmap;
+			pe_segmap = pe->m64_segmap;
+			break;
+		}
+		i = -1;
+		while ((i = find_next_bit(pe_segmap,
+			phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) {
+			if (win_type[win] == OPAL_IO_WINDOW_TYPE ||
+			    win_type[win] == OPAL_M32_WINDOW_TYPE)
+				opal_pci_map_pe_mmio_window(phb->opal_id,
+						phb->ioda.reserved_pe,
+						win_type[win], 0, i);
+			else if (phb->type == PNV_PHB_IODA1)
+				opal_pci_map_pe_mmio_window(phb->opal_id,
+						phb->ioda.reserved_pe,
+						win_type[win],
+						i / 8, i % 8);
+
+			clear_bit(i, pe_segmap);
+			clear_bit(i, segmap);
+		}
+	}
+}
+
+static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
+				  struct pnv_ioda_pe *parent,
+				  struct pnv_ioda_pe *child,
+				  bool is_add)
+{
+	const char *desc = is_add ? "adding" : "removing";
+	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
+			      OPAL_REMOVE_PE_FROM_DOMAIN;
+	struct pnv_ioda_pe *slave;
+	long rc;
+
+	/* Parent PE affects child PE */
+	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
+				child->pe_number, op);
+	if (rc != OPAL_SUCCESS) {
+		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
+			rc, desc);
+		return -ENXIO;
+	}
+
+	if (!(child->flags & PNV_IODA_PE_MASTER))
+		return 0;
+
+	/* Compound case: parent PE affects slave PEs */
+	list_for_each_entry(slave, &child->slaves, list) {
+		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
+					slave->pe_number, op);
+		if (rc != OPAL_SUCCESS) {
+			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
+				rc, desc);
+			return -ENXIO;
+		}
+	}
+
+	return 0;
+}
+
+static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add)
+{
+	struct pnv_phb *phb = pe->phb;
+	struct pnv_ioda_pe *slave;
+	struct pci_dev *pdev = NULL;
+	int ret;
+
+	/*
+	 * Clear PE frozen state. If it's master PE, we need
+	 * clear slave PE frozen state as well.
+	 */
+	opal_pci_eeh_freeze_clear(phb->opal_id,
+				  pe->pe_number,
+				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+	if (pe->flags & PNV_IODA_PE_MASTER) {
+		list_for_each_entry(slave, &pe->slaves, list) {
+			opal_pci_eeh_freeze_clear(phb->opal_id,
+						  slave->pe_number,
+						  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+		}
+	}
+
+	/*
+	 * Associate PE in PELT. We need add the PE into the
+	 * corresponding PELT-V as well. Otherwise, the error
+	 * originated from the PE might contribute to other
+	 * PEs.
+	 */
+	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
+	if (ret)
+		return ret;
+
+	/* For compound PEs, any one affects all of them */
+	if (pe->flags & PNV_IODA_PE_MASTER) {
+		list_for_each_entry(slave, &pe->slaves, list) {
+			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
+			if (ret)
+				return ret;
+		}
+	}
+
+	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
+		pdev = pe->pbus->self;
+	else if (pe->flags & PNV_IODA_PE_DEV)
+		pdev = pe->pdev->bus->self;
+#ifdef CONFIG_PCI_IOV
+	else if (pe->flags & PNV_IODA_PE_VF)
+		pdev = pe->parent_dev->bus->self;
+#endif /* CONFIG_PCI_IOV */
+
+	while (pdev) {
+		struct pci_dn *pdn = pci_get_pdn(pdev);
+		struct pnv_ioda_pe *parent;
+
+		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+			parent = &phb->ioda.pe_array[pdn->pe_number];
+			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
+			if (ret)
+				return ret;
+		}
+
+		pdev = pdev->bus->self;
+	}
+
+	return 0;
+}
+
+static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb;
+	struct pci_dev *parent;
+	uint8_t bcomp, dcomp, fcomp;
+	long rid_end, rid;
+	int64_t rc;
+
+	/* Tear down MVE */
+	if (phb->type == PNV_PHB_IODA1 &&
+	    pe->mve_number != -1) {
+		rc = opal_pci_set_mve(phb->opal_id,
+				      pe->mve_number,
+				      phb->ioda.reserved_pe);
+		if (rc != OPAL_SUCCESS)
+			pe_warn(pe, "Error %lld unmapping MVE#%d\n",
+				rc, pe->mve_number);
+		rc = opal_pci_set_mve_enable(phb->opal_id,
+					     pe->mve_number,
+					     OPAL_DISABLE_MVE);
+		if (rc != OPAL_SUCCESS)
+			pe_warn(pe, "Error %lld disabling MVE#%d\n",
+				rc, pe->mve_number);
+		pe->mve_number = -1;
+	}
+
+	/* Unmapping PELTV */
+	pnv_ioda_set_peltv(pe, false);
+
+	/* To unmap PELTM */
+	if (pe->pbus) {
+		int count;
+
+		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+		parent = pe->pbus->self;
+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
+			count = pe->pbus->busn_res.end -
+				pe->pbus->busn_res.start + 1;
+		else
+			count = 1;
+
+		switch(count) {
+		case  1: bcomp = OpalPciBusAll;   break;
+		case  2: bcomp = OpalPciBus7Bits; break;
+		case  4: bcomp = OpalPciBus6Bits; break;
+		case  8: bcomp = OpalPciBus5Bits; break;
+		case 16: bcomp = OpalPciBus4Bits; break;
+		case 32: bcomp = OpalPciBus3Bits; break;
+		default:
+			/* Fail back to case of one bus */
+			pe_warn(pe, "Cannot support %d buses\n", count);
+			bcomp = OpalPciBusAll;
+		}
+		rid_end = pe->rid + (count << 8);
+	} else {
+#ifdef CONFIG_PCI_IOV
+		if (pe->flags & PNV_IODA_PE_VF)
+			parent = pe->parent_dev;
+		else
+#endif
+			parent = pe->pdev->bus->self;
+		bcomp = OpalPciBusAll;
+		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+		rid_end = pe->rid + 1;
+	}
+
+	/* Clear RID mapping */
+	for (rid = pe->rid; rid < rid_end; rid++)
+		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
+
+	/* Unmapping PELTM */
+	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
+	if (rc)
+		pe_warn(pe, "Error %ld unmapping PELTM\n", rc);
+}
+
+static void pnv_ioda_release_pe(struct kref *kref)
+{
+	struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref);
+	struct pnv_ioda_pe *tmp, *slave;
+	struct pnv_phb *phb = pe->phb;
+
+	pnv_ioda_release_pe_dma(pe);
+	pnv_ioda_release_pe_seg(pe);
+	pnv_ioda_deconfigure_pe(pe);
+
+	/* Release slave PEs for compound PE */
+	if (pe->flags & PNV_IODA_PE_MASTER) {
+		list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
+			pnv_ioda_pe_put(slave);
+	}
+
+	/* Remove the PE from various list. We need remove slave
+	 * PE from master's list.
+	 */
+	list_del(&pe->dma_link);
+	list_del(&pe->list);
+
+	/* Free PE number */
+	clear_bit(pe->pe_number, phb->ioda.pe_alloc);
+}
+
+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb,
+					    int pe_no)
+{
+	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
+
+	kref_init(&pe->kref);
+	pe->phb = phb;
+	pe->pe_number = pe_no;
+	INIT_LIST_HEAD(&pe->dma_link);
+	INIT_LIST_HEAD(&pe->list);
+
+	return pe;
+}
+
+static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb,
+					       int pe_no)
+{
+	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
+		pr_warn("%s: Invalid PE %d on PHB#%x\n",
+			__func__, pe_no, phb->hose->global_number);
+		return NULL;
 	}
 
-	phb->ioda.pe_array[pe_no].phb = phb;
-	phb->ioda.pe_array[pe_no].pe_number = pe_no;
+	/*
+	 * Same PE might be reserved for multiple times, which
+	 * is out of problem actually.
+	 */
+	set_bit(pe_no, phb->ioda.pe_alloc);
+	return pnv_ioda_init_pe(phb, pe_no);
 }
 
-static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
+static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
 {
 	unsigned long pe_no;
 	unsigned long limit = phb->ioda.total_pe - 1;
@@ -154,20 +533,10 @@  static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
 			break;
 
 		if (--limit >= phb->ioda.total_pe)
-			return IODA_INVALID_PE;
+			return NULL;
 	} while(1);
 
-	phb->ioda.pe_array[pe_no].phb = phb;
-	phb->ioda.pe_array[pe_no].pe_number = pe_no;
-	return pe_no;
-}
-
-static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
-{
-	WARN_ON(phb->ioda.pe_array[pe].pdev);
-
-	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
-	clear_bit(pe, phb->ioda.pe_alloc);
+	return pnv_ioda_init_pe(phb, pe_no);
 }
 
 static int pnv_ioda1_init_m64(struct pnv_phb *phb)
@@ -382,8 +751,9 @@  static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb,
 	}
 }
 
-static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
-				struct pci_bus *bus, int all)
+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
+						struct pci_bus *bus,
+						int all)
 {
 	resource_size_t segsz = phb->ioda.m64_segsize;
 	struct pci_dev *pdev;
@@ -394,14 +764,14 @@  static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
 	int i;
 
 	if (!pnv_ioda_need_m64_pe(phb, bus))
-		return IODA_INVALID_PE;
+		return NULL;
 
         /* Allocate bitmap */
 	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
 	pe_bitsmap = kzalloc(size, GFP_KERNEL);
 	if (!pe_bitsmap) {
 		pr_warn("%s: Out of memory !\n", __func__);
-		return IODA_INVALID_PE;
+		return NULL;
 	}
 
 	/* The bridge's M64 window might be extended to PHB's M64
@@ -438,7 +808,7 @@  static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
 	/* No M64 window found ? */
 	if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) {
 		kfree(pe_bitsmap);
-		return IODA_INVALID_PE;
+		return NULL;
 	}
 
 	/* Figure out the master PE and put all slave PEs
@@ -491,7 +861,7 @@  static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
 	}
 
 	kfree(pe_bitsmap);
-	return master_pe->pe_number;
+	return master_pe;
 }
 
 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
@@ -695,7 +1065,7 @@  static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
  * but in the meantime, we need to protect them to avoid warnings
  */
 #ifdef CONFIG_PCI_MSI
-static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
+static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev)
 {
 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
 	struct pnv_phb *phb = hose->private_data;
@@ -709,191 +1079,6 @@  static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 }
 #endif /* CONFIG_PCI_MSI */
 
-static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
-				  struct pnv_ioda_pe *parent,
-				  struct pnv_ioda_pe *child,
-				  bool is_add)
-{
-	const char *desc = is_add ? "adding" : "removing";
-	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
-			      OPAL_REMOVE_PE_FROM_DOMAIN;
-	struct pnv_ioda_pe *slave;
-	long rc;
-
-	/* Parent PE affects child PE */
-	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
-				child->pe_number, op);
-	if (rc != OPAL_SUCCESS) {
-		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
-			rc, desc);
-		return -ENXIO;
-	}
-
-	if (!(child->flags & PNV_IODA_PE_MASTER))
-		return 0;
-
-	/* Compound case: parent PE affects slave PEs */
-	list_for_each_entry(slave, &child->slaves, list) {
-		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
-					slave->pe_number, op);
-		if (rc != OPAL_SUCCESS) {
-			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
-				rc, desc);
-			return -ENXIO;
-		}
-	}
-
-	return 0;
-}
-
-static int pnv_ioda_set_peltv(struct pnv_phb *phb,
-			      struct pnv_ioda_pe *pe,
-			      bool is_add)
-{
-	struct pnv_ioda_pe *slave;
-	struct pci_dev *pdev = NULL;
-	int ret;
-
-	/*
-	 * Clear PE frozen state. If it's master PE, we need
-	 * clear slave PE frozen state as well.
-	 */
-	if (is_add) {
-		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
-					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
-		if (pe->flags & PNV_IODA_PE_MASTER) {
-			list_for_each_entry(slave, &pe->slaves, list)
-				opal_pci_eeh_freeze_clear(phb->opal_id,
-							  slave->pe_number,
-							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
-		}
-	}
-
-	/*
-	 * Associate PE in PELT. We need add the PE into the
-	 * corresponding PELT-V as well. Otherwise, the error
-	 * originated from the PE might contribute to other
-	 * PEs.
-	 */
-	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
-	if (ret)
-		return ret;
-
-	/* For compound PEs, any one affects all of them */
-	if (pe->flags & PNV_IODA_PE_MASTER) {
-		list_for_each_entry(slave, &pe->slaves, list) {
-			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
-			if (ret)
-				return ret;
-		}
-	}
-
-	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
-		pdev = pe->pbus->self;
-	else if (pe->flags & PNV_IODA_PE_DEV)
-		pdev = pe->pdev->bus->self;
-#ifdef CONFIG_PCI_IOV
-	else if (pe->flags & PNV_IODA_PE_VF)
-		pdev = pe->parent_dev->bus->self;
-#endif /* CONFIG_PCI_IOV */
-	while (pdev) {
-		struct pci_dn *pdn = pci_get_pdn(pdev);
-		struct pnv_ioda_pe *parent;
-
-		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
-			parent = &phb->ioda.pe_array[pdn->pe_number];
-			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
-			if (ret)
-				return ret;
-		}
-
-		pdev = pdev->bus->self;
-	}
-
-	return 0;
-}
-
-#ifdef CONFIG_PCI_IOV
-static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
-{
-	struct pci_dev *parent;
-	uint8_t bcomp, dcomp, fcomp;
-	int64_t rc;
-	long rid_end, rid;
-
-	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
-	if (pe->pbus) {
-		int count;
-
-		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
-		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
-		parent = pe->pbus->self;
-		if (pe->flags & PNV_IODA_PE_BUS_ALL)
-			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
-		else
-			count = 1;
-
-		switch(count) {
-		case  1: bcomp = OpalPciBusAll;         break;
-		case  2: bcomp = OpalPciBus7Bits;       break;
-		case  4: bcomp = OpalPciBus6Bits;       break;
-		case  8: bcomp = OpalPciBus5Bits;       break;
-		case 16: bcomp = OpalPciBus4Bits;       break;
-		case 32: bcomp = OpalPciBus3Bits;       break;
-		default:
-			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
-			        count);
-			/* Do an exact match only */
-			bcomp = OpalPciBusAll;
-		}
-		rid_end = pe->rid + (count << 8);
-	} else {
-		if (pe->flags & PNV_IODA_PE_VF)
-			parent = pe->parent_dev;
-		else
-			parent = pe->pdev->bus->self;
-		bcomp = OpalPciBusAll;
-		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
-		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
-		rid_end = pe->rid + 1;
-	}
-
-	/* Clear the reverse map */
-	for (rid = pe->rid; rid < rid_end; rid++)
-		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
-
-	/* Release from all parents PELT-V */
-	while (parent) {
-		struct pci_dn *pdn = pci_get_pdn(parent);
-		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
-			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
-						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
-			/* XXX What to do in case of error ? */
-		}
-		parent = parent->bus->self;
-	}
-
-	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
-				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
-
-	/* Disassociate PE in PELT */
-	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
-				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
-	if (rc)
-		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
-	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
-			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
-	if (rc)
-		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
-
-	pe->pbus = NULL;
-	pe->pdev = NULL;
-	pe->parent_dev = NULL;
-
-	return 0;
-}
-#endif /* CONFIG_PCI_IOV */
-
 static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 {
 	struct pci_dev *parent;
@@ -953,7 +1138,7 @@  static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 	}
 
 	/* Configure PELTV */
-	pnv_ioda_set_peltv(phb, pe, true);
+	pnv_ioda_set_peltv(pe, true);
 
 	/* Setup reverse map */
 	for (rid = pe->rid; rid < rid_end; rid++)
@@ -1207,6 +1392,8 @@  static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
 		if (pdn->pe_number != IODA_INVALID_PE)
 			continue;
 
+		/* Increase reference count of the parent PE */
+		pnv_ioda_pe_get(pe);
 		pdn->pe_number = pe->pe_number;
 		pe->dma_weight += pnv_ioda_dev_dma_weight(dev);
 		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
@@ -1224,7 +1411,7 @@  static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
 {
 	struct pci_controller *hose = pci_bus_to_host(bus);
 	struct pnv_phb *phb = hose->private_data;
-	struct pnv_ioda_pe *pe;
+	struct pnv_ioda_pe *pe = NULL;
 	int pe_num = IODA_INVALID_PE;
 
 	/* For partial hotplug case, the PE instance hasn't been destroyed
@@ -1240,24 +1427,24 @@  static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
 	}
 
 	/* PE number for root bus should have been reserved */
-	if (pci_is_root_bus(bus))
-		pe_num = phb->ioda.root_pe_no;
+	if (pci_is_root_bus(bus) &&
+	    phb->ioda.root_pe_no != IODA_INVALID_PE)
+		pe = &phb->ioda.pe_array[phb->ioda.root_pe_no];
 
 	/* Check if PE is determined by M64 */
-	if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe)
-		pe_num = phb->pick_m64_pe(phb, bus, all);
+	if (!pe && phb->pick_m64_pe)
+		pe = phb->pick_m64_pe(phb, bus, all);
 
 	/* The PE number isn't pinned by M64 */
-	if (pe_num == IODA_INVALID_PE)
-		pe_num = pnv_ioda_alloc_pe(phb);
+	if (!pe)
+		pe = pnv_ioda_alloc_pe(phb);
 
-	if (pe_num == IODA_INVALID_PE) {
-		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
+	if (!pe) {
+		pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n",
 			__func__, pci_domain_nr(bus), bus->number);
 		return NULL;
 	}
 
-	pe = &phb->ioda.pe_array[pe_num];
 	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
 	pe->pbus = bus;
 	pe->pdev = NULL;
@@ -1274,14 +1461,12 @@  static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
 
 	if (pnv_ioda_configure_pe(phb, pe)) {
 		/* XXX What do we do here ? */
-		if (pe_num)
-			pnv_ioda_free_pe(phb, pe_num);
-		pe->pbus = NULL;
+		pnv_ioda_pe_put(pe);
 		return NULL;
 	}
 
 	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
-			GFP_KERNEL, hose->node);
+				       GFP_KERNEL, hose->node);
 	pe->tce32_table->data = pe;
 
 	/* Associate it with all child devices */
@@ -1521,9 +1706,9 @@  static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		list_del(&pe->list);
 		mutex_unlock(&phb->ioda.pe_list_mutex);
 
-		pnv_ioda_deconfigure_pe(phb, pe);
+		pnv_ioda_deconfigure_pe(pe);
 
-		pnv_ioda_free_pe(phb, pe->pe_number);
+		pnv_ioda_pe_put(pe);
 	}
 }
 
@@ -1601,9 +1786,7 @@  static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
 		if (pnv_ioda_configure_pe(phb, pe)) {
 			/* XXX What do we do here ? */
-			if (pe_num)
-				pnv_ioda_free_pe(phb, pe_num);
-			pe->pdev = NULL;
+			pnv_ioda_pe_put(pe);
 			continue;
 		}
 
@@ -2263,7 +2446,7 @@  int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
 	struct pnv_ioda_pe *pe;
 	int rc;
 
-	pe = pnv_ioda_get_pe(dev);
+	pe = pnv_ioda_pci_dev_to_pe(dev);
 	if (!pe)
 		return -ENODEV;
 
@@ -2379,7 +2562,7 @@  int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
 	struct pnv_ioda_pe *pe;
 	int rc;
 
-	if (!(pe = pnv_ioda_get_pe(dev)))
+	if (!(pe = pnv_ioda_pci_dev_to_pe(dev)))
 		return -ENODEV;
 
 	/* Assign XIVE to PE */
@@ -2401,7 +2584,7 @@  static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
 				  unsigned int hwirq, unsigned int virq,
 				  unsigned int is_64, struct msi_msg *msg)
 {
-	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
+	struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev);
 	unsigned int xive_num = hwirq - phb->msi_base;
 	__be32 data;
 	int rc;
@@ -3065,6 +3248,7 @@  static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge;
 	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
 	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
+	pnv_pci_controller_ops.release_device = pnv_pci_release_device;
 	hose->controller_ops = pnv_pci_controller_ops;
 
 #ifdef CONFIG_PCI_IOV
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 1bea3a8..8b10f01 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -28,6 +28,7 @@  enum pnv_phb_model {
 /* Data associated with a PE, including IOMMU tracking etc.. */
 struct pnv_phb;
 struct pnv_ioda_pe {
+	struct kref		kref;
 	unsigned long		flags;
 	struct pnv_phb		*phb;
 
@@ -120,7 +121,8 @@  struct pnv_phb {
 	void (*shutdown)(struct pnv_phb *phb);
 	int (*init_m64)(struct pnv_phb *phb);
 	void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus);
-	int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
+	struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb,
+					   struct pci_bus *bus, int all);
 	int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
 	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
 	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);