diff mbox

[v12,17/21] powerpc/powernv: Shift VF resource with an offset

Message ID 20150224083457.32124.55534.stgit@bhelgaas-glaptop2.roam.corp.google.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Bjorn Helgaas Feb. 24, 2015, 8:34 a.m. UTC
From: Wei Yang <weiyang@linux.vnet.ibm.com>

On PowerNV platform, resource position in M64 implies the PE# the resource
belongs to.  In some cases, adjustment of a resource is necessary to locate
it to a correct position in M64.

Add pnv_pci_vf_resource_shift() to shift the 'real' PF IOV BAR address
according to an offset.

[bhelgaas: rework loops, rework overlap check, index resource[]
conventionally, remove pci_regs.h include, squashed with next patch]
Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/powerpc/include/asm/pci-bridge.h     |    4 
 arch/powerpc/kernel/pci_dn.c              |   11 +
 arch/powerpc/platforms/powernv/pci-ioda.c |  520 ++++++++++++++++++++++++++++-
 arch/powerpc/platforms/powernv/pci.c      |   18 +
 arch/powerpc/platforms/powernv/pci.h      |    7 
 5 files changed, 543 insertions(+), 17 deletions(-)

Comments

Bjorn Helgaas Feb. 24, 2015, 9 a.m. UTC | #1
On Tue, Feb 24, 2015 at 02:34:57AM -0600, Bjorn Helgaas wrote:
> From: Wei Yang <weiyang@linux.vnet.ibm.com>
> 
> On PowerNV platform, resource position in M64 implies the PE# the resource
> belongs to.  In some cases, adjustment of a resource is necessary to locate
> it to a correct position in M64.
> 
> Add pnv_pci_vf_resource_shift() to shift the 'real' PF IOV BAR address
> according to an offset.
> 
> [bhelgaas: rework loops, rework overlap check, index resource[]
> conventionally, remove pci_regs.h include, squashed with next patch]
> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>

...

> +#ifdef CONFIG_PCI_IOV
> +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
> +{
> +	struct pci_dn *pdn = pci_get_pdn(dev);
> +	int i;
> +	struct resource *res, res2;
> +	resource_size_t size;
> +	u16 vf_num;
> +
> +	if (!dev->is_physfn)
> +		return -EINVAL;
> +
> +	/*
> +	 * "offset" is in VFs.  The M64 windows are sized so that when they
> +	 * are segmented, each segment is the same size as the IOV BAR.
> +	 * Each segment is in a separate PE, and the high order bits of the
> +	 * address are the PE number.  Therefore, each VF's BAR is in a
> +	 * separate PE, and changing the IOV BAR start address changes the
> +	 * range of PEs the VFs are in.
> +	 */
> +	vf_num = pdn->vf_pes;
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> +		res = &dev->resource[i + PCI_IOV_RESOURCES];
> +		if (!res->flags || !res->parent)
> +			continue;
> +
> +		if (!pnv_pci_is_mem_pref_64(res->flags))
> +			continue;
> +
> +		/*
> +		 * The actual IOV BAR range is determined by the start address
> +		 * and the actual size for vf_num VFs BAR.  This check is to
> +		 * make sure that after shifting, the range will not overlap
> +		 * with another device.
> +		 */
> +		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> +		res2.flags = res->flags;
> +		res2.start = res->start + (size * offset);
> +		res2.end = res2.start + (size * vf_num) - 1;
> +
> +		if (res2.end > res->end) {
> +			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
> +				i, &res2, res, vf_num, offset);
> +			return -EBUSY;
> +		}
> +	}
> +
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> +		res = &dev->resource[i + PCI_IOV_RESOURCES];
> +		if (!res->flags || !res->parent)
> +			continue;
> +
> +		if (!pnv_pci_is_mem_pref_64(res->flags))
> +			continue;
> +
> +		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> +		res2 = *res;
> +		res->start += size * offset;

I'm still not happy about this fiddling with res->start.

Increasing res->start means that in principle, the "size * offset" bytes
that we just removed from res are now available for allocation to somebody
else.  I don't think we *will* give that space to anything else because of
the alignment restrictions you're enforcing, but "res" now doesn't
correctly describe the real resource map.

Would you be able to just update the BAR here while leaving the struct
resource alone?  In that case, it would look a little funny that lspci
would show a BAR value in the middle of the region in /proc/iomem, but
the /proc/iomem region would be more correct.

> +
> +		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n",
> +			 i, &res2, res, vf_num, offset);
> +		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
> +	}
> +	pdn->max_vfs -= offset;
> +	return 0;
> +}
> +#endif /* CONFIG_PCI_IOV */
Bjorn Helgaas Feb. 24, 2015, 9:03 a.m. UTC | #2
On Tue, Feb 24, 2015 at 02:34:57AM -0600, Bjorn Helgaas wrote:
> From: Wei Yang <weiyang@linux.vnet.ibm.com>
> 
> On PowerNV platform, resource position in M64 implies the PE# the resource
> belongs to.  In some cases, adjustment of a resource is necessary to locate
> it to a correct position in M64.
> 
> Add pnv_pci_vf_resource_shift() to shift the 'real' PF IOV BAR address
> according to an offset.

I think I squashed the "powerpc/powernv: Allocate VF PE" into this one, but
I didn't merge the changelog into this one.  Those two patches don't seem
super related to each other, but I think there really was some dependency.

> [bhelgaas: rework loops, rework overlap check, index resource[]
> conventionally, remove pci_regs.h include, squashed with next patch]
> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> ---
>  arch/powerpc/include/asm/pci-bridge.h     |    4 
>  arch/powerpc/kernel/pci_dn.c              |   11 +
>  arch/powerpc/platforms/powernv/pci-ioda.c |  520 ++++++++++++++++++++++++++++-
>  arch/powerpc/platforms/powernv/pci.c      |   18 +
>  arch/powerpc/platforms/powernv/pci.h      |    7 
>  5 files changed, 543 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
> index de11de7d4547..011340df8583 100644
> --- a/arch/powerpc/include/asm/pci-bridge.h
> +++ b/arch/powerpc/include/asm/pci-bridge.h
> @@ -177,6 +177,10 @@ struct pci_dn {
>  	int	pe_number;
>  #ifdef CONFIG_PCI_IOV
>  	u16     max_vfs;		/* number of VFs IOV BAR expended */
> +	u16     vf_pes;			/* VF PE# under this PF */
> +	int     offset;			/* PE# for the first VF PE */
> +#define IODA_INVALID_M64        (-1)
> +	int     m64_wins[PCI_SRIOV_NUM_BARS];
>  #endif /* CONFIG_PCI_IOV */
>  #endif
>  	struct list_head child_list;
> diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
> index f3a1a81d112f..5faf7ca45434 100644
> --- a/arch/powerpc/kernel/pci_dn.c
> +++ b/arch/powerpc/kernel/pci_dn.c
> @@ -217,6 +217,17 @@ void remove_dev_pci_info(struct pci_dev *pdev)
>  	struct pci_dn *pdn, *tmp;
>  	int i;
>  
> +	/*
> +	 * VF and VF PE are created/released dynamically, so we need to
> +	 * bind/unbind them.  Otherwise the VF and VF PE would be mismatched
> +	 * when re-enabling SR-IOV.
> +	 */
> +	if (pdev->is_virtfn) {
> +		pdn = pci_get_pdn(pdev);
> +		pdn->pe_number = IODA_INVALID_PE;
> +		return;
> +	}
> +
>  	/* Only support IOV PF for now */
>  	if (!pdev->is_physfn)
>  		return;
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 6a86690bb8de..a3c2fbe35fc8 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -44,6 +44,9 @@
>  #include "powernv.h"
>  #include "pci.h"
>  
> +/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
> +#define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
> +
>  static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>  			    const char *fmt, ...)
>  {
> @@ -56,11 +59,18 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>  	vaf.fmt = fmt;
>  	vaf.va = &args;
>  
> -	if (pe->pdev)
> +	if (pe->flags & PNV_IODA_PE_DEV)
>  		strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
> -	else
> +	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
>  		sprintf(pfix, "%04x:%02x     ",
>  			pci_domain_nr(pe->pbus), pe->pbus->number);
> +#ifdef CONFIG_PCI_IOV
> +	else if (pe->flags & PNV_IODA_PE_VF)
> +		sprintf(pfix, "%04x:%02x:%2x.%d",
> +			pci_domain_nr(pe->parent_dev->bus),
> +			(pe->rid & 0xff00) >> 8,
> +			PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
> +#endif /* CONFIG_PCI_IOV*/
>  
>  	printk("%spci %s: [PE# %.3d] %pV",
>  	       level, pfix, pe->pe_number, &vaf);
> @@ -591,7 +601,7 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>  			      bool is_add)
>  {
>  	struct pnv_ioda_pe *slave;
> -	struct pci_dev *pdev;
> +	struct pci_dev *pdev = NULL;
>  	int ret;
>  
>  	/*
> @@ -630,8 +640,12 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>  
>  	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>  		pdev = pe->pbus->self;
> -	else
> +	else if (pe->flags & PNV_IODA_PE_DEV)
>  		pdev = pe->pdev->bus->self;
> +#ifdef CONFIG_PCI_IOV
> +	else if (pe->flags & PNV_IODA_PE_VF)
> +		pdev = pe->parent_dev->bus->self;
> +#endif /* CONFIG_PCI_IOV */
>  	while (pdev) {
>  		struct pci_dn *pdn = pci_get_pdn(pdev);
>  		struct pnv_ioda_pe *parent;
> @@ -649,6 +663,87 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>  	return 0;
>  }
>  
> +#ifdef CONFIG_PCI_IOV
> +static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
> +{
> +	struct pci_dev *parent;
> +	uint8_t bcomp, dcomp, fcomp;
> +	int64_t rc;
> +	long rid_end, rid;
> +
> +	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
> +	if (pe->pbus) {
> +		int count;
> +
> +		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
> +		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
> +		parent = pe->pbus->self;
> +		if (pe->flags & PNV_IODA_PE_BUS_ALL)
> +			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
> +		else
> +			count = 1;
> +
> +		switch(count) {
> +		case  1: bcomp = OpalPciBusAll;         break;
> +		case  2: bcomp = OpalPciBus7Bits;       break;
> +		case  4: bcomp = OpalPciBus6Bits;       break;
> +		case  8: bcomp = OpalPciBus5Bits;       break;
> +		case 16: bcomp = OpalPciBus4Bits;       break;
> +		case 32: bcomp = OpalPciBus3Bits;       break;
> +		default:
> +			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
> +			        count);
> +			/* Do an exact match only */
> +			bcomp = OpalPciBusAll;
> +		}
> +		rid_end = pe->rid + (count << 8);
> +	} else {
> +		if (pe->flags & PNV_IODA_PE_VF)
> +			parent = pe->parent_dev;
> +		else
> +			parent = pe->pdev->bus->self;
> +		bcomp = OpalPciBusAll;
> +		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
> +		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
> +		rid_end = pe->rid + 1;
> +	}
> +
> +	/* Clear the reverse map */
> +	for (rid = pe->rid; rid < rid_end; rid++)
> +		phb->ioda.pe_rmap[rid] = 0;
> +
> +	/* Release from all parents PELT-V */
> +	while (parent) {
> +		struct pci_dn *pdn = pci_get_pdn(parent);
> +		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
> +			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
> +						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
> +			/* XXX What to do in case of error ? */
> +		}
> +		parent = parent->bus->self;
> +	}
> +
> +	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
> +				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> +
> +	/* Disassociate PE in PELT */
> +	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
> +				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
> +	if (rc)
> +		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
> +	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
> +			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
> +	if (rc)
> +		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
> +
> +	pe->pbus = NULL;
> +	pe->pdev = NULL;
> +	pe->parent_dev = NULL;
> +
> +	return 0;
> +}
> +#endif /* CONFIG_PCI_IOV */
> +
>  static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>  {
>  	struct pci_dev *parent;
> @@ -675,15 +770,19 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>  		case 16: bcomp = OpalPciBus4Bits;	break;
>  		case 32: bcomp = OpalPciBus3Bits;	break;
>  		default:
> -			pr_err("%s: Number of subordinate busses %d"
> -			       " unsupported\n",
> -			       pci_name(pe->pbus->self), count);
> +			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
> +			        count);
>  			/* Do an exact match only */
>  			bcomp = OpalPciBusAll;
>  		}
>  		rid_end = pe->rid + (count << 8);
>  	} else {
> -		parent = pe->pdev->bus->self;
> +#ifdef CONFIG_PCI_IOV
> +		if (pe->flags & PNV_IODA_PE_VF)
> +			parent = pe->parent_dev;
> +		else
> +#endif /* CONFIG_PCI_IOV */
> +			parent = pe->pdev->bus->self;
>  		bcomp = OpalPciBusAll;
>  		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>  		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
> @@ -774,6 +873,74 @@ static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
>  	return 10;
>  }
>  
> +#ifdef CONFIG_PCI_IOV
> +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
> +{
> +	struct pci_dn *pdn = pci_get_pdn(dev);
> +	int i;
> +	struct resource *res, res2;
> +	resource_size_t size;
> +	u16 vf_num;
> +
> +	if (!dev->is_physfn)
> +		return -EINVAL;
> +
> +	/*
> +	 * "offset" is in VFs.  The M64 windows are sized so that when they
> +	 * are segmented, each segment is the same size as the IOV BAR.
> +	 * Each segment is in a separate PE, and the high order bits of the
> +	 * address are the PE number.  Therefore, each VF's BAR is in a
> +	 * separate PE, and changing the IOV BAR start address changes the
> +	 * range of PEs the VFs are in.
> +	 */
> +	vf_num = pdn->vf_pes;
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> +		res = &dev->resource[i + PCI_IOV_RESOURCES];
> +		if (!res->flags || !res->parent)
> +			continue;
> +
> +		if (!pnv_pci_is_mem_pref_64(res->flags))
> +			continue;
> +
> +		/*
> +		 * The actual IOV BAR range is determined by the start address
> +		 * and the actual size for vf_num VFs BAR.  This check is to
> +		 * make sure that after shifting, the range will not overlap
> +		 * with another device.
> +		 */
> +		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> +		res2.flags = res->flags;
> +		res2.start = res->start + (size * offset);
> +		res2.end = res2.start + (size * vf_num) - 1;
> +
> +		if (res2.end > res->end) {
> +			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
> +				i, &res2, res, vf_num, offset);
> +			return -EBUSY;
> +		}
> +	}
> +
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> +		res = &dev->resource[i + PCI_IOV_RESOURCES];
> +		if (!res->flags || !res->parent)
> +			continue;
> +
> +		if (!pnv_pci_is_mem_pref_64(res->flags))
> +			continue;
> +
> +		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> +		res2 = *res;
> +		res->start += size * offset;
> +
> +		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n",
> +			 i, &res2, res, vf_num, offset);
> +		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
> +	}
> +	pdn->max_vfs -= offset;
> +	return 0;
> +}
> +#endif /* CONFIG_PCI_IOV */
> +
>  #if 0
>  static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>  {
> @@ -979,8 +1146,312 @@ static void pnv_pci_ioda_setup_PEs(void)
>  }
>  
>  #ifdef CONFIG_PCI_IOV
> +static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
> +{
> +	struct pci_bus        *bus;
> +	struct pci_controller *hose;
> +	struct pnv_phb        *phb;
> +	struct pci_dn         *pdn;
> +	int                    i;
> +
> +	bus = pdev->bus;
> +	hose = pci_bus_to_host(bus);
> +	phb = hose->private_data;
> +	pdn = pci_get_pdn(pdev);
> +
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> +		if (pdn->m64_wins[i] == IODA_INVALID_M64)
> +			continue;
> +		opal_pci_phb_mmio_enable(phb->opal_id,
> +				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 0);
> +		clear_bit(pdn->m64_wins[i], &phb->ioda.m64_bar_alloc);
> +		pdn->m64_wins[i] = IODA_INVALID_M64;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pnv_pci_vf_assign_m64(struct pci_dev *pdev)
> +{
> +	struct pci_bus        *bus;
> +	struct pci_controller *hose;
> +	struct pnv_phb        *phb;
> +	struct pci_dn         *pdn;
> +	unsigned int           win;
> +	struct resource       *res;
> +	int                    i;
> +	int64_t                rc;
> +
> +	bus = pdev->bus;
> +	hose = pci_bus_to_host(bus);
> +	phb = hose->private_data;
> +	pdn = pci_get_pdn(pdev);
> +
> +	/* Initialize the m64_wins to IODA_INVALID_M64 */
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
> +		pdn->m64_wins[i] = IODA_INVALID_M64;
> +
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> +		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> +		if (!res->flags || !res->parent)
> +			continue;
> +
> +		if (!pnv_pci_is_mem_pref_64(res->flags))
> +			continue;
> +
> +		do {
> +			win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
> +					phb->ioda.m64_bar_idx + 1, 0);
> +
> +			if (win >= phb->ioda.m64_bar_idx + 1)
> +				goto m64_failed;
> +		} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
> +
> +		pdn->m64_wins[i] = win;
> +
> +		/* Map the M64 here */
> +		rc = opal_pci_set_phb_mem_window(phb->opal_id,
> +						 OPAL_M64_WINDOW_TYPE,
> +						 pdn->m64_wins[i],
> +						 res->start,
> +						 0, /* unused */
> +						 resource_size(res));
> +		if (rc != OPAL_SUCCESS) {
> +			dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
> +				win, rc);
> +			goto m64_failed;
> +		}
> +
> +		rc = opal_pci_phb_mmio_enable(phb->opal_id,
> +				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 1);
> +		if (rc != OPAL_SUCCESS) {
> +			dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
> +				win, rc);
> +			goto m64_failed;
> +		}
> +	}
> +	return 0;
> +
> +m64_failed:
> +	pnv_pci_vf_release_m64(pdev);
> +	return -EBUSY;
> +}
> +
> +static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
> +{
> +	struct pci_bus        *bus;
> +	struct pci_controller *hose;
> +	struct pnv_phb        *phb;
> +	struct iommu_table    *tbl;
> +	unsigned long         addr;
> +	int64_t               rc;
> +
> +	bus = dev->bus;
> +	hose = pci_bus_to_host(bus);
> +	phb = hose->private_data;
> +	tbl = pe->tce32_table;
> +	addr = tbl->it_base;
> +
> +	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
> +				   pe->pe_number << 1, 1, __pa(addr),
> +				   0, 0x1000);
> +
> +	rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
> +				        pe->pe_number,
> +				        (pe->pe_number << 1) + 1,
> +				        pe->tce_bypass_base,
> +				        0);
> +	if (rc)
> +		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
> +
> +	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
> +	free_pages(addr, get_order(TCE32_TABLE_SIZE));
> +	pe->tce32_table = NULL;
> +}
> +
> +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
> +{
> +	struct pci_bus        *bus;
> +	struct pci_controller *hose;
> +	struct pnv_phb        *phb;
> +	struct pnv_ioda_pe    *pe, *pe_n;
> +	struct pci_dn         *pdn;
> +
> +	bus = pdev->bus;
> +	hose = pci_bus_to_host(bus);
> +	phb = hose->private_data;
> +
> +	if (!pdev->is_physfn)
> +		return;
> +
> +	pdn = pci_get_pdn(pdev);
> +	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
> +		if (pe->parent_dev != pdev)
> +			continue;
> +
> +		pnv_pci_ioda2_release_dma_pe(pdev, pe);
> +
> +		/* Remove from list */
> +		mutex_lock(&phb->ioda.pe_list_mutex);
> +		list_del(&pe->list);
> +		mutex_unlock(&phb->ioda.pe_list_mutex);
> +
> +		pnv_ioda_deconfigure_pe(phb, pe);
> +
> +		pnv_ioda_free_pe(phb, pe->pe_number);
> +	}
> +}
> +
> +void pnv_pci_sriov_disable(struct pci_dev *pdev)
> +{
> +	struct pci_bus        *bus;
> +	struct pci_controller *hose;
> +	struct pnv_phb        *phb;
> +	struct pci_dn         *pdn;
> +	struct pci_sriov      *iov;
> +	u16 vf_num;
> +
> +	bus = pdev->bus;
> +	hose = pci_bus_to_host(bus);
> +	phb = hose->private_data;
> +	pdn = pci_get_pdn(pdev);
> +	iov = pdev->sriov;
> +	vf_num = pdn->vf_pes;
> +
> +	/* Release VF PEs */
> +	pnv_ioda_release_vf_PE(pdev);
> +
> +	if (phb->type == PNV_PHB_IODA2) {
> +		pnv_pci_vf_resource_shift(pdev, -pdn->offset);
> +
> +		/* Release M64 windows */
> +		pnv_pci_vf_release_m64(pdev);
> +
> +		/* Release PE numbers */
> +		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, vf_num);
> +		pdn->offset = 0;
> +	}
> +}
> +
> +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> +				       struct pnv_ioda_pe *pe);
> +static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 vf_num)
> +{
> +	struct pci_bus        *bus;
> +	struct pci_controller *hose;
> +	struct pnv_phb        *phb;
> +	struct pnv_ioda_pe    *pe;
> +	int                    pe_num;
> +	u16                    vf_index;
> +	struct pci_dn         *pdn;
> +
> +	bus = pdev->bus;
> +	hose = pci_bus_to_host(bus);
> +	phb = hose->private_data;
> +	pdn = pci_get_pdn(pdev);
> +
> +	if (!pdev->is_physfn)
> +		return;
> +
> +	/* Reserve PE for each VF */
> +	for (vf_index = 0; vf_index < vf_num; vf_index++) {
> +		pe_num = pdn->offset + vf_index;
> +
> +		pe = &phb->ioda.pe_array[pe_num];
> +		pe->pe_number = pe_num;
> +		pe->phb = phb;
> +		pe->flags = PNV_IODA_PE_VF;
> +		pe->pbus = NULL;
> +		pe->parent_dev = pdev;
> +		pe->tce32_seg = -1;
> +		pe->mve_number = -1;
> +		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
> +			   pci_iov_virtfn_devfn(pdev, vf_index);
> +
> +		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n",
> +			hose->global_number, pdev->bus->number,
> +			PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
> +			PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
> +
> +		if (pnv_ioda_configure_pe(phb, pe)) {
> +			/* XXX What do we do here ? */
> +			if (pe_num)
> +				pnv_ioda_free_pe(phb, pe_num);
> +			pe->pdev = NULL;
> +			continue;
> +		}
> +
> +		pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
> +				GFP_KERNEL, hose->node);
> +		pe->tce32_table->data = pe;
> +
> +		/* Put PE to the list */
> +		mutex_lock(&phb->ioda.pe_list_mutex);
> +		list_add_tail(&pe->list, &phb->ioda.pe_list);
> +		mutex_unlock(&phb->ioda.pe_list_mutex);
> +
> +		pnv_pci_ioda2_setup_dma_pe(phb, pe);
> +	}
> +}
> +
> +int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 vf_num)
> +{
> +	struct pci_bus        *bus;
> +	struct pci_controller *hose;
> +	struct pnv_phb        *phb;
> +	struct pci_dn         *pdn;
> +	int                    ret;
> +
> +	bus = pdev->bus;
> +	hose = pci_bus_to_host(bus);
> +	phb = hose->private_data;
> +	pdn = pci_get_pdn(pdev);
> +
> +	if (phb->type == PNV_PHB_IODA2) {
> +		/* Calculate available PE for required VFs */
> +		mutex_lock(&phb->ioda.pe_alloc_mutex);
> +		pdn->offset = bitmap_find_next_zero_area(
> +			phb->ioda.pe_alloc, phb->ioda.total_pe,
> +			0, vf_num, 0);
> +		if (pdn->offset >= phb->ioda.total_pe) {
> +			mutex_unlock(&phb->ioda.pe_alloc_mutex);
> +			dev_info(&pdev->dev, "Failed to enable VF%d\n", vf_num);
> +			pdn->offset = 0;
> +			return -EBUSY;
> +		}
> +		bitmap_set(phb->ioda.pe_alloc, pdn->offset, vf_num);
> +		pdn->vf_pes = vf_num;
> +		mutex_unlock(&phb->ioda.pe_alloc_mutex);
> +
> +		/* Assign M64 window accordingly */
> +		ret = pnv_pci_vf_assign_m64(pdev);
> +		if (ret) {
> +			dev_info(&pdev->dev, "Not enough M64 window resources\n");
> +			goto m64_failed;
> +		}
> +
> +		/* Do some magic shift */
> +		ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
> +		if (ret)
> +			goto m64_failed;
> +	}
> +
> +	/* Setup VF PEs */
> +	pnv_ioda_setup_vf_PE(pdev, vf_num);
> +
> +	return 0;
> +
> +m64_failed:
> +	bitmap_clear(phb->ioda.pe_alloc, pdn->offset, vf_num);
> +	pdn->offset = 0;
> +
> +	return ret;
> +}
> +
>  int pcibios_sriov_disable(struct pci_dev *pdev)
>  {
> +	pnv_pci_sriov_disable(pdev);
> +
>  	/* Release firmware data */
>  	remove_dev_pci_info(pdev);
>  	return 0;
> @@ -990,6 +1461,8 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 vf_num)
>  {
>  	/* Allocate firmware data */
>  	add_dev_pci_info(pdev);
> +
> +	pnv_pci_sriov_enable(pdev, vf_num);
>  	return 0;
>  }
>  #endif /* CONFIG_PCI_IOV */
> @@ -1186,9 +1659,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>  	int64_t rc;
>  	void *addr;
>  
> -	/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
> -#define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
> -
>  	/* XXX FIXME: Handle 64-bit only DMA devices */
>  	/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
>  	/* XXX FIXME: Allocate multi-level tables on PHB3 */
> @@ -1251,12 +1721,19 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>  				 TCE_PCI_SWINV_PAIR);
>  	}
>  	iommu_init_table(tbl, phb->hose->node);
> -	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
>  
> -	if (pe->pdev)
> +	if (pe->flags & PNV_IODA_PE_DEV) {
> +		iommu_register_group(tbl, phb->hose->global_number,
> +				     pe->pe_number);
>  		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
> -	else
> +	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
> +		iommu_register_group(tbl, phb->hose->global_number,
> +				     pe->pe_number);
>  		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
> +	} else if (pe->flags & PNV_IODA_PE_VF) {
> +		iommu_register_group(tbl, phb->hose->global_number,
> +				     pe->pe_number);
> +	}
>  
>  	return;
>   fail:
> @@ -1383,12 +1860,19 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
>  	}
>  	iommu_init_table(tbl, phb->hose->node);
> -	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
>  
> -	if (pe->pdev)
> +	if (pe->flags & PNV_IODA_PE_DEV) {
> +		iommu_register_group(tbl, phb->hose->global_number,
> +				     pe->pe_number);
>  		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
> -	else
> +	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
> +		iommu_register_group(tbl, phb->hose->global_number,
> +				     pe->pe_number);
>  		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
> +	} else if (pe->flags & PNV_IODA_PE_VF) {
> +		iommu_register_group(tbl, phb->hose->global_number,
> +				     pe->pe_number);
> +	}
>  
>  	/* Also create a bypass window */
>  	if (!pnv_iommu_bypass_disabled)
> @@ -2083,6 +2567,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>  	phb->hub_id = hub_id;
>  	phb->opal_id = phb_id;
>  	phb->type = ioda_type;
> +	mutex_init(&phb->ioda.pe_alloc_mutex);
>  
>  	/* Detect specific models for error handling */
>  	if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
> @@ -2142,6 +2627,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>  
>  	INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
>  	INIT_LIST_HEAD(&phb->ioda.pe_list);
> +	mutex_init(&phb->ioda.pe_list_mutex);
>  
>  	/* Calculate how many 32-bit TCE segments we have */
>  	phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 6c20d6e70383..a88f915fc603 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -714,6 +714,24 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
>  {
>  	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>  	struct pnv_phb *phb = hose->private_data;
> +#ifdef CONFIG_PCI_IOV
> +	struct pnv_ioda_pe *pe;
> +	struct pci_dn *pdn;
> +
> +	/* Fix the VF pdn PE number */
> +	if (pdev->is_virtfn) {
> +		pdn = pci_get_pdn(pdev);
> +		WARN_ON(pdn->pe_number != IODA_INVALID_PE);
> +		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
> +			if (pe->rid == ((pdev->bus->number << 8) |
> +			    (pdev->devfn & 0xff))) {
> +				pdn->pe_number = pe->pe_number;
> +				pe->pdev = pdev;
> +				break;
> +			}
> +		}
> +	}
> +#endif /* CONFIG_PCI_IOV */
>  
>  	/* If we have no phb structure, try to setup a fallback based on
>  	 * the device-tree (RTAS PCI for example)
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index 731777734bca..39d42f2b7a15 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -23,6 +23,7 @@ enum pnv_phb_model {
>  #define PNV_IODA_PE_BUS_ALL	(1 << 2)	/* PE has subordinate buses	*/
>  #define PNV_IODA_PE_MASTER	(1 << 3)	/* Master PE in compound case	*/
>  #define PNV_IODA_PE_SLAVE	(1 << 4)	/* Slave PE in compound case	*/
> +#define PNV_IODA_PE_VF		(1 << 5)	/* PE for one VF 		*/
>  
>  /* Data associated with a PE, including IOMMU tracking etc.. */
>  struct pnv_phb;
> @@ -34,6 +35,9 @@ struct pnv_ioda_pe {
>  	 * entire bus (& children). In the former case, pdev
>  	 * is populated, in the later case, pbus is.
>  	 */
> +#ifdef CONFIG_PCI_IOV
> +	struct pci_dev          *parent_dev;
> +#endif
>  	struct pci_dev		*pdev;
>  	struct pci_bus		*pbus;
>  
> @@ -165,6 +169,8 @@ struct pnv_phb {
>  
>  			/* PE allocation bitmap */
>  			unsigned long		*pe_alloc;
> +			/* PE allocation mutex */
> +			struct mutex		pe_alloc_mutex;
>  
>  			/* M32 & IO segment maps */
>  			unsigned int		*m32_segmap;
> @@ -179,6 +185,7 @@ struct pnv_phb {
>  			 * on the sequence of creation
>  			 */
>  			struct list_head	pe_list;
> +			struct mutex            pe_list_mutex;
>  
>  			/* Reverse map of PEs, will have to extend if
>  			 * we are to support more than 256 PEs, indexed
>
Bjorn Helgaas Feb. 24, 2015, 5:10 p.m. UTC | #3
On Tue, Feb 24, 2015 at 3:00 AM, Bjorn Helgaas <bhelgaas@google.com> wrote:
> On Tue, Feb 24, 2015 at 02:34:57AM -0600, Bjorn Helgaas wrote:
>> From: Wei Yang <weiyang@linux.vnet.ibm.com>
>>
>> On PowerNV platform, resource position in M64 implies the PE# the resource
>> belongs to.  In some cases, adjustment of a resource is necessary to locate
>> it to a correct position in M64.
>>
>> Add pnv_pci_vf_resource_shift() to shift the 'real' PF IOV BAR address
>> according to an offset.
>>
>> [bhelgaas: rework loops, rework overlap check, index resource[]
>> conventionally, remove pci_regs.h include, squashed with next patch]
>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
>
> ...
>
>> +#ifdef CONFIG_PCI_IOV
>> +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>> +{
>> +     struct pci_dn *pdn = pci_get_pdn(dev);
>> +     int i;
>> +     struct resource *res, res2;
>> +     resource_size_t size;
>> +     u16 vf_num;
>> +
>> +     if (!dev->is_physfn)
>> +             return -EINVAL;
>> +
>> +     /*
>> +      * "offset" is in VFs.  The M64 windows are sized so that when they
>> +      * are segmented, each segment is the same size as the IOV BAR.
>> +      * Each segment is in a separate PE, and the high order bits of the
>> +      * address are the PE number.  Therefore, each VF's BAR is in a
>> +      * separate PE, and changing the IOV BAR start address changes the
>> +      * range of PEs the VFs are in.
>> +      */
>> +     vf_num = pdn->vf_pes;
>> +     for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> +             res = &dev->resource[i + PCI_IOV_RESOURCES];
>> +             if (!res->flags || !res->parent)
>> +                     continue;
>> +
>> +             if (!pnv_pci_is_mem_pref_64(res->flags))
>> +                     continue;
>> +
>> +             /*
>> +              * The actual IOV BAR range is determined by the start address
>> +              * and the actual size for vf_num VFs BAR.  This check is to
>> +              * make sure that after shifting, the range will not overlap
>> +              * with another device.
>> +              */
>> +             size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>> +             res2.flags = res->flags;
>> +             res2.start = res->start + (size * offset);
>> +             res2.end = res2.start + (size * vf_num) - 1;
>> +
>> +             if (res2.end > res->end) {
>> +                     dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
>> +                             i, &res2, res, vf_num, offset);
>> +                     return -EBUSY;
>> +             }
>> +     }
>> +
>> +     for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> +             res = &dev->resource[i + PCI_IOV_RESOURCES];
>> +             if (!res->flags || !res->parent)
>> +                     continue;
>> +
>> +             if (!pnv_pci_is_mem_pref_64(res->flags))
>> +                     continue;
>> +
>> +             size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>> +             res2 = *res;
>> +             res->start += size * offset;
>
> I'm still not happy about this fiddling with res->start.
>
> Increasing res->start means that in principle, the "size * offset" bytes
> that we just removed from res are now available for allocation to somebody
> else.  I don't think we *will* give that space to anything else because of
> the alignment restrictions you're enforcing, but "res" now doesn't
> correctly describe the real resource map.
>
> Would you be able to just update the BAR here while leaving the struct
> resource alone?  In that case, it would look a little funny that lspci
> would show a BAR value in the middle of the region in /proc/iomem, but
> the /proc/iomem region would be more correct.

I guess this would also require a tweak where we compute the addresses
of each of the VF resources.  Today it's probably just "base + VF_num
* size", where "base" is res->start.  We'd have to account for the
offset there if we don't adjust it here.

>> +
>> +             dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n",
>> +                      i, &res2, res, vf_num, offset);
>> +             pci_update_resource(dev, i + PCI_IOV_RESOURCES);
>> +     }
>> +     pdn->max_vfs -= offset;
>> +     return 0;
>> +}
>> +#endif /* CONFIG_PCI_IOV */
Wei Yang March 2, 2015, 7:58 a.m. UTC | #4
On Tue, Feb 24, 2015 at 11:10:33AM -0600, Bjorn Helgaas wrote:
>On Tue, Feb 24, 2015 at 3:00 AM, Bjorn Helgaas <bhelgaas@google.com> wrote:
>> On Tue, Feb 24, 2015 at 02:34:57AM -0600, Bjorn Helgaas wrote:
>>> From: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>
>>> On PowerNV platform, resource position in M64 implies the PE# the resource
>>> belongs to.  In some cases, adjustment of a resource is necessary to locate
>>> it to a correct position in M64.
>>>
>>> Add pnv_pci_vf_resource_shift() to shift the 'real' PF IOV BAR address
>>> according to an offset.
>>>
>>> [bhelgaas: rework loops, rework overlap check, index resource[]
>>> conventionally, remove pci_regs.h include, squashed with next patch]
>>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
>>
>> ...
>>
>>> +#ifdef CONFIG_PCI_IOV
>>> +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>> +{
>>> +     struct pci_dn *pdn = pci_get_pdn(dev);
>>> +     int i;
>>> +     struct resource *res, res2;
>>> +     resource_size_t size;
>>> +     u16 vf_num;
>>> +
>>> +     if (!dev->is_physfn)
>>> +             return -EINVAL;
>>> +
>>> +     /*
>>> +      * "offset" is in VFs.  The M64 windows are sized so that when they
>>> +      * are segmented, each segment is the same size as the IOV BAR.
>>> +      * Each segment is in a separate PE, and the high order bits of the
>>> +      * address are the PE number.  Therefore, each VF's BAR is in a
>>> +      * separate PE, and changing the IOV BAR start address changes the
>>> +      * range of PEs the VFs are in.
>>> +      */
>>> +     vf_num = pdn->vf_pes;
>>> +     for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>> +             res = &dev->resource[i + PCI_IOV_RESOURCES];
>>> +             if (!res->flags || !res->parent)
>>> +                     continue;
>>> +
>>> +             if (!pnv_pci_is_mem_pref_64(res->flags))
>>> +                     continue;
>>> +
>>> +             /*
>>> +              * The actual IOV BAR range is determined by the start address
>>> +              * and the actual size for vf_num VFs BAR.  This check is to
>>> +              * make sure that after shifting, the range will not overlap
>>> +              * with another device.
>>> +              */
>>> +             size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>> +             res2.flags = res->flags;
>>> +             res2.start = res->start + (size * offset);
>>> +             res2.end = res2.start + (size * vf_num) - 1;
>>> +
>>> +             if (res2.end > res->end) {
>>> +                     dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
>>> +                             i, &res2, res, vf_num, offset);
>>> +                     return -EBUSY;
>>> +             }
>>> +     }
>>> +
>>> +     for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>> +             res = &dev->resource[i + PCI_IOV_RESOURCES];
>>> +             if (!res->flags || !res->parent)
>>> +                     continue;
>>> +
>>> +             if (!pnv_pci_is_mem_pref_64(res->flags))
>>> +                     continue;
>>> +
>>> +             size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>> +             res2 = *res;
>>> +             res->start += size * offset;
>>
>> I'm still not happy about this fiddling with res->start.
>>
>> Increasing res->start means that in principle, the "size * offset" bytes
>> that we just removed from res are now available for allocation to somebody
>> else.  I don't think we *will* give that space to anything else because of
>> the alignment restrictions you're enforcing, but "res" now doesn't
>> correctly describe the real resource map.
>>
>> Would you be able to just update the BAR here while leaving the struct
>> resource alone?  In that case, it would look a little funny that lspci
>> would show a BAR value in the middle of the region in /proc/iomem, but
>> the /proc/iomem region would be more correct.
>
>I guess this would also require a tweak where we compute the addresses
>of each of the VF resources.  Today it's probably just "base + VF_num
>* size", where "base" is res->start.  We'd have to account for the
>offset there if we don't adjust it here.
>

Oh, this is really an interesting idea.

I will do some tests to see the result.

>>> +
>>> +             dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n",
>>> +                      i, &res2, res, vf_num, offset);
>>> +             pci_update_resource(dev, i + PCI_IOV_RESOURCES);
>>> +     }
>>> +     pdn->max_vfs -= offset;
>>> +     return 0;
>>> +}
>>> +#endif /* CONFIG_PCI_IOV */
Wei Yang March 4, 2015, 3:01 a.m. UTC | #5
On Tue, Feb 24, 2015 at 03:00:37AM -0600, Bjorn Helgaas wrote:
>On Tue, Feb 24, 2015 at 02:34:57AM -0600, Bjorn Helgaas wrote:
>> From: Wei Yang <weiyang@linux.vnet.ibm.com>
>> 
>> On PowerNV platform, resource position in M64 implies the PE# the resource
>> belongs to.  In some cases, adjustment of a resource is necessary to locate
>> it to a correct position in M64.
>> 
>> Add pnv_pci_vf_resource_shift() to shift the 'real' PF IOV BAR address
>> according to an offset.
>> 
>> [bhelgaas: rework loops, rework overlap check, index resource[]
>> conventionally, remove pci_regs.h include, squashed with next patch]
>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
>
>...
>
>> +#ifdef CONFIG_PCI_IOV
>> +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>> +{
>> +	struct pci_dn *pdn = pci_get_pdn(dev);
>> +	int i;
>> +	struct resource *res, res2;
>> +	resource_size_t size;
>> +	u16 vf_num;
>> +
>> +	if (!dev->is_physfn)
>> +		return -EINVAL;
>> +
>> +	/*
>> +	 * "offset" is in VFs.  The M64 windows are sized so that when they
>> +	 * are segmented, each segment is the same size as the IOV BAR.
>> +	 * Each segment is in a separate PE, and the high order bits of the
>> +	 * address are the PE number.  Therefore, each VF's BAR is in a
>> +	 * separate PE, and changing the IOV BAR start address changes the
>> +	 * range of PEs the VFs are in.
>> +	 */
>> +	vf_num = pdn->vf_pes;
>> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> +		res = &dev->resource[i + PCI_IOV_RESOURCES];
>> +		if (!res->flags || !res->parent)
>> +			continue;
>> +
>> +		if (!pnv_pci_is_mem_pref_64(res->flags))
>> +			continue;
>> +
>> +		/*
>> +		 * The actual IOV BAR range is determined by the start address
>> +		 * and the actual size for vf_num VFs BAR.  This check is to
>> +		 * make sure that after shifting, the range will not overlap
>> +		 * with another device.
>> +		 */
>> +		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>> +		res2.flags = res->flags;
>> +		res2.start = res->start + (size * offset);
>> +		res2.end = res2.start + (size * vf_num) - 1;
>> +
>> +		if (res2.end > res->end) {
>> +			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
>> +				i, &res2, res, vf_num, offset);
>> +			return -EBUSY;
>> +		}
>> +	}
>> +
>> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> +		res = &dev->resource[i + PCI_IOV_RESOURCES];
>> +		if (!res->flags || !res->parent)
>> +			continue;
>> +
>> +		if (!pnv_pci_is_mem_pref_64(res->flags))
>> +			continue;
>> +
>> +		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>> +		res2 = *res;
>> +		res->start += size * offset;
>
>I'm still not happy about this fiddling with res->start.
>
>Increasing res->start means that in principle, the "size * offset" bytes
>that we just removed from res are now available for allocation to somebody
>else.  I don't think we *will* give that space to anything else because of
>the alignment restrictions you're enforcing, but "res" now doesn't
>correctly describe the real resource map.
>
>Would you be able to just update the BAR here while leaving the struct
>resource alone?  In that case, it would look a little funny that lspci
>would show a BAR value in the middle of the region in /proc/iomem, but
>the /proc/iomem region would be more correct.

Bjorn,

I did some tests, while the result is not good.

What I did is still write the shifted resource address to the device by
pci_update_resource(), but I revert the res->start to the original one. If
this step is not correct, please let me know.

This can't work since after we revert the res->start, those VFs will be given
resources from res->start instead of (res->start + offset * size). This is not
what we expect.

I have rebased/clean/change the code according to your comments based on this
patch set. Will send it out v13 soon.

>
>> +
>> +		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n",
>> +			 i, &res2, res, vf_num, offset);
>> +		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
>> +	}
>> +	pdn->max_vfs -= offset;
>> +	return 0;
>> +}
>> +#endif /* CONFIG_PCI_IOV */
Bjorn Helgaas March 11, 2015, 2:55 a.m. UTC | #6
On Wed, Mar 04, 2015 at 11:01:24AM +0800, Wei Yang wrote:
> On Tue, Feb 24, 2015 at 03:00:37AM -0600, Bjorn Helgaas wrote:
> >On Tue, Feb 24, 2015 at 02:34:57AM -0600, Bjorn Helgaas wrote:
> >> From: Wei Yang <weiyang@linux.vnet.ibm.com>
> >> 
> >> On PowerNV platform, resource position in M64 implies the PE# the resource
> >> belongs to.  In some cases, adjustment of a resource is necessary to locate
> >> it to a correct position in M64.
> >> 
> >> Add pnv_pci_vf_resource_shift() to shift the 'real' PF IOV BAR address
> >> according to an offset.
> >> 
> >> [bhelgaas: rework loops, rework overlap check, index resource[]
> >> conventionally, remove pci_regs.h include, squashed with next patch]
> >> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
> >> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> >
> >...
> >
> >> +#ifdef CONFIG_PCI_IOV
> >> +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
> >> +{
> >> +	struct pci_dn *pdn = pci_get_pdn(dev);
> >> +	int i;
> >> +	struct resource *res, res2;
> >> +	resource_size_t size;
> >> +	u16 vf_num;
> >> +
> >> +	if (!dev->is_physfn)
> >> +		return -EINVAL;
> >> +
> >> +	/*
> >> +	 * "offset" is in VFs.  The M64 windows are sized so that when they
> >> +	 * are segmented, each segment is the same size as the IOV BAR.
> >> +	 * Each segment is in a separate PE, and the high order bits of the
> >> +	 * address are the PE number.  Therefore, each VF's BAR is in a
> >> +	 * separate PE, and changing the IOV BAR start address changes the
> >> +	 * range of PEs the VFs are in.
> >> +	 */
> >> +	vf_num = pdn->vf_pes;
> >> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> >> +		res = &dev->resource[i + PCI_IOV_RESOURCES];
> >> +		if (!res->flags || !res->parent)
> >> +			continue;
> >> +
> >> +		if (!pnv_pci_is_mem_pref_64(res->flags))
> >> +			continue;
> >> +
> >> +		/*
> >> +		 * The actual IOV BAR range is determined by the start address
> >> +		 * and the actual size for vf_num VFs BAR.  This check is to
> >> +		 * make sure that after shifting, the range will not overlap
> >> +		 * with another device.
> >> +		 */
> >> +		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> >> +		res2.flags = res->flags;
> >> +		res2.start = res->start + (size * offset);
> >> +		res2.end = res2.start + (size * vf_num) - 1;
> >> +
> >> +		if (res2.end > res->end) {
> >> +			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
> >> +				i, &res2, res, vf_num, offset);
> >> +			return -EBUSY;
> >> +		}
> >> +	}
> >> +
> >> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> >> +		res = &dev->resource[i + PCI_IOV_RESOURCES];
> >> +		if (!res->flags || !res->parent)
> >> +			continue;
> >> +
> >> +		if (!pnv_pci_is_mem_pref_64(res->flags))
> >> +			continue;
> >> +
> >> +		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> >> +		res2 = *res;
> >> +		res->start += size * offset;
> >
> >I'm still not happy about this fiddling with res->start.
> >
> >Increasing res->start means that in principle, the "size * offset" bytes
> >that we just removed from res are now available for allocation to somebody
> >else.  I don't think we *will* give that space to anything else because of
> >the alignment restrictions you're enforcing, but "res" now doesn't
> >correctly describe the real resource map.
> >
> >Would you be able to just update the BAR here while leaving the struct
> >resource alone?  In that case, it would look a little funny that lspci
> >would show a BAR value in the middle of the region in /proc/iomem, but
> >the /proc/iomem region would be more correct.
> 
> Bjorn,
> 
> I did some tests, while the result is not good.
> 
> What I did is still write the shifted resource address to the device by
> pci_update_resource(), but I revert the res->start to the original one. If
> this step is not correct, please let me know.
> 
> This can't work since after we revert the res->start, those VFs will be given
> resources from res->start instead of (res->start + offset * size). This is not
> what we expect.

Hmm, yes, I suppose we'd have to have a hook in pci_bus_alloc_from_region()
or something.  That's getting a little messy.  I still don't like messing
with the resource after it's in the resource tree, but I don't have a
better idea right now.  So let's just go with what you have.

> >> +
> >> +		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n",
> >> +			 i, &res2, res, vf_num, offset);
> >> +		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
> >> +	}
> >> +	pdn->max_vfs -= offset;
> >> +	return 0;
> >> +}
> >> +#endif /* CONFIG_PCI_IOV */
> 
> -- 
> Richard Yang
> Help you, Help me
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Yang March 11, 2015, 6:42 a.m. UTC | #7
On Tue, Mar 10, 2015 at 09:55:19PM -0500, Bjorn Helgaas wrote:
>On Wed, Mar 04, 2015 at 11:01:24AM +0800, Wei Yang wrote:
>> On Tue, Feb 24, 2015 at 03:00:37AM -0600, Bjorn Helgaas wrote:
>> >On Tue, Feb 24, 2015 at 02:34:57AM -0600, Bjorn Helgaas wrote:
>> >> From: Wei Yang <weiyang@linux.vnet.ibm.com>
>> >> 
>> >> On PowerNV platform, resource position in M64 implies the PE# the resource
>> >> belongs to.  In some cases, adjustment of a resource is necessary to locate
>> >> it to a correct position in M64.
>> >> 
>> >> Add pnv_pci_vf_resource_shift() to shift the 'real' PF IOV BAR address
>> >> according to an offset.
>> >> 
>> >> [bhelgaas: rework loops, rework overlap check, index resource[]
>> >> conventionally, remove pci_regs.h include, squashed with next patch]
>> >> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>> >> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
>> >
>> >...
>> >
>> >> +#ifdef CONFIG_PCI_IOV
>> >> +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>> >> +{
>> >> +	struct pci_dn *pdn = pci_get_pdn(dev);
>> >> +	int i;
>> >> +	struct resource *res, res2;
>> >> +	resource_size_t size;
>> >> +	u16 vf_num;
>> >> +
>> >> +	if (!dev->is_physfn)
>> >> +		return -EINVAL;
>> >> +
>> >> +	/*
>> >> +	 * "offset" is in VFs.  The M64 windows are sized so that when they
>> >> +	 * are segmented, each segment is the same size as the IOV BAR.
>> >> +	 * Each segment is in a separate PE, and the high order bits of the
>> >> +	 * address are the PE number.  Therefore, each VF's BAR is in a
>> >> +	 * separate PE, and changing the IOV BAR start address changes the
>> >> +	 * range of PEs the VFs are in.
>> >> +	 */
>> >> +	vf_num = pdn->vf_pes;
>> >> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> >> +		res = &dev->resource[i + PCI_IOV_RESOURCES];
>> >> +		if (!res->flags || !res->parent)
>> >> +			continue;
>> >> +
>> >> +		if (!pnv_pci_is_mem_pref_64(res->flags))
>> >> +			continue;
>> >> +
>> >> +		/*
>> >> +		 * The actual IOV BAR range is determined by the start address
>> >> +		 * and the actual size for vf_num VFs BAR.  This check is to
>> >> +		 * make sure that after shifting, the range will not overlap
>> >> +		 * with another device.
>> >> +		 */
>> >> +		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>> >> +		res2.flags = res->flags;
>> >> +		res2.start = res->start + (size * offset);
>> >> +		res2.end = res2.start + (size * vf_num) - 1;
>> >> +
>> >> +		if (res2.end > res->end) {
>> >> +			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
>> >> +				i, &res2, res, vf_num, offset);
>> >> +			return -EBUSY;
>> >> +		}
>> >> +	}
>> >> +
>> >> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> >> +		res = &dev->resource[i + PCI_IOV_RESOURCES];
>> >> +		if (!res->flags || !res->parent)
>> >> +			continue;
>> >> +
>> >> +		if (!pnv_pci_is_mem_pref_64(res->flags))
>> >> +			continue;
>> >> +
>> >> +		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>> >> +		res2 = *res;
>> >> +		res->start += size * offset;
>> >
>> >I'm still not happy about this fiddling with res->start.
>> >
>> >Increasing res->start means that in principle, the "size * offset" bytes
>> >that we just removed from res are now available for allocation to somebody
>> >else.  I don't think we *will* give that space to anything else because of
>> >the alignment restrictions you're enforcing, but "res" now doesn't
>> >correctly describe the real resource map.
>> >
>> >Would you be able to just update the BAR here while leaving the struct
>> >resource alone?  In that case, it would look a little funny that lspci
>> >would show a BAR value in the middle of the region in /proc/iomem, but
>> >the /proc/iomem region would be more correct.
>> 
>> Bjorn,
>> 
>> I did some tests, while the result is not good.
>> 
>> What I did is still write the shifted resource address to the device by
>> pci_update_resource(), but I revert the res->start to the original one. If
>> this step is not correct, please let me know.
>> 
>> This can't work since after we revert the res->start, those VFs will be given
>> resources from res->start instead of (res->start + offset * size). This is not
>> what we expect.
>
>Hmm, yes, I suppose we'd have to have a hook in pci_bus_alloc_from_region()
>or something.  That's getting a little messy.  I still don't like messing
>with the resource after it's in the resource tree, but I don't have a
>better idea right now.  So let's just go with what you have.
>

Thanks  :-)

I would state this in the change log and add a comment in the code to note
this down. Hope this would be a little helpful.

>> >> +
>> >> +		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n",
>> >> +			 i, &res2, res, vf_num, offset);
>> >> +		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
>> >> +	}
>> >> +	pdn->max_vfs -= offset;
>> >> +	return 0;
>> >> +}
>> >> +#endif /* CONFIG_PCI_IOV */
>> 
>> -- 
>> Richard Yang
>> Help you, Help me
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index de11de7d4547..011340df8583 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -177,6 +177,10 @@  struct pci_dn {
 	int	pe_number;
 #ifdef CONFIG_PCI_IOV
 	u16     max_vfs;		/* number of VFs IOV BAR expended */
+	u16     vf_pes;			/* VF PE# under this PF */
+	int     offset;			/* PE# for the first VF PE */
+#define IODA_INVALID_M64        (-1)
+	int     m64_wins[PCI_SRIOV_NUM_BARS];
 #endif /* CONFIG_PCI_IOV */
 #endif
 	struct list_head child_list;
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index f3a1a81d112f..5faf7ca45434 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -217,6 +217,17 @@  void remove_dev_pci_info(struct pci_dev *pdev)
 	struct pci_dn *pdn, *tmp;
 	int i;
 
+	/*
+	 * VF and VF PE are created/released dynamically, so we need to
+	 * bind/unbind them.  Otherwise the VF and VF PE would be mismatched
+	 * when re-enabling SR-IOV.
+	 */
+	if (pdev->is_virtfn) {
+		pdn = pci_get_pdn(pdev);
+		pdn->pe_number = IODA_INVALID_PE;
+		return;
+	}
+
 	/* Only support IOV PF for now */
 	if (!pdev->is_physfn)
 		return;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 6a86690bb8de..a3c2fbe35fc8 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -44,6 +44,9 @@ 
 #include "powernv.h"
 #include "pci.h"
 
+/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
+#define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
+
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 			    const char *fmt, ...)
 {
@@ -56,11 +59,18 @@  static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	if (pe->pdev)
+	if (pe->flags & PNV_IODA_PE_DEV)
 		strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
-	else
+	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 		sprintf(pfix, "%04x:%02x     ",
 			pci_domain_nr(pe->pbus), pe->pbus->number);
+#ifdef CONFIG_PCI_IOV
+	else if (pe->flags & PNV_IODA_PE_VF)
+		sprintf(pfix, "%04x:%02x:%2x.%d",
+			pci_domain_nr(pe->parent_dev->bus),
+			(pe->rid & 0xff00) >> 8,
+			PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
+#endif /* CONFIG_PCI_IOV*/
 
 	printk("%spci %s: [PE# %.3d] %pV",
 	       level, pfix, pe->pe_number, &vaf);
@@ -591,7 +601,7 @@  static int pnv_ioda_set_peltv(struct pnv_phb *phb,
 			      bool is_add)
 {
 	struct pnv_ioda_pe *slave;
-	struct pci_dev *pdev;
+	struct pci_dev *pdev = NULL;
 	int ret;
 
 	/*
@@ -630,8 +640,12 @@  static int pnv_ioda_set_peltv(struct pnv_phb *phb,
 
 	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
 		pdev = pe->pbus->self;
-	else
+	else if (pe->flags & PNV_IODA_PE_DEV)
 		pdev = pe->pdev->bus->self;
+#ifdef CONFIG_PCI_IOV
+	else if (pe->flags & PNV_IODA_PE_VF)
+		pdev = pe->parent_dev->bus->self;
+#endif /* CONFIG_PCI_IOV */
 	while (pdev) {
 		struct pci_dn *pdn = pci_get_pdn(pdev);
 		struct pnv_ioda_pe *parent;
@@ -649,6 +663,87 @@  static int pnv_ioda_set_peltv(struct pnv_phb *phb,
 	return 0;
 }
 
+#ifdef CONFIG_PCI_IOV
+static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+{
+	struct pci_dev *parent;
+	uint8_t bcomp, dcomp, fcomp;
+	int64_t rc;
+	long rid_end, rid;
+
+	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
+	if (pe->pbus) {
+		int count;
+
+		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+		parent = pe->pbus->self;
+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
+			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+		else
+			count = 1;
+
+		switch(count) {
+		case  1: bcomp = OpalPciBusAll;         break;
+		case  2: bcomp = OpalPciBus7Bits;       break;
+		case  4: bcomp = OpalPciBus6Bits;       break;
+		case  8: bcomp = OpalPciBus5Bits;       break;
+		case 16: bcomp = OpalPciBus4Bits;       break;
+		case 32: bcomp = OpalPciBus3Bits;       break;
+		default:
+			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
+			        count);
+			/* Do an exact match only */
+			bcomp = OpalPciBusAll;
+		}
+		rid_end = pe->rid + (count << 8);
+	} else {
+		if (pe->flags & PNV_IODA_PE_VF)
+			parent = pe->parent_dev;
+		else
+			parent = pe->pdev->bus->self;
+		bcomp = OpalPciBusAll;
+		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+		rid_end = pe->rid + 1;
+	}
+
+	/* Clear the reverse map */
+	for (rid = pe->rid; rid < rid_end; rid++)
+		phb->ioda.pe_rmap[rid] = 0;
+
+	/* Release from all parents PELT-V */
+	while (parent) {
+		struct pci_dn *pdn = pci_get_pdn(parent);
+		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
+						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+			/* XXX What to do in case of error ? */
+		}
+		parent = parent->bus->self;
+	}
+
+	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
+				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+	/* Disassociate PE in PELT */
+	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
+				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+	if (rc)
+		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
+	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
+	if (rc)
+		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+
+	pe->pbus = NULL;
+	pe->pdev = NULL;
+	pe->parent_dev = NULL;
+
+	return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
 static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 {
 	struct pci_dev *parent;
@@ -675,15 +770,19 @@  static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 		case 16: bcomp = OpalPciBus4Bits;	break;
 		case 32: bcomp = OpalPciBus3Bits;	break;
 		default:
-			pr_err("%s: Number of subordinate busses %d"
-			       " unsupported\n",
-			       pci_name(pe->pbus->self), count);
+			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
+			        count);
 			/* Do an exact match only */
 			bcomp = OpalPciBusAll;
 		}
 		rid_end = pe->rid + (count << 8);
 	} else {
-		parent = pe->pdev->bus->self;
+#ifdef CONFIG_PCI_IOV
+		if (pe->flags & PNV_IODA_PE_VF)
+			parent = pe->parent_dev;
+		else
+#endif /* CONFIG_PCI_IOV */
+			parent = pe->pdev->bus->self;
 		bcomp = OpalPciBusAll;
 		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
 		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
@@ -774,6 +873,74 @@  static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
 	return 10;
 }
 
+#ifdef CONFIG_PCI_IOV
+static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
+{
+	struct pci_dn *pdn = pci_get_pdn(dev);
+	int i;
+	struct resource *res, res2;
+	resource_size_t size;
+	u16 vf_num;
+
+	if (!dev->is_physfn)
+		return -EINVAL;
+
+	/*
+	 * "offset" is in VFs.  The M64 windows are sized so that when they
+	 * are segmented, each segment is the same size as the IOV BAR.
+	 * Each segment is in a separate PE, and the high order bits of the
+	 * address are the PE number.  Therefore, each VF's BAR is in a
+	 * separate PE, and changing the IOV BAR start address changes the
+	 * range of PEs the VFs are in.
+	 */
+	vf_num = pdn->vf_pes;
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &dev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+
+		if (!pnv_pci_is_mem_pref_64(res->flags))
+			continue;
+
+		/*
+		 * The actual IOV BAR range is determined by the start address
+		 * and the actual size for vf_num VFs BAR.  This check is to
+		 * make sure that after shifting, the range will not overlap
+		 * with another device.
+		 */
+		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+		res2.flags = res->flags;
+		res2.start = res->start + (size * offset);
+		res2.end = res2.start + (size * vf_num) - 1;
+
+		if (res2.end > res->end) {
+			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
+				i, &res2, res, vf_num, offset);
+			return -EBUSY;
+		}
+	}
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &dev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+
+		if (!pnv_pci_is_mem_pref_64(res->flags))
+			continue;
+
+		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+		res2 = *res;
+		res->start += size * offset;
+
+		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n",
+			 i, &res2, res, vf_num, offset);
+		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+	}
+	pdn->max_vfs -= offset;
+	return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
 #if 0
 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 {
@@ -979,8 +1146,312 @@  static void pnv_pci_ioda_setup_PEs(void)
 }
 
 #ifdef CONFIG_PCI_IOV
+static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pci_dn         *pdn;
+	int                    i;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	pdn = pci_get_pdn(pdev);
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		if (pdn->m64_wins[i] == IODA_INVALID_M64)
+			continue;
+		opal_pci_phb_mmio_enable(phb->opal_id,
+				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 0);
+		clear_bit(pdn->m64_wins[i], &phb->ioda.m64_bar_alloc);
+		pdn->m64_wins[i] = IODA_INVALID_M64;
+	}
+
+	return 0;
+}
+
+static int pnv_pci_vf_assign_m64(struct pci_dev *pdev)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pci_dn         *pdn;
+	unsigned int           win;
+	struct resource       *res;
+	int                    i;
+	int64_t                rc;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	pdn = pci_get_pdn(pdev);
+
+	/* Initialize the m64_wins to IODA_INVALID_M64 */
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+		pdn->m64_wins[i] = IODA_INVALID_M64;
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+
+		if (!pnv_pci_is_mem_pref_64(res->flags))
+			continue;
+
+		do {
+			win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
+					phb->ioda.m64_bar_idx + 1, 0);
+
+			if (win >= phb->ioda.m64_bar_idx + 1)
+				goto m64_failed;
+		} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+
+		pdn->m64_wins[i] = win;
+
+		/* Map the M64 here */
+		rc = opal_pci_set_phb_mem_window(phb->opal_id,
+						 OPAL_M64_WINDOW_TYPE,
+						 pdn->m64_wins[i],
+						 res->start,
+						 0, /* unused */
+						 resource_size(res));
+		if (rc != OPAL_SUCCESS) {
+			dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
+				win, rc);
+			goto m64_failed;
+		}
+
+		rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 1);
+		if (rc != OPAL_SUCCESS) {
+			dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
+				win, rc);
+			goto m64_failed;
+		}
+	}
+	return 0;
+
+m64_failed:
+	pnv_pci_vf_release_m64(pdev);
+	return -EBUSY;
+}
+
+static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct iommu_table    *tbl;
+	unsigned long         addr;
+	int64_t               rc;
+
+	bus = dev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	tbl = pe->tce32_table;
+	addr = tbl->it_base;
+
+	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+				   pe->pe_number << 1, 1, __pa(addr),
+				   0, 0x1000);
+
+	rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+				        pe->pe_number,
+				        (pe->pe_number << 1) + 1,
+				        pe->tce_bypass_base,
+				        0);
+	if (rc)
+		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
+
+	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
+	free_pages(addr, get_order(TCE32_TABLE_SIZE));
+	pe->tce32_table = NULL;
+}
+
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pnv_ioda_pe    *pe, *pe_n;
+	struct pci_dn         *pdn;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+
+	if (!pdev->is_physfn)
+		return;
+
+	pdn = pci_get_pdn(pdev);
+	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
+		if (pe->parent_dev != pdev)
+			continue;
+
+		pnv_pci_ioda2_release_dma_pe(pdev, pe);
+
+		/* Remove from list */
+		mutex_lock(&phb->ioda.pe_list_mutex);
+		list_del(&pe->list);
+		mutex_unlock(&phb->ioda.pe_list_mutex);
+
+		pnv_ioda_deconfigure_pe(phb, pe);
+
+		pnv_ioda_free_pe(phb, pe->pe_number);
+	}
+}
+
+void pnv_pci_sriov_disable(struct pci_dev *pdev)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pci_dn         *pdn;
+	struct pci_sriov      *iov;
+	u16 vf_num;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	pdn = pci_get_pdn(pdev);
+	iov = pdev->sriov;
+	vf_num = pdn->vf_pes;
+
+	/* Release VF PEs */
+	pnv_ioda_release_vf_PE(pdev);
+
+	if (phb->type == PNV_PHB_IODA2) {
+		pnv_pci_vf_resource_shift(pdev, -pdn->offset);
+
+		/* Release M64 windows */
+		pnv_pci_vf_release_m64(pdev);
+
+		/* Release PE numbers */
+		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, vf_num);
+		pdn->offset = 0;
+	}
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe);
+static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 vf_num)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pnv_ioda_pe    *pe;
+	int                    pe_num;
+	u16                    vf_index;
+	struct pci_dn         *pdn;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	pdn = pci_get_pdn(pdev);
+
+	if (!pdev->is_physfn)
+		return;
+
+	/* Reserve PE for each VF */
+	for (vf_index = 0; vf_index < vf_num; vf_index++) {
+		pe_num = pdn->offset + vf_index;
+
+		pe = &phb->ioda.pe_array[pe_num];
+		pe->pe_number = pe_num;
+		pe->phb = phb;
+		pe->flags = PNV_IODA_PE_VF;
+		pe->pbus = NULL;
+		pe->parent_dev = pdev;
+		pe->tce32_seg = -1;
+		pe->mve_number = -1;
+		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
+			   pci_iov_virtfn_devfn(pdev, vf_index);
+
+		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n",
+			hose->global_number, pdev->bus->number,
+			PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
+			PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
+
+		if (pnv_ioda_configure_pe(phb, pe)) {
+			/* XXX What do we do here ? */
+			if (pe_num)
+				pnv_ioda_free_pe(phb, pe_num);
+			pe->pdev = NULL;
+			continue;
+		}
+
+		pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
+				GFP_KERNEL, hose->node);
+		pe->tce32_table->data = pe;
+
+		/* Put PE to the list */
+		mutex_lock(&phb->ioda.pe_list_mutex);
+		list_add_tail(&pe->list, &phb->ioda.pe_list);
+		mutex_unlock(&phb->ioda.pe_list_mutex);
+
+		pnv_pci_ioda2_setup_dma_pe(phb, pe);
+	}
+}
+
+int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 vf_num)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pci_dn         *pdn;
+	int                    ret;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	pdn = pci_get_pdn(pdev);
+
+	if (phb->type == PNV_PHB_IODA2) {
+		/* Calculate available PE for required VFs */
+		mutex_lock(&phb->ioda.pe_alloc_mutex);
+		pdn->offset = bitmap_find_next_zero_area(
+			phb->ioda.pe_alloc, phb->ioda.total_pe,
+			0, vf_num, 0);
+		if (pdn->offset >= phb->ioda.total_pe) {
+			mutex_unlock(&phb->ioda.pe_alloc_mutex);
+			dev_info(&pdev->dev, "Failed to enable VF%d\n", vf_num);
+			pdn->offset = 0;
+			return -EBUSY;
+		}
+		bitmap_set(phb->ioda.pe_alloc, pdn->offset, vf_num);
+		pdn->vf_pes = vf_num;
+		mutex_unlock(&phb->ioda.pe_alloc_mutex);
+
+		/* Assign M64 window accordingly */
+		ret = pnv_pci_vf_assign_m64(pdev);
+		if (ret) {
+			dev_info(&pdev->dev, "Not enough M64 window resources\n");
+			goto m64_failed;
+		}
+
+		/* Do some magic shift */
+		ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
+		if (ret)
+			goto m64_failed;
+	}
+
+	/* Setup VF PEs */
+	pnv_ioda_setup_vf_PE(pdev, vf_num);
+
+	return 0;
+
+m64_failed:
+	bitmap_clear(phb->ioda.pe_alloc, pdn->offset, vf_num);
+	pdn->offset = 0;
+
+	return ret;
+}
+
 int pcibios_sriov_disable(struct pci_dev *pdev)
 {
+	pnv_pci_sriov_disable(pdev);
+
 	/* Release firmware data */
 	remove_dev_pci_info(pdev);
 	return 0;
@@ -990,6 +1461,8 @@  int pcibios_sriov_enable(struct pci_dev *pdev, u16 vf_num)
 {
 	/* Allocate firmware data */
 	add_dev_pci_info(pdev);
+
+	pnv_pci_sriov_enable(pdev, vf_num);
 	return 0;
 }
 #endif /* CONFIG_PCI_IOV */
@@ -1186,9 +1659,6 @@  static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	int64_t rc;
 	void *addr;
 
-	/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
-#define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
-
 	/* XXX FIXME: Handle 64-bit only DMA devices */
 	/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
 	/* XXX FIXME: Allocate multi-level tables on PHB3 */
@@ -1251,12 +1721,19 @@  static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 				 TCE_PCI_SWINV_PAIR);
 	}
 	iommu_init_table(tbl, phb->hose->node);
-	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
 
-	if (pe->pdev)
+	if (pe->flags & PNV_IODA_PE_DEV) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
 		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
-	else
+	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
 		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+	} else if (pe->flags & PNV_IODA_PE_VF) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
+	}
 
 	return;
  fail:
@@ -1383,12 +1860,19 @@  static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
 	}
 	iommu_init_table(tbl, phb->hose->node);
-	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
 
-	if (pe->pdev)
+	if (pe->flags & PNV_IODA_PE_DEV) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
 		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
-	else
+	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
 		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+	} else if (pe->flags & PNV_IODA_PE_VF) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
+	}
 
 	/* Also create a bypass window */
 	if (!pnv_iommu_bypass_disabled)
@@ -2083,6 +2567,7 @@  static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	phb->hub_id = hub_id;
 	phb->opal_id = phb_id;
 	phb->type = ioda_type;
+	mutex_init(&phb->ioda.pe_alloc_mutex);
 
 	/* Detect specific models for error handling */
 	if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
@@ -2142,6 +2627,7 @@  static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 
 	INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
 	INIT_LIST_HEAD(&phb->ioda.pe_list);
+	mutex_init(&phb->ioda.pe_list_mutex);
 
 	/* Calculate how many 32-bit TCE segments we have */
 	phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 6c20d6e70383..a88f915fc603 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -714,6 +714,24 @@  static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 	struct pnv_phb *phb = hose->private_data;
+#ifdef CONFIG_PCI_IOV
+	struct pnv_ioda_pe *pe;
+	struct pci_dn *pdn;
+
+	/* Fix the VF pdn PE number */
+	if (pdev->is_virtfn) {
+		pdn = pci_get_pdn(pdev);
+		WARN_ON(pdn->pe_number != IODA_INVALID_PE);
+		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+			if (pe->rid == ((pdev->bus->number << 8) |
+			    (pdev->devfn & 0xff))) {
+				pdn->pe_number = pe->pe_number;
+				pe->pdev = pdev;
+				break;
+			}
+		}
+	}
+#endif /* CONFIG_PCI_IOV */
 
 	/* If we have no phb structure, try to setup a fallback based on
 	 * the device-tree (RTAS PCI for example)
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 731777734bca..39d42f2b7a15 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -23,6 +23,7 @@  enum pnv_phb_model {
 #define PNV_IODA_PE_BUS_ALL	(1 << 2)	/* PE has subordinate buses	*/
 #define PNV_IODA_PE_MASTER	(1 << 3)	/* Master PE in compound case	*/
 #define PNV_IODA_PE_SLAVE	(1 << 4)	/* Slave PE in compound case	*/
+#define PNV_IODA_PE_VF		(1 << 5)	/* PE for one VF 		*/
 
 /* Data associated with a PE, including IOMMU tracking etc.. */
 struct pnv_phb;
@@ -34,6 +35,9 @@  struct pnv_ioda_pe {
 	 * entire bus (& children). In the former case, pdev
 	 * is populated, in the later case, pbus is.
 	 */
+#ifdef CONFIG_PCI_IOV
+	struct pci_dev          *parent_dev;
+#endif
 	struct pci_dev		*pdev;
 	struct pci_bus		*pbus;
 
@@ -165,6 +169,8 @@  struct pnv_phb {
 
 			/* PE allocation bitmap */
 			unsigned long		*pe_alloc;
+			/* PE allocation mutex */
+			struct mutex		pe_alloc_mutex;
 
 			/* M32 & IO segment maps */
 			unsigned int		*m32_segmap;
@@ -179,6 +185,7 @@  struct pnv_phb {
 			 * on the sequence of creation
 			 */
 			struct list_head	pe_list;
+			struct mutex            pe_list_mutex;
 
 			/* Reverse map of PEs, will have to extend if
 			 * we are to support more than 256 PEs, indexed