Patchwork [1/7] powerpc: Add interface to get msi region information

login
register
mail settings
Submitter Bharat Bhushan
Date Sept. 19, 2013, 7:29 a.m.
Message ID <1379575763-2091-2-git-send-email-Bharat.Bhushan@freescale.com>
Download mbox | patch
Permalink /patch/275889/
State Not Applicable
Headers show

Comments

Bharat Bhushan - Sept. 19, 2013, 7:29 a.m.
This patch adds interface to get following information
  - Number of MSI regions (which is number of MSI banks for powerpc).
  - Get the region address range: Physical page which have the
     address/addresses used for generating MSI interrupt
     and size of the page.

These are required to create IOMMU (Freescale PAMU) mapping for
devices which are directly assigned using VFIO.

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
---
 arch/powerpc/include/asm/machdep.h |    8 +++++++
 arch/powerpc/include/asm/pci.h     |    2 +
 arch/powerpc/kernel/msi.c          |   18 ++++++++++++++++
 arch/powerpc/sysdev/fsl_msi.c      |   39 +++++++++++++++++++++++++++++++++--
 arch/powerpc/sysdev/fsl_msi.h      |   11 ++++++++-
 drivers/pci/msi.c                  |   26 ++++++++++++++++++++++++
 include/linux/msi.h                |    8 +++++++
 include/linux/pci.h                |   13 ++++++++++++
 8 files changed, 120 insertions(+), 5 deletions(-)
Bjorn Helgaas - Sept. 24, 2013, 11:58 p.m.
On Thu, Sep 19, 2013 at 12:59:17PM +0530, Bharat Bhushan wrote:
> This patch adds interface to get following information
>   - Number of MSI regions (which is number of MSI banks for powerpc).
>   - Get the region address range: Physical page which have the
>      address/addresses used for generating MSI interrupt
>      and size of the page.
> 
> These are required to create IOMMU (Freescale PAMU) mapping for
> devices which are directly assigned using VFIO.
> 
> Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
> ---
>  arch/powerpc/include/asm/machdep.h |    8 +++++++
>  arch/powerpc/include/asm/pci.h     |    2 +
>  arch/powerpc/kernel/msi.c          |   18 ++++++++++++++++
>  arch/powerpc/sysdev/fsl_msi.c      |   39 +++++++++++++++++++++++++++++++++--
>  arch/powerpc/sysdev/fsl_msi.h      |   11 ++++++++-
>  drivers/pci/msi.c                  |   26 ++++++++++++++++++++++++
>  include/linux/msi.h                |    8 +++++++
>  include/linux/pci.h                |   13 ++++++++++++
>  8 files changed, 120 insertions(+), 5 deletions(-)
> 
> ...

> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index aca7578..6d85c15 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -30,6 +30,20 @@ static int pci_msi_enable = 1;
>  
>  /* Arch hooks */
>  
> +#ifndef arch_msi_get_region_count
> +int arch_msi_get_region_count(void)
> +{
> +	return 0;
> +}
> +#endif
> +
> +#ifndef arch_msi_get_region
> +int arch_msi_get_region(int region_num, struct msi_region *region)
> +{
> +	return 0;
> +}
> +#endif

This #define strategy is gone; see 4287d824 ("PCI: use weak functions for
MSI arch-specific functions").  Please use the weak function strategy
for your new MSI region functions.

> +
>  #ifndef arch_msi_check_device
>  int arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
>  {
> @@ -903,6 +917,18 @@ void pci_disable_msi(struct pci_dev *dev)
>  }
>  EXPORT_SYMBOL(pci_disable_msi);
>  
> +int msi_get_region_count(void)
> +{
> +	return arch_msi_get_region_count();
> +}
> +EXPORT_SYMBOL(msi_get_region_count);
> +
> +int msi_get_region(int region_num, struct msi_region *region)
> +{
> +	return arch_msi_get_region(region_num, region);
> +}
> +EXPORT_SYMBOL(msi_get_region);

Please split these interface additions, i.e., the drivers/pci/msi.c,
include/linux/msi.h, and include/linux/pci.h changes, into a separate
patch.

I don't know enough about VFIO to understand why these new interfaces
are needed.  Is this the first VFIO IOMMU driver?  I see
vfio_iommu_spapr_tce.c and vfio_iommu_type1.c but I don't know if
they're comparable to the Freescale PAMU.  Do other VFIO IOMMU
implementations support MSI?  If so, do they handle the problem of
mapping the MSI regions in a different way?

>  /**
>   * pci_msix_table_size - return the number of device's MSI-X table entries
>   * @dev: pointer to the pci_dev data structure of MSI-X device function
> diff --git a/include/linux/msi.h b/include/linux/msi.h
> index ee66f3a..ae32601 100644
> --- a/include/linux/msi.h
> +++ b/include/linux/msi.h
> @@ -50,6 +50,12 @@ struct msi_desc {
>  	struct kobject kobj;
>  };
>  
> +struct msi_region {
> +	int region_num;
> +	dma_addr_t addr;
> +	size_t size;
> +};

This needs some sort of explanatory comment.

>  /*
>   * The arch hook for setup up msi irqs
>   */
> @@ -58,5 +64,7 @@ void arch_teardown_msi_irq(unsigned int irq);
>  int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
>  void arch_teardown_msi_irqs(struct pci_dev *dev);
>  int arch_msi_check_device(struct pci_dev* dev, int nvec, int type);
> +int arch_msi_get_region_count(void);
> +int arch_msi_get_region(int region_num, struct msi_region *region);
>  
>  #endif /* LINUX_MSI_H */
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 186540d..2b26a59 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1126,6 +1126,7 @@ struct msix_entry {
>  	u16	entry;	/* driver uses to specify entry, OS writes */
>  };
>  
> +struct msi_region;
>  
>  #ifndef CONFIG_PCI_MSI
>  static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
> @@ -1168,6 +1169,16 @@ static inline int pci_msi_enabled(void)
>  {
>  	return 0;
>  }
> +
> +static inline int msi_get_region_count(void)
> +{
> +	return 0;
> +}
> +
> +static inline int msi_get_region(int region_num, struct msi_region *region)
> +{
> +	return 0;
> +}
>  #else
>  int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
>  int pci_enable_msi_block_auto(struct pci_dev *dev, unsigned int *maxvec);
> @@ -1180,6 +1191,8 @@ void pci_disable_msix(struct pci_dev *dev);
>  void msi_remove_pci_irq_vectors(struct pci_dev *dev);
>  void pci_restore_msi_state(struct pci_dev *dev);
>  int pci_msi_enabled(void);
> +int msi_get_region_count(void);
> +int msi_get_region(int region_num, struct msi_region *region);
>  #endif
>  
>  #ifdef CONFIG_PCIEPORTBUS
> -- 
> 1.7.0.4
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan - Oct. 4, 2013, 5:19 a.m.
> -----Original Message-----
> From: linux-pci-owner@vger.kernel.org [mailto:linux-pci-owner@vger.kernel.org]
> On Behalf Of Bjorn Helgaas
> Sent: Wednesday, September 25, 2013 5:28 AM
> To: Bhushan Bharat-R65777
> Cc: alex.williamson@redhat.com; joro@8bytes.org; benh@kernel.crashing.org;
> galak@kernel.crashing.org; linux-kernel@vger.kernel.org; linuxppc-
> dev@lists.ozlabs.org; linux-pci@vger.kernel.org; agraf@suse.de; Wood Scott-
> B07421; iommu@lists.linux-foundation.org; Bhushan Bharat-R65777
> Subject: Re: [PATCH 1/7] powerpc: Add interface to get msi region information
> 
> On Thu, Sep 19, 2013 at 12:59:17PM +0530, Bharat Bhushan wrote:
> > This patch adds interface to get following information
> >   - Number of MSI regions (which is number of MSI banks for powerpc).
> >   - Get the region address range: Physical page which have the
> >      address/addresses used for generating MSI interrupt
> >      and size of the page.
> >
> > These are required to create IOMMU (Freescale PAMU) mapping for
> > devices which are directly assigned using VFIO.
> >
> > Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
> > ---
> >  arch/powerpc/include/asm/machdep.h |    8 +++++++
> >  arch/powerpc/include/asm/pci.h     |    2 +
> >  arch/powerpc/kernel/msi.c          |   18 ++++++++++++++++
> >  arch/powerpc/sysdev/fsl_msi.c      |   39 +++++++++++++++++++++++++++++++++--
> >  arch/powerpc/sysdev/fsl_msi.h      |   11 ++++++++-
> >  drivers/pci/msi.c                  |   26 ++++++++++++++++++++++++
> >  include/linux/msi.h                |    8 +++++++
> >  include/linux/pci.h                |   13 ++++++++++++
> >  8 files changed, 120 insertions(+), 5 deletions(-)
> >
> > ...
> 
> > diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index
> > aca7578..6d85c15 100644
> > --- a/drivers/pci/msi.c
> > +++ b/drivers/pci/msi.c
> > @@ -30,6 +30,20 @@ static int pci_msi_enable = 1;
> >
> >  /* Arch hooks */
> >
> > +#ifndef arch_msi_get_region_count
> > +int arch_msi_get_region_count(void)
> > +{
> > +	return 0;
> > +}
> > +#endif
> > +
> > +#ifndef arch_msi_get_region
> > +int arch_msi_get_region(int region_num, struct msi_region *region) {
> > +	return 0;
> > +}
> > +#endif
> 
> This #define strategy is gone; see 4287d824 ("PCI: use weak functions for MSI
> arch-specific functions").  Please use the weak function strategy for your new
> MSI region functions.

ok

> 
> > +
> >  #ifndef arch_msi_check_device
> >  int arch_msi_check_device(struct pci_dev *dev, int nvec, int type)  {
> > @@ -903,6 +917,18 @@ void pci_disable_msi(struct pci_dev *dev)  }
> > EXPORT_SYMBOL(pci_disable_msi);
> >
> > +int msi_get_region_count(void)
> > +{
> > +	return arch_msi_get_region_count();
> > +}
> > +EXPORT_SYMBOL(msi_get_region_count);
> > +
> > +int msi_get_region(int region_num, struct msi_region *region) {
> > +	return arch_msi_get_region(region_num, region); }
> > +EXPORT_SYMBOL(msi_get_region);
> 
> Please split these interface additions, i.e., the drivers/pci/msi.c,
> include/linux/msi.h, and include/linux/pci.h changes, into a separate patch.

ok

> 
> I don't know enough about VFIO to understand why these new interfaces are
> needed.  Is this the first VFIO IOMMU driver?  I see vfio_iommu_spapr_tce.c and
> vfio_iommu_type1.c but I don't know if they're comparable to the Freescale PAMU.
> Do other VFIO IOMMU implementations support MSI?  If so, do they handle the
> problem of mapping the MSI regions in a different way?

PAMU is an aperture type of IOMMU while other are paging type, So they are completely different from what PAMU is and handle that differently.

> 
> >  /**
> >   * pci_msix_table_size - return the number of device's MSI-X table entries
> >   * @dev: pointer to the pci_dev data structure of MSI-X device
> > function diff --git a/include/linux/msi.h b/include/linux/msi.h index
> > ee66f3a..ae32601 100644
> > --- a/include/linux/msi.h
> > +++ b/include/linux/msi.h
> > @@ -50,6 +50,12 @@ struct msi_desc {
> >  	struct kobject kobj;
> >  };
> >
> > +struct msi_region {
> > +	int region_num;
> > +	dma_addr_t addr;
> > +	size_t size;
> > +};
> 
> This needs some sort of explanatory comment.

Ok

-Bharat

> 
> >  /*
> >   * The arch hook for setup up msi irqs
> >   */
> > @@ -58,5 +64,7 @@ void arch_teardown_msi_irq(unsigned int irq);  int
> > arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);  void
> > arch_teardown_msi_irqs(struct pci_dev *dev);  int
> > arch_msi_check_device(struct pci_dev* dev, int nvec, int type);
> > +int arch_msi_get_region_count(void);
> > +int arch_msi_get_region(int region_num, struct msi_region *region);
> >
> >  #endif /* LINUX_MSI_H */
> > diff --git a/include/linux/pci.h b/include/linux/pci.h index
> > 186540d..2b26a59 100644
> > --- a/include/linux/pci.h
> > +++ b/include/linux/pci.h
> > @@ -1126,6 +1126,7 @@ struct msix_entry {
> >  	u16	entry;	/* driver uses to specify entry, OS writes */
> >  };
> >
> > +struct msi_region;
> >
> >  #ifndef CONFIG_PCI_MSI
> >  static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned
> > int nvec) @@ -1168,6 +1169,16 @@ static inline int
> > pci_msi_enabled(void)  {
> >  	return 0;
> >  }
> > +
> > +static inline int msi_get_region_count(void) {
> > +	return 0;
> > +}
> > +
> > +static inline int msi_get_region(int region_num, struct msi_region
> > +*region) {
> > +	return 0;
> > +}
> >  #else
> >  int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
> > int pci_enable_msi_block_auto(struct pci_dev *dev, unsigned int
> > *maxvec); @@ -1180,6 +1191,8 @@ void pci_disable_msix(struct pci_dev
> > *dev);  void msi_remove_pci_irq_vectors(struct pci_dev *dev);  void
> > pci_restore_msi_state(struct pci_dev *dev);  int
> > pci_msi_enabled(void);
> > +int msi_get_region_count(void);
> > +int msi_get_region(int region_num, struct msi_region *region);
> >  #endif
> >
> >  #ifdef CONFIG_PCIEPORTBUS
> > --
> > 1.7.0.4
> >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-pci"
> > in the body of a message to majordomo@vger.kernel.org More majordomo
> > info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body
> of a message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas - Oct. 8, 2013, 4:47 p.m.
On Thu, Oct 3, 2013 at 11:19 PM, Bhushan Bharat-R65777
<R65777@freescale.com> wrote:

>> I don't know enough about VFIO to understand why these new interfaces are
>> needed.  Is this the first VFIO IOMMU driver?  I see vfio_iommu_spapr_tce.c and
>> vfio_iommu_type1.c but I don't know if they're comparable to the Freescale PAMU.
>> Do other VFIO IOMMU implementations support MSI?  If so, do they handle the
>> problem of mapping the MSI regions in a different way?
>
> PAMU is an aperture type of IOMMU while other are paging type, So they are completely different from what PAMU is and handle that differently.

This is not an explanation or a justification for adding new
interfaces.  I still have no idea what an "aperture type IOMMU" is,
other than that it is "different."  But I see that Alex is working on
this issue with you in a different thread, so I'm sure you guys will
sort it out.

Bjorn
Joerg Roedel - Oct. 8, 2013, 5:02 p.m.
On Tue, Oct 08, 2013 at 10:47:49AM -0600, Bjorn Helgaas wrote:
> I still have no idea what an "aperture type IOMMU" is,
> other than that it is "different."

An aperture based IOMMU is basically any GART-like IOMMU which can only
remap a small window (the aperture) of the DMA address space. DMA
outside of that window is either blocked completly or passed through
untranslated.


	Joerg
Scott Wood - Oct. 8, 2013, 5:09 p.m.
On Tue, 2013-10-08 at 10:47 -0600, Bjorn Helgaas wrote:
> On Thu, Oct 3, 2013 at 11:19 PM, Bhushan Bharat-R65777
> <R65777@freescale.com> wrote:
> 
> >> I don't know enough about VFIO to understand why these new interfaces are
> >> needed.  Is this the first VFIO IOMMU driver?  I see vfio_iommu_spapr_tce.c and
> >> vfio_iommu_type1.c but I don't know if they're comparable to the Freescale PAMU.
> >> Do other VFIO IOMMU implementations support MSI?  If so, do they handle the
> >> problem of mapping the MSI regions in a different way?
> >
> > PAMU is an aperture type of IOMMU while other are paging type, So they are completely different from what PAMU is and handle that differently.
> 
> This is not an explanation or a justification for adding new
> interfaces.  I still have no idea what an "aperture type IOMMU" is,
> other than that it is "different."  But I see that Alex is working on
> this issue with you in a different thread, so I'm sure you guys will
> sort it out.

PAMU is a very constrained IOMMU that cannot do arbitrary page mappings.
Due to these constraints, we cannot map the MSI I/O page at its normal
address while also mapping RAM at the address we want.  The address we
can map it at depends on the addresses of other mappings, so it can't be
hidden in the IOMMU driver -- the user needs to be in control.

Another difference is that (if I understand correctly) PCs handle MSIs
specially, via interrupt remapping, rather than being translated as a
normal memory access through the IOMMU.

-Scott
Bharat Bhushan - Oct. 8, 2013, 5:09 p.m.
> -----Original Message-----
> From: joro@8bytes.org [mailto:joro@8bytes.org]
> Sent: Tuesday, October 08, 2013 10:32 PM
> To: Bjorn Helgaas
> Cc: Bhushan Bharat-R65777; alex.williamson@redhat.com; benh@kernel.crashing.org;
> galak@kernel.crashing.org; linux-kernel@vger.kernel.org; linuxppc-
> dev@lists.ozlabs.org; linux-pci@vger.kernel.org; agraf@suse.de; Wood Scott-
> B07421; iommu@lists.linux-foundation.org
> Subject: Re: [PATCH 1/7] powerpc: Add interface to get msi region information
> 
> On Tue, Oct 08, 2013 at 10:47:49AM -0600, Bjorn Helgaas wrote:
> > I still have no idea what an "aperture type IOMMU" is, other than that
> > it is "different."
> 
> An aperture based IOMMU is basically any GART-like IOMMU which can only remap a
> small window (the aperture) of the DMA address space. DMA outside of that window
> is either blocked completly or passed through untranslated.

It is completely blocked for Freescale PAMU. 
So for this type of iommu what we have to do is to create a MSI mapping just after guest physical address, Example: guest have a 512M of memory then we create window of 1G (because of power of 2 requirement), then we have to FIT MSI just after 512M of guest.
And for that we need
	1) to know the physical address of MSI's in interrupt controller (for that this patch was all about of).

	2) When guest enable MSI interrupt then we write MSI-address and MSI-DATA in device. The discussion with Alex Williamson is about that interface.

Thanks
-Bharat

> 
> 
> 	Joerg
> 
>
Scott Wood - Oct. 8, 2013, 10:57 p.m.
On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> @@ -376,6 +405,7 @@ static int fsl_of_msi_probe(struct platform_device *dev)
>  	int len;
>  	u32 offset;
>  	static const u32 all_avail[] = { 0, NR_MSI_IRQS };
> +	static int bank_index;
>  
>  	match = of_match_device(fsl_of_msi_ids, &dev->dev);
>  	if (!match)
> @@ -419,8 +449,8 @@ static int fsl_of_msi_probe(struct platform_device *dev)
>  				dev->dev.of_node->full_name);
>  			goto error_out;
>  		}
> -		msi->msiir_offset =
> -			features->msiir_offset + (res.start & 0xfffff);
> +		msi->msiir = res.start + features->msiir_offset;
> +		printk("msi->msiir = %llx\n", msi->msiir);

dev_dbg or remove

>  	}
>  
>  	msi->feature = features->fsl_pic_ip;
> @@ -470,6 +500,7 @@ static int fsl_of_msi_probe(struct platform_device *dev)
>  		}
>  	}
>  
> +	msi->bank_index = bank_index++;

What if multiple MSIs are boing probed in parallel?  bank_index is not
atomic.

> diff --git a/arch/powerpc/sysdev/fsl_msi.h b/arch/powerpc/sysdev/fsl_msi.h
> index 8225f86..6bd5cfc 100644
> --- a/arch/powerpc/sysdev/fsl_msi.h
> +++ b/arch/powerpc/sysdev/fsl_msi.h
> @@ -29,12 +29,19 @@ struct fsl_msi {
>  	struct irq_domain *irqhost;
>  
>  	unsigned long cascade_irq;
> -
> -	u32 msiir_offset; /* Offset of MSIIR, relative to start of CCSR */
> +	dma_addr_t msiir; /* MSIIR Address in CCSR */

Are you sure dma_addr_t is right here, versus phys_addr_t?  It implies
that it's the output of the DMA API, but I don't think the DMA API is
used in the MSI driver.  Perhaps it should be, but we still want the raw
physical address to pass on to VFIO.

>  	void __iomem *msi_regs;
>  	u32 feature;
>  	int msi_virqs[NR_MSI_REG];
>  
> +	/*
> +	 * During probe each bank is assigned a index number.
> +	 * index number ranges from 0 to 2^32.
> +	 * Example  MSI bank 1 = 0
> +	 * MSI bank 2 = 1, and so on.
> +	 */
> +	int bank_index;

2^32 doesn't fit in "int" (nor does 2^32 - 1).

Just say that indices start at 0.

-Scott
Bjorn Helgaas - Oct. 8, 2013, 11:25 p.m.
>> -     u32 msiir_offset; /* Offset of MSIIR, relative to start of CCSR */
>> +     dma_addr_t msiir; /* MSIIR Address in CCSR */
>
> Are you sure dma_addr_t is right here, versus phys_addr_t?  It implies
> that it's the output of the DMA API, but I don't think the DMA API is
> used in the MSI driver.  Perhaps it should be, but we still want the raw
> physical address to pass on to VFIO.

I don't know what "msiir" is used for, but if it's an address you
program into a PCI device, then it's a dma_addr_t even if you didn't
get it from the DMA API.  Maybe "bus_addr_t" would have been a more
suggestive name than "dma_addr_t".  That said, I have no idea how this
relates to VFIO.

Bjorn
Scott Wood - Oct. 8, 2013, 11:35 p.m.
On Tue, 2013-10-08 at 17:25 -0600, Bjorn Helgaas wrote:
> >> -     u32 msiir_offset; /* Offset of MSIIR, relative to start of CCSR */
> >> +     dma_addr_t msiir; /* MSIIR Address in CCSR */
> >
> > Are you sure dma_addr_t is right here, versus phys_addr_t?  It implies
> > that it's the output of the DMA API, but I don't think the DMA API is
> > used in the MSI driver.  Perhaps it should be, but we still want the raw
> > physical address to pass on to VFIO.
> 
> I don't know what "msiir" is used for, but if it's an address you
> program into a PCI device, then it's a dma_addr_t even if you didn't
> get it from the DMA API.  Maybe "bus_addr_t" would have been a more
> suggestive name than "dma_addr_t".  That said, I have no idea how this
> relates to VFIO.

It's a bit awkward because it gets used both as something to program
into a PCI device (and it's probably a bug that the DMA API doesn't get
used), and also (if I understand the current plans correctly) as a
physical address to give to VFIO to be a destination address in an IOMMU
mapping.  So I think the value we keep here should be a phys_addr_t (it
comes straight from the MMIO address in the device tree), which gets
trivially turned into a dma_addr_t by the non-VFIO code path because
there's currently no translation there.

-Scott
Bharat Bhushan - Oct. 9, 2013, 4:47 a.m.
> -----Original Message-----
> From: Wood Scott-B07421
> Sent: Wednesday, October 09, 2013 4:27 AM
> To: Bhushan Bharat-R65777
> Cc: alex.williamson@redhat.com; joro@8bytes.org; benh@kernel.crashing.org;
> galak@kernel.crashing.org; linux-kernel@vger.kernel.org; linuxppc-
> dev@lists.ozlabs.org; linux-pci@vger.kernel.org; agraf@suse.de;
> iommu@lists.linux-foundation.org; Bhushan Bharat-R65777
> Subject: Re: [PATCH 1/7] powerpc: Add interface to get msi region information
> 
> On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> > @@ -376,6 +405,7 @@ static int fsl_of_msi_probe(struct platform_device *dev)
> >  	int len;
> >  	u32 offset;
> >  	static const u32 all_avail[] = { 0, NR_MSI_IRQS };
> > +	static int bank_index;
> >
> >  	match = of_match_device(fsl_of_msi_ids, &dev->dev);
> >  	if (!match)
> > @@ -419,8 +449,8 @@ static int fsl_of_msi_probe(struct platform_device *dev)
> >  				dev->dev.of_node->full_name);
> >  			goto error_out;
> >  		}
> > -		msi->msiir_offset =
> > -			features->msiir_offset + (res.start & 0xfffff);
> > +		msi->msiir = res.start + features->msiir_offset;
> > +		printk("msi->msiir = %llx\n", msi->msiir);
> 
> dev_dbg or remove

Oops, sorry it was leftover of debugging :(

> 
> >  	}
> >
> >  	msi->feature = features->fsl_pic_ip; @@ -470,6 +500,7 @@ static int
> > fsl_of_msi_probe(struct platform_device *dev)
> >  		}
> >  	}
> >
> > +	msi->bank_index = bank_index++;
> 
> What if multiple MSIs are boing probed in parallel?

Ohh, I have not thought that it can be called in parallel

>  bank_index is not atomic.

Will declare bank_intex as atomic_t and use atomic_inc_return(&bank_index)

> 
> > diff --git a/arch/powerpc/sysdev/fsl_msi.h
> > b/arch/powerpc/sysdev/fsl_msi.h index 8225f86..6bd5cfc 100644
> > --- a/arch/powerpc/sysdev/fsl_msi.h
> > +++ b/arch/powerpc/sysdev/fsl_msi.h
> > @@ -29,12 +29,19 @@ struct fsl_msi {
> >  	struct irq_domain *irqhost;
> >
> >  	unsigned long cascade_irq;
> > -
> > -	u32 msiir_offset; /* Offset of MSIIR, relative to start of CCSR */
> > +	dma_addr_t msiir; /* MSIIR Address in CCSR */
> 
> Are you sure dma_addr_t is right here, versus phys_addr_t?  It implies that it's
> the output of the DMA API, but I don't think the DMA API is used in the MSI
> driver.  Perhaps it should be, but we still want the raw physical address to
> pass on to VFIO.

Looking through the conversation I will make this phys_addr_t

> 
> >  	void __iomem *msi_regs;
> >  	u32 feature;
> >  	int msi_virqs[NR_MSI_REG];
> >
> > +	/*
> > +	 * During probe each bank is assigned a index number.
> > +	 * index number ranges from 0 to 2^32.
> > +	 * Example  MSI bank 1 = 0
> > +	 * MSI bank 2 = 1, and so on.
> > +	 */
> > +	int bank_index;
> 
> 2^32 doesn't fit in "int" (nor does 2^32 - 1).

Right :(

> 
> Just say that indices start at 0.

Will correct this

Thanks
-Bharat

> 
> -Scott
>
Sethi Varun-B16395 - Oct. 10, 2013, 9:13 p.m.
> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> owner@vger.kernel.org] On Behalf Of Bhushan Bharat-R65777
> Sent: Tuesday, October 08, 2013 10:40 PM
> To: joro@8bytes.org; Bjorn Helgaas
> Cc: alex.williamson@redhat.com; benh@kernel.crashing.org;
> galak@kernel.crashing.org; linux-kernel@vger.kernel.org; linuxppc-
> dev@lists.ozlabs.org; linux-pci@vger.kernel.org; agraf@suse.de; Wood
> Scott-B07421; iommu@lists.linux-foundation.org
> Subject: RE: [PATCH 1/7] powerpc: Add interface to get msi region
> information
> 
> 
> 
> > -----Original Message-----
> > From: joro@8bytes.org [mailto:joro@8bytes.org]
> > Sent: Tuesday, October 08, 2013 10:32 PM
> > To: Bjorn Helgaas
> > Cc: Bhushan Bharat-R65777; alex.williamson@redhat.com;
> > benh@kernel.crashing.org; galak@kernel.crashing.org;
> > linux-kernel@vger.kernel.org; linuxppc- dev@lists.ozlabs.org;
> > linux-pci@vger.kernel.org; agraf@suse.de; Wood Scott- B07421;
> > iommu@lists.linux-foundation.org
> > Subject: Re: [PATCH 1/7] powerpc: Add interface to get msi region
> > information
> >
> > On Tue, Oct 08, 2013 at 10:47:49AM -0600, Bjorn Helgaas wrote:
> > > I still have no idea what an "aperture type IOMMU" is, other than
> > > that it is "different."
> >
> > An aperture based IOMMU is basically any GART-like IOMMU which can
> > only remap a small window (the aperture) of the DMA address space. DMA
> > outside of that window is either blocked completly or passed through
> untranslated.
> 
> It is completely blocked for Freescale PAMU.
> So for this type of iommu what we have to do is to create a MSI mapping
> just after guest physical address, Example: guest have a 512M of memory
> then we create window of 1G (because of power of 2 requirement), then we
> have to FIT MSI just after 512M of guest.

[Sethi Varun-B16395] PAMU (FSL IOMMU) has a concept of primary window and subwindows. Primary window corresponds to the complete guest iova address space (including MSI space), with respect to IOMMU_API this is termed as geometry . IOVA Base of subwindow is determined from the number of subwindows (configurable using iommu API). Subwindows allow for handling physically discontiguous memory. PAMU translates device iova accesses to actual physical address. MSI mapping would be addressed by a subwindow, with iova base starting at the end of the guest iova space. 

VFIO code creates a PAMU  window (also defines number of subwindow) to map the guest iova space + msi space. The interface defined by this patch queries the PAMU driver to get the iova mapping for the msi region assigned to the PCIe device (assigned to the guest).

-Varun

Patch

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 8b48090..8d1b787 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -30,6 +30,7 @@  struct file;
 struct pci_controller;
 struct kimage;
 struct pci_host_bridge;
+struct msi_region;
 
 struct machdep_calls {
 	char		*name;
@@ -124,6 +125,13 @@  struct machdep_calls {
 	int		(*setup_msi_irqs)(struct pci_dev *dev,
 					  int nvec, int type);
 	void		(*teardown_msi_irqs)(struct pci_dev *dev);
+
+	/* Returns the number of MSI regions (banks) */
+	int		(*msi_get_region_count)(void);
+
+	/* Returns the requested region's address and size */
+	int		(*msi_get_region)(int region_num,
+					  struct msi_region *region);
 #endif
 
 	void		(*restart)(char *cmd);
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 6653f27..e575349 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -117,6 +117,8 @@  extern int pci_proc_domain(struct pci_bus *bus);
 #define arch_setup_msi_irqs arch_setup_msi_irqs
 #define arch_teardown_msi_irqs arch_teardown_msi_irqs
 #define arch_msi_check_device arch_msi_check_device
+#define arch_msi_get_region_count arch_msi_get_region_count
+#define arch_msi_get_region arch_msi_get_region
 
 struct vm_area_struct;
 /* Map a range of PCI memory or I/O space for a device into user space */
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index 8bbc12d..1a67787 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -13,6 +13,24 @@ 
 
 #include <asm/machdep.h>
 
+int arch_msi_get_region_count(void)
+{
+	if (ppc_md.msi_get_region_count) {
+		pr_debug("msi: Using platform get_region_count routine.\n");
+		return ppc_md.msi_get_region_count();
+	}
+	return 0;
+}
+
+int arch_msi_get_region(int region_num, struct msi_region *region)
+{
+	if (ppc_md.msi_get_region) {
+		pr_debug("msi: Using platform get_region routine.\n");
+		return ppc_md.msi_get_region(region_num, region);
+	}
+	return 0;
+}
+
 int arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
 {
 	if (!ppc_md.setup_msi_irqs || !ppc_md.teardown_msi_irqs) {
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index ab02db3..ed045cb 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -96,6 +96,34 @@  static int fsl_msi_init_allocator(struct fsl_msi *msi_data)
 	return 0;
 }
 
+static int fsl_msi_get_region_count(void)
+{
+	int count = 0;
+	struct fsl_msi *msi_data;
+
+	list_for_each_entry(msi_data, &msi_head, list)
+		count++;
+
+	return count;
+}
+
+static int fsl_msi_get_region(int region_num, struct msi_region *region)
+{
+	struct fsl_msi *msi_data;
+
+	list_for_each_entry(msi_data, &msi_head, list) {
+		if (msi_data->bank_index == region_num) {
+			region->region_num = msi_data->bank_index;
+			/* Setting PAGE_SIZE as MSIIR is a 4 byte register */
+			region->size = PAGE_SIZE;
+			region->addr = msi_data->msiir & ~(region->size - 1);
+			return 0;
+		}
+	}
+
+	return -ENODEV;
+}
+
 static int fsl_msi_check_device(struct pci_dev *pdev, int nvec, int type)
 {
 	if (type == PCI_CAP_ID_MSIX)
@@ -137,7 +165,8 @@  static void fsl_compose_msi_msg(struct pci_dev *pdev, int hwirq,
 	if (reg && (len == sizeof(u64)))
 		address = be64_to_cpup(reg);
 	else
-		address = fsl_pci_immrbar_base(hose) + msi_data->msiir_offset;
+		address = fsl_pci_immrbar_base(hose) +
+			   (msi_data->msiir & 0xfffff);
 
 	msg->address_lo = lower_32_bits(address);
 	msg->address_hi = upper_32_bits(address);
@@ -376,6 +405,7 @@  static int fsl_of_msi_probe(struct platform_device *dev)
 	int len;
 	u32 offset;
 	static const u32 all_avail[] = { 0, NR_MSI_IRQS };
+	static int bank_index;
 
 	match = of_match_device(fsl_of_msi_ids, &dev->dev);
 	if (!match)
@@ -419,8 +449,8 @@  static int fsl_of_msi_probe(struct platform_device *dev)
 				dev->dev.of_node->full_name);
 			goto error_out;
 		}
-		msi->msiir_offset =
-			features->msiir_offset + (res.start & 0xfffff);
+		msi->msiir = res.start + features->msiir_offset;
+		printk("msi->msiir = %llx\n", msi->msiir);
 	}
 
 	msi->feature = features->fsl_pic_ip;
@@ -470,6 +500,7 @@  static int fsl_of_msi_probe(struct platform_device *dev)
 		}
 	}
 
+	msi->bank_index = bank_index++;
 	list_add_tail(&msi->list, &msi_head);
 
 	/* The multiple setting ppc_md.setup_msi_irqs will not harm things */
@@ -477,6 +508,8 @@  static int fsl_of_msi_probe(struct platform_device *dev)
 		ppc_md.setup_msi_irqs = fsl_setup_msi_irqs;
 		ppc_md.teardown_msi_irqs = fsl_teardown_msi_irqs;
 		ppc_md.msi_check_device = fsl_msi_check_device;
+		ppc_md.msi_get_region_count = fsl_msi_get_region_count;
+		ppc_md.msi_get_region = fsl_msi_get_region;
 	} else if (ppc_md.setup_msi_irqs != fsl_setup_msi_irqs) {
 		dev_err(&dev->dev, "Different MSI driver already installed!\n");
 		err = -ENODEV;
diff --git a/arch/powerpc/sysdev/fsl_msi.h b/arch/powerpc/sysdev/fsl_msi.h
index 8225f86..6bd5cfc 100644
--- a/arch/powerpc/sysdev/fsl_msi.h
+++ b/arch/powerpc/sysdev/fsl_msi.h
@@ -29,12 +29,19 @@  struct fsl_msi {
 	struct irq_domain *irqhost;
 
 	unsigned long cascade_irq;
-
-	u32 msiir_offset; /* Offset of MSIIR, relative to start of CCSR */
+	dma_addr_t msiir; /* MSIIR Address in CCSR */
 	void __iomem *msi_regs;
 	u32 feature;
 	int msi_virqs[NR_MSI_REG];
 
+	/*
+	 * During probe each bank is assigned a index number.
+	 * index number ranges from 0 to 2^32.
+	 * Example  MSI bank 1 = 0
+	 * MSI bank 2 = 1, and so on.
+	 */
+	int bank_index;
+
 	struct msi_bitmap bitmap;
 
 	struct list_head list;          /* support multiple MSI banks */
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index aca7578..6d85c15 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -30,6 +30,20 @@  static int pci_msi_enable = 1;
 
 /* Arch hooks */
 
+#ifndef arch_msi_get_region_count
+int arch_msi_get_region_count(void)
+{
+	return 0;
+}
+#endif
+
+#ifndef arch_msi_get_region
+int arch_msi_get_region(int region_num, struct msi_region *region)
+{
+	return 0;
+}
+#endif
+
 #ifndef arch_msi_check_device
 int arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
 {
@@ -903,6 +917,18 @@  void pci_disable_msi(struct pci_dev *dev)
 }
 EXPORT_SYMBOL(pci_disable_msi);
 
+int msi_get_region_count(void)
+{
+	return arch_msi_get_region_count();
+}
+EXPORT_SYMBOL(msi_get_region_count);
+
+int msi_get_region(int region_num, struct msi_region *region)
+{
+	return arch_msi_get_region(region_num, region);
+}
+EXPORT_SYMBOL(msi_get_region);
+
 /**
  * pci_msix_table_size - return the number of device's MSI-X table entries
  * @dev: pointer to the pci_dev data structure of MSI-X device function
diff --git a/include/linux/msi.h b/include/linux/msi.h
index ee66f3a..ae32601 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -50,6 +50,12 @@  struct msi_desc {
 	struct kobject kobj;
 };
 
+struct msi_region {
+	int region_num;
+	dma_addr_t addr;
+	size_t size;
+};
+
 /*
  * The arch hook for setup up msi irqs
  */
@@ -58,5 +64,7 @@  void arch_teardown_msi_irq(unsigned int irq);
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void arch_teardown_msi_irqs(struct pci_dev *dev);
 int arch_msi_check_device(struct pci_dev* dev, int nvec, int type);
+int arch_msi_get_region_count(void);
+int arch_msi_get_region(int region_num, struct msi_region *region);
 
 #endif /* LINUX_MSI_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 186540d..2b26a59 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1126,6 +1126,7 @@  struct msix_entry {
 	u16	entry;	/* driver uses to specify entry, OS writes */
 };
 
+struct msi_region;
 
 #ifndef CONFIG_PCI_MSI
 static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
@@ -1168,6 +1169,16 @@  static inline int pci_msi_enabled(void)
 {
 	return 0;
 }
+
+static inline int msi_get_region_count(void)
+{
+	return 0;
+}
+
+static inline int msi_get_region(int region_num, struct msi_region *region)
+{
+	return 0;
+}
 #else
 int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
 int pci_enable_msi_block_auto(struct pci_dev *dev, unsigned int *maxvec);
@@ -1180,6 +1191,8 @@  void pci_disable_msix(struct pci_dev *dev);
 void msi_remove_pci_irq_vectors(struct pci_dev *dev);
 void pci_restore_msi_state(struct pci_dev *dev);
 int pci_msi_enabled(void);
+int msi_get_region_count(void);
+int msi_get_region(int region_num, struct msi_region *region);
 #endif
 
 #ifdef CONFIG_PCIEPORTBUS