[2/2] NTB: PCI Quirk to Enable Switchtec NT Functionality with IOMMU On

Message ID 1526558413-23113-3-git-send-email-dmeyer@gigaio.com
State Superseded
Delegated to: Bjorn Helgaas
Headers show
Series
  • PCI Quirk Patchset for Microsemi Switchtec NTB
Related show

Commit Message

Doug Meyer May 17, 2018, noon
From: Doug Meyer <dmeyer@gigaio.com>

Here we add the PCI quirk for the Microsemi Switchtec parts to allow
non-transparent bridging to work when the IOMMU is turned on.

When a requestor on one NT endpoint accesses memory on another NT
endpoint, it does this via a devfn proxy ID. Proxy IDs are uniquely
assigned on a per-requestor basis within the NT hardware, and are not
visible via PCI enumeration. As a result, a memory access from a peer
NT endpoint will have an unrecognized requestor ID devfn which the
IOMMU will reject.

The quirk introduced here interrogates each configured remote NT
endpoint at runtime to obtain the proxy IDs that have been assigned,
and aliases every proxy ID to the local (enumerated) NT endpoint's
device. The IOMMU then accepts the remote proxy IDs as if they were
requests coming directly from the enumerated endpoint, giving remote
requestors access to memory resources which a given host has made
available. Operation with the IOMMU off is unchanged by this quirk.

Signed-off-by: Doug Meyer <dmeyer@gigaio.com>
---
 drivers/pci/quirks.c | 196 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 196 insertions(+)

Comments

Bjorn Helgaas May 17, 2018, 1:45 p.m. | #1
On Thu, May 17, 2018 at 05:00:13AM -0700, dmeyer@gigaio.com wrote:
> From: Doug Meyer <dmeyer@gigaio.com>
> 
> Here we add the PCI quirk for the Microsemi Switchtec parts to allow
> non-transparent bridging to work when the IOMMU is turned on.

I'm not an NTB expert, but it *sounds* like you're specifically fixing
DMA initiated by an NT endpoint.  I assume other aspects of
non-transparent bridging, e.g., routing of config accesses,
interrupts, doorbells, etc., might work even without this quirk.

> When a requestor on one NT endpoint accesses memory on another NT
> endpoint, it does this via a devfn proxy ID. Proxy IDs are uniquely
> assigned on a per-requestor basis within the NT hardware, and are not
> visible via PCI enumeration. As a result, a memory access from a peer
> NT endpoint will have an unrecognized requestor ID devfn which the
> IOMMU will reject.

It would be very helpful if you could include a reference to the
relevant section of a publicly available spec.

> The quirk introduced here interrogates each configured remote NT
> endpoint at runtime to obtain the proxy IDs that have been assigned,
> and aliases every proxy ID to the local (enumerated) NT endpoint's
> device. The IOMMU then accepts the remote proxy IDs as if they were
> requests coming directly from the enumerated endpoint, giving remote
> requestors access to memory resources which a given host has made
> available. Operation with the IOMMU off is unchanged by this quirk.

Who assigns these proxy IDs?  Quirks run before drivers claim the
device, so it looks like this assumes the proxy ID assignments are
either hard-wired into the device or programmed by firmware.  If the
latter, how would they be programmed for hot-added NTBs?

I'm wondering if all this could or should be done in the switchtec
driver instead of in a quirk.  But I really don't know how that driver
works.

> Signed-off-by: Doug Meyer <dmeyer@gigaio.com>
> ---
>  drivers/pci/quirks.c | 196 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 196 insertions(+)
> 
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> index 2990ad1..c7e27b3 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -27,6 +27,7 @@
>  #include <linux/mm.h>
>  #include <linux/platform_data/x86/apple.h>
>  #include <linux/pm_runtime.h>
> +#include <linux/switchtec.h>
>  #include <asm/dma.h>	/* isa_dma_bridge_buggy */
>  #include "pci.h"
>  
> @@ -4741,3 +4742,198 @@ static void quirk_gpu_hda(struct pci_dev *hda)
>  			      PCI_CLASS_MULTIMEDIA_HD_AUDIO, 8, quirk_gpu_hda);
>  DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
>  			      PCI_CLASS_MULTIMEDIA_HD_AUDIO, 8, quirk_gpu_hda);
> +
> +/*
> + * Microsemi Switchtec NTB uses devfn proxy IDs to move TLPs between
> + * NT endpoints via the internal switch fabric. These IDs replace the
> + * originating requestor ID TLPs which access host memory on peer NTB
> + * ports. Therefore, all proxy IDs must be aliased to the NTB device
> + * to permit access when the IOMMU is turned on.
> + */
> +static void quirk_switchtec_ntb_dma_alias(struct pci_dev *pdev)
> +{
> +	void __iomem *mmio = NULL;

Don't initialize variables unless there's a path through the
code that would otherwise use an uninitialized value.

> +	struct ntb_info_regs __iomem *mmio_ntb = NULL;
> +	struct ntb_ctrl_regs __iomem *mmio_ctrl = NULL;
> +	struct sys_info_regs __iomem *mmio_sys_info = NULL;
> +

No blank line here.

> +	u64 partition_map;
> +	u8 partition;
> +	int pp;
> +
> +	if (pci_enable_device(pdev)) {
> +		dev_err(&pdev->dev, "Cannot enable Switchtec device\n");

Use "pci_err(pdev, ...)" here and below.

> +		return;
> +	}
> +
> +	mmio = pci_iomap(pdev, 0, 0);
> +	if (mmio == NULL) {
> +		dev_err(&pdev->dev, "Cannot iomap Switchtec device\n");
> +		return;
> +	}
> +
> +	dev_info(&pdev->dev, "Setting Switchtec proxy ID aliases\n");
> +
> +	mmio_ntb = mmio + SWITCHTEC_GAS_NTB_OFFSET;
> +	mmio_ctrl = (void * __iomem) mmio_ntb + SWITCHTEC_NTB_REG_CTRL_OFFSET;
> +	mmio_sys_info = mmio + SWITCHTEC_GAS_SYS_INFO_OFFSET;
> +
> +	partition = ioread8(&mmio_ntb->partition_id);
> +
> +	partition_map = (u64) ioread32((void * __iomem) &mmio_ntb->ep_map);
> +	partition_map |=
> +		((u64) ioread32((void * __iomem) &mmio_ntb->ep_map + 4)) << 32;
> +	partition_map &= ~(1ULL << partition);
> +
> +	for (pp = 0; pp < (sizeof(partition_map) * 8); pp++) {
> +		struct ntb_ctrl_regs __iomem *mmio_peer_ctrl;
> +		u32 table_sz = 0;
> +		int te;
> +
> +		if (!(partition_map & (1ULL << pp)))
> +			continue;
> +
> +		dev_dbg(&pdev->dev, "Processing partition %d\n", pp);
> +
> +		mmio_peer_ctrl = &mmio_ctrl[pp];
> +
> +		table_sz = ioread16(&mmio_peer_ctrl->req_id_table_size);
> +		if (!table_sz) {
> +			dev_warn(&pdev->dev, "Partition %d table_sz 0\n", pp);
> +			continue;
> +		}
> +
> +		if (table_sz > 512) {
> +			dev_warn(&pdev->dev,
> +				 "Invalid Switchtec partition %d table_sz %d\n",
> +				 pp, table_sz);
> +			continue;
> +		}
> +
> +		for (te = 0; te < table_sz; te++) {
> +			u32 rid_entry;
> +			u8 devfn;
> +
> +			rid_entry = ioread32(&mmio_peer_ctrl->req_id_table[te]);
> +			devfn = (rid_entry >> 1) & 0xFF;
> +			dev_dbg(&pdev->dev,
> +				"Aliasing Partition %d Proxy ID %02d.%d\n",
> +				pp, PCI_SLOT(devfn), PCI_FUNC(devfn));
> +			pci_add_dma_alias(pdev, devfn);
> +		}
> +	}
> +
> +	pci_iounmap(pdev, mmio);

I think you should probably disable the device here (and in the error
return path) to balance the enable above.

> +}
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFX24XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFX32XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFX48XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFX64XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFX80XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFX96XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PSX48XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PSX64XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PSX80XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PSX96XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PAX24XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PAX32XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PAX48XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PAX64XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PAX80XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PAX96XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXL24XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXL32XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXL48XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXL64XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXL80XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXL96XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXI24XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXI32XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXI48XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXI64XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXI80XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
> +			      PCI_DEVICE_ID_MICROSEMI_PFXI96XG3,
> +			      PCI_CLASS_BRIDGE_OTHER, 8,
> +			      quirk_switchtec_ntb_dma_alias);
> -- 
> 1.8.3.1
>
Logan Gunthorpe May 17, 2018, 3:48 p.m. | #2
Thanks for your hard work on this Doug!

On 17/05/18 06:00 AM, dmeyer@gigaio.com wrote:

> +	if (pci_enable_device(pdev)) {
> +		dev_err(&pdev->dev, "Cannot enable Switchtec device\n");
> +		return;
> +	}

I suspect we should probably cass pci_disable_device() before leaving
this function so it's in the same state we started. Just like we unmap
the mmio.

Logan
Logan Gunthorpe May 17, 2018, 4:06 p.m. | #3
On 17/05/18 07:45 AM, Bjorn Helgaas wrote:
> On Thu, May 17, 2018 at 05:00:13AM -0700, dmeyer@gigaio.com wrote:
>> From: Doug Meyer <dmeyer@gigaio.com>
>>
>> Here we add the PCI quirk for the Microsemi Switchtec parts to allow
>> non-transparent bridging to work when the IOMMU is turned on.
> 
> I'm not an NTB expert, but it *sounds* like you're specifically fixing
> DMA initiated by an NT endpoint.  I assume other aspects of
> non-transparent bridging, e.g., routing of config accesses,
> interrupts, doorbells, etc., might work even without this quirk.

Yes, that's correct.

>> When a requestor on one NT endpoint accesses memory on another NT
>> endpoint, it does this via a devfn proxy ID. Proxy IDs are uniquely
>> assigned on a per-requestor basis within the NT hardware, and are not
>> visible via PCI enumeration. As a result, a memory access from a peer
>> NT endpoint will have an unrecognized requestor ID devfn which the
>> IOMMU will reject.
> 
> It would be very helpful if you could include a reference to the
> relevant section of a publicly available spec.

I'm not aware of any public specs on this. The basic idea is that a peer
accesses a BAR memory window on it's own side and the NTB translates it
to a memory request on the local side. This request has it's requester
ID translated through a LUT seeing the requester ID on the initiating
side isn't valid on the receiving side. The LUT has multiple entries so
that multiple requester IDs can be translated and the responses routed
back to the original requester.

> Who assigns these proxy IDs?  Quirks run before drivers claim the
> device, so it looks like this assumes the proxy ID assignments are
> either hard-wired into the device or programmed by firmware.  If the
> latter, how would they be programmed for hot-added NTBs?

The local side of the requester ID LUT are fixed by the hardware so we
can determine all the possible IDs coming from the NTB device just by
reading some registers. The peer's side of the LUT are programmed with
all possible source requester IDs by the driver but these don't have any
effect on the ID's on the receiving side.

> I'm wondering if all this could or should be done in the switchtec
> driver instead of in a quirk.  But I really don't know how that driver
> works.

We'd like it to be done in the driver too but it seems
pci_add_dma_alias() must be called before the driver is initialized and
therefore in a quirk. Presently, it seems nearly all calls to that
function are in quirks.c for this reason.

Thanks,

Logan
Doug Meyer May 22, 2018, 9:08 p.m. | #4
[resending because I forgot the mailing lists want plain text mode]


Dear Logan,

I agree. I'll add pci_disable_device().

Blessings,
Doug



On Thu, May 17, 2018 at 8:48 AM, Logan Gunthorpe <logang@deltatee.com> wrote:
>
> Thanks for your hard work on this Doug!
>
> On 17/05/18 06:00 AM, dmeyer@gigaio.com wrote:
>
> > +     if (pci_enable_device(pdev)) {
> > +             dev_err(&pdev->dev, "Cannot enable Switchtec device\n");
> > +             return;
> > +     }
>
> I suspect we should probably cass pci_disable_device() before leaving
> this function so it's in the same state we started. Just like we unmap
> the mmio.
>
> Logan
>
> --
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-ntb+unsubscribe@googlegroups.com.
> To post to this group, send email to linux-ntb@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/a0997fec-ba06-0d1d-a43d-fcc1600484f5%40deltatee.com.
> For more options, visit https://groups.google.com/d/optout.
Bjorn Helgaas May 22, 2018, 9:51 p.m. | #5
[+cc Alex]

On Tue, May 22, 2018 at 02:09:59PM -0700, Doug Meyer wrote:
> Logan answered the questions quite thoroughly. (Thanks, Logan!)

When you repost it, please rework the commit log so it answers the
questions directly.  Otherwise the next reader may have the same
questions again.  E.g., say something about how the proxy IDs are not
programmable and are fixed in the hardware so all we have to do is
read them.

I don't think the question of when the aliases need to be added is
quite closed.  Logan said "it seems pci_add_dma_alias() must be called
before the driver is initialized and therefore in a quirk", but that
doesn't make clear *why* the alias needs to be added before the driver
is initialized.  The alias shouldn't be needed until the device does a
DMA, and it shouldn't do that until after the driver initializes.

I suspect the reason the existing quirks are in drivers/pci/quirks.c
is because the IOMMU driver is in the host OS, but the host may not
have a driver for the device if the device is passed through to a
guest OS.  In that case, the only way to add the alias is by using a
quirk that is always built into the host OS.

We could argue that the driver in the guest should be able to tell the
host's IOMMU driver about these aliases, but I doubt there's an
interface for that.

Bjorn
Alex Williamson May 22, 2018, 10:13 p.m. | #6
On Tue, 22 May 2018 16:51:26 -0500
Bjorn Helgaas <helgaas@kernel.org> wrote:

> [+cc Alex]
> 
> On Tue, May 22, 2018 at 02:09:59PM -0700, Doug Meyer wrote:
> > Logan answered the questions quite thoroughly. (Thanks, Logan!)  
> 
> When you repost it, please rework the commit log so it answers the
> questions directly.  Otherwise the next reader may have the same
> questions again.  E.g., say something about how the proxy IDs are not
> programmable and are fixed in the hardware so all we have to do is
> read them.
> 
> I don't think the question of when the aliases need to be added is
> quite closed.  Logan said "it seems pci_add_dma_alias() must be called
> before the driver is initialized and therefore in a quirk", but that
> doesn't make clear *why* the alias needs to be added before the driver
> is initialized.  The alias shouldn't be needed until the device does a
> DMA, and it shouldn't do that until after the driver initializes.

Aliases for devices that don't have a representation on the bus is only
one use for pci_add_dma_alias(), we can also use it when the aliased
device is visible on the bus and then it factors not only into the IOMMU
context entries for a given device, but also the grouping of multiple
devices that must be done without a host endpoint driver.
 
> I suspect the reason the existing quirks are in drivers/pci/quirks.c
> is because the IOMMU driver is in the host OS, but the host may not
> have a driver for the device if the device is passed through to a
> guest OS.  In that case, the only way to add the alias is by using a
> quirk that is always built into the host OS.
> 
> We could argue that the driver in the guest should be able to tell the
> host's IOMMU driver about these aliases, but I doubt there's an
> interface for that.

Sounds like a dangerous interface, imagine two physical functions on a
device, each assigned to separate guests where one guest could usurp
context entries for hidden devices from the other guest.  Thanks,

Alex
Logan Gunthorpe May 22, 2018, 10:23 p.m. | #7
On 22/05/18 03:51 PM, Bjorn Helgaas wrote:
> I don't think the question of when the aliases need to be added is
> quite closed.  Logan said "it seems pci_add_dma_alias() must be called
> before the driver is initialized and therefore in a quirk", but that
> doesn't make clear *why* the alias needs to be added before the driver
> is initialized.  The alias shouldn't be needed until the device does a
> DMA, and it shouldn't do that until after the driver initializes.

No, Doug tried it in the driver first and it didn't work. The symbol is
also not exported which was probably done because it can't be used in
the driver.

> I suspect the reason the existing quirks are in drivers/pci/quirks.c
> is because the IOMMU driver is in the host OS, but the host may not
> have a driver for the device if the device is passed through to a
> guest OS.  In that case, the only way to add the alias is by using a
> quirk that is always built into the host OS.

Digging into the code a bit, it's not because it must be done by the
Host OS but because it must be done before the IOMMU groups are created.
The IOMMU code registers a bus_notifier and creates the groups based on
the dma_alias mask when it receives the BUS_NOTIFY_ADD_DEVICE event.
This event is notified in device_add() just before a call to
bus_probe_device()[1]. Therefore, if a driver attempts to use
pci_add_dma_alias() as part of it's probe routine, it will be too late
as the IOMMU has already setup the groups based on the original version
of the dma_alias_mask.

I suspect this is by design as the groups must be created before and any
dma_maps are done on the device and some drivers may create dma_maps
during probe.

Logan

[1]
https://elixir.bootlin.com/linux/v4.17-rc6/source/drivers/base/core.c#L1863
Bjorn Helgaas May 23, 2018, 1:33 p.m. | #8
On Tue, May 22, 2018 at 04:23:13PM -0600, Logan Gunthorpe wrote:
> On 22/05/18 03:51 PM, Bjorn Helgaas wrote:
> > I don't think the question of when the aliases need to be added is
> > quite closed.  Logan said "it seems pci_add_dma_alias() must be called
> > before the driver is initialized and therefore in a quirk", but that
> > doesn't make clear *why* the alias needs to be added before the driver
> > is initialized.  The alias shouldn't be needed until the device does a
> > DMA, and it shouldn't do that until after the driver initializes.
> 
> No, Doug tried it in the driver first and it didn't work. The symbol is
> also not exported which was probably done because it can't be used in
> the driver.
> 
> > I suspect the reason the existing quirks are in drivers/pci/quirks.c
> > is because the IOMMU driver is in the host OS, but the host may not
> > have a driver for the device if the device is passed through to a
> > guest OS.  In that case, the only way to add the alias is by using a
> > quirk that is always built into the host OS.
> 
> Digging into the code a bit, it's not because it must be done by the
> Host OS but because it must be done before the IOMMU groups are created.
> The IOMMU code registers a bus_notifier and creates the groups based on
> the dma_alias mask when it receives the BUS_NOTIFY_ADD_DEVICE event.
> This event is notified in device_add() just before a call to
> bus_probe_device()[1]. Therefore, if a driver attempts to use
> pci_add_dma_alias() as part of it's probe routine, it will be too late
> as the IOMMU has already setup the groups based on the original version
> of the dma_alias_mask.

This (and Alex's) analysis is very useful and I'd like to capture it
somehow, perhaps by expanding the poor pci_add_dma_alias() function
comment I added with f0af9593372a ("PCI: Add pci_add_dma_alias() to
abstract implementation").

The admonition to "call early" without any details about *how* early
or *why* it needs to be called early is not really very useful.

If we added your analysis, it would be a great help to anybody who
reworks IOMMU groups in the future.

> I suspect this is by design as the groups must be created before and any
> dma_maps are done on the device and some drivers may create dma_maps
> during probe.
> 
> Logan
> 
> [1]
> https://elixir.bootlin.com/linux/v4.17-rc6/source/drivers/base/core.c#L1863
Logan Gunthorpe May 23, 2018, 8:21 p.m. | #9
On 23/05/18 07:33 AM, Bjorn Helgaas wrote:
> This (and Alex's) analysis is very useful and I'd like to capture it
> somehow, perhaps by expanding the poor pci_add_dma_alias() function
> comment I added with f0af9593372a ("PCI: Add pci_add_dma_alias() to
> abstract implementation").
> 
> The admonition to "call early" without any details about *how* early
> or *why* it needs to be called early is not really very useful.
> 
> If we added your analysis, it would be a great help to anybody who
> reworks IOMMU groups in the future.

Sure, I'll work up a patch and send it on shortly.

Logan

Patch

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 2990ad1..c7e27b3 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -27,6 +27,7 @@ 
 #include <linux/mm.h>
 #include <linux/platform_data/x86/apple.h>
 #include <linux/pm_runtime.h>
+#include <linux/switchtec.h>
 #include <asm/dma.h>	/* isa_dma_bridge_buggy */
 #include "pci.h"
 
@@ -4741,3 +4742,198 @@  static void quirk_gpu_hda(struct pci_dev *hda)
 			      PCI_CLASS_MULTIMEDIA_HD_AUDIO, 8, quirk_gpu_hda);
 DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
 			      PCI_CLASS_MULTIMEDIA_HD_AUDIO, 8, quirk_gpu_hda);
+
+/*
+ * Microsemi Switchtec NTB uses devfn proxy IDs to move TLPs between
+ * NT endpoints via the internal switch fabric. These IDs replace the
+ * originating requestor ID TLPs which access host memory on peer NTB
+ * ports. Therefore, all proxy IDs must be aliased to the NTB device
+ * to permit access when the IOMMU is turned on.
+ */
+static void quirk_switchtec_ntb_dma_alias(struct pci_dev *pdev)
+{
+	void __iomem *mmio = NULL;
+	struct ntb_info_regs __iomem *mmio_ntb = NULL;
+	struct ntb_ctrl_regs __iomem *mmio_ctrl = NULL;
+	struct sys_info_regs __iomem *mmio_sys_info = NULL;
+
+	u64 partition_map;
+	u8 partition;
+	int pp;
+
+	if (pci_enable_device(pdev)) {
+		dev_err(&pdev->dev, "Cannot enable Switchtec device\n");
+		return;
+	}
+
+	mmio = pci_iomap(pdev, 0, 0);
+	if (mmio == NULL) {
+		dev_err(&pdev->dev, "Cannot iomap Switchtec device\n");
+		return;
+	}
+
+	dev_info(&pdev->dev, "Setting Switchtec proxy ID aliases\n");
+
+	mmio_ntb = mmio + SWITCHTEC_GAS_NTB_OFFSET;
+	mmio_ctrl = (void * __iomem) mmio_ntb + SWITCHTEC_NTB_REG_CTRL_OFFSET;
+	mmio_sys_info = mmio + SWITCHTEC_GAS_SYS_INFO_OFFSET;
+
+	partition = ioread8(&mmio_ntb->partition_id);
+
+	partition_map = (u64) ioread32((void * __iomem) &mmio_ntb->ep_map);
+	partition_map |=
+		((u64) ioread32((void * __iomem) &mmio_ntb->ep_map + 4)) << 32;
+	partition_map &= ~(1ULL << partition);
+
+	for (pp = 0; pp < (sizeof(partition_map) * 8); pp++) {
+		struct ntb_ctrl_regs __iomem *mmio_peer_ctrl;
+		u32 table_sz = 0;
+		int te;
+
+		if (!(partition_map & (1ULL << pp)))
+			continue;
+
+		dev_dbg(&pdev->dev, "Processing partition %d\n", pp);
+
+		mmio_peer_ctrl = &mmio_ctrl[pp];
+
+		table_sz = ioread16(&mmio_peer_ctrl->req_id_table_size);
+		if (!table_sz) {
+			dev_warn(&pdev->dev, "Partition %d table_sz 0\n", pp);
+			continue;
+		}
+
+		if (table_sz > 512) {
+			dev_warn(&pdev->dev,
+				 "Invalid Switchtec partition %d table_sz %d\n",
+				 pp, table_sz);
+			continue;
+		}
+
+		for (te = 0; te < table_sz; te++) {
+			u32 rid_entry;
+			u8 devfn;
+
+			rid_entry = ioread32(&mmio_peer_ctrl->req_id_table[te]);
+			devfn = (rid_entry >> 1) & 0xFF;
+			dev_dbg(&pdev->dev,
+				"Aliasing Partition %d Proxy ID %02d.%d\n",
+				pp, PCI_SLOT(devfn), PCI_FUNC(devfn));
+			pci_add_dma_alias(pdev, devfn);
+		}
+	}
+
+	pci_iounmap(pdev, mmio);
+}
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFX24XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFX32XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFX48XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFX64XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFX80XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFX96XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PSX48XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PSX64XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PSX80XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PSX96XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PAX24XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PAX32XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PAX48XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PAX64XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PAX80XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PAX96XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXL24XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXL32XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXL48XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXL64XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXL80XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXL96XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXI24XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXI32XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXI48XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXI64XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXI80XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_MICROSEMI,
+			      PCI_DEVICE_ID_MICROSEMI_PFXI96XG3,
+			      PCI_CLASS_BRIDGE_OTHER, 8,
+			      quirk_switchtec_ntb_dma_alias);