Patchwork Quirk for buggy dma source tags with Intel IOMMU.

login
register
mail settings
Submitter George Spelvin
Date Jan. 20, 2014, 1:13 p.m.
Message ID <20140120131359.26728.qmail@science.horizon.com>
Download mbox | patch
Permalink /patch/312524/
State Rejected
Headers show

Comments

George Spelvin - Jan. 20, 2014, 1:13 p.m.
This is requried for me to get a Marvell 9172 SATA controller working
with VT-d.

Andrew Cooks wrote the original; see
https://bugzilla.kernel.org/show_bug.cgi?id=42679#c22

Without it, I see the same symptoms as in that bug report:
the device is recognized, but probing the attached disk fails.

I massaged it a bit to fit my personal idea of "cleaner".
The biggest logic change is the elimination of the fn_mapped
variable from quirk_map_multi_requester_ids.

I also got rid of the "fn_map << fn" logic (which is never false).

I can vouch for quirk_map_multi_requester_ids working in my case;
I can't speak for quirk_map_requester_id.

Who do I nudge about actually getting some variant of this merged?

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Cooks - Jan. 20, 2014, 3:45 p.m.
On Mon, Jan 20, 2014 at 9:13 PM, George Spelvin <linux@horizon.com> wrote:
> This is requried for me to get a Marvell 9172 SATA controller working
> with VT-d.
>
> Andrew Cooks wrote the original; see
> https://bugzilla.kernel.org/show_bug.cgi?id=42679#c22
>
> Without it, I see the same symptoms as in that bug report:
> the device is recognized, but probing the attached disk fails.
>
> I massaged it a bit to fit my personal idea of "cleaner".
> The biggest logic change is the elimination of the fn_mapped
> variable from quirk_map_multi_requester_ids.

Thanks, that was detritus left over from debugging.

> I also got rid of the "fn_map << fn" logic (which is never false).

What if function 0 is not present? There were cases where the Marvell
9172 only used function 1 (when only one port is in use). I don't
think it's safe to assume that function 0 is present when you're
trying to compensate for broken devices.

> I can vouch for quirk_map_multi_requester_ids working in my case;
> I can't speak for quirk_map_requester_id.

I've tested quirk_map_requester_id on a Thinkpad T410 with Ricoh IEEE
1394 Controller.

> Who do I nudge about actually getting some variant of this merged?

Thanks for your efforts to get this merged. If you look through the
list archives you'll see that I've posted previous iterations of this
before, without much enthusiasm.

>
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index 43b9bfea48..4695865051 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -1677,6 +1677,111 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
>         return 0;
>  }
>
> +static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn);
> +
> +static void quirk_unmap_multi_requesters(struct pci_dev *pdev, u8 fn_map)
> +{
> +       int fn;
> +       struct intel_iommu *iommu = device_to_iommu(pci_domain_nr(pdev->bus),
> +                                               pdev->bus->number, pdev->devfn);
> +
> +       /* something must be seriously wrong if we can't lookup the iommu. */
> +       BUG_ON(!iommu);
> +
> +       fn_map &= ~(1<<PCI_FUNC(pdev->devfn));  /* Skip the normal case */

Do you really find this "cleaner", as in more readable?

> +
> +       for (fn = 0; fn_map >> fn; fn++) {
> +               if (fn_map & (1<<fn)) {
> +                       iommu_detach_dev(iommu,
> +                                       pdev->bus->number,
> +                                       PCI_DEVFN(PCI_SLOT(pdev->devfn), fn));
> +                       dev_dbg(&pdev->dev,
> +                               "requester id quirk; ghost func %d unmapped",
> +                               fn);
> +               }
> +       }
> +}
> +
> +/* For quirky devices that use multiple requester ids. */
> +static int quirk_map_multi_requester_ids(struct dmar_domain *domain,
> +               struct pci_dev *pdev,
> +               int translation)
> +{
> +       int fn, err = 0;
> +       u8 fn_map = pci_multi_requesters(pdev);
> +
> +       /* this is the common, non-quirky case. */
> +       if (!fn_map)
> +               return 0;
> +
> +       fn_map &= ~(1<<PCI_FUNC(pdev->devfn));  /* Skip the normal case */
> +
> +       for (fn = 0; fn_map >> fn; fn++) {
> +               if (fn_map & (1<<fn)) {
> +                       err = domain_context_mapping_one(domain,
> +                                       pci_domain_nr(pdev->bus),
> +                                       pdev->bus->number,
> +                                       PCI_DEVFN(PCI_SLOT(pdev->devfn), fn),
> +                                       translation);
> +                       if (err) {
> +                               dev_err(&pdev->dev,
> +                                       "mapping ghost func %d failed", fn);
> +                               quirk_unmap_multi_requesters(pdev,
> +                                       fn_map & ((1<<fn)-1));
> +                               return err;
> +                       }
> +                       dev_dbg(&pdev->dev,
> +                               "requester id quirk; ghost func %d mapped", fn);
> +               }
> +       }
> +       return 0;
> +}
> +
> +
> +static void quirk_unmap_requester_id(struct pci_dev *pdev)
> +{
> +       u8 devfn = pci_requester(pdev);
> +       struct intel_iommu *iommu = device_to_iommu(pci_domain_nr(pdev->bus),
> +                                               pdev->bus->number, pdev->devfn);
> +
> +       /* something must be seriously wrong if we can't lookup the iommu. */
> +       BUG_ON(!iommu);
> +
> +
> +       if (pdev->devfn == devfn)
> +               return;
> +
> +       iommu_detach_dev(iommu, pdev->bus->number, devfn);
> +       dev_dbg(&pdev->dev, "requester id quirk; bugged device unmapped");
> +}
> +
> +static int quirk_map_requester_id(struct dmar_domain *domain,
> +               struct pci_dev *pdev,
> +               int translation)
> +{
> +       u8 devfn = pci_requester(pdev);
> +       int err;
> +
> +       dev_dbg(&pdev->dev,
> +               "checking for incorrect pci requester id quirk...");
> +
> +       if (pdev->devfn == devfn)
> +               return 0;
> +
> +       err = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
> +                       pdev->bus->number, devfn, translation);
> +       if (err) {
> +               dev_err(&pdev->dev,
> +                       "requester id quirk: mapping dev %02x:%02x.%d failed",
> +                       pdev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn));
> +               return err;
> +       }
> +       dev_dbg(&pdev->dev,
> +               "requester id quirk; dmar context entry added: %02x:%02x.%d",
> +               pdev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn));
> +       return 0;
> +}
> +
>  static int
>  domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
>                         int translation)
> @@ -1690,6 +1795,16 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
>         if (ret)
>                 return ret;
>
> +       /* quirk for devices using multiple pci requester ids */
> +       ret = quirk_map_multi_requester_ids(domain, pdev, translation);
> +       if (ret)
> +               return ret;
> +
> +       /* quirk for devices single incorrect pci requester id */
> +       ret = quirk_map_requester_id(domain, pdev, translation);
> +       if (ret)
> +               return ret;
> +
>         /* dependent device mapping */
>         tmp = pci_find_upstream_pcie_bridge(pdev);
>         if (!tmp)
> @@ -3802,6 +3917,9 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain,
>                         iommu_disable_dev_iotlb(info);
>                         iommu_detach_dev(iommu, info->bus, info->devfn);
>                         iommu_detach_dependent_devices(iommu, pdev);
> +                       quirk_unmap_multi_requesters(pdev,
> +                                               pci_multi_requesters(pdev));
> +                       quirk_unmap_requester_id(pdev);
>                         free_devinfo_mem(info);
>
>                         spin_lock_irqsave(&device_domain_lock, flags);
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> index 3a02717473..500edde3d3 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -3336,6 +3336,10 @@ static struct pci_dev *pci_func_0_dma_source(struct pci_dev *dev)
>         return pci_get_slot(dev->bus, PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
>  }
>
> +/* Table of source functions for real devices. The DMA requests for the
> + * device are tagged with a different real function as source. This is
> + * relevant to multifunction devices.
> + */
>  static const struct pci_dev_dma_source {
>         u16 vendor;
>         u16 device;
> @@ -3362,7 +3366,8 @@ static const struct pci_dev_dma_source {
>   * the device doing the DMA, but sometimes hardware is broken and will
>   * tag the DMA as being sourced from a different device.  This function
>   * allows that translation.  Note that the reference count of the
> - * returned device is incremented on all paths.
> + * returned device is incremented on all paths. Translation is done when
> + * the device is added to an IOMMU group.
>   */
>  struct pci_dev *pci_get_dma_source(struct pci_dev *dev)
>  {
> @@ -3423,6 +3428,144 @@ static int pci_quirk_amd_sb_acs(struct pci_dev *dev, u16 acs_flags)
>  #endif
>  }
>
> +/* Table of multiple (ghost) source functions. Devices that may need this quirk
> + * show the following behaviour:
> + * 1. the device may use multiple PCI requester IDs during operation,
> + *     (eg. one pci transaction uses xx:yy.0, the next uses xx:yy.1)
> + * 2. the requester ID may not match a known device.
> + *     (eg. lspci does not show xx:yy.1 to be present)
> + *
> + * The bitmap contains all of the functions "in use" by the device.
> + * See  https://bugzilla.redhat.com/show_bug.cgi?id=757166,
> + * https://bugzilla.kernel.org/show_bug.cgi?id=42679
> + * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1089768
> + */
> +static const struct pci_dev_dma_multi_source_map {
> +       u16 vendor;
> +       u16 device;
> +       u8 func_map;    /* bit map. lsb is fn 0. */
> +} pci_dev_dma_multi_source_map[] = {
> +        /* Reported by Patrick Bregman
> +         * https://bugzilla.redhat.com/show_bug.cgi?id=863653 */
> +       { PCI_VENDOR_ID_MARVELL_EXT, 0x9120, (1<<0)|(1<<1)},
> +
> +       /* Reported by  Paweł Żak, Korneliusz Jarzębski, Daniel Mayer
> +        * https://bugzilla.kernel.org/show_bug.cgi?id=42679 and by
> +        * Justin Piszcz  https://lkml.org/lkml/2012/11/24/94 */
> +       { PCI_VENDOR_ID_MARVELL_EXT, 0x9123, (1<<0)|(1<<1)},
> +
> +       /* Used in a patch by Ying Chu
> +        * https://bugzilla.redhat.com/show_bug.cgi?id=757166 */
> +       { PCI_VENDOR_ID_MARVELL_EXT, 0x9125, (1<<0)|(1<<1)},
> +
> +       /* Reported by Robert Cicconetti
> +        * https://bugzilla.kernel.org/show_bug.cgi?id=42679 and by
> +        * Fernando https://bugzilla.redhat.com/show_bug.cgi?id=757166 */
> +       { PCI_VENDOR_ID_MARVELL_EXT, 0x9128, (1<<0)|(1<<1)},
> +
> +       /* Reported by Stijn Tintel
> +        * https://bugzilla.kernel.org/show_bug.cgi?id=42679 */
> +       { PCI_VENDOR_ID_MARVELL_EXT, 0x9130, (1<<0)|(1<<1)},
> +
> +       /* Reported by Gaudenz Steinlin
> +        * https://lkml.org/lkml/2013/3/5/288 */
> +       { PCI_VENDOR_ID_MARVELL_EXT, 0x9143, (1<<0)|(1<<1)},
> +
> +       /* Reported by Andrew Cooks
> +        * https://bugzilla.kernel.org/show_bug.cgi?id=42679 */
> +       { PCI_VENDOR_ID_MARVELL_EXT, 0x9172, (1<<0)|(1<<1)},
> +
> +       { 0 }
> +};
> +
> +/*
> + * The mapping of quirky requester ids is used when the device driver sets up
> + * dma, if iommu is enabled.
> + */
> +u8 pci_multi_requesters(struct pci_dev *dev)
> +{
> +       const struct pci_dev_dma_multi_source_map *i;
> +
> +       for (i = pci_dev_dma_multi_source_map; i->func_map; i++) {
> +               if ((i->vendor == dev->vendor ||
> +                    i->vendor == (u16)PCI_ANY_ID) &&
> +                   (i->device == dev->device ||
> +                    i->device == (u16)PCI_ANY_ID)) {
> +                       return i->func_map;
> +               }
> +       }
> +       return 0;
> +}
> +
> +/* These are one-to-one translations for devices that use a single incorrect
> + * requester ID. The requester id may not be the BDF of a real device.
> + */
> +static const struct pci_dev_dma_source_map {
> +       u16 vendor;
> +       u16 device;
> +       u8  devfn;
> +       u8  dma_devfn;
> +} pci_dev_dma_source_map[] = {
> +
> +       /* Ricoh IEEE 1394 Controller */
> +       {
> +               PCI_VENDOR_ID_RICOH,
> +               0xe832,
> +               PCI_DEVFN(0x00, 3),
> +               PCI_DEVFN(0x00, 0)
> +       },
> +
> +       /* Nils Caspar - Adaptec 3405
> +        * http://www.mail-archive.com/centos@centos.org/msg90986.html
> +        * Jonathan McCune
> +        * http://old-list-archives.xen.org/archives/html/xen-users/2010-04/msg00535.html */
> +       {
> +               PCI_VENDOR_ID_ADAPTEC2,
> +               0x028b,
> +               PCI_DEVFN(0x0e, 0),
> +               PCI_DEVFN(0x01, 0)
> +       },
> +
> +       /* Mateusz Murawski - LSI SAS based MegaRAID
> +        * https://lkml.org/lkml/2011/9/12/104
> +        * M. Nunberg - Dell PERC 5/i Integrated RAID Controller
> +        * http://lists.xen.org/archives/html/xen-devel/2010-05/msg01563.html */
> +       {
> +               PCI_VENDOR_ID_LSI_LOGIC,
> +               0x0411,
> +               PCI_DEVFN(0x0e, 0),
> +               PCI_DEVFN(0x08, 0)
> +       },
> +
> +       /* Steven Dake, Markus Stockhausen - Mellanox 26428
> +        * https://bugzilla.redhat.com/show_bug.cgi?id=713774
> +        * Note: mellanox uses decimal product numbers, convert to hex for PCI
> +        * device ID. ie. 26428 == 0x673c */
> +       {
> +               PCI_VENDOR_ID_MELLANOX,
> +               0x673c,
> +               PCI_DEVFN(0x00, 0),
> +               PCI_DEVFN(0x00, 6)
> +       },
> +
> +       { 0 }
> +};
> +
> +u8 pci_requester(struct pci_dev *dev)
> +{
> +       const struct pci_dev_dma_source_map *i;
> +
> +       for (i = pci_dev_dma_source_map; i->devfn; i++) {

The bug that Martin Öhrling pointed out in the ticket is still there.

> +               if ((i->vendor == dev->vendor) &&
> +                   (i->device == dev->device) &&
> +                   (i->devfn == dev->devfn)) {
> +                       return i->dma_devfn;
> +               }
> +       }
> +       return dev->devfn;
> +}
> +
> +
>  static const struct pci_dev_acs_enabled {
>         u16 vendor;
>         u16 device;
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index a13d6825e5..3214fd2514 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1632,6 +1632,8 @@ enum pci_fixup_pass {
>  #ifdef CONFIG_PCI_QUIRKS
>  void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev);
>  struct pci_dev *pci_get_dma_source(struct pci_dev *dev);
> +u8 pci_multi_requesters(struct pci_dev *dev);
> +u8 pci_requester(struct pci_dev *dev);
>  int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags);
>  #else
>  static inline void pci_fixup_device(enum pci_fixup_pass pass,
> @@ -1640,6 +1642,14 @@ static inline struct pci_dev *pci_get_dma_source(struct pci_dev *dev)
>  {
>         return pci_dev_get(dev);
>  }
> +u8 pci_multi_requesters(struct pci_dev *dev)
> +{
> +       return 0;
> +}
> +u8 pci_requester(struct pci_dev *dev)
> +{
> +       return dev->devfn;
> +}
>  static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev,
>                                                u16 acs_flags)
>  {

Thanks,

a.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
George Spelvin - Jan. 20, 2014, 7:01 p.m.
>> I massaged it a bit to fit my personal idea of "cleaner".
>> The biggest logic change is the elimination of the fn_mapped
>> variable from quirk_map_multi_requester_ids.

> Thanks, that was detritus left over from debugging.

The code still does the exact same thing; I just derive the value from
(fn_map & ((1<<fn)-1)) if it's required.

>> I also got rid of the "fn_map << fn" logic (which is never false).

> What if function 0 is not present? There were cases where the Marvell
> 9172 only used function 1 (when only one port is in use). I don't
> think it's safe to assume that function 0 is present when you're
> trying to compensate for broken devices.

Er... can you explain?  I did not change what the code *does* in the
slightest, only the was it figures out what to do.

I took this it out because it does nothing; it can be statically proved
that the expression is always non-zero.

Because "fn_map" is a non-zero 8-bit value and "fn" is an integer in
the range 0-7, the value "fn_map << fn" is a non-zero 15-bit value.
15 bits can't overflow (even on a PDP-11 with 16-bit ints), so this is
a non-zero value, and the loop will never terminate because of it.

It sure looked like a bug to me, but I couldn't figure out what it
was supposed to do.  Can you enlighten me?  "fn_map >> fn" (shifting
right rather than left) achieves early termination of the loop, but
isn't essential.

>> +       fn_map &= ~(1<<PCI_FUNC(pdev->devfn));  /* Skip the normal case */

> Do you really find this "cleaner", as in more readable?

I find it easier to think about fn_map as the bitmap of functions to
be operated on.  Rather than have two conditions in the following loop.

The confusing thing is "why are we skipping that function?"  (Because this
is a quirk and the "normal case" has already been handled.)  It would be
nicer if all the domain_context_mapping_one calls were grouped together.

I would like to see if there's a clean way to combine this with phantom
function support (which I haven't find the code for yet).

>> +       for (i = pci_dev_dma_source_map; i->devfn; i++) {

> The bug that Martin Oehrling pointed out in the ticket is still there.

Ah, thank you; I had missed that that.  Yes, it should be testing
i->vendor.  You really get devices using the wrong slot as well as
function?  Wow!

Using multiple functions is at least anticipated in the PCI spec,
even if they do it wrong.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
George Spelvin - Jan. 20, 2014, 11:35 p.m.
I'm looking over the bug reports, and I'm trying to understand why you
need two mechanisms.  You have one table (pci_dev_dma_source_map)
that maps one devfn to a secondary one, and a second table
(pci_dev_dma_multi_source_map) that gives a 2-bit bitmap of function
numbers.

But as I understand it, all the Marvell devices listed there always have
a devfn of 00.0, so you could just move them to the first table with a
mapping to 00.1.

If they also have device/function 00.1 using a requester ID of 00.0
you could add a second entry.

The only time you'd need a bitmap would be if the device ended up on
function 2 or higher and still used functions 0 and 1 in request IDs.


Now, unfortunately this does not cover the phantom functions case, but
that could be done with separate code.


The other question is when to do the lookup.  I don't know how often
IOMMU mapping is done, and if linear search through a list of PCI devices
would be a performance problem.  It would be nicer to have a standard
PCI fixup done once which sets a flag in the pci_dev to either note the
actual secondary devfn, or set a flag to trigger the secondary lookup
only on the devices that need it.

There's certainly room in the pci_dev for a second 8-bit devfn field,
Let me work up a patch based on that...
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Cooks - Jan. 20, 2014, 11:45 p.m.
On Tue, Jan 21, 2014 at 3:01 AM, George Spelvin <linux@horizon.com> wrote:
>>> I massaged it a bit to fit my personal idea of "cleaner".
>>> The biggest logic change is the elimination of the fn_mapped
>>> variable from quirk_map_multi_requester_ids.
>
>> Thanks, that was detritus left over from debugging.
>
> The code still does the exact same thing; I just derive the value from
> (fn_map & ((1<<fn)-1)) if it's required.

I see that now - was over excited and tired last night.

>>> I also got rid of the "fn_map << fn" logic (which is never false).
>
>> What if function 0 is not present? There were cases where the Marvell
>> 9172 only used function 1 (when only one port is in use). I don't
>> think it's safe to assume that function 0 is present when you're
>> trying to compensate for broken devices.
>
> Er... can you explain?  I did not change what the code *does* in the
> slightest, only the was it figures out what to do.
>
> I took this it out because it does nothing; it can be statically proved
> that the expression is always non-zero.
>
> Because "fn_map" is a non-zero 8-bit value and "fn" is an integer in
> the range 0-7, the value "fn_map << fn" is a non-zero 15-bit value.
> 15 bits can't overflow (even on a PDP-11 with 16-bit ints), so this is
> a non-zero value, and the loop will never terminate because of it.
>
> It sure looked like a bug to me, but I couldn't figure out what it
> was supposed to do.  Can you enlighten me?  "fn_map >> fn" (shifting
> right rather than left) achieves early termination of the loop, but
> isn't essential.

OK, I understand the problem now. Early exit was the intended effect
and for some reason I thought the new version could exit too early,
but it seems fine.

>>> +       fn_map &= ~(1<<PCI_FUNC(pdev->devfn));  /* Skip the normal case */
>
>> Do you really find this "cleaner", as in more readable?
>
> I find it easier to think about fn_map as the bitmap of functions to
> be operated on.  Rather than have two conditions in the following loop.

I see that now.

> The confusing thing is "why are we skipping that function?"  (Because this
> is a quirk and the "normal case" has already been handled.)  It would be
> nicer if all the domain_context_mapping_one calls were grouped together.

Sure, but since this was the first non-trivial patch I submitted, I
tried to keep it non-intrusive and simple.

> I would like to see if there's a clean way to combine this with phantom
> function support (which I haven't find the code for yet).

I'm willing to believe that if phantom functions haven't shown up and
caused problems yet, it probably doesn't need fixing in a hurry.
SR-IOV seems like a better and more common way for devices to use
multiple lightweight functions.

>
>>> +       for (i = pci_dev_dma_source_map; i->devfn; i++) {
>
>> The bug that Martin Oehrling pointed out in the ticket is still there.
>
> Ah, thank you; I had missed that that.  Yes, it should be testing
> i->vendor.  You really get devices using the wrong slot as well as
> function?  Wow!
>
> Using multiple functions is at least anticipated in the PCI spec,
> even if they do it wrong.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Cooks - Jan. 21, 2014, 1:54 a.m.
On Tue, Jan 21, 2014 at 7:35 AM, George Spelvin <linux@horizon.com> wrote:
> I'm looking over the bug reports, and I'm trying to understand why you
> need two mechanisms.  You have one table (pci_dev_dma_source_map)
> that maps one devfn to a secondary one, and a second table
> (pci_dev_dma_multi_source_map) that gives a 2-bit bitmap of function
> numbers.
I haven't found a simple way to combine the two maps that still makes
the two distinct problems apparent.
pci_dev_dma_multi_source_map deals with devices that use multiple
functions, but the correct device number.
pci_dev_dma_source_map deals with devices that use a single incorrect
devfn that's mangled in both device and function numbers.

> But as I understand it, all the Marvell devices listed there always have
> a devfn of 00.0, so you could just move them to the first table with a
> mapping to 00.1.
>
> If they also have device/function 00.1 using a requester ID of 00.0
> you could add a second entry.
>
> The only time you'd need a bitmap would be if the device ended up on
> function 2 or higher and still used functions 0 and 1 in request IDs.

Sure, that sounds reasonable and certainly needs to be considered, but
since there's no complete record of what devices get this wrong and
how they get this wrong, I wanted to
* show all requester ids are used for each device in a concise entry,
* build up as much of a record of this problem as possible (hence the
comments and bug references),
* make it easy to see the general problem and have this discussion,
* not make assumptions about which functions would be visible and
which requester ids are actually used (like a device showing up at
00.0 that uses 00.1 and 00.2 only).

Who knows what will show up when iommu is turned on by default again.

> Now, unfortunately this does not cover the phantom functions case, but
> that could be done with separate code.
>
>
> The other question is when to do the lookup.  I don't know how often
> IOMMU mapping is done, and if linear search through a list of PCI devices
> would be a performance problem.  It would be nicer to have a standard
> PCI fixup done once which sets a flag in the pci_dev to either note the
> actual secondary devfn, or set a flag to trigger the secondary lookup
> only on the devices that need it.

Yes, I'd also like to hear what the gatekeepers think about this.

Thanks,

a.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
George Spelvin - Jan. 21, 2014, 2 a.m.
New recipient: Alex Williamson, who added the pci_get_dma_source()
quirk that resembles this one.

Alex, Andrew is working on a patch to fix some more buggy
PCIe devices like the Ricoh ones that you added a quirk for.

His current patch can be found at

https://bugzilla.kernel.org/show_bug.cgi?id=42679#c22

Ir only works on Intel IOMMUs, and sort of duplicates the functionality
of your quirk.

Obviously, I hate the duplication and would like to fold them together,
but there are a few fundamental differences.

Most notably, Andrew's patch enables the incorrect requester ID
*in addition to* the correct one.  I'm not sure if this is necessary,
but it seems like a nice defensive feature in case the vendor fixes
the problem in a minor rev, and the ability is also needed by PCIe
phantom function support,

The other thing is that pci_get_dma_source returns a pci_dev, but
Andrew's patch deals with devices which use requester IDs which
don't correspond to existing PCI devices at all:

- Marvell SATA controllers which exist only as devid 00.0, but generate
  request devids of 00.1.  (The latter function might be PATA support
  on chips which support that.)
- Mellanox 26428 Infiniband controllers which also only provide one
  function, but use a source devid of 00.6
- An LSI MegaRAID SAS 2078 PCI-X controller that is device 0e.0, but
  generates request devids of 08.0
- An Adaptec RAID controller ie 0e.0 that makes requests as 01.0.

My thought for implementing Andrew's version was to add an 8-bit devfn_quirk
field to struct pci_dev that contains the delta to the devfn that will
appear in the source requester ID.

(By using a delta rather than the target value, the default value of 0
means "no quirk".)

This would be initialized at boot time using a standard PCI quirk,
avoiding the need for linearly searching potentially long device lists
every time an IOMMU mapping is set up.

(Even if I only use a single-bit flag, that would avoid the list
searching for all the non-buggy devices.)


But I'm not quite sure what the locking rules are on the whole swap_pci_ref
business.  Or if the Ricoh devices have to be handled more carefully
because multiple functions need to arbitrate over the IOMMU tables
for function 0.

Can I request some sage guidance from someone with more experience in the
area?

Thank you!
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Cooks - Jan. 21, 2014, 2:52 a.m.
On Tue, Jan 21, 2014 at 10:00 AM, George Spelvin <linux@horizon.com> wrote:
> New recipient: Alex Williamson, who added the pci_get_dma_source()
> quirk that resembles this one.
>
> Alex, Andrew is working on a patch to fix some more buggy
> PCIe devices like the Ricoh ones that you added a quirk for.
>
> His current patch can be found at
>
> https://bugzilla.kernel.org/show_bug.cgi?id=42679#c22
>
> Ir only works on Intel IOMMUs, and sort of duplicates the functionality
> of your quirk.
>
> Obviously, I hate the duplication and would like to fold them together,
> but there are a few fundamental differences.

Hang on! The existing quirk only worked for vfio groups last time I
checked and didn't actually solve the problem for using the device on
the host.

Alex had done a bunch of work to support requester id quirks (see
http://marc.info/?l=linux-pci&m=137357663626523&w=3), but that seems
blocked.

> Most notably, Andrew's patch enables the incorrect requester ID
> *in addition to* the correct one.  I'm not sure if this is necessary,
> but it seems like a nice defensive feature in case the vendor fixes
> the problem in a minor rev, and the ability is also needed by PCIe
> phantom function support,
>
> The other thing is that pci_get_dma_source returns a pci_dev, but
> Andrew's patch deals with devices which use requester IDs which
> don't correspond to existing PCI devices at all:
>
> - Marvell SATA controllers which exist only as devid 00.0, but generate
>   request devids of 00.1.  (The latter function might be PATA support
>   on chips which support that.)

No, I can prove that both .0 and .1 are used on the 9172 controller,
depending on which of the two SATA port(s) is(are) in use. If you have
the same hardware, I suggest you verify this.

> - Mellanox 26428 Infiniband controllers which also only provide one
>   function, but use a source devid of 00.6
> - An LSI MegaRAID SAS 2078 PCI-X controller that is device 0e.0, but
>   generates request devids of 08.0
> - An Adaptec RAID controller ie 0e.0 that makes requests as 01.0.
>
> My thought for implementing Andrew's version was to add an 8-bit devfn_quirk
> field to struct pci_dev that contains the delta to the devfn that will
> appear in the source requester ID.
>
> (By using a delta rather than the target value, the default value of 0
> means "no quirk".)
>
> This would be initialized at boot time using a standard PCI quirk,
> avoiding the need for linearly searching potentially long device lists
> every time an IOMMU mapping is set up.
>
> (Even if I only use a single-bit flag, that would avoid the list
> searching for all the non-buggy devices.)
>
>
> But I'm not quite sure what the locking rules are on the whole swap_pci_ref
> business.  Or if the Ricoh devices have to be handled more carefully
> because multiple functions need to arbitrate over the IOMMU tables
> for function 0.

The Ricoh firewire function simply uses the wrong requester ID. The
other functions work correctly. Remapping only the firewire specific
function of the multifunction device (as in my patch) fixes the
problem.

a.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
George Spelvin - Jan. 21, 2014, 3:53 a.m.
>> It only works on Intel IOMMUs, and sort of duplicates the functionality
>> of your quirk.
>>
>> Obviously, I hate the duplication and would like to fold them together,
>> but there are a few fundamental differences.

> Hang on! The existing quirk only worked for vfio groups last time I
> checked and didn't actually solve the problem for using the device on
> the host.

I'd believe there are additional differences I haven't figured out,
but it sure seems similar enough to be potentially mergeable.

I want to either do that, or understand the differences well enough
to explain them in a comment and tell future coders which should
be used in whatr circumstances.

> Alex had done a bunch of work to support requester id quirks (see
> http://marc.info/?l=linux-pci&m=137357663626523&w=3), but that seems
> blocked.

Thank you for this pointer.  If only I understood it better, :-(

>> - Marvell SATA controllers which exist only as devid 00.0, but generate
>>   request devids of 00.1.  (The latter function might be PATA support
>>   on chips which support that.)

> No, I can prove that both .0 and .1 are used on the 9172 controller,
> depending on which of the two SATA port(s) is(are) in use. If you have
> the same hardware, I suggest you verify this.

Thank you for the confirmation (that's a second thing that makes Alex's
current solution incompatible), but I wasn't trying to imply that they
didn't also generate DMA as .0; I was just saying that the only function
which shows up in PCI config space and thus has a struct pci_dev allocated
for it is .0.

I was trying to draw attention to the fact that there is no struct pci_dev
for .1 which can be used for Alex's interface.

>> But I'm not quite sure what the locking rules are on the whole swap_pci_ref
>> business.  Or if the Ricoh devices have to be handled more carefully
>> because multiple functions need to arbitrate over the IOMMU tables
>> for function 0.

> The Ricoh firewire function simply uses the wrong requester ID. The
> other functions work correctly. Remapping only the firewire specific
> function of the multifunction device (as in my patch) fixes the
> problem.

Good to know, but can function 0 generate DMA that we need to worry about?
(I believe it's an SD host controller, 1180:e822.)

That's what I was worried about.  Both devices trying to do IOMMU mapping
at the same time and stepping on each other.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 43b9bfea48..4695865051 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1677,6 +1677,111 @@  static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 	return 0;
 }
 
+static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn);
+
+static void quirk_unmap_multi_requesters(struct pci_dev *pdev, u8 fn_map)
+{
+	int fn;
+	struct intel_iommu *iommu = device_to_iommu(pci_domain_nr(pdev->bus),
+						pdev->bus->number, pdev->devfn);
+
+	/* something must be seriously wrong if we can't lookup the iommu. */
+	BUG_ON(!iommu);
+
+	fn_map &= ~(1<<PCI_FUNC(pdev->devfn));	/* Skip the normal case */
+
+	for (fn = 0; fn_map >> fn; fn++) {
+		if (fn_map & (1<<fn)) {
+			iommu_detach_dev(iommu,
+					pdev->bus->number,
+					PCI_DEVFN(PCI_SLOT(pdev->devfn), fn));
+			dev_dbg(&pdev->dev,
+				"requester id quirk; ghost func %d unmapped",
+				fn);
+		}
+	}
+}
+
+/* For quirky devices that use multiple requester ids. */
+static int quirk_map_multi_requester_ids(struct dmar_domain *domain,
+		struct pci_dev *pdev,
+		int translation)
+{
+	int fn, err = 0;
+	u8 fn_map = pci_multi_requesters(pdev);
+
+	/* this is the common, non-quirky case. */
+	if (!fn_map)
+		return 0;
+
+	fn_map &= ~(1<<PCI_FUNC(pdev->devfn));	/* Skip the normal case */
+
+	for (fn = 0; fn_map >> fn; fn++) {
+		if (fn_map & (1<<fn)) {
+			err = domain_context_mapping_one(domain,
+					pci_domain_nr(pdev->bus),
+					pdev->bus->number,
+					PCI_DEVFN(PCI_SLOT(pdev->devfn), fn),
+					translation);
+			if (err) {
+				dev_err(&pdev->dev,
+					"mapping ghost func %d failed", fn);
+				quirk_unmap_multi_requesters(pdev,
+					fn_map & ((1<<fn)-1));
+				return err;
+			}
+			dev_dbg(&pdev->dev,
+				"requester id quirk; ghost func %d mapped", fn);
+		}
+	}
+	return 0;
+}
+
+
+static void quirk_unmap_requester_id(struct pci_dev *pdev)
+{
+	u8 devfn = pci_requester(pdev);
+	struct intel_iommu *iommu = device_to_iommu(pci_domain_nr(pdev->bus),
+						pdev->bus->number, pdev->devfn);
+
+	/* something must be seriously wrong if we can't lookup the iommu. */
+	BUG_ON(!iommu);
+
+
+	if (pdev->devfn == devfn)
+		return;
+
+	iommu_detach_dev(iommu,	pdev->bus->number, devfn);
+	dev_dbg(&pdev->dev, "requester id quirk; bugged device unmapped");
+}
+
+static int quirk_map_requester_id(struct dmar_domain *domain,
+		struct pci_dev *pdev,
+		int translation)
+{
+	u8 devfn = pci_requester(pdev);
+	int err;
+
+	dev_dbg(&pdev->dev,
+		"checking for incorrect pci requester id quirk...");
+
+	if (pdev->devfn == devfn)
+		return 0;
+
+	err = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
+			pdev->bus->number, devfn, translation);
+	if (err) {
+		dev_err(&pdev->dev,
+			"requester id quirk: mapping dev %02x:%02x.%d failed",
+			pdev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn));
+		return err;
+	}
+	dev_dbg(&pdev->dev,
+		"requester id quirk; dmar context entry added: %02x:%02x.%d",
+		pdev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn));
+	return 0;
+}
+
 static int
 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
 			int translation)
@@ -1690,6 +1795,16 @@  domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
 	if (ret)
 		return ret;
 
+	/* quirk for devices using multiple pci requester ids */
+	ret = quirk_map_multi_requester_ids(domain, pdev, translation);
+	if (ret)
+		return ret;
+
+	/* quirk for devices single incorrect pci requester id */
+	ret = quirk_map_requester_id(domain, pdev, translation);
+	if (ret)
+		return ret;
+
 	/* dependent device mapping */
 	tmp = pci_find_upstream_pcie_bridge(pdev);
 	if (!tmp)
@@ -3802,6 +3917,9 @@  static void domain_remove_one_dev_info(struct dmar_domain *domain,
 			iommu_disable_dev_iotlb(info);
 			iommu_detach_dev(iommu, info->bus, info->devfn);
 			iommu_detach_dependent_devices(iommu, pdev);
+			quirk_unmap_multi_requesters(pdev,
+						pci_multi_requesters(pdev));
+			quirk_unmap_requester_id(pdev);
 			free_devinfo_mem(info);
 
 			spin_lock_irqsave(&device_domain_lock, flags);
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 3a02717473..500edde3d3 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3336,6 +3336,10 @@  static struct pci_dev *pci_func_0_dma_source(struct pci_dev *dev)
 	return pci_get_slot(dev->bus, PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
 }
 
+/* Table of source functions for real devices. The DMA requests for the
+ * device are tagged with a different real function as source. This is
+ * relevant to multifunction devices.
+ */
 static const struct pci_dev_dma_source {
 	u16 vendor;
 	u16 device;
@@ -3362,7 +3366,8 @@  static const struct pci_dev_dma_source {
  * the device doing the DMA, but sometimes hardware is broken and will
  * tag the DMA as being sourced from a different device.  This function
  * allows that translation.  Note that the reference count of the
- * returned device is incremented on all paths.
+ * returned device is incremented on all paths. Translation is done when
+ * the device is added to an IOMMU group.
  */
 struct pci_dev *pci_get_dma_source(struct pci_dev *dev)
 {
@@ -3423,6 +3428,144 @@  static int pci_quirk_amd_sb_acs(struct pci_dev *dev, u16 acs_flags)
 #endif
 }
 
+/* Table of multiple (ghost) source functions. Devices that may need this quirk
+ * show the following behaviour:
+ * 1. the device may use multiple PCI requester IDs during operation,
+ *     (eg. one pci transaction uses xx:yy.0, the next uses xx:yy.1)
+ * 2. the requester ID may not match a known device.
+ *     (eg. lspci does not show xx:yy.1 to be present)
+ *
+ * The bitmap contains all of the functions "in use" by the device.
+ * See  https://bugzilla.redhat.com/show_bug.cgi?id=757166,
+ * https://bugzilla.kernel.org/show_bug.cgi?id=42679
+ * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1089768
+ */
+static const struct pci_dev_dma_multi_source_map {
+	u16 vendor;
+	u16 device;
+	u8 func_map;	/* bit map. lsb is fn 0. */
+} pci_dev_dma_multi_source_map[] = {
+	 /* Reported by Patrick Bregman
+	  * https://bugzilla.redhat.com/show_bug.cgi?id=863653 */
+	{ PCI_VENDOR_ID_MARVELL_EXT, 0x9120, (1<<0)|(1<<1)},
+
+	/* Reported by  Paweł Żak, Korneliusz Jarzębski, Daniel Mayer
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=42679 and by
+	 * Justin Piszcz  https://lkml.org/lkml/2012/11/24/94 */
+	{ PCI_VENDOR_ID_MARVELL_EXT, 0x9123, (1<<0)|(1<<1)},
+
+	/* Used in a patch by Ying Chu
+	 * https://bugzilla.redhat.com/show_bug.cgi?id=757166 */
+	{ PCI_VENDOR_ID_MARVELL_EXT, 0x9125, (1<<0)|(1<<1)},
+
+	/* Reported by Robert Cicconetti
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=42679 and by
+	 * Fernando https://bugzilla.redhat.com/show_bug.cgi?id=757166 */
+	{ PCI_VENDOR_ID_MARVELL_EXT, 0x9128, (1<<0)|(1<<1)},
+
+	/* Reported by Stijn Tintel
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=42679 */
+	{ PCI_VENDOR_ID_MARVELL_EXT, 0x9130, (1<<0)|(1<<1)},
+
+	/* Reported by Gaudenz Steinlin
+	 * https://lkml.org/lkml/2013/3/5/288 */
+	{ PCI_VENDOR_ID_MARVELL_EXT, 0x9143, (1<<0)|(1<<1)},
+
+	/* Reported by Andrew Cooks
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=42679 */
+	{ PCI_VENDOR_ID_MARVELL_EXT, 0x9172, (1<<0)|(1<<1)},
+
+	{ 0 }
+};
+
+/*
+ * The mapping of quirky requester ids is used when the device driver sets up
+ * dma, if iommu is enabled.
+ */
+u8 pci_multi_requesters(struct pci_dev *dev)
+{
+	const struct pci_dev_dma_multi_source_map *i;
+
+	for (i = pci_dev_dma_multi_source_map; i->func_map; i++) {
+		if ((i->vendor == dev->vendor ||
+		     i->vendor == (u16)PCI_ANY_ID) &&
+		    (i->device == dev->device ||
+		     i->device == (u16)PCI_ANY_ID)) {
+			return i->func_map;
+		}
+	}
+	return 0;
+}
+
+/* These are one-to-one translations for devices that use a single incorrect
+ * requester ID. The requester id may not be the BDF of a real device.
+ */
+static const struct pci_dev_dma_source_map {
+	u16 vendor;
+	u16 device;
+	u8  devfn;
+	u8  dma_devfn;
+} pci_dev_dma_source_map[] = {
+
+	/* Ricoh IEEE 1394 Controller */
+	{
+		PCI_VENDOR_ID_RICOH,
+		0xe832,
+		PCI_DEVFN(0x00, 3),
+		PCI_DEVFN(0x00, 0)
+	},
+
+	/* Nils Caspar - Adaptec 3405
+	 * http://www.mail-archive.com/centos@centos.org/msg90986.html
+	 * Jonathan McCune
+	 * http://old-list-archives.xen.org/archives/html/xen-users/2010-04/msg00535.html */
+	{
+		PCI_VENDOR_ID_ADAPTEC2,
+		0x028b,
+		PCI_DEVFN(0x0e, 0),
+		PCI_DEVFN(0x01, 0)
+	},
+
+	/* Mateusz Murawski - LSI SAS based MegaRAID
+	 * https://lkml.org/lkml/2011/9/12/104
+	 * M. Nunberg - Dell PERC 5/i Integrated RAID Controller
+	 * http://lists.xen.org/archives/html/xen-devel/2010-05/msg01563.html */
+	{
+		PCI_VENDOR_ID_LSI_LOGIC,
+		0x0411,
+		PCI_DEVFN(0x0e, 0),
+		PCI_DEVFN(0x08, 0)
+	},
+
+	/* Steven Dake, Markus Stockhausen - Mellanox 26428
+	 * https://bugzilla.redhat.com/show_bug.cgi?id=713774
+	 * Note: mellanox uses decimal product numbers, convert to hex for PCI
+	 * device ID. ie. 26428 == 0x673c */
+	{
+		PCI_VENDOR_ID_MELLANOX,
+		0x673c,
+		PCI_DEVFN(0x00, 0),
+		PCI_DEVFN(0x00, 6)
+	},
+
+	{ 0 }
+};
+
+u8 pci_requester(struct pci_dev *dev)
+{
+	const struct pci_dev_dma_source_map *i;
+
+	for (i = pci_dev_dma_source_map; i->devfn; i++) {
+		if ((i->vendor == dev->vendor) &&
+		    (i->device == dev->device) &&
+		    (i->devfn == dev->devfn)) {
+			return i->dma_devfn;
+		}
+	}
+	return dev->devfn;
+}
+
+
 static const struct pci_dev_acs_enabled {
 	u16 vendor;
 	u16 device;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a13d6825e5..3214fd2514 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1632,6 +1632,8 @@  enum pci_fixup_pass {
 #ifdef CONFIG_PCI_QUIRKS
 void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev);
 struct pci_dev *pci_get_dma_source(struct pci_dev *dev);
+u8 pci_multi_requesters(struct pci_dev *dev);
+u8 pci_requester(struct pci_dev *dev);
 int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags);
 #else
 static inline void pci_fixup_device(enum pci_fixup_pass pass,
@@ -1640,6 +1642,14 @@  static inline struct pci_dev *pci_get_dma_source(struct pci_dev *dev)
 {
 	return pci_dev_get(dev);
 }
+u8 pci_multi_requesters(struct pci_dev *dev)
+{
+	return 0;
+}
+u8 pci_requester(struct pci_dev *dev)
+{
+	return dev->devfn;
+}
 static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev,
 					       u16 acs_flags)
 {