diff mbox

[v2,RESEND] Add NumaChip remote PCI support

Message ID 1353487196-10204-1-git-send-email-daniel@numascale-asia.com
State Superseded
Headers show

Commit Message

Daniel J Blueman Nov. 21, 2012, 8:39 a.m. UTC
Add NumaChip-specific PCI access mechanism via MMCONFIG cycles, but
preventing access to AMD Northbridges which shouldn't respond.

v2: Use PCI_DEVFN in precomputed constant limit; drop unneeded includes

Signed-off-by: Daniel J Blueman <daniel@numascale-asia.com>
---
 arch/x86/include/asm/numachip/numachip.h |   20 +++++
 arch/x86/kernel/apic/apic_numachip.c     |    2 +
 arch/x86/pci/Makefile                    |    1 +
 arch/x86/pci/numachip.c                  |  134 ++++++++++++++++++++++++++++++
 4 files changed, 157 insertions(+)
 create mode 100644 arch/x86/include/asm/numachip/numachip.h
 create mode 100644 arch/x86/pci/numachip.c

Comments

Bjorn Helgaas Nov. 28, 2012, 11:08 p.m. UTC | #1
On Wed, Nov 21, 2012 at 1:39 AM, Daniel J Blueman
<daniel@numascale-asia.com> wrote:
> Add NumaChip-specific PCI access mechanism via MMCONFIG cycles, but
> preventing access to AMD Northbridges which shouldn't respond.
>
> v2: Use PCI_DEVFN in precomputed constant limit; drop unneeded includes
>
> Signed-off-by: Daniel J Blueman <daniel@numascale-asia.com>
> ---
>  arch/x86/include/asm/numachip/numachip.h |   20 +++++
>  arch/x86/kernel/apic/apic_numachip.c     |    2 +
>  arch/x86/pci/Makefile                    |    1 +
>  arch/x86/pci/numachip.c                  |  134 ++++++++++++++++++++++++++++++
>  4 files changed, 157 insertions(+)
>  create mode 100644 arch/x86/include/asm/numachip/numachip.h
>  create mode 100644 arch/x86/pci/numachip.c
>
> diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h
> new file mode 100644
> index 0000000..d35e71a
> --- /dev/null
> +++ b/arch/x86/include/asm/numachip/numachip.h
> @@ -0,0 +1,20 @@
> +/*
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License.  See the file "COPYING" in the main directory of this archive
> + * for more details.
> + *
> + * Numascale NumaConnect-specific header file
> + *
> + * Copyright (C) 2012 Numascale AS. All rights reserved.
> + *
> + * Send feedback to <support@numascale.com>
> + *
> + */
> +
> +#ifndef _ASM_X86_NUMACHIP_NUMACHIP_H
> +#define _ASM_X86_NUMACHIP_NUMACHIP_H
> +
> +extern int __init pci_numachip_init(void);
> +
> +#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */
> +
> diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
> index a65829a..9c2aa89 100644
> --- a/arch/x86/kernel/apic/apic_numachip.c
> +++ b/arch/x86/kernel/apic/apic_numachip.c
> @@ -22,6 +22,7 @@
>  #include <linux/hardirq.h>
>  #include <linux/delay.h>
>
> +#include <asm/numachip/numachip.h>
>  #include <asm/numachip/numachip_csr.h>
>  #include <asm/smp.h>
>  #include <asm/apic.h>
> @@ -179,6 +180,7 @@ static int __init numachip_system_init(void)
>                 return 0;
>
>         x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
> +       x86_init.pci.arch_init = pci_numachip_init;
>
>         map_csrs();
>
> diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
> index 3af5a1e..ee0af58 100644
> --- a/arch/x86/pci/Makefile
> +++ b/arch/x86/pci/Makefile
> @@ -16,6 +16,7 @@ obj-$(CONFIG_STA2X11)           += sta2x11-fixup.o
>  obj-$(CONFIG_X86_VISWS)                += visws.o
>
>  obj-$(CONFIG_X86_NUMAQ)                += numaq_32.o
> +obj-$(CONFIG_X86_NUMACHIP)     += numachip.o

It looks like this depends on CONFIG_PCI_MMCONFIG for
pci_mmconfig_lookup().  Are there config constraints that force
CONFIG_PCI_MMCONFIG=y when CONFIG_X86_NUMACHIP=y?

>  obj-$(CONFIG_X86_INTEL_MID)    += mrst.o
>
> diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
> new file mode 100644
> index 0000000..3773e05
> --- /dev/null
> +++ b/arch/x86/pci/numachip.c
> @@ -0,0 +1,129 @@
> +/*
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License.  See the file "COPYING" in the main directory of this archive
> + * for more details.
> + *
> + * Numascale NumaConnect-specific PCI code
> + *
> + * Copyright (C) 2012 Numascale AS. All rights reserved.
> + *
> + * Send feedback to <support@numascale.com>
> + *
> + * PCI accessor functions derived from mmconfig_64.c
> + *
> + */
> +
> +#include <linux/pci.h>
> +#include <asm/pci_x86.h>
> +
> +static u8 limit __read_mostly;
> +
> +static inline char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
> +{
> +       struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
> +
> +       if (cfg && cfg->virt)
> +               return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
> +       return NULL;
> +}

Most of this file is copied directly from mmconfig_64.c (as you
mentioned above).  I wonder if we could avoid the code duplication by
making the pci_dev_base() implementation in mmconfig_64.c a weak
definition.  Then you could just supply a non-weak pci_dev_base() here
that would override that default version.  Your version would look
something like:

  char __iomem *pci_dev_base(unsigned int seg, unsigned int bus,
unsigned int devfn)
  {
      struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);

      if (cfg && cfg->virt && devfn < limit)
          return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
      return NULL;
  }

That would be different from what you have in this patch because reads
& writes to devices above "limit" would return -EINVAL rather than 0
as you do here.  Would that be a problem?

> +static int pci_mmcfg_read_numachip(unsigned int seg, unsigned int bus,
> +                         unsigned int devfn, int reg, int len, u32 *value)
> +{
> +       char __iomem *addr;
> +
> +       /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
> +       if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
> +err:           *value = -1;
> +               return -EINVAL;
> +       }
> +
> +       /* Ensure AMD Northbridges don't decode reads to other devices */
> +       if (unlikely(bus == 0 && devfn >= limit)) {
> +               *value = -1;
> +               return 0;
> +       }
> +
> +       rcu_read_lock();
> +       addr = pci_dev_base(seg, bus, devfn);
> +       if (!addr) {
> +               rcu_read_unlock();
> +               goto err;
> +       }
> +
> +       switch (len) {
> +       case 1:
> +               *value = mmio_config_readb(addr + reg);
> +               break;
> +       case 2:
> +               *value = mmio_config_readw(addr + reg);
> +               break;
> +       case 4:
> +               *value = mmio_config_readl(addr + reg);
> +               break;
> +       }
> +       rcu_read_unlock();
> +
> +       return 0;
> +}
> +
> +static int pci_mmcfg_write_numachip(unsigned int seg, unsigned int bus,
> +                          unsigned int devfn, int reg, int len, u32 value)
> +{
> +       char __iomem *addr;
> +
> +       /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
> +       if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
> +               return -EINVAL;
> +
> +       /* Ensure AMD Northbridges don't decode writes to other devices */
> +       if (unlikely(bus == 0 && devfn >= limit))
> +               return 0;
> +
> +       rcu_read_lock();
> +       addr = pci_dev_base(seg, bus, devfn);
> +       if (!addr) {
> +               rcu_read_unlock();
> +               return -EINVAL;
> +       }
> +
> +       switch (len) {
> +       case 1:
> +               mmio_config_writeb(addr + reg, value);
> +               break;
> +       case 2:
> +               mmio_config_writew(addr + reg, value);
> +               break;
> +       case 4:
> +               mmio_config_writel(addr + reg, value);
> +               break;
> +       }
> +       rcu_read_unlock();
> +
> +       return 0;
> +}
> +
> +const struct pci_raw_ops pci_mmcfg_numachip = {
> +       .read = pci_mmcfg_read_numachip,
> +       .write = pci_mmcfg_write_numachip,
> +};
> +
> +int __init pci_numachip_init(void)
> +{
> +       int ret = 0;
> +       u32 val;
> +
> +       /* For remote I/O, restrict bus 0 access to the actual number of AMD
> +          Northbridges, which starts at device number 0x18 */
> +       ret = raw_pci_read(0, 0, PCI_DEVFN(0x18, 0), 0x60, sizeof(val), &val);
> +       if (ret)
> +               goto out;
> +
> +       /* HyperTransport fabric size in bits 6:4 */
> +       limit = PCI_DEVFN(0x18 + ((val >> 4) & 7) + 1, 0);
> +
> +       /* Use NumaChip PCI accessors for non-extended and extended access */
> +       raw_pci_ops = raw_pci_ext_ops = &pci_mmcfg_numachip;
> +out:
> +       return ret;
> +}
> --
> 1.7.9.5
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel J Blueman Nov. 30, 2012, 5:28 a.m. UTC | #2
Hi Bjorn,

On 29/11/2012 07:08, Bjorn Helgaas wrote:
> On Wed, Nov 21, 2012 at 1:39 AM, Daniel J Blueman
> <daniel@numascale-asia.com> wrote:
>> Add NumaChip-specific PCI access mechanism via MMCONFIG cycles, but
>> preventing access to AMD Northbridges which shouldn't respond.
>>
>> v2: Use PCI_DEVFN in precomputed constant limit; drop unneeded includes
>>
>> Signed-off-by: Daniel J Blueman <daniel@numascale-asia.com>
>> ---
>>   arch/x86/include/asm/numachip/numachip.h |   20 +++++
>>   arch/x86/kernel/apic/apic_numachip.c     |    2 +
>>   arch/x86/pci/Makefile                    |    1 +
>>   arch/x86/pci/numachip.c                  |  134 ++++++++++++++++++++++++++++++
>>   4 files changed, 157 insertions(+)
>>   create mode 100644 arch/x86/include/asm/numachip/numachip.h
>>   create mode 100644 arch/x86/pci/numachip.c
>>
>> diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h
>> new file mode 100644
>> index 0000000..d35e71a
>> --- /dev/null
>> +++ b/arch/x86/include/asm/numachip/numachip.h
>> @@ -0,0 +1,20 @@
>> +/*
>> + * This file is subject to the terms and conditions of the GNU General Public
>> + * License.  See the file "COPYING" in the main directory of this archive
>> + * for more details.
>> + *
>> + * Numascale NumaConnect-specific header file
>> + *
>> + * Copyright (C) 2012 Numascale AS. All rights reserved.
>> + *
>> + * Send feedback to <support@numascale.com>
>> + *
>> + */
>> +
>> +#ifndef _ASM_X86_NUMACHIP_NUMACHIP_H
>> +#define _ASM_X86_NUMACHIP_NUMACHIP_H
>> +
>> +extern int __init pci_numachip_init(void);
>> +
>> +#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */
>> +
>> diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
>> index a65829a..9c2aa89 100644
>> --- a/arch/x86/kernel/apic/apic_numachip.c
>> +++ b/arch/x86/kernel/apic/apic_numachip.c
>> @@ -22,6 +22,7 @@
>>   #include <linux/hardirq.h>
>>   #include <linux/delay.h>
>>
>> +#include <asm/numachip/numachip.h>
>>   #include <asm/numachip/numachip_csr.h>
>>   #include <asm/smp.h>
>>   #include <asm/apic.h>
>> @@ -179,6 +180,7 @@ static int __init numachip_system_init(void)
>>                  return 0;
>>
>>          x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
>> +       x86_init.pci.arch_init = pci_numachip_init;
>>
>>          map_csrs();
>>
>> diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
>> index 3af5a1e..ee0af58 100644
>> --- a/arch/x86/pci/Makefile
>> +++ b/arch/x86/pci/Makefile
>> @@ -16,6 +16,7 @@ obj-$(CONFIG_STA2X11)           += sta2x11-fixup.o
>>   obj-$(CONFIG_X86_VISWS)                += visws.o
>>
>>   obj-$(CONFIG_X86_NUMAQ)                += numaq_32.o
>> +obj-$(CONFIG_X86_NUMACHIP)     += numachip.o
>
> It looks like this depends on CONFIG_PCI_MMCONFIG for
> pci_mmconfig_lookup().  Are there config constraints that force
> CONFIG_PCI_MMCONFIG=y when CONFIG_X86_NUMACHIP=y?

I'll revise the patch with this constraint after we work out the best 
approach for below.

>>   obj-$(CONFIG_X86_INTEL_MID)    += mrst.o
>>
>> diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
>> new file mode 100644
>> index 0000000..3773e05
>> --- /dev/null
>> +++ b/arch/x86/pci/numachip.c
>> @@ -0,0 +1,129 @@
>> +/*
>> + * This file is subject to the terms and conditions of the GNU General Public
>> + * License.  See the file "COPYING" in the main directory of this archive
>> + * for more details.
>> + *
>> + * Numascale NumaConnect-specific PCI code
>> + *
>> + * Copyright (C) 2012 Numascale AS. All rights reserved.
>> + *
>> + * Send feedback to <support@numascale.com>
>> + *
>> + * PCI accessor functions derived from mmconfig_64.c
>> + *
>> + */
>> +
>> +#include <linux/pci.h>
>> +#include <asm/pci_x86.h>
>> +
>> +static u8 limit __read_mostly;
>> +
>> +static inline char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
>> +{
>> +       struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
>> +
>> +       if (cfg && cfg->virt)
>> +               return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
>> +       return NULL;
>> +}
>
> Most of this file is copied directly from mmconfig_64.c (as you
> mentioned above).  I wonder if we could avoid the code duplication by
> making the pci_dev_base() implementation in mmconfig_64.c a weak
> definition.  Then you could just supply a non-weak pci_dev_base() here
> that would override that default version.  Your version would look
> something like:
>
>    char __iomem *pci_dev_base(unsigned int seg, unsigned int bus,
> unsigned int devfn)
>    {
>        struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
>
>        if (cfg && cfg->virt && devfn < limit)
>            return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
>        return NULL;
>    }
>
> That would be different from what you have in this patch because reads
> & writes to devices above "limit" would return -EINVAL rather than 0
> as you do here.  Would that be a problem?

That would work nicely (pointer lookup and inlining etc aside) if there 
was the runtime ability to override pci_dev_base only if the NumaChip 
signature was detected.

We could expose pci_dev_base via struct x86_init_pci; the extra 
complexity and performance tradeoff may not be worth it for a single 
case perhaps?

Thanks,
   Daniel
Bjorn Helgaas Nov. 30, 2012, 4:45 p.m. UTC | #3
On Thu, Nov 29, 2012 at 10:28 PM, Daniel J Blueman
<daniel@numascale-asia.com> wrote:
> Hi Bjorn,
>
>
> On 29/11/2012 07:08, Bjorn Helgaas wrote:
>>
>> On Wed, Nov 21, 2012 at 1:39 AM, Daniel J Blueman
>> <daniel@numascale-asia.com> wrote:
>>>
>>> Add NumaChip-specific PCI access mechanism via MMCONFIG cycles, but
>>> preventing access to AMD Northbridges which shouldn't respond.
>>>
>>> v2: Use PCI_DEVFN in precomputed constant limit; drop unneeded includes
>>>
>>> Signed-off-by: Daniel J Blueman <daniel@numascale-asia.com>
>>> ---
>>>   arch/x86/include/asm/numachip/numachip.h |   20 +++++
>>>   arch/x86/kernel/apic/apic_numachip.c     |    2 +
>>>   arch/x86/pci/Makefile                    |    1 +
>>>   arch/x86/pci/numachip.c                  |  134
>>> ++++++++++++++++++++++++++++++
>>>   4 files changed, 157 insertions(+)
>>>   create mode 100644 arch/x86/include/asm/numachip/numachip.h
>>>   create mode 100644 arch/x86/pci/numachip.c
>>>
>>> diff --git a/arch/x86/include/asm/numachip/numachip.h
>>> b/arch/x86/include/asm/numachip/numachip.h
>>> new file mode 100644
>>> index 0000000..d35e71a
>>> --- /dev/null
>>> +++ b/arch/x86/include/asm/numachip/numachip.h
>>> @@ -0,0 +1,20 @@
>>> +/*
>>> + * This file is subject to the terms and conditions of the GNU General
>>> Public
>>> + * License.  See the file "COPYING" in the main directory of this
>>> archive
>>> + * for more details.
>>> + *
>>> + * Numascale NumaConnect-specific header file
>>> + *
>>> + * Copyright (C) 2012 Numascale AS. All rights reserved.
>>> + *
>>> + * Send feedback to <support@numascale.com>
>>> + *
>>> + */
>>> +
>>> +#ifndef _ASM_X86_NUMACHIP_NUMACHIP_H
>>> +#define _ASM_X86_NUMACHIP_NUMACHIP_H
>>> +
>>> +extern int __init pci_numachip_init(void);
>>> +
>>> +#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */
>>> +
>>> diff --git a/arch/x86/kernel/apic/apic_numachip.c
>>> b/arch/x86/kernel/apic/apic_numachip.c
>>> index a65829a..9c2aa89 100644
>>> --- a/arch/x86/kernel/apic/apic_numachip.c
>>> +++ b/arch/x86/kernel/apic/apic_numachip.c
>>> @@ -22,6 +22,7 @@
>>>   #include <linux/hardirq.h>
>>>   #include <linux/delay.h>
>>>
>>> +#include <asm/numachip/numachip.h>
>>>   #include <asm/numachip/numachip_csr.h>
>>>   #include <asm/smp.h>
>>>   #include <asm/apic.h>
>>> @@ -179,6 +180,7 @@ static int __init numachip_system_init(void)
>>>                  return 0;
>>>
>>>          x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
>>> +       x86_init.pci.arch_init = pci_numachip_init;
>>>
>>>          map_csrs();
>>>
>>> diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
>>> index 3af5a1e..ee0af58 100644
>>> --- a/arch/x86/pci/Makefile
>>> +++ b/arch/x86/pci/Makefile
>>> @@ -16,6 +16,7 @@ obj-$(CONFIG_STA2X11)           += sta2x11-fixup.o
>>>   obj-$(CONFIG_X86_VISWS)                += visws.o
>>>
>>>   obj-$(CONFIG_X86_NUMAQ)                += numaq_32.o
>>> +obj-$(CONFIG_X86_NUMACHIP)     += numachip.o
>>
>>
>> It looks like this depends on CONFIG_PCI_MMCONFIG for
>> pci_mmconfig_lookup().  Are there config constraints that force
>> CONFIG_PCI_MMCONFIG=y when CONFIG_X86_NUMACHIP=y?
>
>
> I'll revise the patch with this constraint after we work out the best
> approach for below.
>
>
>>>   obj-$(CONFIG_X86_INTEL_MID)    += mrst.o
>>>
>>> diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
>>> new file mode 100644
>>> index 0000000..3773e05
>>> --- /dev/null
>>> +++ b/arch/x86/pci/numachip.c
>>> @@ -0,0 +1,129 @@
>>> +/*
>>> + * This file is subject to the terms and conditions of the GNU General
>>> Public
>>> + * License.  See the file "COPYING" in the main directory of this
>>> archive
>>> + * for more details.
>>> + *
>>> + * Numascale NumaConnect-specific PCI code
>>> + *
>>> + * Copyright (C) 2012 Numascale AS. All rights reserved.
>>> + *
>>> + * Send feedback to <support@numascale.com>
>>> + *
>>> + * PCI accessor functions derived from mmconfig_64.c
>>> + *
>>> + */
>>> +
>>> +#include <linux/pci.h>
>>> +#include <asm/pci_x86.h>
>>> +
>>> +static u8 limit __read_mostly;
>>> +
>>> +static inline char __iomem *pci_dev_base(unsigned int seg, unsigned int
>>> bus, unsigned int devfn)
>>> +{
>>> +       struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
>>> +
>>> +       if (cfg && cfg->virt)
>>> +               return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn <<
>>> 12));
>>> +       return NULL;
>>> +}
>>
>>
>> Most of this file is copied directly from mmconfig_64.c (as you
>> mentioned above).  I wonder if we could avoid the code duplication by
>> making the pci_dev_base() implementation in mmconfig_64.c a weak
>> definition.  Then you could just supply a non-weak pci_dev_base() here
>> that would override that default version.  Your version would look
>> something like:
>>
>>    char __iomem *pci_dev_base(unsigned int seg, unsigned int bus,
>> unsigned int devfn)
>>    {
>>        struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
>>
>>        if (cfg && cfg->virt && devfn < limit)
>>            return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
>>        return NULL;
>>    }
>>
>> That would be different from what you have in this patch because reads
>> & writes to devices above "limit" would return -EINVAL rather than 0
>> as you do here.  Would that be a problem?
>
>
> That would work nicely (pointer lookup and inlining etc aside) if there was
> the runtime ability to override pci_dev_base only if the NumaChip signature
> was detected.
>
> We could expose pci_dev_base via struct x86_init_pci; the extra complexity
> and performance tradeoff may not be worth it for a single case perhaps?

Oh, right, I forgot that you can't decide this at build-time.  This is
PCI config access, which is not a performance path, so I'm not really
concerned about it from that angle, but you make a good point about
the complexity.

The reason I'm interested in this is because MMCONFIG is a generic
PCIe feature but is currently done via several arch-specific
implementations, so I'm starting to think about how we can make parts
of it more generic.  From that perspective, it's nicer to parameterize
an existing implementation than to clone it because it makes
refactoring opportunities more obvious.

Backing up a bit, I'm curious about exactly why you need to check for
the limit to begin with.  The comment says "Ensure AMD Northbridges
don't decode reads to other devices," but that doesn't seem strictly
accurate.  You're not changing anything in the hardware to prevent it
from *decoding* a read, so it seems like you're actually just
preventing the read in the first place.

What happens without the limit check?  Do you get a response timeout
and a machine check?  Read from the wrong device?

As far as I can tell, you still describe your MMCONFIG area with an
MCFG table (since you use pci_mmconfig_lookup() to find the region).
That table only includes the starting and ending bus numbers, so the
assumption is that the MMCONFIG space is valid for every possible
device on those buses.  So it seems like your system is not really
compatible with the spec here.

Because the MCFG table can't describe finer granularity than start/end
bus numbers, we manage MMCONFIG regions as (segment, start_bus,
end_bus, address) tuples.  Maybe if we tracked it with slightly finer
granularity, e.g., (segment, start_bus, end_bus, end_bus_device,
address), you could have some sort of MCFG-parsing quirk that reduces
the size of the MMCONFIG region you register for bus 0.

Just brainstorming here; it's not obvious to me yet what the best solution is.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steffen Persvold Nov. 30, 2012, 5:41 p.m. UTC | #4
Hi Bjorn,

On 11/30/2012 17:45, Bjorn Helgaas wrote:
> On Thu, Nov 29, 2012 at 10:28 PM, Daniel J Blueman
[]
>> We could expose pci_dev_base via struct x86_init_pci; the extra complexity
>> and performance tradeoff may not be worth it for a single case perhaps?
>
> Oh, right, I forgot that you can't decide this at build-time.  This is
> PCI config access, which is not a performance path, so I'm not really
> concerned about it from that angle, but you make a good point about
> the complexity.
>
> The reason I'm interested in this is because MMCONFIG is a generic
> PCIe feature but is currently done via several arch-specific
> implementations, so I'm starting to think about how we can make parts
> of it more generic.  From that perspective, it's nicer to parameterize
> an existing implementation than to clone it because it makes
> refactoring opportunities more obvious.
>
> Backing up a bit, I'm curious about exactly why you need to check for
> the limit to begin with.  The comment says "Ensure AMD Northbridges
> don't decode reads to other devices," but that doesn't seem strictly
> accurate.  You're not changing anything in the hardware to prevent it
> from *decoding* a read, so it seems like you're actually just
> preventing the read in the first place.
>
> What happens without the limit check?  Do you get a response timeout
> and a machine check?  Read from the wrong device?'

The latter. I'm not sure how familiar you are with how pci config reads 
are decoded and handled on coherent hypertransport fabrics; The way it 
works *within* one coherent HT fabric is that the CPU will redirect all 
config space access above a configured max HT node (a setting in the AMD 
northbridge) to a specific I/O link (non-coherent link) which usually 
links up with a "southbridge" device that responds with a target abort 
(non-existing device).

However, this only works when a CPU core is accessing local HT devices. 
In our architecture, we "glue" together multiple HT fabrics and when a 
CPU core sends a pci config space request (mmconfig) to a remote machine 
(via our hardware) this re-direction is not applied anymore. The result 
is that when a mmconfig read comes in to a coherent HT device on bus00 
which is non-existent, one of the other HT nodes on that remote node 
will respond to the read, leading to "phantom" devices (i.e lspci will 
show more HT northbridges than what's really physically present) *or* 
worst case scenario will be that the transaction hangs (alternatively 
times out, leading to MCE and other bad things).

This is why we're checking accesses to bus0, device24-31 and returning a 
"fake" target abort scenario if the access was to a non-existing HT 
device. In other words, we're doing in software what a "normal" HT based 
platform would do in hardware.

>
> As far as I can tell, you still describe your MMCONFIG area with an
> MCFG table (since you use pci_mmconfig_lookup() to find the region).
> That table only includes the starting and ending bus numbers, so the
> assumption is that the MMCONFIG space is valid for every possible
> device on those buses.  So it seems like your system is not really
> compatible with the spec here.
>
> Because the MCFG table can't describe finer granularity than start/end
> bus numbers, we manage MMCONFIG regions as (segment, start_bus,
> end_bus, address) tuples.  Maybe if we tracked it with slightly finer
> granularity, e.g., (segment, start_bus, end_bus, end_bus_device,
> address), you could have some sort of MCFG-parsing quirk that reduces
> the size of the MMCONFIG region you register for bus 0.
>
> Just brainstorming here; it's not obvious to me yet what the best solution is.
>
> Bjorn
>

Kind regards,
Daniel J Blueman Dec. 6, 2012, 8:16 a.m. UTC | #5
On 01/12/2012 00:45, Bjorn Helgaas wrote:
> On Thu, Nov 29, 2012 at 10:28 PM, Daniel J Blueman
>> On 29/11/2012 07:08, Bjorn Helgaas wrote:
>>> On Wed, Nov 21, 2012 at 1:39 AM, Daniel J Blueman
>>> <daniel@numascale-asia.com> wrote:
>>>>
>>>> Add NumaChip-specific PCI access mechanism via MMCONFIG cycles, but
>>>> preventing access to AMD Northbridges which shouldn't respond.
>>>>
>>>> v2: Use PCI_DEVFN in precomputed constant limit; drop unneeded includes
>>>>
>>>> Signed-off-by: Daniel J Blueman <daniel@numascale-asia.com>
>>>> ---
>>>>    arch/x86/include/asm/numachip/numachip.h |   20 +++++
>>>>    arch/x86/kernel/apic/apic_numachip.c     |    2 +
>>>>    arch/x86/pci/Makefile                    |    1 +
>>>>    arch/x86/pci/numachip.c                  |  134
>>>> ++++++++++++++++++++++++++++++
>>>>    4 files changed, 157 insertions(+)
>>>>    create mode 100644 arch/x86/include/asm/numachip/numachip.h
>>>>    create mode 100644 arch/x86/pci/numachip.c
>>>>
>>>> diff --git a/arch/x86/include/asm/numachip/numachip.h
>>>> b/arch/x86/include/asm/numachip/numachip.h
>>>> new file mode 100644
>>>> index 0000000..d35e71a
>>>> --- /dev/null
>>>> +++ b/arch/x86/include/asm/numachip/numachip.h
>>>> @@ -0,0 +1,20 @@
>>>> +/*
>>>> + * This file is subject to the terms and conditions of the GNU General
>>>> Public
>>>> + * License.  See the file "COPYING" in the main directory of this
>>>> archive
>>>> + * for more details.
>>>> + *
>>>> + * Numascale NumaConnect-specific header file
>>>> + *
>>>> + * Copyright (C) 2012 Numascale AS. All rights reserved.
>>>> + *
>>>> + * Send feedback to <support@numascale.com>
>>>> + *
>>>> + */
>>>> +
>>>> +#ifndef _ASM_X86_NUMACHIP_NUMACHIP_H
>>>> +#define _ASM_X86_NUMACHIP_NUMACHIP_H
>>>> +
>>>> +extern int __init pci_numachip_init(void);
>>>> +
>>>> +#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */
>>>> +
>>>> diff --git a/arch/x86/kernel/apic/apic_numachip.c
>>>> b/arch/x86/kernel/apic/apic_numachip.c
>>>> index a65829a..9c2aa89 100644
>>>> --- a/arch/x86/kernel/apic/apic_numachip.c
>>>> +++ b/arch/x86/kernel/apic/apic_numachip.c
>>>> @@ -22,6 +22,7 @@
>>>>    #include <linux/hardirq.h>
>>>>    #include <linux/delay.h>
>>>>
>>>> +#include <asm/numachip/numachip.h>
>>>>    #include <asm/numachip/numachip_csr.h>
>>>>    #include <asm/smp.h>
>>>>    #include <asm/apic.h>
>>>> @@ -179,6 +180,7 @@ static int __init numachip_system_init(void)
>>>>                   return 0;
>>>>
>>>>           x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
>>>> +       x86_init.pci.arch_init = pci_numachip_init;
>>>>
>>>>           map_csrs();
>>>>
>>>> diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
>>>> index 3af5a1e..ee0af58 100644
>>>> --- a/arch/x86/pci/Makefile
>>>> +++ b/arch/x86/pci/Makefile
>>>> @@ -16,6 +16,7 @@ obj-$(CONFIG_STA2X11)           += sta2x11-fixup.o
>>>>    obj-$(CONFIG_X86_VISWS)                += visws.o
>>>>
>>>>    obj-$(CONFIG_X86_NUMAQ)                += numaq_32.o
>>>> +obj-$(CONFIG_X86_NUMACHIP)     += numachip.o
>>>
>>>
>>> It looks like this depends on CONFIG_PCI_MMCONFIG for
>>> pci_mmconfig_lookup().  Are there config constraints that force
>>> CONFIG_PCI_MMCONFIG=y when CONFIG_X86_NUMACHIP=y?
>>
>>
>> I'll revise the patch with this constraint after we work out the best
>> approach for below.
>>
>>
>>>>    obj-$(CONFIG_X86_INTEL_MID)    += mrst.o
>>>>
>>>> diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
>>>> new file mode 100644
>>>> index 0000000..3773e05
>>>> --- /dev/null
>>>> +++ b/arch/x86/pci/numachip.c
>>>> @@ -0,0 +1,129 @@
>>>> +/*
>>>> + * This file is subject to the terms and conditions of the GNU General
>>>> Public
>>>> + * License.  See the file "COPYING" in the main directory of this
>>>> archive
>>>> + * for more details.
>>>> + *
>>>> + * Numascale NumaConnect-specific PCI code
>>>> + *
>>>> + * Copyright (C) 2012 Numascale AS. All rights reserved.
>>>> + *
>>>> + * Send feedback to <support@numascale.com>
>>>> + *
>>>> + * PCI accessor functions derived from mmconfig_64.c
>>>> + *
>>>> + */
>>>> +
>>>> +#include <linux/pci.h>
>>>> +#include <asm/pci_x86.h>
>>>> +
>>>> +static u8 limit __read_mostly;
>>>> +
>>>> +static inline char __iomem *pci_dev_base(unsigned int seg, unsigned int
>>>> bus, unsigned int devfn)
>>>> +{
>>>> +       struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
>>>> +
>>>> +       if (cfg && cfg->virt)
>>>> +               return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn <<
>>>> 12));
>>>> +       return NULL;
>>>> +}
>>>
>>>
>>> Most of this file is copied directly from mmconfig_64.c (as you
>>> mentioned above).  I wonder if we could avoid the code duplication by
>>> making the pci_dev_base() implementation in mmconfig_64.c a weak
>>> definition.  Then you could just supply a non-weak pci_dev_base() here
>>> that would override that default version.  Your version would look
>>> something like:
>>>
>>>     char __iomem *pci_dev_base(unsigned int seg, unsigned int bus,
>>> unsigned int devfn)
>>>     {
>>>         struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
>>>
>>>         if (cfg && cfg->virt && devfn < limit)
>>>             return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
>>>         return NULL;
>>>     }
>>>
>>> That would be different from what you have in this patch because reads
>>> & writes to devices above "limit" would return -EINVAL rather than 0
>>> as you do here.  Would that be a problem?
>>
>>
>> That would work nicely (pointer lookup and inlining etc aside) if there was
>> the runtime ability to override pci_dev_base only if the NumaChip signature
>> was detected.
>>
>> We could expose pci_dev_base via struct x86_init_pci; the extra complexity
>> and performance tradeoff may not be worth it for a single case perhaps?
>
> Oh, right, I forgot that you can't decide this at build-time.  This is
> PCI config access, which is not a performance path, so I'm not really
> concerned about it from that angle, but you make a good point about
> the complexity.
>
> The reason I'm interested in this is because MMCONFIG is a generic
> PCIe feature but is currently done via several arch-specific
> implementations, so I'm starting to think about how we can make parts
> of it more generic.  From that perspective, it's nicer to parameterize
> an existing implementation than to clone it because it makes
> refactoring opportunities more obvious.
>
> Backing up a bit, I'm curious about exactly why you need to check for
> the limit to begin with.  The comment says "Ensure AMD Northbridges
> don't decode reads to other devices," but that doesn't seem strictly
> accurate.  You're not changing anything in the hardware to prevent it
> from *decoding* a read, so it seems like you're actually just
> preventing the read in the first place.
>
> What happens without the limit check?  Do you get a response timeout
> and a machine check?  Read from the wrong device?
>
> As far as I can tell, you still describe your MMCONFIG area with an
> MCFG table (since you use pci_mmconfig_lookup() to find the region).
> That table only includes the starting and ending bus numbers, so the
> assumption is that the MMCONFIG space is valid for every possible
> device on those buses.  So it seems like your system is not really
> compatible with the spec here.
>
> Because the MCFG table can't describe finer granularity than start/end
> bus numbers, we manage MMCONFIG regions as (segment, start_bus,
> end_bus, address) tuples.  Maybe if we tracked it with slightly finer
> granularity, e.g., (segment, start_bus, end_bus, end_bus_device,
> address), you could have some sort of MCFG-parsing quirk that reduces
> the size of the MMCONFIG region you register for bus 0.
>
> Just brainstorming here; it's not obvious to me yet what the best solution is.

The main intent with the approach I chose was to ensure zero additional 
code/overhead when CONFIG_NUMACHIP isn't defined, and as a side-effect, 
all changes are scoped within CONFIG_NUMACHIP, so there's no potential 
for side-effects.

It's possible to add a function pointer to struct x86_init_pci to 
abstract the base address calculation, but it's a pity to do this just 
for one need (ie NumaChip) and add indirection even when CONFIG_NUMACHIP 
is not defined.

I revised the patch with the constraints a few days back, so hope it 
looks good otherwise.

Let me know if abstracting the base address calculation via struct 
x86_init_pci sounds like a better plan (albeit missing the 3.8 merge 
window).

Thanks Bjorn,
   Daniel
diff mbox

Patch

diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h
new file mode 100644
index 0000000..d35e71a
--- /dev/null
+++ b/arch/x86/include/asm/numachip/numachip.h
@@ -0,0 +1,20 @@ 
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-specific header file
+ *
+ * Copyright (C) 2012 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#ifndef _ASM_X86_NUMACHIP_NUMACHIP_H
+#define _ASM_X86_NUMACHIP_NUMACHIP_H
+
+extern int __init pci_numachip_init(void);
+
+#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */
+
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index a65829a..9c2aa89 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -22,6 +22,7 @@ 
 #include <linux/hardirq.h>
 #include <linux/delay.h>
 
+#include <asm/numachip/numachip.h>
 #include <asm/numachip/numachip_csr.h>
 #include <asm/smp.h>
 #include <asm/apic.h>
@@ -179,6 +180,7 @@  static int __init numachip_system_init(void)
 		return 0;
 
 	x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+	x86_init.pci.arch_init = pci_numachip_init;
 
 	map_csrs();
 
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 3af5a1e..ee0af58 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -16,6 +16,7 @@  obj-$(CONFIG_STA2X11)           += sta2x11-fixup.o
 obj-$(CONFIG_X86_VISWS)		+= visws.o
 
 obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
+obj-$(CONFIG_X86_NUMACHIP)	+= numachip.o
 
 obj-$(CONFIG_X86_INTEL_MID)	+= mrst.o
 
diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
new file mode 100644
index 0000000..3773e05
--- /dev/null
+++ b/arch/x86/pci/numachip.c
@@ -0,0 +1,129 @@ 
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-specific PCI code
+ *
+ * Copyright (C) 2012 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ * PCI accessor functions derived from mmconfig_64.c
+ *
+ */
+
+#include <linux/pci.h>
+#include <asm/pci_x86.h>
+
+static u8 limit __read_mostly;
+
+static inline char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
+{
+	struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
+
+	if (cfg && cfg->virt)
+		return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
+	return NULL;
+}
+
+static int pci_mmcfg_read_numachip(unsigned int seg, unsigned int bus,
+			  unsigned int devfn, int reg, int len, u32 *value)
+{
+	char __iomem *addr;
+
+	/* Why do we have this when nobody checks it. How about a BUG()!? -AK */
+	if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
+err:		*value = -1;
+		return -EINVAL;
+	}
+
+	/* Ensure AMD Northbridges don't decode reads to other devices */
+	if (unlikely(bus == 0 && devfn >= limit)) {
+		*value = -1;
+		return 0;
+	}
+
+	rcu_read_lock();
+	addr = pci_dev_base(seg, bus, devfn);
+	if (!addr) {
+		rcu_read_unlock();
+		goto err;
+	}
+
+	switch (len) {
+	case 1:
+		*value = mmio_config_readb(addr + reg);
+		break;
+	case 2:
+		*value = mmio_config_readw(addr + reg);
+		break;
+	case 4:
+		*value = mmio_config_readl(addr + reg);
+		break;
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int pci_mmcfg_write_numachip(unsigned int seg, unsigned int bus,
+			   unsigned int devfn, int reg, int len, u32 value)
+{
+	char __iomem *addr;
+
+	/* Why do we have this when nobody checks it. How about a BUG()!? -AK */
+	if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
+		return -EINVAL;
+
+	/* Ensure AMD Northbridges don't decode writes to other devices */
+	if (unlikely(bus == 0 && devfn >= limit))
+		return 0;
+
+	rcu_read_lock();
+	addr = pci_dev_base(seg, bus, devfn);
+	if (!addr) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	switch (len) {
+	case 1:
+		mmio_config_writeb(addr + reg, value);
+		break;
+	case 2:
+		mmio_config_writew(addr + reg, value);
+		break;
+	case 4:
+		mmio_config_writel(addr + reg, value);
+		break;
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+const struct pci_raw_ops pci_mmcfg_numachip = {
+	.read = pci_mmcfg_read_numachip,
+	.write = pci_mmcfg_write_numachip,
+};
+
+int __init pci_numachip_init(void)
+{
+	int ret = 0;
+	u32 val;
+
+	/* For remote I/O, restrict bus 0 access to the actual number of AMD
+	   Northbridges, which starts at device number 0x18 */
+	ret = raw_pci_read(0, 0, PCI_DEVFN(0x18, 0), 0x60, sizeof(val), &val);
+	if (ret)
+		goto out;
+
+	/* HyperTransport fabric size in bits 6:4 */
+	limit = PCI_DEVFN(0x18 + ((val >> 4) & 7) + 1, 0);
+
+	/* Use NumaChip PCI accessors for non-extended and extended access */
+	raw_pci_ops = raw_pci_ext_ops = &pci_mmcfg_numachip;
+out:
+	return ret;
+}