diff mbox

[V5,01/15] ACPI: MCFG: Move mmcfg_list management to drivers/acpi

Message ID 1455630825-27253-2-git-send-email-tn@semihalf.com
State Changes Requested
Headers show

Commit Message

Tomasz Nowicki Feb. 16, 2016, 1:53 p.m. UTC
From: Jayachandran C <jchandra@broadcom.com>

Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
to share the API and code with ARM64 later. The corresponding
declarations are moved from asm/pci_x86.h to linux/pci-acpi.h

As a part of this we introduce three functions that can be
implemented by the arch code: pci_mmconfig_map_resource() to map a
mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
unmap and pci_mmconfig_enabled to see if the arch setup of
mcfg entries was successful. We also provide weak implementations
of these, which will be used from ARM64. On x86, we retain the
old logic by providing platform specific implementation.

This patch is purely rearranging code, it should not have any
impact on the logic of MCFG parsing or list handling.

Signed-off-by: Jayachandran C <jchandra@broadcom.com>
[Xen parts:]
Acked-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/include/asm/pci_x86.h |  24 +---
 arch/x86/pci/mmconfig-shared.c | 269 +++++------------------------------
 arch/x86/pci/mmconfig_32.c     |   1 +
 arch/x86/pci/mmconfig_64.c     |   1 +
 arch/x86/pci/numachip.c        |   1 +
 drivers/acpi/Makefile          |   1 +
 drivers/acpi/pci_mcfg.c        | 312 +++++++++++++++++++++++++++++++++++++++++
 drivers/xen/pci.c              |   5 +-
 include/linux/pci-acpi.h       |  33 +++++
 9 files changed, 386 insertions(+), 261 deletions(-)
 create mode 100644 drivers/acpi/pci_mcfg.c

Comments

Lorenzo Pieralisi Feb. 17, 2016, 11 a.m. UTC | #1
On Tue, Feb 16, 2016 at 02:53:31PM +0100, Tomasz Nowicki wrote:
> From: Jayachandran C <jchandra@broadcom.com>
> 
> Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
> to share the API and code with ARM64 later. The corresponding
> declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
> 
> As a part of this we introduce three functions that can be
> implemented by the arch code: pci_mmconfig_map_resource() to map a
> mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
> unmap and pci_mmconfig_enabled to see if the arch setup of
> mcfg entries was successful. We also provide weak implementations
> of these, which will be used from ARM64. On x86, we retain the
> old logic by providing platform specific implementation.
> 
> This patch is purely rearranging code, it should not have any
> impact on the logic of MCFG parsing or list handling.
> 
> Signed-off-by: Jayachandran C <jchandra@broadcom.com>
> [Xen parts:]
> Acked-by: David Vrabel <david.vrabel@citrix.com>
> ---
>  arch/x86/include/asm/pci_x86.h |  24 +---
>  arch/x86/pci/mmconfig-shared.c | 269 +++++------------------------------
>  arch/x86/pci/mmconfig_32.c     |   1 +
>  arch/x86/pci/mmconfig_64.c     |   1 +
>  arch/x86/pci/numachip.c        |   1 +
>  drivers/acpi/Makefile          |   1 +
>  drivers/acpi/pci_mcfg.c        | 312 +++++++++++++++++++++++++++++++++++++++++
>  drivers/xen/pci.c              |   5 +-
>  include/linux/pci-acpi.h       |  33 +++++
>  9 files changed, 386 insertions(+), 261 deletions(-)
>  create mode 100644 drivers/acpi/pci_mcfg.c

This patch makes perfect sense to me and manages to move MCFG handling
to common code in a seamless manner, it is basically a code move with
weak functions to cater for X86 specific legacy bits which are otherwise
pretty complex to untangle, so (apart from a few nits below):

Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>

[...]

> diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
> new file mode 100644
> index 0000000..ea84365
> --- /dev/null
> +++ b/drivers/acpi/pci_mcfg.c
> @@ -0,0 +1,312 @@
> +/*
> + * pci_mcfg.c
> + *
> + * Common code to maintain the MCFG areas and mappings
> + *
> + * This has been extracted from arch/x86/pci/mmconfig-shared.c
> + * and moved here so that other architectures can use this code.
> + */
> +
> +#include <linux/pci.h>
> +#include <linux/init.h>
> +#include <linux/dmi.h>
> +#include <linux/pci-acpi.h>
> +#include <linux/sfi_acpi.h>
> +#include <linux/slab.h>
> +#include <linux/mutex.h>
> +#include <linux/rculist.h>

Nit: while at it order them alphabetically.

> +
> +#define PREFIX	"ACPI: "
> +
> +static DEFINE_MUTEX(pci_mmcfg_lock);
> +LIST_HEAD(pci_mmcfg_list);
> +
> +static void list_add_sorted(struct pci_mmcfg_region *new)
> +{
> +	struct pci_mmcfg_region *cfg;
> +
> +	/* keep list sorted by segment and starting bus number */
> +	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) {
> +		if (cfg->segment > new->segment ||
> +		    (cfg->segment == new->segment &&
> +		     cfg->start_bus >= new->start_bus)) {
> +			list_add_tail_rcu(&new->list, &cfg->list);
> +			return;
> +		}
> +	}
> +	list_add_tail_rcu(&new->list, &pci_mmcfg_list);
> +}
> +
> +static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start,
> +						   int end, u64 addr)
> +{
> +	struct pci_mmcfg_region *new;
> +	struct resource *res;
> +
> +	if (addr == 0)
> +		return NULL;
> +
> +	new = kzalloc(sizeof(*new), GFP_KERNEL);
> +	if (!new)
> +		return NULL;
> +
> +	new->address = addr;
> +	new->segment = segment;
> +	new->start_bus = start;
> +	new->end_bus = end;
> +
> +	res = &new->res;
> +	res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
> +	res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
> +	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> +	snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
> +		 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
> +	res->name = new->name;
> +
> +	return new;
> +}
> +
> +struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
> +							int end, u64 addr)
> +{
> +	struct pci_mmcfg_region *new;
> +
> +	new = pci_mmconfig_alloc(segment, start, end, addr);
> +	if (new) {
> +		mutex_lock(&pci_mmcfg_lock);
> +		list_add_sorted(new);
> +		mutex_unlock(&pci_mmcfg_lock);
> +
> +		pr_info(PREFIX
> +		       "MMCONFIG for domain %04x [bus %02x-%02x] at %pR "
> +		       "(base %#lx)\n",
> +		       segment, start, end, &new->res, (unsigned long)addr);
> +	}
> +
> +	return new;
> +}
> +
> +struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
> +{
> +	struct pci_mmcfg_region *cfg;
> +
> +	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
> +		if (cfg->segment == segment &&
> +		    cfg->start_bus <= bus && bus <= cfg->end_bus)
> +			return cfg;
> +
> +	return NULL;
> +}
> +
> +/*
> + * Map a pci_mmcfg_region, can be overrriden by arch

s/overrriden/overridden/

[...]

> +static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
> +					struct acpi_mcfg_allocation *cfg)
> +{
> +	int year;
> +
> +	if (!config_enabled(CONFIG_X86))
> +		return 0;

This check in generic code may ruffle someone's feathers, I even think we
can run this function safely on ARM64 but to prevent surprises we'd better
keep the X86 check, alternatives like adding a weak function just for a
quirk do not make much sense to me.

Lorenzo

> +
> +	if (cfg->address < 0xFFFFFFFF)
> +		return 0;
> +
> +	if (!strncmp(mcfg->header.oem_id, "SGI", 3))
> +		return 0;
> +
> +	if (mcfg->header.revision >= 1) {
> +		if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
> +		    year >= 2010)
> +			return 0;
> +	}
> +
> +	pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
> +	       "is above 4GB, ignored\n", cfg->pci_segment,
> +	       cfg->start_bus_number, cfg->end_bus_number, cfg->address);
> +	return -EINVAL;
> +}
> +
> +static int __init pci_parse_mcfg(struct acpi_table_header *header)
> +{
> +	struct acpi_table_mcfg *mcfg;
> +	struct acpi_mcfg_allocation *cfg_table, *cfg;
> +	unsigned long i;
> +	int entries;
> +
> +	if (!header)
> +		return -EINVAL;
> +
> +	mcfg = (struct acpi_table_mcfg *)header;
> +
> +	/* how many config structures do we have */
> +	entries = 0;
> +	i = header->length - sizeof(struct acpi_table_mcfg);
> +	while (i >= sizeof(struct acpi_mcfg_allocation)) {
> +		entries++;
> +		i -= sizeof(struct acpi_mcfg_allocation);
> +	}
> +	if (entries == 0) {
> +		pr_err(PREFIX "MMCONFIG has no entries\n");
> +		return -ENODEV;
> +	}
> +
> +	cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
> +	for (i = 0; i < entries; i++) {
> +		cfg = &cfg_table[i];
> +		if (acpi_mcfg_check_entry(mcfg, cfg))
> +			return -ENODEV;
> +
> +		if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
> +				   cfg->end_bus_number, cfg->address) == NULL) {
> +			pr_warn(PREFIX "no memory for MCFG entries\n");
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +int __init pci_mmconfig_parse_table(void)
> +{
> +	return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
> +}
> +
> +void __weak __init pci_mmcfg_late_init(void)
> +{
> +	int err, n = 0;
> +	struct pci_mmcfg_region *cfg;
> +
> +	err = pci_mmconfig_parse_table();
> +	if (err) {
> +		pr_err(PREFIX " Failed to parse MCFG (%d)\n", err);
> +		return;
> +	}
> +
> +	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
> +		pci_mmconfig_map_resource(NULL, cfg);
> +		n++;
> +	}
> +
> +	pr_info(PREFIX " MCFG table loaded %d entries\n", n);
> +}
> diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
> index 7494dbe..97aa9d3 100644
> --- a/drivers/xen/pci.c
> +++ b/drivers/xen/pci.c
> @@ -27,9 +27,6 @@
>  #include <asm/xen/hypervisor.h>
>  #include <asm/xen/hypercall.h>
>  #include "../pci/pci.h"
> -#ifdef CONFIG_PCI_MMCONFIG
> -#include <asm/pci_x86.h>
> -#endif
>  
>  static bool __read_mostly pci_seg_supported = true;
>  
> @@ -221,7 +218,7 @@ static int __init xen_mcfg_late(void)
>  	if (!xen_initial_domain())
>  		return 0;
>  
> -	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
> +	if (!pci_mmconfig_enabled())
>  		return 0;
>  
>  	if (list_empty(&pci_mmcfg_list))
> diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
> index 89ab057..e9450ef 100644
> --- a/include/linux/pci-acpi.h
> +++ b/include/linux/pci-acpi.h
> @@ -106,6 +106,39 @@ extern const u8 pci_acpi_dsm_uuid[];
>  #define RESET_DELAY_DSM		0x08
>  #define FUNCTION_DELAY_DSM	0x09
>  
> +/* common API to maintain list of MCFG regions */
> +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
> +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
> +
> +struct pci_mmcfg_region {
> +	struct list_head list;
> +	struct resource res;
> +	u64 address;
> +	char __iomem *virt;
> +	u16 segment;
> +	u8 start_bus;
> +	u8 end_bus;
> +	char name[PCI_MMCFG_RESOURCE_NAME_LEN];
> +};
> +
> +extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
> +			       phys_addr_t addr);
> +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
> +
> +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
> +extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
> +							int end, u64 addr);
> +extern int pci_mmconfig_map_resource(struct device *dev,
> +	struct pci_mmcfg_region *mcfg);
> +extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region *mcfg);
> +extern int pci_mmconfig_enabled(void);
> +extern int __init pci_mmconfig_parse_table(void);
> +
> +extern struct list_head pci_mmcfg_list;
> +
> +#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
> +#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
> +
>  #else	/* CONFIG_ACPI */
>  static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
>  static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
> -- 
> 1.9.1
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dongdong Liu Feb. 18, 2016, 12:25 p.m. UTC | #2
Hi Tomasz

在 2016/2/16 21:53, Tomasz Nowicki 写道:
> From: Jayachandran C <jchandra@broadcom.com>
>
> Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
> to share the API and code with ARM64 later. The corresponding
> declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
>
> As a part of this we introduce three functions that can be
> implemented by the arch code: pci_mmconfig_map_resource() to map a
> mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
> unmap and pci_mmconfig_enabled to see if the arch setup of
> mcfg entries was successful. We also provide weak implementations
> of these, which will be used from ARM64. On x86, we retain the
> old logic by providing platform specific implementation.
>
> This patch is purely rearranging code, it should not have any
> impact on the logic of MCFG parsing or list handling.
>
> Signed-off-by: Jayachandran C <jchandra@broadcom.com>
> [Xen parts:]
> Acked-by: David Vrabel <david.vrabel@citrix.com>
> ---
>   arch/x86/include/asm/pci_x86.h |  24 +---
>   arch/x86/pci/mmconfig-shared.c | 269 +++++------------------------------
>   arch/x86/pci/mmconfig_32.c     |   1 +
>   arch/x86/pci/mmconfig_64.c     |   1 +
>   arch/x86/pci/numachip.c        |   1 +
>   drivers/acpi/Makefile          |   1 +
>   drivers/acpi/pci_mcfg.c        | 312 +++++++++++++++++++++++++++++++++++++++++
>   drivers/xen/pci.c              |   5 +-
>   include/linux/pci-acpi.h       |  33 +++++
>   9 files changed, 386 insertions(+), 261 deletions(-)
>   create mode 100644 drivers/acpi/pci_mcfg.c
>
> diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
> index 46873fb..7824626 100644
> --- a/arch/x86/include/asm/pci_x86.h
> +++ b/arch/x86/include/asm/pci_x86.h
> @@ -122,33 +122,11 @@ extern int pci_legacy_init(void);
>   extern void pcibios_fixup_irqs(void);
>
>   /* pci-mmconfig.c */
> -
> -/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
> -#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
> -
> -struct pci_mmcfg_region {
> -	struct list_head list;
> -	struct resource res;
> -	u64 address;
> -	char __iomem *virt;
> -	u16 segment;
> -	u8 start_bus;
> -	u8 end_bus;
> -	char name[PCI_MMCFG_RESOURCE_NAME_LEN];
> -};
> -
> +struct pci_mmcfg_region;
>   extern int __init pci_mmcfg_arch_init(void);
>   extern void __init pci_mmcfg_arch_free(void);
>   extern int pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg);
>   extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg);
> -extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
> -			       phys_addr_t addr);
> -extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
> -extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
> -
> -extern struct list_head pci_mmcfg_list;
> -
> -#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
>
>   /*
>    * On AMD Fam10h CPUs, all PCI MMIO configuration space accesses must use
> diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
> index dd30b7e..626710b 100644
> --- a/arch/x86/pci/mmconfig-shared.c
> +++ b/arch/x86/pci/mmconfig-shared.c
> @@ -12,13 +12,12 @@
>
>   #include <linux/pci.h>
>   #include <linux/init.h>
> -#include <linux/sfi_acpi.h>
>   #include <linux/bitmap.h>
> -#include <linux/dmi.h>
>   #include <linux/slab.h>
>   #include <linux/mutex.h>
>   #include <linux/rculist.h>
>   #include <asm/e820.h>
> +#include <linux/pci-acpi.h>
>   #include <asm/pci_x86.h>
>   #include <asm/acpi.h>
>
> @@ -27,9 +26,6 @@
>   /* Indicate if the mmcfg resources have been placed into the resource table. */
>   static bool pci_mmcfg_running_state;
>   static bool pci_mmcfg_arch_init_failed;
> -static DEFINE_MUTEX(pci_mmcfg_lock);
> -
> -LIST_HEAD(pci_mmcfg_list);
>
>   static void __init pci_mmconfig_remove(struct pci_mmcfg_region *cfg)
>   {
> @@ -48,83 +44,6 @@ static void __init free_all_mmcfg(void)
>   		pci_mmconfig_remove(cfg);
>   }
>
> -static void list_add_sorted(struct pci_mmcfg_region *new)
> -{
> -	struct pci_mmcfg_region *cfg;
> -
> -	/* keep list sorted by segment and starting bus number */
> -	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) {
> -		if (cfg->segment > new->segment ||
> -		    (cfg->segment == new->segment &&
> -		     cfg->start_bus >= new->start_bus)) {
> -			list_add_tail_rcu(&new->list, &cfg->list);
> -			return;
> -		}
> -	}
> -	list_add_tail_rcu(&new->list, &pci_mmcfg_list);
> -}
> -
> -static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start,
> -						   int end, u64 addr)
> -{
> -	struct pci_mmcfg_region *new;
> -	struct resource *res;
> -
> -	if (addr == 0)
> -		return NULL;
> -
> -	new = kzalloc(sizeof(*new), GFP_KERNEL);
> -	if (!new)
> -		return NULL;
> -
> -	new->address = addr;
> -	new->segment = segment;
> -	new->start_bus = start;
> -	new->end_bus = end;
> -
> -	res = &new->res;
> -	res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
> -	res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
> -	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> -	snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
> -		 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
> -	res->name = new->name;
> -
> -	return new;
> -}
> -
> -static struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
> -							int end, u64 addr)
> -{
> -	struct pci_mmcfg_region *new;
> -
> -	new = pci_mmconfig_alloc(segment, start, end, addr);
> -	if (new) {
> -		mutex_lock(&pci_mmcfg_lock);
> -		list_add_sorted(new);
> -		mutex_unlock(&pci_mmcfg_lock);
> -
> -		pr_info(PREFIX
> -		       "MMCONFIG for domain %04x [bus %02x-%02x] at %pR "
> -		       "(base %#lx)\n",
> -		       segment, start, end, &new->res, (unsigned long)addr);
> -	}
> -
> -	return new;
> -}
> -
> -struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
> -{
> -	struct pci_mmcfg_region *cfg;
> -
> -	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
> -		if (cfg->segment == segment &&
> -		    cfg->start_bus <= bus && bus <= cfg->end_bus)
> -			return cfg;
> -
> -	return NULL;
> -}
> -
>   static const char *__init pci_mmcfg_e7520(void)
>   {
>   	u32 win;
> @@ -543,73 +462,6 @@ static void __init pci_mmcfg_reject_broken(int early)
>   	}
>   }
>
> -static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
> -					struct acpi_mcfg_allocation *cfg)
> -{
> -	int year;
> -
> -	if (cfg->address < 0xFFFFFFFF)
> -		return 0;
> -
> -	if (!strncmp(mcfg->header.oem_id, "SGI", 3))
> -		return 0;
> -
> -	if (mcfg->header.revision >= 1) {
> -		if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
> -		    year >= 2010)
> -			return 0;
> -	}
> -
> -	pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
> -	       "is above 4GB, ignored\n", cfg->pci_segment,
> -	       cfg->start_bus_number, cfg->end_bus_number, cfg->address);
> -	return -EINVAL;
> -}
> -
> -static int __init pci_parse_mcfg(struct acpi_table_header *header)
> -{
> -	struct acpi_table_mcfg *mcfg;
> -	struct acpi_mcfg_allocation *cfg_table, *cfg;
> -	unsigned long i;
> -	int entries;
> -
> -	if (!header)
> -		return -EINVAL;
> -
> -	mcfg = (struct acpi_table_mcfg *)header;
> -
> -	/* how many config structures do we have */
> -	free_all_mmcfg();
> -	entries = 0;
> -	i = header->length - sizeof(struct acpi_table_mcfg);
> -	while (i >= sizeof(struct acpi_mcfg_allocation)) {
> -		entries++;
> -		i -= sizeof(struct acpi_mcfg_allocation);
> -	}
> -	if (entries == 0) {
> -		pr_err(PREFIX "MMCONFIG has no entries\n");
> -		return -ENODEV;
> -	}
> -
> -	cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
> -	for (i = 0; i < entries; i++) {
> -		cfg = &cfg_table[i];
> -		if (acpi_mcfg_check_entry(mcfg, cfg)) {
> -			free_all_mmcfg();
> -			return -ENODEV;
> -		}
> -
> -		if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
> -				   cfg->end_bus_number, cfg->address) == NULL) {
> -			pr_warn(PREFIX "no memory for MCFG entries\n");
> -			free_all_mmcfg();
> -			return -ENOMEM;
> -		}
> -	}
> -
> -	return 0;
> -}
> -
>   #ifdef CONFIG_ACPI_APEI
>   extern int (*arch_apei_filter_addr)(int (*func)(__u64 start, __u64 size,
>   				     void *data), void *data);
> @@ -662,13 +514,20 @@ static void __init __pci_mmcfg_init(int early)
>
>   static int __initdata known_bridge;
>
> +static void __init pci_mmcfg_list_setup(void)
> +{
> +	free_all_mmcfg();
> +	if (pci_mmconfig_parse_table())
> +		free_all_mmcfg();
> +}
> +
>   void __init pci_mmcfg_early_init(void)
>   {
>   	if (pci_probe & PCI_PROBE_MMCONF) {
>   		if (pci_mmcfg_check_hostbridge())
>   			known_bridge = 1;
>   		else
> -			acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
> +			pci_mmcfg_list_setup();
>   		__pci_mmcfg_init(1);
>
>   		set_apei_filter();
> @@ -686,7 +545,7 @@ void __init pci_mmcfg_late_init(void)
>
>   	/* MMCONFIG hasn't been enabled yet, try again */
>   	if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) {
> -		acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
> +		pci_mmcfg_list_setup();
>   		__pci_mmcfg_init(0);
>   	}
>   }
> @@ -720,99 +579,41 @@ static int __init pci_mmcfg_late_insert_resources(void)
>    */
>   late_initcall(pci_mmcfg_late_insert_resources);
>
> -/* Add MMCFG information for host bridges */
> -int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
> -			phys_addr_t addr)
> +int pci_mmconfig_map_resource(struct device *dev, struct pci_mmcfg_region *cfg)
>   {
> -	int rc;
> -	struct resource *tmp = NULL;
> -	struct pci_mmcfg_region *cfg;
> +	struct resource *tmp;
>
> -	if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed)
> -		return -ENODEV;
> -
> -	if (start > end)
> -		return -EINVAL;
> -
> -	mutex_lock(&pci_mmcfg_lock);
> -	cfg = pci_mmconfig_lookup(seg, start);
> -	if (cfg) {
> -		if (cfg->end_bus < end)
> -			dev_info(dev, FW_INFO
> -				 "MMCONFIG for "
> -				 "domain %04x [bus %02x-%02x] "
> -				 "only partially covers this bridge\n",
> -				  cfg->segment, cfg->start_bus, cfg->end_bus);
> -		mutex_unlock(&pci_mmcfg_lock);
> -		return -EEXIST;
> -	}
> -
> -	if (!addr) {
> -		mutex_unlock(&pci_mmcfg_lock);
> -		return -EINVAL;
> -	}
> -
> -	rc = -EBUSY;
> -	cfg = pci_mmconfig_alloc(seg, start, end, addr);
> -	if (cfg == NULL) {
> -		dev_warn(dev, "fail to add MMCONFIG (out of memory)\n");
> -		rc = -ENOMEM;
> -	} else if (!pci_mmcfg_check_reserved(dev, cfg, 0)) {
> +	if (!pci_mmcfg_check_reserved(dev, cfg, 0)) {
>   		dev_warn(dev, FW_BUG "MMCONFIG %pR isn't reserved\n",
>   			 &cfg->res);
> -	} else {
> -		/* Insert resource if it's not in boot stage */
> -		if (pci_mmcfg_running_state)
> -			tmp = insert_resource_conflict(&iomem_resource,
> -						       &cfg->res);
> -
> +		return -EBUSY;
> +	}
> +	/* Insert resource if it's not in boot stage */
> +	if (pci_mmcfg_running_state) {
> +		tmp = insert_resource_conflict(&iomem_resource,
> +					       &cfg->res);
>   		if (tmp) {
> -			dev_warn(dev,
> -				 "MMCONFIG %pR conflicts with "
> -				 "%s %pR\n",
> -				 &cfg->res, tmp->name, tmp);
> -		} else if (pci_mmcfg_arch_map(cfg)) {
> -			dev_warn(dev, "fail to map MMCONFIG %pR.\n",
> -				 &cfg->res);
> -		} else {
> -			list_add_sorted(cfg);
> -			dev_info(dev, "MMCONFIG at %pR (base %#lx)\n",
> -				 &cfg->res, (unsigned long)addr);
> -			cfg = NULL;
> -			rc = 0;
> +			dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
> +				&cfg->res, tmp->name, tmp);
> +			return -EBUSY;
>   		}
>   	}
> -
> -	if (cfg) {
> -		if (cfg->res.parent)
> -			release_resource(&cfg->res);
> -		kfree(cfg);
> +	if (pci_mmcfg_arch_map(cfg)) {
> +		dev_warn(dev, "fail to map MMCONFIG %pR.\n", &cfg->res);
> +		return -EBUSY;
>   	}
> -
> -	mutex_unlock(&pci_mmcfg_lock);
> -
> -	return rc;
> +	return 0;
>   }
>
> -/* Delete MMCFG information for host bridges */
> -int pci_mmconfig_delete(u16 seg, u8 start, u8 end)
> +void pci_mmconfig_unmap_resource(struct pci_mmcfg_region *cfg)
>   {
> -	struct pci_mmcfg_region *cfg;
> -
> -	mutex_lock(&pci_mmcfg_lock);
> -	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
> -		if (cfg->segment == seg && cfg->start_bus == start &&
> -		    cfg->end_bus == end) {
> -			list_del_rcu(&cfg->list);
> -			synchronize_rcu();
> -			pci_mmcfg_arch_unmap(cfg);
> -			if (cfg->res.parent)
> -				release_resource(&cfg->res);
> -			mutex_unlock(&pci_mmcfg_lock);
> -			kfree(cfg);
> -			return 0;
> -		}
> -	mutex_unlock(&pci_mmcfg_lock);
> +	pci_mmcfg_arch_unmap(cfg);
> +	if (cfg->res.parent)
> +		release_resource(&cfg->res);
> +	cfg->res.parent = NULL;
> +}
>
> -	return -ENOENT;
> +int pci_mmconfig_enabled(void)
> +{
> +	return (pci_probe & PCI_PROBE_MMCONF) && !pci_mmcfg_arch_init_failed;
>   }
> diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
> index 43984bc..38a37f8 100644
> --- a/arch/x86/pci/mmconfig_32.c
> +++ b/arch/x86/pci/mmconfig_32.c
> @@ -12,6 +12,7 @@
>   #include <linux/pci.h>
>   #include <linux/init.h>
>   #include <linux/rcupdate.h>
> +#include <linux/pci-acpi.h>
>   #include <asm/e820.h>
>   #include <asm/pci_x86.h>
>
> diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
> index bea5249..29253ec 100644
> --- a/arch/x86/pci/mmconfig_64.c
> +++ b/arch/x86/pci/mmconfig_64.c
> @@ -10,6 +10,7 @@
>   #include <linux/acpi.h>
>   #include <linux/bitmap.h>
>   #include <linux/rcupdate.h>
> +#include <linux/pci-acpi.h>
>   #include <asm/e820.h>
>   #include <asm/pci_x86.h>
>
> diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
> index 2e565e6..c181eeb 100644
> --- a/arch/x86/pci/numachip.c
> +++ b/arch/x86/pci/numachip.c
> @@ -14,6 +14,7 @@
>    */
>
>   #include <linux/pci.h>
> +#include <linux/pci-acpi.h>
>   #include <asm/pci_x86.h>
>
>   static u8 limit __read_mostly;
> diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
> index 7ea903d..e5e4393 100644
> --- a/drivers/acpi/Makefile
> +++ b/drivers/acpi/Makefile
> @@ -40,6 +40,7 @@ acpi-$(CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC) += processor_pdc.o
>   acpi-y				+= ec.o
>   acpi-$(CONFIG_ACPI_DOCK)	+= dock.o
>   acpi-y				+= pci_root.o pci_link.o pci_irq.o
> +acpi-$(CONFIG_PCI_MMCONFIG)	+= pci_mcfg.o
>   acpi-y				+= acpi_lpss.o acpi_apd.o
>   acpi-y				+= acpi_platform.o
>   acpi-y				+= acpi_pnp.o
> diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
> new file mode 100644
> index 0000000..ea84365
> --- /dev/null
> +++ b/drivers/acpi/pci_mcfg.c
> @@ -0,0 +1,312 @@
> +/*
> + * pci_mcfg.c
> + *
> + * Common code to maintain the MCFG areas and mappings
> + *
> + * This has been extracted from arch/x86/pci/mmconfig-shared.c
> + * and moved here so that other architectures can use this code.
> + */
> +
> +#include <linux/pci.h>
> +#include <linux/init.h>
> +#include <linux/dmi.h>
> +#include <linux/pci-acpi.h>
> +#include <linux/sfi_acpi.h>
> +#include <linux/slab.h>
> +#include <linux/mutex.h>
> +#include <linux/rculist.h>
> +
> +#define PREFIX	"ACPI: "
> +
> +static DEFINE_MUTEX(pci_mmcfg_lock);
> +LIST_HEAD(pci_mmcfg_list);
> +
> +static void list_add_sorted(struct pci_mmcfg_region *new)
> +{
> +	struct pci_mmcfg_region *cfg;
> +
> +	/* keep list sorted by segment and starting bus number */
> +	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) {
> +		if (cfg->segment > new->segment ||
> +		    (cfg->segment == new->segment &&
> +		     cfg->start_bus >= new->start_bus)) {
> +			list_add_tail_rcu(&new->list, &cfg->list);
> +			return;
> +		}
> +	}
> +	list_add_tail_rcu(&new->list, &pci_mmcfg_list);
> +}
> +
> +static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start,
> +						   int end, u64 addr)
> +{
> +	struct pci_mmcfg_region *new;
> +	struct resource *res;
> +
> +	if (addr == 0)
> +		return NULL;
> +
> +	new = kzalloc(sizeof(*new), GFP_KERNEL);
> +	if (!new)
> +		return NULL;
> +
> +	new->address = addr;
> +	new->segment = segment;
> +	new->start_bus = start;
> +	new->end_bus = end;
> +
> +	res = &new->res;
> +	res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
> +	res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
> +	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> +	snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
> +		 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
> +	res->name = new->name;
> +
> +	return new;
> +}
> +
> +struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
> +							int end, u64 addr)
> +{
> +	struct pci_mmcfg_region *new;
> +
> +	new = pci_mmconfig_alloc(segment, start, end, addr);
> +	if (new) {
> +		mutex_lock(&pci_mmcfg_lock);
> +		list_add_sorted(new);
> +		mutex_unlock(&pci_mmcfg_lock);
> +
> +		pr_info(PREFIX
> +		       "MMCONFIG for domain %04x [bus %02x-%02x] at %pR "
> +		       "(base %#lx)\n",
> +		       segment, start, end, &new->res, (unsigned long)addr);
> +	}
> +
> +	return new;
> +}
> +
> +struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
> +{
> +	struct pci_mmcfg_region *cfg;
> +
> +	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
> +		if (cfg->segment == segment &&
> +		    cfg->start_bus <= bus && bus <= cfg->end_bus)
> +			return cfg;
> +
> +	return NULL;
> +}
> +
> +/*
> + * Map a pci_mmcfg_region, can be overrriden by arch
> + */
> +int __weak pci_mmconfig_map_resource(struct device *dev,
> +	struct pci_mmcfg_region *mcfg)
> +{
> +	struct resource *tmp;
> +	void __iomem *vaddr;
> +
> +	tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
> +	if (tmp) {
> +		dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
> +			&mcfg->res, tmp->name, tmp);
> +		return -EBUSY;
> +	}
> +
> +	vaddr =  ioremap(mcfg->res.start, resource_size(&mcfg->res));
> +	if (!vaddr) {
> +		release_resource(&mcfg->res);
> +		return -ENOMEM;
> +	}
> +
> +	mcfg->virt = vaddr;
Here should be changed to
	mcfg->virt = vaddr - PCI_MMCFG_BUS_OFFSET(mcfg->start_bus);

or when pcie host "start_bus" is not 0, the configuraion access will be wrong.

See v3 or v4 patchset "addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus);"
static void __iomem *mcfg_ioremap(struct pci_mmcfg_region *cfg)
{
	void __iomem *addr;
	u64 start, size;
	int num_buses;

	start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
	num_buses = cfg->end_bus - cfg->start_bus + 1;
	size = PCI_MMCFG_BUS_OFFSET(num_buses);
	addr = ioremap_nocache(start, size);
	if (addr)
		addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
	return addr;
}

Dongdong
Thanks

> +	return 0;
> +}
> +
> +/*
> + * Unmap a pci_mmcfg_region, can be overrriden by arch
> + */
> +void __weak pci_mmconfig_unmap_resource(struct pci_mmcfg_region *mcfg)
> +{
> +	if (mcfg->virt) {
> +		iounmap(mcfg->virt);
> +		mcfg->virt = NULL;
> +	}
> +	if (mcfg->res.parent) {
> +		release_resource(&mcfg->res);
> +		mcfg->res.parent = NULL;
> +	}
> +}
> +
> +/*
> + * check if the mmconfig is enabled and configured
> + */
> +int __weak pci_mmconfig_enabled(void)
> +{
> +	return 1;
> +}
> +
> +/* Add MMCFG information for host bridges */
> +int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
> +			phys_addr_t addr)
> +{
> +	struct pci_mmcfg_region *cfg;
> +	int rc;
> +
> +	if (!pci_mmconfig_enabled())
> +		return -ENODEV;
> +	if (start > end)
> +		return -EINVAL;
> +
> +	mutex_lock(&pci_mmcfg_lock);
> +	cfg = pci_mmconfig_lookup(seg, start);
> +	if (cfg) {
> +		if (cfg->end_bus < end)
> +			dev_info(dev, FW_INFO
> +				 "MMCONFIG for "
> +				 "domain %04x [bus %02x-%02x] "
> +				 "only partially covers this bridge\n",
> +				  cfg->segment, cfg->start_bus, cfg->end_bus);
> +		rc = -EEXIST;
> +		goto err;
> +	}
> +
> +	if (!addr) {
> +		rc = -EINVAL;
> +		goto err;
> +	}
> +
> +	cfg = pci_mmconfig_alloc(seg, start, end, addr);
> +	if (cfg == NULL) {
> +		dev_warn(dev, "fail to add MMCONFIG (out of memory)\n");
> +		rc = -ENOMEM;
> +		goto err;
> +	}
> +	rc = pci_mmconfig_map_resource(dev, cfg);
> +	if (!rc) {
> +		list_add_sorted(cfg);
> +		dev_info(dev, "MMCONFIG at %pR (base %#lx)\n",
> +				 &cfg->res, (unsigned long)addr);
> +		return 0;
> +	} else {
> +		if (cfg->res.parent)
> +			release_resource(&cfg->res);
> +		kfree(cfg);
> +	}
> +
> +err:
> +	mutex_unlock(&pci_mmcfg_lock);
> +	return rc;
> +}
> +
> +/* Delete MMCFG information for host bridges */
> +int pci_mmconfig_delete(u16 seg, u8 start, u8 end)
> +{
> +	struct pci_mmcfg_region *cfg;
> +
> +	mutex_lock(&pci_mmcfg_lock);
> +	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
> +		if (cfg->segment == seg && cfg->start_bus == start &&
> +		    cfg->end_bus == end) {
> +			list_del_rcu(&cfg->list);
> +			synchronize_rcu();
> +			pci_mmconfig_unmap_resource(cfg);
> +			mutex_unlock(&pci_mmcfg_lock);
> +			kfree(cfg);
> +			return 0;
> +		}
> +	mutex_unlock(&pci_mmcfg_lock);
> +
> +	return -ENOENT;
> +}
> +
> +static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
> +					struct acpi_mcfg_allocation *cfg)
> +{
> +	int year;
> +
> +	if (!config_enabled(CONFIG_X86))
> +		return 0;
> +
> +	if (cfg->address < 0xFFFFFFFF)
> +		return 0;
> +
> +	if (!strncmp(mcfg->header.oem_id, "SGI", 3))
> +		return 0;
> +
> +	if (mcfg->header.revision >= 1) {
> +		if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
> +		    year >= 2010)
> +			return 0;
> +	}
> +
> +	pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
> +	       "is above 4GB, ignored\n", cfg->pci_segment,
> +	       cfg->start_bus_number, cfg->end_bus_number, cfg->address);
> +	return -EINVAL;
> +}
> +
> +static int __init pci_parse_mcfg(struct acpi_table_header *header)
> +{
> +	struct acpi_table_mcfg *mcfg;
> +	struct acpi_mcfg_allocation *cfg_table, *cfg;
> +	unsigned long i;
> +	int entries;
> +
> +	if (!header)
> +		return -EINVAL;
> +
> +	mcfg = (struct acpi_table_mcfg *)header;
> +
> +	/* how many config structures do we have */
> +	entries = 0;
> +	i = header->length - sizeof(struct acpi_table_mcfg);
> +	while (i >= sizeof(struct acpi_mcfg_allocation)) {
> +		entries++;
> +		i -= sizeof(struct acpi_mcfg_allocation);
> +	}
> +	if (entries == 0) {
> +		pr_err(PREFIX "MMCONFIG has no entries\n");
> +		return -ENODEV;
> +	}
> +
> +	cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
> +	for (i = 0; i < entries; i++) {
> +		cfg = &cfg_table[i];
> +		if (acpi_mcfg_check_entry(mcfg, cfg))
> +			return -ENODEV;
> +
> +		if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
> +				   cfg->end_bus_number, cfg->address) == NULL) {
> +			pr_warn(PREFIX "no memory for MCFG entries\n");
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +int __init pci_mmconfig_parse_table(void)
> +{
> +	return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
> +}
> +
> +void __weak __init pci_mmcfg_late_init(void)
> +{
> +	int err, n = 0;
> +	struct pci_mmcfg_region *cfg;
> +
> +	err = pci_mmconfig_parse_table();
> +	if (err) {
> +		pr_err(PREFIX " Failed to parse MCFG (%d)\n", err);
> +		return;
> +	}
> +
> +	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
> +		pci_mmconfig_map_resource(NULL, cfg);
> +		n++;
> +	}
> +
> +	pr_info(PREFIX " MCFG table loaded %d entries\n", n);
> +}
> diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
> index 7494dbe..97aa9d3 100644
> --- a/drivers/xen/pci.c
> +++ b/drivers/xen/pci.c
> @@ -27,9 +27,6 @@
>   #include <asm/xen/hypervisor.h>
>   #include <asm/xen/hypercall.h>
>   #include "../pci/pci.h"
> -#ifdef CONFIG_PCI_MMCONFIG
> -#include <asm/pci_x86.h>
> -#endif
>
>   static bool __read_mostly pci_seg_supported = true;
>
> @@ -221,7 +218,7 @@ static int __init xen_mcfg_late(void)
>   	if (!xen_initial_domain())
>   		return 0;
>
> -	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
> +	if (!pci_mmconfig_enabled())
>   		return 0;
>
>   	if (list_empty(&pci_mmcfg_list))
> diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
> index 89ab057..e9450ef 100644
> --- a/include/linux/pci-acpi.h
> +++ b/include/linux/pci-acpi.h
> @@ -106,6 +106,39 @@ extern const u8 pci_acpi_dsm_uuid[];
>   #define RESET_DELAY_DSM		0x08
>   #define FUNCTION_DELAY_DSM	0x09
>
> +/* common API to maintain list of MCFG regions */
> +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
> +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
> +
> +struct pci_mmcfg_region {
> +	struct list_head list;
> +	struct resource res;
> +	u64 address;
> +	char __iomem *virt;
> +	u16 segment;
> +	u8 start_bus;
> +	u8 end_bus;
> +	char name[PCI_MMCFG_RESOURCE_NAME_LEN];
> +};
> +
> +extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
> +			       phys_addr_t addr);
> +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
> +
> +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
> +extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
> +							int end, u64 addr);
> +extern int pci_mmconfig_map_resource(struct device *dev,
> +	struct pci_mmcfg_region *mcfg);
> +extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region *mcfg);
> +extern int pci_mmconfig_enabled(void);
> +extern int __init pci_mmconfig_parse_table(void);
> +
> +extern struct list_head pci_mmcfg_list;
> +
> +#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
> +#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
> +
>   #else	/* CONFIG_ACPI */
>   static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
>   static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lorenzo Pieralisi Feb. 18, 2016, 1:20 p.m. UTC | #3
On Thu, Feb 18, 2016 at 08:25:35PM +0800, liudongdong (C) wrote:

[...]

> >+/*
> >+ * Map a pci_mmcfg_region, can be overrriden by arch
> >+ */
> >+int __weak pci_mmconfig_map_resource(struct device *dev,
> >+	struct pci_mmcfg_region *mcfg)
> >+{
> >+	struct resource *tmp;
> >+	void __iomem *vaddr;
> >+
> >+	tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
> >+	if (tmp) {
> >+		dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
> >+			&mcfg->res, tmp->name, tmp);
> >+		return -EBUSY;
> >+	}
> >+
> >+	vaddr =  ioremap(mcfg->res.start, resource_size(&mcfg->res));
               ^^
	       while at it, stray white space

> >+	if (!vaddr) {
> >+		release_resource(&mcfg->res);
> >+		return -ENOMEM;
> >+	}
> >+
> >+	mcfg->virt = vaddr;
> Here should be changed to
> 	mcfg->virt = vaddr - PCI_MMCFG_BUS_OFFSET(mcfg->start_bus);
> 
> or when pcie host "start_bus" is not 0, the configuraion access will be wrong.
Well spotted, thanks.

Lorenzo
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas March 3, 2016, 10:51 p.m. UTC | #4
Hi Tomasz, Jayachandran, et al,

On Tue, Feb 16, 2016 at 02:53:31PM +0100, Tomasz Nowicki wrote:
> From: Jayachandran C <jchandra@broadcom.com>
> 
> Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
> to share the API and code with ARM64 later. The corresponding
> declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
> 
> As a part of this we introduce three functions that can be
> implemented by the arch code: pci_mmconfig_map_resource() to map a
> mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
> unmap and pci_mmconfig_enabled to see if the arch setup of
> mcfg entries was successful. We also provide weak implementations
> of these, which will be used from ARM64. On x86, we retain the
> old logic by providing platform specific implementation.
> 
> This patch is purely rearranging code, it should not have any
> impact on the logic of MCFG parsing or list handling.

I definitely want to figure out how to make this work well on ARM64.
I need to ponder this some more, so these are just some initial
thoughts.

My first impression is that (a) the x86 MCFG code is an unmitigated
disaster, and (b) we're trying a little too hard to make that mess
generic.  I think we might be better served if we came up with some
cleaner, more generic code that we can use for ARM64 today, and
migrate x86 toward that over time.

My concern is that if we elevate the current x86 code to be
"arch-independent", we will be perpetuating some interfaces and
designs that shouldn't be allowed to escape arch/x86.

Some of the code that moved to drivers/acpi/pci_mcfg.c is not really
ACPI-specific, and could potentially be used for non-ACPI bridges that
support ECAM.  I'd like to see that sort of code moved to a new file
like drivers/pci/ecam.c.

There's a little bit of overlap here with the ECAM code in
pci-host-generic.c.  I'm not sure whether or how to include that, but
it's a very good example of how simple this *should* be: probe the
host bridge, discover the ECAM region, request the region, ioremap it,
done.

> diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
> new file mode 100644
> index 0000000..ea84365
> --- /dev/null
> +++ b/drivers/acpi/pci_mcfg.c
> ...
> +int __weak pci_mmconfig_map_resource(struct device *dev,
> +	struct pci_mmcfg_region *mcfg)
> +{
> +	struct resource *tmp;
> +	void __iomem *vaddr;
> +
> +	tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
> +	if (tmp) {
> +		dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
> +			&mcfg->res, tmp->name, tmp);
> +		return -EBUSY;
> +	}

I think this is a mistake in the x86 MCFG support that we should not
carry over to a generic implementation.  We should not use the MCFG
table for resource reservation because MCFG is not defined by the ACPI
spec and an OS need not include support for it.  The platform must
indicate in some other, more generic way, that ECAM space is reserved.
This probably means ECAM space should be declared in a PNP0C02 _CRS
method (see the PCI Firmware Spec r3.0, sec 4.1.2, note 2).

We might need some kind of x86-specific quirk that does this, or a
pcibios hook or something here; I just don't think it should be
generic.

> +int __init pci_mmconfig_parse_table(void)
> +{
> +	return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
> +}

I don't like the fact that we parse the entire MCFG table at once
here.  I think we should look for the information we need when we are
claiming a PCI host bridge, e.g., in acpi_pci_root_add().  This might
not be a great fit for the way ACPI table management works, but I
think it's better to do things on-demand rather than just-in-case.

> diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
> index 89ab057..e9450ef 100644
> --- a/include/linux/pci-acpi.h
> +++ b/include/linux/pci-acpi.h
> @@ -106,6 +106,39 @@ extern const u8 pci_acpi_dsm_uuid[];
>  #define RESET_DELAY_DSM		0x08
>  #define FUNCTION_DELAY_DSM	0x09
>  
> +/* common API to maintain list of MCFG regions */
> +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
> +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
> +
> +struct pci_mmcfg_region {
> +	struct list_head list;
> +	struct resource res;
> +	u64 address;
> +	char __iomem *virt;
> +	u16 segment;
> +	u8 start_bus;
> +	u8 end_bus;
> +	char name[PCI_MMCFG_RESOURCE_NAME_LEN];
> +};
> +
> +extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
> +			       phys_addr_t addr);
> +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
> +
> +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
> +extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
> +							int end, u64 addr);
> +extern int pci_mmconfig_map_resource(struct device *dev,
> +	struct pci_mmcfg_region *mcfg);
> +extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region *mcfg);
> +extern int pci_mmconfig_enabled(void);
> +extern int __init pci_mmconfig_parse_table(void);
> +
> +extern struct list_head pci_mmcfg_list;
> +
> +#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
> +#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
> +

With the exception of pci_mmconfig_parse_table(), nothing here is
ACPI-specific.  I'd like to see the PCI ECAM-related interfaces
(hopefully not these exact ones, but a more rational set) put in
something like include/linux/pci-ecam.h.

>  #else	/* CONFIG_ACPI */
>  static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
>  static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jayachandran Chandrashekaran Nair March 4, 2016, 8:35 a.m. UTC | #5
On Fri, Mar 4, 2016 at 4:21 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> Hi Tomasz, Jayachandran, et al,
>
> On Tue, Feb 16, 2016 at 02:53:31PM +0100, Tomasz Nowicki wrote:
>> From: Jayachandran C <jchandra@broadcom.com>
>>
>> Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
>> to share the API and code with ARM64 later. The corresponding
>> declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
>>
>> As a part of this we introduce three functions that can be
>> implemented by the arch code: pci_mmconfig_map_resource() to map a
>> mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
>> unmap and pci_mmconfig_enabled to see if the arch setup of
>> mcfg entries was successful. We also provide weak implementations
>> of these, which will be used from ARM64. On x86, we retain the
>> old logic by providing platform specific implementation.
>>
>> This patch is purely rearranging code, it should not have any
>> impact on the logic of MCFG parsing or list handling.
>
> I definitely want to figure out how to make this work well on ARM64.
> I need to ponder this some more, so these are just some initial
> thoughts.
>
> My first impression is that (a) the x86 MCFG code is an unmitigated
> disaster, and (b) we're trying a little too hard to make that mess
> generic.  I think we might be better served if we came up with some
> cleaner, more generic code that we can use for ARM64 today, and
> migrate x86 toward that over time.
>
> My concern is that if we elevate the current x86 code to be
> "arch-independent", we will be perpetuating some interfaces and
> designs that shouldn't be allowed to escape arch/x86.

I think the major decision is whether to maintain the pci_mmcfg_list
for all architectures or not. My initial plan was not to do this because
of the mess (basically the ECAM region info should be attached to
the pci root and not maintained in a separate list that needs locking),
The patch I posted initially https://patchwork.ozlabs.org/patch/553464/
had a much simpler way of handling the MCFG table without using
the list.

In x86 case it is not feasible to remove using the pci_mmcfg_list.
The only use of it outside is in xen that can be fixed up.

> Some of the code that moved to drivers/acpi/pci_mcfg.c is not really
> ACPI-specific, and could potentially be used for non-ACPI bridges that
> support ECAM.  I'd like to see that sort of code moved to a new file
> like drivers/pci/ecam.c.
>
> There's a little bit of overlap here with the ECAM code in
> pci-host-generic.c.  I'm not sure whether or how to include that, but
> it's a very good example of how simple this *should* be: probe the
> host bridge, discover the ECAM region, request the region, ioremap it,
> done.

I had a similar approach in my initial patchset, please see the patch
above. The resource for ECAM is mapped similar to the the way
pci-host-generic.c handled it. An additional step I could do was to
move the common code (ioremap and mapbus) into a common
file and share the code with pci-host-generic.c

>> diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
>> new file mode 100644
>> index 0000000..ea84365
>> --- /dev/null
>> +++ b/drivers/acpi/pci_mcfg.c
>> ...
>> +int __weak pci_mmconfig_map_resource(struct device *dev,
>> +     struct pci_mmcfg_region *mcfg)
>> +{
>> +     struct resource *tmp;
>> +     void __iomem *vaddr;
>> +
>> +     tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
>> +     if (tmp) {
>> +             dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
>> +                     &mcfg->res, tmp->name, tmp);
>> +             return -EBUSY;
>> +     }
>
> I think this is a mistake in the x86 MCFG support that we should not
> carry over to a generic implementation.  We should not use the MCFG
> table for resource reservation because MCFG is not defined by the ACPI
> spec and an OS need not include support for it.  The platform must
> indicate in some other, more generic way, that ECAM space is reserved.
> This probably means ECAM space should be declared in a PNP0C02 _CRS
> method (see the PCI Firmware Spec r3.0, sec 4.1.2, note 2).
>
> We might need some kind of x86-specific quirk that does this, or a
> pcibios hook or something here; I just don't think it should be
> generic.
>
>> +int __init pci_mmconfig_parse_table(void)
>> +{
>> +     return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
>> +}
>
> I don't like the fact that we parse the entire MCFG table at once
> here.  I think we should look for the information we need when we are
> claiming a PCI host bridge, e.g., in acpi_pci_root_add().  This might
> not be a great fit for the way ACPI table management works, but I
> think it's better to do things on-demand rather than just-in-case.

There is an overhead of looking up this table, and the information
available there is very limited (i.e, segment, start_bus, end_bus
and address). My approach in the above patch is to save this info
into an array at boot time and avoid multiple lookups.

>> diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
>> index 89ab057..e9450ef 100644
>> --- a/include/linux/pci-acpi.h
>> +++ b/include/linux/pci-acpi.h
>> @@ -106,6 +106,39 @@ extern const u8 pci_acpi_dsm_uuid[];
>>  #define RESET_DELAY_DSM              0x08
>>  #define FUNCTION_DELAY_DSM   0x09
>>
>> +/* common API to maintain list of MCFG regions */
>> +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
>> +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
>> +
>> +struct pci_mmcfg_region {
>> +     struct list_head list;
>> +     struct resource res;
>> +     u64 address;
>> +     char __iomem *virt;
>> +     u16 segment;
>> +     u8 start_bus;
>> +     u8 end_bus;
>> +     char name[PCI_MMCFG_RESOURCE_NAME_LEN];
>> +};
>> +
>> +extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
>> +                            phys_addr_t addr);
>> +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
>> +
>> +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
>> +extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
>> +                                                     int end, u64 addr);
>> +extern int pci_mmconfig_map_resource(struct device *dev,
>> +     struct pci_mmcfg_region *mcfg);
>> +extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region *mcfg);
>> +extern int pci_mmconfig_enabled(void);
>> +extern int __init pci_mmconfig_parse_table(void);
>> +
>> +extern struct list_head pci_mmcfg_list;
>> +
>> +#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
>> +#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
>> +
>
> With the exception of pci_mmconfig_parse_table(), nothing here is
> ACPI-specific.  I'd like to see the PCI ECAM-related interfaces
> (hopefully not these exact ones, but a more rational set) put in
> something like include/linux/pci-ecam.h.
>
>>  #else        /* CONFIG_ACPI */
>>  static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
>>  static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }

I can update this patch to
 -  drop the pci_mmcfg_list handling from generic case
 -  move common ECAM code so that it can be shared with
    pci-host-generic.c
if that is what you are looking for. The code will end up looking much
simpler.

Thanks,
JC.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tomasz Nowicki March 4, 2016, 9:27 a.m. UTC | #6
Hi Bjorn,

On 03.03.2016 23:51, Bjorn Helgaas wrote:
> Hi Tomasz, Jayachandran, et al,
>
> On Tue, Feb 16, 2016 at 02:53:31PM +0100, Tomasz Nowicki wrote:
>> From: Jayachandran C <jchandra@broadcom.com>
>>
>> Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
>> to share the API and code with ARM64 later. The corresponding
>> declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
>>
>> As a part of this we introduce three functions that can be
>> implemented by the arch code: pci_mmconfig_map_resource() to map a
>> mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
>> unmap and pci_mmconfig_enabled to see if the arch setup of
>> mcfg entries was successful. We also provide weak implementations
>> of these, which will be used from ARM64. On x86, we retain the
>> old logic by providing platform specific implementation.
>>
>> This patch is purely rearranging code, it should not have any
>> impact on the logic of MCFG parsing or list handling.
>
> I definitely want to figure out how to make this work well on ARM64.
> I need to ponder this some more, so these are just some initial
> thoughts.
>
> My first impression is that (a) the x86 MCFG code is an unmitigated
> disaster, and (b) we're trying a little too hard to make that mess
> generic.  I think we might be better served if we came up with some
> cleaner, more generic code that we can use for ARM64 today, and
> migrate x86 toward that over time.
>
> My concern is that if we elevate the current x86 code to be
> "arch-independent", we will be perpetuating some interfaces and
> designs that shouldn't be allowed to escape arch/x86.
>
> Some of the code that moved to drivers/acpi/pci_mcfg.c is not really
> ACPI-specific, and could potentially be used for non-ACPI bridges that
> support ECAM.  I'd like to see that sort of code moved to a new file
> like drivers/pci/ecam.c.

Actually I split it as you suggested in the previous patch set. Please 
have a look at:
https://lkml.org/lkml/2016/2/4/646

Especially patches [0-6] which handle MMCONFIG refactoring.

Thanks,
Tomasz
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas March 5, 2016, 4:14 a.m. UTC | #7
On Fri, Mar 04, 2016 at 02:05:56PM +0530, Jayachandran Chandrashekaran Nair wrote:
> On Fri, Mar 4, 2016 at 4:21 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> > Hi Tomasz, Jayachandran, et al,
> >
> > On Tue, Feb 16, 2016 at 02:53:31PM +0100, Tomasz Nowicki wrote:
> >> From: Jayachandran C <jchandra@broadcom.com>
> >>
> >> Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
> >> to share the API and code with ARM64 later. The corresponding
> >> declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
> >>
> >> As a part of this we introduce three functions that can be
> >> implemented by the arch code: pci_mmconfig_map_resource() to map a
> >> mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
> >> unmap and pci_mmconfig_enabled to see if the arch setup of
> >> mcfg entries was successful. We also provide weak implementations
> >> of these, which will be used from ARM64. On x86, we retain the
> >> old logic by providing platform specific implementation.
> >>
> >> This patch is purely rearranging code, it should not have any
> >> impact on the logic of MCFG parsing or list handling.
> >
> > I definitely want to figure out how to make this work well on ARM64.
> > I need to ponder this some more, so these are just some initial
> > thoughts.
> >
> > My first impression is that (a) the x86 MCFG code is an unmitigated
> > disaster, and (b) we're trying a little too hard to make that mess
> > generic.  I think we might be better served if we came up with some
> > cleaner, more generic code that we can use for ARM64 today, and
> > migrate x86 toward that over time.
> >
> > My concern is that if we elevate the current x86 code to be
> > "arch-independent", we will be perpetuating some interfaces and
> > designs that shouldn't be allowed to escape arch/x86.
> 
> I think the major decision is whether to maintain the pci_mmcfg_list
> for all architectures or not. My initial plan was not to do this because
> of the mess (basically the ECAM region info should be attached to
> the pci root and not maintained in a separate list that needs locking),
> The patch I posted initially https://patchwork.ozlabs.org/patch/553464/
> had a much simpler way of handling the MCFG table without using
> the list.

I agree that ECAM info should be attached to the PCI host controller.
That should simplify locking and hot-add and hot-removal of host
controllers.

I think pci_mmcfg_list is an implementation detail that may not need
to be generic.  I certainly don't think it needs to be part of the
interface.

> In x86 case it is not feasible to remove using the pci_mmcfg_list.
> The only use of it outside is in xen that can be fixed up.
> 
> > Some of the code that moved to drivers/acpi/pci_mcfg.c is not really
> > ACPI-specific, and could potentially be used for non-ACPI bridges that
> > support ECAM.  I'd like to see that sort of code moved to a new file
> > like drivers/pci/ecam.c.
> >
> > There's a little bit of overlap here with the ECAM code in
> > pci-host-generic.c.  I'm not sure whether or how to include that, but
> > it's a very good example of how simple this *should* be: probe the
> > host bridge, discover the ECAM region, request the region, ioremap it,
> > done.
> 
> I had a similar approach in my initial patchset, please see the patch
> above. The resource for ECAM is mapped similar to the the way
> pci-host-generic.c handled it. An additional step I could do was to
> move the common code (ioremap and mapbus) into a common
> file and share the code with pci-host-generic.c
> 
> >> diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
> >> new file mode 100644
> >> index 0000000..ea84365
> >> --- /dev/null
> >> +++ b/drivers/acpi/pci_mcfg.c
> >> ...
> >> +int __weak pci_mmconfig_map_resource(struct device *dev,
> >> +     struct pci_mmcfg_region *mcfg)
> >> +{
> >> +     struct resource *tmp;
> >> +     void __iomem *vaddr;
> >> +
> >> +     tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
> >> +     if (tmp) {
> >> +             dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
> >> +                     &mcfg->res, tmp->name, tmp);
> >> +             return -EBUSY;
> >> +     }
> >
> > I think this is a mistake in the x86 MCFG support that we should not
> > carry over to a generic implementation.  We should not use the MCFG
> > table for resource reservation because MCFG is not defined by the ACPI
> > spec and an OS need not include support for it.  The platform must
> > indicate in some other, more generic way, that ECAM space is reserved.
> > This probably means ECAM space should be declared in a PNP0C02 _CRS
> > method (see the PCI Firmware Spec r3.0, sec 4.1.2, note 2).
> >
> > We might need some kind of x86-specific quirk that does this, or a
> > pcibios hook or something here; I just don't think it should be
> > generic.
> >
> >> +int __init pci_mmconfig_parse_table(void)
> >> +{
> >> +     return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
> >> +}
> >
> > I don't like the fact that we parse the entire MCFG table at once
> > here.  I think we should look for the information we need when we are
> > claiming a PCI host bridge, e.g., in acpi_pci_root_add().  This might
> > not be a great fit for the way ACPI table management works, but I
> > think it's better to do things on-demand rather than just-in-case.
> 
> There is an overhead of looking up this table, and the information
> available there is very limited (i.e, segment, start_bus, end_bus
> and address). My approach in the above patch is to save this info
> into an array at boot time and avoid multiple lookups.

We need to look up MCFG info once per host bridge, so I don't think
there's any performance issue here.  But we do use acpi_table_parse(),
which is __init, and *that* is a reason why we might need to parse the
entire MCFG at boot-time.  But this is the least of our worries in any
case.

> >> diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
> >> index 89ab057..e9450ef 100644
> >> --- a/include/linux/pci-acpi.h
> >> +++ b/include/linux/pci-acpi.h
> >> @@ -106,6 +106,39 @@ extern const u8 pci_acpi_dsm_uuid[];
> >>  #define RESET_DELAY_DSM              0x08
> >>  #define FUNCTION_DELAY_DSM   0x09
> >>
> >> +/* common API to maintain list of MCFG regions */
> >> +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
> >> +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
> >> +
> >> +struct pci_mmcfg_region {
> >> +     struct list_head list;
> >> +     struct resource res;
> >> +     u64 address;
> >> +     char __iomem *virt;
> >> +     u16 segment;
> >> +     u8 start_bus;
> >> +     u8 end_bus;
> >> +     char name[PCI_MMCFG_RESOURCE_NAME_LEN];
> >> +};
> >> +
> >> +extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
> >> +                            phys_addr_t addr);
> >> +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
> >> +
> >> +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
> >> +extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
> >> +                                                     int end, u64 addr);
> >> +extern int pci_mmconfig_map_resource(struct device *dev,
> >> +     struct pci_mmcfg_region *mcfg);
> >> +extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region *mcfg);
> >> +extern int pci_mmconfig_enabled(void);
> >> +extern int __init pci_mmconfig_parse_table(void);
> >> +
> >> +extern struct list_head pci_mmcfg_list;
> >> +
> >> +#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
> >> +#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
> >> +
> >
> > With the exception of pci_mmconfig_parse_table(), nothing here is
> > ACPI-specific.  I'd like to see the PCI ECAM-related interfaces
> > (hopefully not these exact ones, but a more rational set) put in
> > something like include/linux/pci-ecam.h.
> >
> >>  #else        /* CONFIG_ACPI */
> >>  static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
> >>  static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
> 
> I can update this patch to
>  -  drop the pci_mmcfg_list handling from generic case
>  -  move common ECAM code so that it can be shared with
>     pci-host-generic.c
> if that is what you are looking for. The code will end up looking much
> simpler.

I think we should ignore x86 mmconfig for now.  It is absurdly
complicated and I'm not sure it's fixable.  I *do* want to keep
drivers/acpi/pci_root.c for all ACPI host bridges, including x86,
ia64, and arm64.

So I think we should write generic MCFG and ECAM support from scratch
for arm64.  Something like this:

  - Add an acpi_mcfg_init(), maybe in drivers/acpi/pci_mcfg.c, to be
    called from acpi_init() to copy MCFG info to something we can
    access after __init.  This would not reserve resources, but
    probably does have to ioremap() the regions to support
    raw_pci_read().

  - Implement raw_pci_read(), which is "special" because ACPI needs it
    for PCI config access from AML.  It's supposed to be "always
    accessible" and we don't have a struct pci_bus *, so this probably
    has to use the MCFG copy and the ioremap done above.  Maybe it
    should go in the same file.  This is completely independent of
    the PCI core and PCI data structures.

  - Implement arm64 pci_acpi_scan_root() that calls
    acpi_pci_root_create() with an .init_info() function that calls
    acpi_pci_root_get_mcfg_addr() to read _CBA, and if that fails,
    looks up the bus range in the MCFG copy from above.  It should
    call request_mem_region().  For a region from _CBA, it should call
    ioremap().  For regions from MCFG it can probably use the ioremap
    done by acpi_mcfg_init().
    
    I know acpi_pci_root_add() calls acpi_pci_root_get_mcfg_addr()
    before calling pci_acpi_scan_root(), but I think that's wrong
    because (a) some arches, e.g., ia64, don't use ECAM and (b) _CBA
    and MCFG should be handled in the same place.
    
    I know calling request_mem_region() here will probably be an
    ordering problem because the PNP0C02 driver hasn't reserved
    resources yet.  But the host bridge driver is using the region and
    it should reserve it.

  - If we store the ECAM mapped base address in the sysdata or struct
    pci_host_bridge, the normal config accessors can use
    pci_generic_config_read() with a new generic .map_bus() function.

    On x86, the normal config access path is:

      pci_read(struct pci_bus *, ...)
        raw_pci_read(seg, bus#, ...)
	  raw_pci_ext_ops->read(seg, bus#, ...)
	    pci_mmcfg_read(seg, bus#, ...)
	      pci_dev_base
	        pci_mmconfig_lookup(seg, bus#)

    I think this is somewhat backwards because we start with a pci_bus
    pointer, so we *could* have a nice simple bus-specific accessor,
    but we throw that pointer away, so pci_mmcfg_read() has to start
    over and look up the ECAM offset from scratch, which makes it all
    unnecessarily complicated.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tomasz Nowicki March 9, 2016, 9:13 a.m. UTC | #8
Hi Bjorn,

Thanks for your pointers! See my comments inline. Aslo, can you please 
have a look at my previous patch set v4 and check how many of your 
comments are already addressed there. We may want to back to it then.

https://lkml.org/lkml/2016/2/4/646
Especially patches [0-6] which handle MMCONFIG refactoring.


On 05.03.2016 05:14, Bjorn Helgaas wrote:
> On Fri, Mar 04, 2016 at 02:05:56PM +0530, Jayachandran Chandrashekaran Nair wrote:
>> On Fri, Mar 4, 2016 at 4:21 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
>>> Hi Tomasz, Jayachandran, et al,
>>>
>>> On Tue, Feb 16, 2016 at 02:53:31PM +0100, Tomasz Nowicki wrote:
>>>> From: Jayachandran C <jchandra@broadcom.com>
>>>>
>>>> Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
>>>> to share the API and code with ARM64 later. The corresponding
>>>> declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
>>>>
>>>> As a part of this we introduce three functions that can be
>>>> implemented by the arch code: pci_mmconfig_map_resource() to map a
>>>> mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
>>>> unmap and pci_mmconfig_enabled to see if the arch setup of
>>>> mcfg entries was successful. We also provide weak implementations
>>>> of these, which will be used from ARM64. On x86, we retain the
>>>> old logic by providing platform specific implementation.
>>>>
>>>> This patch is purely rearranging code, it should not have any
>>>> impact on the logic of MCFG parsing or list handling.
>>>
>>> I definitely want to figure out how to make this work well on ARM64.
>>> I need to ponder this some more, so these are just some initial
>>> thoughts.
>>>
>>> My first impression is that (a) the x86 MCFG code is an unmitigated
>>> disaster, and (b) we're trying a little too hard to make that mess
>>> generic.  I think we might be better served if we came up with some
>>> cleaner, more generic code that we can use for ARM64 today, and
>>> migrate x86 toward that over time.
>>>
>>> My concern is that if we elevate the current x86 code to be
>>> "arch-independent", we will be perpetuating some interfaces and
>>> designs that shouldn't be allowed to escape arch/x86.
>>
>> I think the major decision is whether to maintain the pci_mmcfg_list
>> for all architectures or not. My initial plan was not to do this because
>> of the mess (basically the ECAM region info should be attached to
>> the pci root and not maintained in a separate list that needs locking),
>> The patch I posted initially https://patchwork.ozlabs.org/patch/553464/
>> had a much simpler way of handling the MCFG table without using
>> the list.
>
> I agree that ECAM info should be attached to the PCI host controller.
> That should simplify locking and hot-add and hot-removal of host
> controllers.
>
> I think pci_mmcfg_list is an implementation detail that may not need
> to be generic.  I certainly don't think it needs to be part of the
> interface.
>
>> In x86 case it is not feasible to remove using the pci_mmcfg_list.
>> The only use of it outside is in xen that can be fixed up.
>>
>>> Some of the code that moved to drivers/acpi/pci_mcfg.c is not really
>>> ACPI-specific, and could potentially be used for non-ACPI bridges that
>>> support ECAM.  I'd like to see that sort of code moved to a new file
>>> like drivers/pci/ecam.c.
>>>
>>> There's a little bit of overlap here with the ECAM code in
>>> pci-host-generic.c.  I'm not sure whether or how to include that, but
>>> it's a very good example of how simple this *should* be: probe the
>>> host bridge, discover the ECAM region, request the region, ioremap it,
>>> done.
>>
>> I had a similar approach in my initial patchset, please see the patch
>> above. The resource for ECAM is mapped similar to the the way
>> pci-host-generic.c handled it. An additional step I could do was to
>> move the common code (ioremap and mapbus) into a common
>> file and share the code with pci-host-generic.c
>>
>>>> diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
>>>> new file mode 100644
>>>> index 0000000..ea84365
>>>> --- /dev/null
>>>> +++ b/drivers/acpi/pci_mcfg.c
>>>> ...
>>>> +int __weak pci_mmconfig_map_resource(struct device *dev,
>>>> +     struct pci_mmcfg_region *mcfg)
>>>> +{
>>>> +     struct resource *tmp;
>>>> +     void __iomem *vaddr;
>>>> +
>>>> +     tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
>>>> +     if (tmp) {
>>>> +             dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
>>>> +                     &mcfg->res, tmp->name, tmp);
>>>> +             return -EBUSY;
>>>> +     }
>>>
>>> I think this is a mistake in the x86 MCFG support that we should not
>>> carry over to a generic implementation.  We should not use the MCFG
>>> table for resource reservation because MCFG is not defined by the ACPI
>>> spec and an OS need not include support for it.  The platform must
>>> indicate in some other, more generic way, that ECAM space is reserved.
>>> This probably means ECAM space should be declared in a PNP0C02 _CRS
>>> method (see the PCI Firmware Spec r3.0, sec 4.1.2, note 2).
>>>
>>> We might need some kind of x86-specific quirk that does this, or a
>>> pcibios hook or something here; I just don't think it should be
>>> generic.
>>>
>>>> +int __init pci_mmconfig_parse_table(void)
>>>> +{
>>>> +     return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
>>>> +}
>>>
>>> I don't like the fact that we parse the entire MCFG table at once
>>> here.  I think we should look for the information we need when we are
>>> claiming a PCI host bridge, e.g., in acpi_pci_root_add().  This might
>>> not be a great fit for the way ACPI table management works, but I
>>> think it's better to do things on-demand rather than just-in-case.
>>
>> There is an overhead of looking up this table, and the information
>> available there is very limited (i.e, segment, start_bus, end_bus
>> and address). My approach in the above patch is to save this info
>> into an array at boot time and avoid multiple lookups.
>
> We need to look up MCFG info once per host bridge, so I don't think
> there's any performance issue here.  But we do use acpi_table_parse(),
> which is __init, and *that* is a reason why we might need to parse the
> entire MCFG at boot-time.  But this is the least of our worries in any
> case.
>
>>>> diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
>>>> index 89ab057..e9450ef 100644
>>>> --- a/include/linux/pci-acpi.h
>>>> +++ b/include/linux/pci-acpi.h
>>>> @@ -106,6 +106,39 @@ extern const u8 pci_acpi_dsm_uuid[];
>>>>   #define RESET_DELAY_DSM              0x08
>>>>   #define FUNCTION_DELAY_DSM   0x09
>>>>
>>>> +/* common API to maintain list of MCFG regions */
>>>> +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
>>>> +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
>>>> +
>>>> +struct pci_mmcfg_region {
>>>> +     struct list_head list;
>>>> +     struct resource res;
>>>> +     u64 address;
>>>> +     char __iomem *virt;
>>>> +     u16 segment;
>>>> +     u8 start_bus;
>>>> +     u8 end_bus;
>>>> +     char name[PCI_MMCFG_RESOURCE_NAME_LEN];
>>>> +};
>>>> +
>>>> +extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
>>>> +                            phys_addr_t addr);
>>>> +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
>>>> +
>>>> +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
>>>> +extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
>>>> +                                                     int end, u64 addr);
>>>> +extern int pci_mmconfig_map_resource(struct device *dev,
>>>> +     struct pci_mmcfg_region *mcfg);
>>>> +extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region *mcfg);
>>>> +extern int pci_mmconfig_enabled(void);
>>>> +extern int __init pci_mmconfig_parse_table(void);
>>>> +
>>>> +extern struct list_head pci_mmcfg_list;
>>>> +
>>>> +#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
>>>> +#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
>>>> +
>>>
>>> With the exception of pci_mmconfig_parse_table(), nothing here is
>>> ACPI-specific.  I'd like to see the PCI ECAM-related interfaces
>>> (hopefully not these exact ones, but a more rational set) put in
>>> something like include/linux/pci-ecam.h.
>>>
>>>>   #else        /* CONFIG_ACPI */
>>>>   static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
>>>>   static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
>>
>> I can update this patch to
>>   -  drop the pci_mmcfg_list handling from generic case
>>   -  move common ECAM code so that it can be shared with
>>      pci-host-generic.c
>> if that is what you are looking for. The code will end up looking much
>> simpler.
>
> I think we should ignore x86 mmconfig for now.  It is absurdly
> complicated and I'm not sure it's fixable.  I *do* want to keep
> drivers/acpi/pci_root.c for all ACPI host bridges, including x86,
> ia64, and arm64.
>
> So I think we should write generic MCFG and ECAM support from scratch
> for arm64.  Something like this:
>
>    - Add an acpi_mcfg_init(), maybe in drivers/acpi/pci_mcfg.c, to be
>      called from acpi_init() to copy MCFG info to something we can
>      access after __init.  This would not reserve resources, but
>      probably does have to ioremap() the regions to support
>      raw_pci_read().

As said, ECAM and ACPI specific code was isolated in previous patch. 
There, I tried to leave x86 complication in arch/x86/ and extract 
generic functionalities to driver/pci/ecam.c as the library.

>
>    - Implement raw_pci_read(), which is "special" because ACPI needs it
>      for PCI config access from AML.  It's supposed to be "always
>      accessible" and we don't have a struct pci_bus *, so this probably
>      has to use the MCFG copy and the ioremap done above.  Maybe it
>      should go in the same file.  This is completely independent of
>      the PCI core and PCI data structures.
We were looking for the answer which would justify RAW PCI config 
accessors being for ARM64 world. Unfortunately, nobody was able to show 
real use case for ARM64. Do you see the reason we need this? Our 
conclusion was to leave it empty for ARM64 which in turn makes code 
simpler. I am not ASWG member while that was under discussion so I will 
ask Lorenzo to elaborate more on this.

>
>    - Implement arm64 pci_acpi_scan_root() that calls
>      acpi_pci_root_create() with an .init_info() function that calls
>      acpi_pci_root_get_mcfg_addr() to read _CBA, and if that fails,
>      looks up the bus range in the MCFG copy from above.  It should
>      call request_mem_region().  For a region from _CBA, it should call
>      ioremap().  For regions from MCFG it can probably use the ioremap
>      done by acpi_mcfg_init().
Yes, Expanding .init_info() to check for _CBA is good point.

>
>      I know acpi_pci_root_add() calls acpi_pci_root_get_mcfg_addr()
>      before calling pci_acpi_scan_root(), but I think that's wrong
>      because (a) some arches, e.g., ia64, don't use ECAM and (b) _CBA
>      and MCFG should be handled in the same place.
>
>      I know calling request_mem_region() here will probably be an
>      ordering problem because the PNP0C02 driver hasn't reserved
>      resources yet.  But the host bridge driver is using the region and
>      it should reserve it.
>
>    - If we store the ECAM mapped base address in the sysdata or struct
>      pci_host_bridge, the normal config accessors can use
>      pci_generic_config_read() with a new generic .map_bus() function.

pci_generic_config_{read|write}() is what we want to use, actually we do 
now, but ECAM region and sysdata association will remove ECAM region 
lookup step (see patch 09/15 of this series).

>
>      On x86, the normal config access path is:
>
>        pci_read(struct pci_bus *, ...)
>          raw_pci_read(seg, bus#, ...)
> 	  raw_pci_ext_ops->read(seg, bus#, ...)
> 	    pci_mmcfg_read(seg, bus#, ...)
> 	      pci_dev_base
> 	        pci_mmconfig_lookup(seg, bus#)
>
>      I think this is somewhat backwards because we start with a pci_bus
>      pointer, so we *could* have a nice simple bus-specific accessor,
>      but we throw that pointer away, so pci_mmcfg_read() has to start
>      over and look up the ECAM offset from scratch, which makes it all
>      unnecessarily complicated.
>

As you pointed out raw_pci_{read|write} make things complicated, so IMO 
we should either say they are absolutely necessary (and then think how 
to simplify it) or just use simple bus-specific accessor (patch 02/15) 
e.g. for ARM64.

Any comments appreciated.

Thanks,
Tomasz
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tomasz Nowicki March 9, 2016, 9:14 a.m. UTC | #9
On 09.03.2016 10:13, Tomasz Nowicki wrote:
>> So I think we should write generic MCFG and ECAM support from scratch
>> for arm64.  Something like this:
>>
>>    - Add an acpi_mcfg_init(), maybe in drivers/acpi/pci_mcfg.c, to be
>>      called from acpi_init() to copy MCFG info to something we can
>>      access after __init.  This would not reserve resources, but
>>      probably does have to ioremap() the regions to support
>>      raw_pci_read().
>
> As said, ECAM and ACPI specific code was isolated in previous patch.

I meant to say, in my previous patch set (V4), sorry.

Tomasz
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jayachandran Chandrashekaran Nair March 9, 2016, 10:10 a.m. UTC | #10
Hi Tomasz,

On Wed, Mar 9, 2016 at 2:43 PM, Tomasz Nowicki <tn@semihalf.com> wrote:
> Hi Bjorn,
>
> Thanks for your pointers! See my comments inline. Aslo, can you please have
> a look at my previous patch set v4 and check how many of your comments are
> already addressed there. We may want to back to it then.
>
> https://lkml.org/lkml/2016/2/4/646
> Especially patches [0-6] which handle MMCONFIG refactoring.
>
>
> On 05.03.2016 05:14, Bjorn Helgaas wrote:
>>
>> On Fri, Mar 04, 2016 at 02:05:56PM +0530, Jayachandran Chandrashekaran
>> Nair wrote:
>>>
>>> On Fri, Mar 4, 2016 at 4:21 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
>>>>
>>>> Hi Tomasz, Jayachandran, et al,
>>>>
>>>> On Tue, Feb 16, 2016 at 02:53:31PM +0100, Tomasz Nowicki wrote:
>>>>>
>>>>> From: Jayachandran C <jchandra@broadcom.com>
>>>>>
>>>>> Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
>>>>> to share the API and code with ARM64 later. The corresponding
>>>>> declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
>>>>>
>>>>> As a part of this we introduce three functions that can be
>>>>> implemented by the arch code: pci_mmconfig_map_resource() to map a
>>>>> mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
>>>>> unmap and pci_mmconfig_enabled to see if the arch setup of
>>>>> mcfg entries was successful. We also provide weak implementations
>>>>> of these, which will be used from ARM64. On x86, we retain the
>>>>> old logic by providing platform specific implementation.
>>>>>
>>>>> This patch is purely rearranging code, it should not have any
>>>>> impact on the logic of MCFG parsing or list handling.
>>>>
>>>>
>>>> I definitely want to figure out how to make this work well on ARM64.
>>>> I need to ponder this some more, so these are just some initial
>>>> thoughts.
>>>>
>>>> My first impression is that (a) the x86 MCFG code is an unmitigated
>>>> disaster, and (b) we're trying a little too hard to make that mess
>>>> generic.  I think we might be better served if we came up with some
>>>> cleaner, more generic code that we can use for ARM64 today, and
>>>> migrate x86 toward that over time.
>>>>
>>>> My concern is that if we elevate the current x86 code to be
>>>> "arch-independent", we will be perpetuating some interfaces and
>>>> designs that shouldn't be allowed to escape arch/x86.
>>>
>>>
>>> I think the major decision is whether to maintain the pci_mmcfg_list
>>> for all architectures or not. My initial plan was not to do this because
>>> of the mess (basically the ECAM region info should be attached to
>>> the pci root and not maintained in a separate list that needs locking),
>>> The patch I posted initially https://patchwork.ozlabs.org/patch/553464/
>>> had a much simpler way of handling the MCFG table without using
>>> the list.
>>
>>
>> I agree that ECAM info should be attached to the PCI host controller.
>> That should simplify locking and hot-add and hot-removal of host
>> controllers.
>>
>> I think pci_mmcfg_list is an implementation detail that may not need
>> to be generic.  I certainly don't think it needs to be part of the
>> interface.
>>
>>> In x86 case it is not feasible to remove using the pci_mmcfg_list.
>>> The only use of it outside is in xen that can be fixed up.
>>>
>>>> Some of the code that moved to drivers/acpi/pci_mcfg.c is not really
>>>> ACPI-specific, and could potentially be used for non-ACPI bridges that
>>>> support ECAM.  I'd like to see that sort of code moved to a new file
>>>> like drivers/pci/ecam.c.
>>>>
>>>> There's a little bit of overlap here with the ECAM code in
>>>> pci-host-generic.c.  I'm not sure whether or how to include that, but
>>>> it's a very good example of how simple this *should* be: probe the
>>>> host bridge, discover the ECAM region, request the region, ioremap it,
>>>> done.
>>>
>>>
>>> I had a similar approach in my initial patchset, please see the patch
>>> above. The resource for ECAM is mapped similar to the the way
>>> pci-host-generic.c handled it. An additional step I could do was to
>>> move the common code (ioremap and mapbus) into a common
>>> file and share the code with pci-host-generic.c
>>>
>>>>> diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
>>>>> new file mode 100644
>>>>> index 0000000..ea84365
>>>>> --- /dev/null
>>>>> +++ b/drivers/acpi/pci_mcfg.c
>>>>> ...
>>>>> +int __weak pci_mmconfig_map_resource(struct device *dev,
>>>>> +     struct pci_mmcfg_region *mcfg)
>>>>> +{
>>>>> +     struct resource *tmp;
>>>>> +     void __iomem *vaddr;
>>>>> +
>>>>> +     tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
>>>>> +     if (tmp) {
>>>>> +             dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
>>>>> +                     &mcfg->res, tmp->name, tmp);
>>>>> +             return -EBUSY;
>>>>> +     }
>>>>
>>>>
>>>> I think this is a mistake in the x86 MCFG support that we should not
>>>> carry over to a generic implementation.  We should not use the MCFG
>>>> table for resource reservation because MCFG is not defined by the ACPI
>>>> spec and an OS need not include support for it.  The platform must
>>>> indicate in some other, more generic way, that ECAM space is reserved.
>>>> This probably means ECAM space should be declared in a PNP0C02 _CRS
>>>> method (see the PCI Firmware Spec r3.0, sec 4.1.2, note 2).
>>>>
>>>> We might need some kind of x86-specific quirk that does this, or a
>>>> pcibios hook or something here; I just don't think it should be
>>>> generic.
>>>>
>>>>> +int __init pci_mmconfig_parse_table(void)
>>>>> +{
>>>>> +     return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
>>>>> +}
>>>>
>>>>
>>>> I don't like the fact that we parse the entire MCFG table at once
>>>> here.  I think we should look for the information we need when we are
>>>> claiming a PCI host bridge, e.g., in acpi_pci_root_add().  This might
>>>> not be a great fit for the way ACPI table management works, but I
>>>> think it's better to do things on-demand rather than just-in-case.
>>>
>>>
>>> There is an overhead of looking up this table, and the information
>>> available there is very limited (i.e, segment, start_bus, end_bus
>>> and address). My approach in the above patch is to save this info
>>> into an array at boot time and avoid multiple lookups.
>>
>>
>> We need to look up MCFG info once per host bridge, so I don't think
>> there's any performance issue here.  But we do use acpi_table_parse(),
>> which is __init, and *that* is a reason why we might need to parse the
>> entire MCFG at boot-time.  But this is the least of our worries in any
>> case.
>>
>>>>> diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
>>>>> index 89ab057..e9450ef 100644
>>>>> --- a/include/linux/pci-acpi.h
>>>>> +++ b/include/linux/pci-acpi.h
>>>>> @@ -106,6 +106,39 @@ extern const u8 pci_acpi_dsm_uuid[];
>>>>>   #define RESET_DELAY_DSM              0x08
>>>>>   #define FUNCTION_DELAY_DSM   0x09
>>>>>
>>>>> +/* common API to maintain list of MCFG regions */
>>>>> +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
>>>>> +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
>>>>> +
>>>>> +struct pci_mmcfg_region {
>>>>> +     struct list_head list;
>>>>> +     struct resource res;
>>>>> +     u64 address;
>>>>> +     char __iomem *virt;
>>>>> +     u16 segment;
>>>>> +     u8 start_bus;
>>>>> +     u8 end_bus;
>>>>> +     char name[PCI_MMCFG_RESOURCE_NAME_LEN];
>>>>> +};
>>>>> +
>>>>> +extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start,
>>>>> u8 end,
>>>>> +                            phys_addr_t addr);
>>>>> +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
>>>>> +
>>>>> +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int
>>>>> bus);
>>>>> +extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int
>>>>> start,
>>>>> +                                                     int end, u64
>>>>> addr);
>>>>> +extern int pci_mmconfig_map_resource(struct device *dev,
>>>>> +     struct pci_mmcfg_region *mcfg);
>>>>> +extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region
>>>>> *mcfg);
>>>>> +extern int pci_mmconfig_enabled(void);
>>>>> +extern int __init pci_mmconfig_parse_table(void);
>>>>> +
>>>>> +extern struct list_head pci_mmcfg_list;
>>>>> +
>>>>> +#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
>>>>> +#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
>>>>> +
>>>>
>>>>
>>>> With the exception of pci_mmconfig_parse_table(), nothing here is
>>>> ACPI-specific.  I'd like to see the PCI ECAM-related interfaces
>>>> (hopefully not these exact ones, but a more rational set) put in
>>>> something like include/linux/pci-ecam.h.
>>>>
>>>>>   #else        /* CONFIG_ACPI */
>>>>>   static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
>>>>>   static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
>>>
>>>
>>> I can update this patch to
>>>   -  drop the pci_mmcfg_list handling from generic case
>>>   -  move common ECAM code so that it can be shared with
>>>      pci-host-generic.c
>>> if that is what you are looking for. The code will end up looking much
>>> simpler.
>>
>>
>> I think we should ignore x86 mmconfig for now.  It is absurdly
>> complicated and I'm not sure it's fixable.  I *do* want to keep
>> drivers/acpi/pci_root.c for all ACPI host bridges, including x86,
>> ia64, and arm64.
>>
>> So I think we should write generic MCFG and ECAM support from scratch
>> for arm64.  Something like this:
>>
>>    - Add an acpi_mcfg_init(), maybe in drivers/acpi/pci_mcfg.c, to be
>>      called from acpi_init() to copy MCFG info to something we can
>>      access after __init.  This would not reserve resources, but
>>      probably does have to ioremap() the regions to support
>>      raw_pci_read().
>
>
> As said, ECAM and ACPI specific code was isolated in previous patch. There,
> I tried to leave x86 complication in arch/x86/ and extract generic
> functionalities to driver/pci/ecam.c as the library.
>
>>
>>    - Implement raw_pci_read(), which is "special" because ACPI needs it
>>      for PCI config access from AML.  It's supposed to be "always
>>      accessible" and we don't have a struct pci_bus *, so this probably
>>      has to use the MCFG copy and the ioremap done above.  Maybe it
>>      should go in the same file.  This is completely independent of
>>      the PCI core and PCI data structures.
>
> We were looking for the answer which would justify RAW PCI config accessors
> being for ARM64 world. Unfortunately, nobody was able to show real use case
> for ARM64. Do you see the reason we need this? Our conclusion was to leave
> it empty for ARM64 which in turn makes code simpler. I am not ASWG member
> while that was under discussion so I will ask Lorenzo to elaborate more on
> this.
>
>>
>>    - Implement arm64 pci_acpi_scan_root() that calls
>>      acpi_pci_root_create() with an .init_info() function that calls
>>      acpi_pci_root_get_mcfg_addr() to read _CBA, and if that fails,
>>      looks up the bus range in the MCFG copy from above.  It should
>>      call request_mem_region().  For a region from _CBA, it should call
>>      ioremap().  For regions from MCFG it can probably use the ioremap
>>      done by acpi_mcfg_init().
>
> Yes, Expanding .init_info() to check for _CBA is good point.
>
>>
>>      I know acpi_pci_root_add() calls acpi_pci_root_get_mcfg_addr()
>>      before calling pci_acpi_scan_root(), but I think that's wrong
>>      because (a) some arches, e.g., ia64, don't use ECAM and (b) _CBA
>>      and MCFG should be handled in the same place.
>>
>>      I know calling request_mem_region() here will probably be an
>>      ordering problem because the PNP0C02 driver hasn't reserved
>>      resources yet.  But the host bridge driver is using the region and
>>      it should reserve it.
>>
>>    - If we store the ECAM mapped base address in the sysdata or struct
>>      pci_host_bridge, the normal config accessors can use
>>      pci_generic_config_read() with a new generic .map_bus() function.
>
>
> pci_generic_config_{read|write}() is what we want to use, actually we do
> now, but ECAM region and sysdata association will remove ECAM region lookup
> step (see patch 09/15 of this series).
>
>>
>>      On x86, the normal config access path is:
>>
>>        pci_read(struct pci_bus *, ...)
>>          raw_pci_read(seg, bus#, ...)
>>           raw_pci_ext_ops->read(seg, bus#, ...)
>>             pci_mmcfg_read(seg, bus#, ...)
>>               pci_dev_base
>>                 pci_mmconfig_lookup(seg, bus#)
>>
>>      I think this is somewhat backwards because we start with a pci_bus
>>      pointer, so we *could* have a nice simple bus-specific accessor,
>>      but we throw that pointer away, so pci_mmcfg_read() has to start
>>      over and look up the ECAM offset from scratch, which makes it all
>>      unnecessarily complicated.
>>
>
> As you pointed out raw_pci_{read|write} make things complicated, so IMO we
> should either say they are absolutely necessary (and then think how to
> simplify it) or just use simple bus-specific accessor (patch 02/15) e.g. for
> ARM64.

Both raw_pci_read/write and a host controller with pci_generic_read/write can
be done without much trouble, please see the patch I had at:
https://patchwork.ozlabs.org/patch/575526/

I have been looking at Bjorn's suggestions and trying to see if I can update
my MCFG patch taking care of them. I will post an updated patchset soon,
unless you want to take this up.

JC.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tomasz Nowicki March 9, 2016, 10:50 a.m. UTC | #11
Hi Jayachandran,

On 09.03.2016 11:10, Jayachandran Chandrashekaran Nair wrote:
> Hi Tomasz,
>
> On Wed, Mar 9, 2016 at 2:43 PM, Tomasz Nowicki <tn@semihalf.com> wrote:
>> Hi Bjorn,
>>
>> Thanks for your pointers! See my comments inline. Aslo, can you please have
>> a look at my previous patch set v4 and check how many of your comments are
>> already addressed there. We may want to back to it then.
>>
>> https://lkml.org/lkml/2016/2/4/646
>> Especially patches [0-6] which handle MMCONFIG refactoring.
>>
>>
>> On 05.03.2016 05:14, Bjorn Helgaas wrote:
>>>
>>> On Fri, Mar 04, 2016 at 02:05:56PM +0530, Jayachandran Chandrashekaran
>>> Nair wrote:
>>>>
>>>> On Fri, Mar 4, 2016 at 4:21 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
>>>>>
>>>>> Hi Tomasz, Jayachandran, et al,
>>>>>
>>>>> On Tue, Feb 16, 2016 at 02:53:31PM +0100, Tomasz Nowicki wrote:
>>>>>>
>>>>>> From: Jayachandran C <jchandra@broadcom.com>
>>>>>>
>>>>>> Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
>>>>>> to share the API and code with ARM64 later. The corresponding
>>>>>> declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
>>>>>>
>>>>>> As a part of this we introduce three functions that can be
>>>>>> implemented by the arch code: pci_mmconfig_map_resource() to map a
>>>>>> mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
>>>>>> unmap and pci_mmconfig_enabled to see if the arch setup of
>>>>>> mcfg entries was successful. We also provide weak implementations
>>>>>> of these, which will be used from ARM64. On x86, we retain the
>>>>>> old logic by providing platform specific implementation.
>>>>>>
>>>>>> This patch is purely rearranging code, it should not have any
>>>>>> impact on the logic of MCFG parsing or list handling.
>>>>>
>>>>>
>>>>> I definitely want to figure out how to make this work well on ARM64.
>>>>> I need to ponder this some more, so these are just some initial
>>>>> thoughts.
>>>>>
>>>>> My first impression is that (a) the x86 MCFG code is an unmitigated
>>>>> disaster, and (b) we're trying a little too hard to make that mess
>>>>> generic.  I think we might be better served if we came up with some
>>>>> cleaner, more generic code that we can use for ARM64 today, and
>>>>> migrate x86 toward that over time.
>>>>>
>>>>> My concern is that if we elevate the current x86 code to be
>>>>> "arch-independent", we will be perpetuating some interfaces and
>>>>> designs that shouldn't be allowed to escape arch/x86.
>>>>
>>>>
>>>> I think the major decision is whether to maintain the pci_mmcfg_list
>>>> for all architectures or not. My initial plan was not to do this because
>>>> of the mess (basically the ECAM region info should be attached to
>>>> the pci root and not maintained in a separate list that needs locking),
>>>> The patch I posted initially https://patchwork.ozlabs.org/patch/553464/
>>>> had a much simpler way of handling the MCFG table without using
>>>> the list.
>>>
>>>
>>> I agree that ECAM info should be attached to the PCI host controller.
>>> That should simplify locking and hot-add and hot-removal of host
>>> controllers.
>>>
>>> I think pci_mmcfg_list is an implementation detail that may not need
>>> to be generic.  I certainly don't think it needs to be part of the
>>> interface.
>>>
>>>> In x86 case it is not feasible to remove using the pci_mmcfg_list.
>>>> The only use of it outside is in xen that can be fixed up.
>>>>
>>>>> Some of the code that moved to drivers/acpi/pci_mcfg.c is not really
>>>>> ACPI-specific, and could potentially be used for non-ACPI bridges that
>>>>> support ECAM.  I'd like to see that sort of code moved to a new file
>>>>> like drivers/pci/ecam.c.
>>>>>
>>>>> There's a little bit of overlap here with the ECAM code in
>>>>> pci-host-generic.c.  I'm not sure whether or how to include that, but
>>>>> it's a very good example of how simple this *should* be: probe the
>>>>> host bridge, discover the ECAM region, request the region, ioremap it,
>>>>> done.
>>>>
>>>>
>>>> I had a similar approach in my initial patchset, please see the patch
>>>> above. The resource for ECAM is mapped similar to the the way
>>>> pci-host-generic.c handled it. An additional step I could do was to
>>>> move the common code (ioremap and mapbus) into a common
>>>> file and share the code with pci-host-generic.c
>>>>
>>>>>> diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
>>>>>> new file mode 100644
>>>>>> index 0000000..ea84365
>>>>>> --- /dev/null
>>>>>> +++ b/drivers/acpi/pci_mcfg.c
>>>>>> ...
>>>>>> +int __weak pci_mmconfig_map_resource(struct device *dev,
>>>>>> +     struct pci_mmcfg_region *mcfg)
>>>>>> +{
>>>>>> +     struct resource *tmp;
>>>>>> +     void __iomem *vaddr;
>>>>>> +
>>>>>> +     tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
>>>>>> +     if (tmp) {
>>>>>> +             dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
>>>>>> +                     &mcfg->res, tmp->name, tmp);
>>>>>> +             return -EBUSY;
>>>>>> +     }
>>>>>
>>>>>
>>>>> I think this is a mistake in the x86 MCFG support that we should not
>>>>> carry over to a generic implementation.  We should not use the MCFG
>>>>> table for resource reservation because MCFG is not defined by the ACPI
>>>>> spec and an OS need not include support for it.  The platform must
>>>>> indicate in some other, more generic way, that ECAM space is reserved.
>>>>> This probably means ECAM space should be declared in a PNP0C02 _CRS
>>>>> method (see the PCI Firmware Spec r3.0, sec 4.1.2, note 2).
>>>>>
>>>>> We might need some kind of x86-specific quirk that does this, or a
>>>>> pcibios hook or something here; I just don't think it should be
>>>>> generic.
>>>>>
>>>>>> +int __init pci_mmconfig_parse_table(void)
>>>>>> +{
>>>>>> +     return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
>>>>>> +}
>>>>>
>>>>>
>>>>> I don't like the fact that we parse the entire MCFG table at once
>>>>> here.  I think we should look for the information we need when we are
>>>>> claiming a PCI host bridge, e.g., in acpi_pci_root_add().  This might
>>>>> not be a great fit for the way ACPI table management works, but I
>>>>> think it's better to do things on-demand rather than just-in-case.
>>>>
>>>>
>>>> There is an overhead of looking up this table, and the information
>>>> available there is very limited (i.e, segment, start_bus, end_bus
>>>> and address). My approach in the above patch is to save this info
>>>> into an array at boot time and avoid multiple lookups.
>>>
>>>
>>> We need to look up MCFG info once per host bridge, so I don't think
>>> there's any performance issue here.  But we do use acpi_table_parse(),
>>> which is __init, and *that* is a reason why we might need to parse the
>>> entire MCFG at boot-time.  But this is the least of our worries in any
>>> case.
>>>
>>>>>> diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
>>>>>> index 89ab057..e9450ef 100644
>>>>>> --- a/include/linux/pci-acpi.h
>>>>>> +++ b/include/linux/pci-acpi.h
>>>>>> @@ -106,6 +106,39 @@ extern const u8 pci_acpi_dsm_uuid[];
>>>>>>    #define RESET_DELAY_DSM              0x08
>>>>>>    #define FUNCTION_DELAY_DSM   0x09
>>>>>>
>>>>>> +/* common API to maintain list of MCFG regions */
>>>>>> +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
>>>>>> +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
>>>>>> +
>>>>>> +struct pci_mmcfg_region {
>>>>>> +     struct list_head list;
>>>>>> +     struct resource res;
>>>>>> +     u64 address;
>>>>>> +     char __iomem *virt;
>>>>>> +     u16 segment;
>>>>>> +     u8 start_bus;
>>>>>> +     u8 end_bus;
>>>>>> +     char name[PCI_MMCFG_RESOURCE_NAME_LEN];
>>>>>> +};
>>>>>> +
>>>>>> +extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start,
>>>>>> u8 end,
>>>>>> +                            phys_addr_t addr);
>>>>>> +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
>>>>>> +
>>>>>> +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int
>>>>>> bus);
>>>>>> +extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int
>>>>>> start,
>>>>>> +                                                     int end, u64
>>>>>> addr);
>>>>>> +extern int pci_mmconfig_map_resource(struct device *dev,
>>>>>> +     struct pci_mmcfg_region *mcfg);
>>>>>> +extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region
>>>>>> *mcfg);
>>>>>> +extern int pci_mmconfig_enabled(void);
>>>>>> +extern int __init pci_mmconfig_parse_table(void);
>>>>>> +
>>>>>> +extern struct list_head pci_mmcfg_list;
>>>>>> +
>>>>>> +#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
>>>>>> +#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
>>>>>> +
>>>>>
>>>>>
>>>>> With the exception of pci_mmconfig_parse_table(), nothing here is
>>>>> ACPI-specific.  I'd like to see the PCI ECAM-related interfaces
>>>>> (hopefully not these exact ones, but a more rational set) put in
>>>>> something like include/linux/pci-ecam.h.
>>>>>
>>>>>>    #else        /* CONFIG_ACPI */
>>>>>>    static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
>>>>>>    static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
>>>>
>>>>
>>>> I can update this patch to
>>>>    -  drop the pci_mmcfg_list handling from generic case
>>>>    -  move common ECAM code so that it can be shared with
>>>>       pci-host-generic.c
>>>> if that is what you are looking for. The code will end up looking much
>>>> simpler.
>>>
>>>
>>> I think we should ignore x86 mmconfig for now.  It is absurdly
>>> complicated and I'm not sure it's fixable.  I *do* want to keep
>>> drivers/acpi/pci_root.c for all ACPI host bridges, including x86,
>>> ia64, and arm64.
>>>
>>> So I think we should write generic MCFG and ECAM support from scratch
>>> for arm64.  Something like this:
>>>
>>>     - Add an acpi_mcfg_init(), maybe in drivers/acpi/pci_mcfg.c, to be
>>>       called from acpi_init() to copy MCFG info to something we can
>>>       access after __init.  This would not reserve resources, but
>>>       probably does have to ioremap() the regions to support
>>>       raw_pci_read().
>>
>>
>> As said, ECAM and ACPI specific code was isolated in previous patch. There,
>> I tried to leave x86 complication in arch/x86/ and extract generic
>> functionalities to driver/pci/ecam.c as the library.
>>
>>>
>>>     - Implement raw_pci_read(), which is "special" because ACPI needs it
>>>       for PCI config access from AML.  It's supposed to be "always
>>>       accessible" and we don't have a struct pci_bus *, so this probably
>>>       has to use the MCFG copy and the ioremap done above.  Maybe it
>>>       should go in the same file.  This is completely independent of
>>>       the PCI core and PCI data structures.
>>
>> We were looking for the answer which would justify RAW PCI config accessors
>> being for ARM64 world. Unfortunately, nobody was able to show real use case
>> for ARM64. Do you see the reason we need this? Our conclusion was to leave
>> it empty for ARM64 which in turn makes code simpler. I am not ASWG member
>> while that was under discussion so I will ask Lorenzo to elaborate more on
>> this.
>>
>>>
>>>     - Implement arm64 pci_acpi_scan_root() that calls
>>>       acpi_pci_root_create() with an .init_info() function that calls
>>>       acpi_pci_root_get_mcfg_addr() to read _CBA, and if that fails,
>>>       looks up the bus range in the MCFG copy from above.  It should
>>>       call request_mem_region().  For a region from _CBA, it should call
>>>       ioremap().  For regions from MCFG it can probably use the ioremap
>>>       done by acpi_mcfg_init().
>>
>> Yes, Expanding .init_info() to check for _CBA is good point.
>>
>>>
>>>       I know acpi_pci_root_add() calls acpi_pci_root_get_mcfg_addr()
>>>       before calling pci_acpi_scan_root(), but I think that's wrong
>>>       because (a) some arches, e.g., ia64, don't use ECAM and (b) _CBA
>>>       and MCFG should be handled in the same place.
>>>
>>>       I know calling request_mem_region() here will probably be an
>>>       ordering problem because the PNP0C02 driver hasn't reserved
>>>       resources yet.  But the host bridge driver is using the region and
>>>       it should reserve it.
>>>
>>>     - If we store the ECAM mapped base address in the sysdata or struct
>>>       pci_host_bridge, the normal config accessors can use
>>>       pci_generic_config_read() with a new generic .map_bus() function.
>>
>>
>> pci_generic_config_{read|write}() is what we want to use, actually we do
>> now, but ECAM region and sysdata association will remove ECAM region lookup
>> step (see patch 09/15 of this series).
>>
>>>
>>>       On x86, the normal config access path is:
>>>
>>>         pci_read(struct pci_bus *, ...)
>>>           raw_pci_read(seg, bus#, ...)
>>>            raw_pci_ext_ops->read(seg, bus#, ...)
>>>              pci_mmcfg_read(seg, bus#, ...)
>>>                pci_dev_base
>>>                  pci_mmconfig_lookup(seg, bus#)
>>>
>>>       I think this is somewhat backwards because we start with a pci_bus
>>>       pointer, so we *could* have a nice simple bus-specific accessor,
>>>       but we throw that pointer away, so pci_mmcfg_read() has to start
>>>       over and look up the ECAM offset from scratch, which makes it all
>>>       unnecessarily complicated.
>>>
>>
>> As you pointed out raw_pci_{read|write} make things complicated, so IMO we
>> should either say they are absolutely necessary (and then think how to
>> simplify it) or just use simple bus-specific accessor (patch 02/15) e.g. for
>> ARM64.
>
> Both raw_pci_read/write and a host controller with pci_generic_read/write can
> be done without much trouble, please see the patch I had at:
> https://patchwork.ozlabs.org/patch/575526/

Yes it is doable, I implemented it in the same way in one of my initial 
patch series, about year ago. I'm questioning raw accessors presence on 
per-arch basis. If it is really needed for all archs, then we definitely 
should implement it. If ARM64 does not care for it, there is no point to 
complicate it. Especially, I mean all kind of PCI config space quirk we 
will need to handle right after this patch got merged, see:
[PATCH V5 13/15] pci, acpi: Match PCI config space accessors against 
platfrom specific quirks.

and

https://lkml.org/lkml/2016/2/9/627
https://lkml.org/lkml/2016/2/8/967

Giving these quirks, raw accessors are not so easy.

>
> I have been looking at Bjorn's suggestions and trying to see if I can update
> my MCFG patch taking care of them. I will post an updated patch set soon,
> unless you want to take this up.
>

Yes, I want to post next version and keep this patch set together, if 
you and Bjorn are okay. I am feeling that my previous patch set is close 
to what Bjorn suggested, modulo the way we keep MCFG regions. Lets 
discuss it here, then I will post it as next version. I am looking 
forward to hear Bjorn's comment on my previous patch set.

Tomasz
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jayachandran Chandrashekaran Nair March 10, 2016, 1:08 p.m. UTC | #12
Hi Tomasz,

On Wed, Mar 9, 2016 at 4:20 PM, Tomasz Nowicki <tn@semihalf.com> wrote:
> Hi Jayachandran,
>
>
> On 09.03.2016 11:10, Jayachandran Chandrashekaran Nair wrote:
>>
>> Hi Tomasz,
>>
>> On Wed, Mar 9, 2016 at 2:43 PM, Tomasz Nowicki <tn@semihalf.com> wrote:
>>>
>>> Hi Bjorn,
>>>
>>> Thanks for your pointers! See my comments inline. Aslo, can you please
>>> have
>>> a look at my previous patch set v4 and check how many of your comments
>>> are
>>> already addressed there. We may want to back to it then.
>>>
>>> https://lkml.org/lkml/2016/2/4/646
>>> Especially patches [0-6] which handle MMCONFIG refactoring.
>>>
>>>
>>> On 05.03.2016 05:14, Bjorn Helgaas wrote:
>>>>
>>>>
>>>> On Fri, Mar 04, 2016 at 02:05:56PM +0530, Jayachandran Chandrashekaran
>>>> Nair wrote:
>>>>>
>>>>>
>>>>> On Fri, Mar 4, 2016 at 4:21 AM, Bjorn Helgaas <helgaas@kernel.org>
>>>>> wrote:
>>>>>>
>>>>>>
>>>>>> Hi Tomasz, Jayachandran, et al,
>>>>>>
>>>>>> On Tue, Feb 16, 2016 at 02:53:31PM +0100, Tomasz Nowicki wrote:
>>>>>>>
>>>>>>>
>>>>>>> From: Jayachandran C <jchandra@broadcom.com>
>>>>>>>
>>>>>>> Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
>>>>>>> to share the API and code with ARM64 later. The corresponding
>>>>>>> declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
>>>>>>>
>>>>>>> As a part of this we introduce three functions that can be
>>>>>>> implemented by the arch code: pci_mmconfig_map_resource() to map a
>>>>>>> mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
>>>>>>> unmap and pci_mmconfig_enabled to see if the arch setup of
>>>>>>> mcfg entries was successful. We also provide weak implementations
>>>>>>> of these, which will be used from ARM64. On x86, we retain the
>>>>>>> old logic by providing platform specific implementation.
>>>>>>>
>>>>>>> This patch is purely rearranging code, it should not have any
>>>>>>> impact on the logic of MCFG parsing or list handling.
>>>>>>
>>>>>>
>>>>>>
>>>>>> I definitely want to figure out how to make this work well on ARM64.
>>>>>> I need to ponder this some more, so these are just some initial
>>>>>> thoughts.
>>>>>>
>>>>>> My first impression is that (a) the x86 MCFG code is an unmitigated
>>>>>> disaster, and (b) we're trying a little too hard to make that mess
>>>>>> generic.  I think we might be better served if we came up with some
>>>>>> cleaner, more generic code that we can use for ARM64 today, and
>>>>>> migrate x86 toward that over time.
>>>>>>
>>>>>> My concern is that if we elevate the current x86 code to be
>>>>>> "arch-independent", we will be perpetuating some interfaces and
>>>>>> designs that shouldn't be allowed to escape arch/x86.
>>>>>
>>>>>
>>>>>
>>>>> I think the major decision is whether to maintain the pci_mmcfg_list
>>>>> for all architectures or not. My initial plan was not to do this
>>>>> because
>>>>> of the mess (basically the ECAM region info should be attached to
>>>>> the pci root and not maintained in a separate list that needs locking),
>>>>> The patch I posted initially https://patchwork.ozlabs.org/patch/553464/
>>>>> had a much simpler way of handling the MCFG table without using
>>>>> the list.
>>>>
>>>>
>>>>
>>>> I agree that ECAM info should be attached to the PCI host controller.
>>>> That should simplify locking and hot-add and hot-removal of host
>>>> controllers.
>>>>
>>>> I think pci_mmcfg_list is an implementation detail that may not need
>>>> to be generic.  I certainly don't think it needs to be part of the
>>>> interface.
>>>>
>>>>> In x86 case it is not feasible to remove using the pci_mmcfg_list.
>>>>> The only use of it outside is in xen that can be fixed up.
>>>>>
>>>>>> Some of the code that moved to drivers/acpi/pci_mcfg.c is not really
>>>>>> ACPI-specific, and could potentially be used for non-ACPI bridges that
>>>>>> support ECAM.  I'd like to see that sort of code moved to a new file
>>>>>> like drivers/pci/ecam.c.
>>>>>>
>>>>>> There's a little bit of overlap here with the ECAM code in
>>>>>> pci-host-generic.c.  I'm not sure whether or how to include that, but
>>>>>> it's a very good example of how simple this *should* be: probe the
>>>>>> host bridge, discover the ECAM region, request the region, ioremap it,
>>>>>> done.
>>>>>
>>>>>
>>>>>
>>>>> I had a similar approach in my initial patchset, please see the patch
>>>>> above. The resource for ECAM is mapped similar to the the way
>>>>> pci-host-generic.c handled it. An additional step I could do was to
>>>>> move the common code (ioremap and mapbus) into a common
>>>>> file and share the code with pci-host-generic.c
>>>>>
>>>>>>> diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
>>>>>>> new file mode 100644
>>>>>>> index 0000000..ea84365
>>>>>>> --- /dev/null
>>>>>>> +++ b/drivers/acpi/pci_mcfg.c
>>>>>>> ...
>>>>>>> +int __weak pci_mmconfig_map_resource(struct device *dev,
>>>>>>> +     struct pci_mmcfg_region *mcfg)
>>>>>>> +{
>>>>>>> +     struct resource *tmp;
>>>>>>> +     void __iomem *vaddr;
>>>>>>> +
>>>>>>> +     tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
>>>>>>> +     if (tmp) {
>>>>>>> +             dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
>>>>>>> +                     &mcfg->res, tmp->name, tmp);
>>>>>>> +             return -EBUSY;
>>>>>>> +     }
>>>>>>
>>>>>>
>>>>>>
>>>>>> I think this is a mistake in the x86 MCFG support that we should not
>>>>>> carry over to a generic implementation.  We should not use the MCFG
>>>>>> table for resource reservation because MCFG is not defined by the ACPI
>>>>>> spec and an OS need not include support for it.  The platform must
>>>>>> indicate in some other, more generic way, that ECAM space is reserved.
>>>>>> This probably means ECAM space should be declared in a PNP0C02 _CRS
>>>>>> method (see the PCI Firmware Spec r3.0, sec 4.1.2, note 2).
>>>>>>
>>>>>> We might need some kind of x86-specific quirk that does this, or a
>>>>>> pcibios hook or something here; I just don't think it should be
>>>>>> generic.
>>>>>>
>>>>>>> +int __init pci_mmconfig_parse_table(void)
>>>>>>> +{
>>>>>>> +     return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
>>>>>>> +}
>>>>>>
>>>>>>
>>>>>>
>>>>>> I don't like the fact that we parse the entire MCFG table at once
>>>>>> here.  I think we should look for the information we need when we are
>>>>>> claiming a PCI host bridge, e.g., in acpi_pci_root_add().  This might
>>>>>> not be a great fit for the way ACPI table management works, but I
>>>>>> think it's better to do things on-demand rather than just-in-case.
>>>>>
>>>>>
>>>>>
>>>>> There is an overhead of looking up this table, and the information
>>>>> available there is very limited (i.e, segment, start_bus, end_bus
>>>>> and address). My approach in the above patch is to save this info
>>>>> into an array at boot time and avoid multiple lookups.
>>>>
>>>>
>>>>
>>>> We need to look up MCFG info once per host bridge, so I don't think
>>>> there's any performance issue here.  But we do use acpi_table_parse(),
>>>> which is __init, and *that* is a reason why we might need to parse the
>>>> entire MCFG at boot-time.  But this is the least of our worries in any
>>>> case.
>>>>
>>>>>>> diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
>>>>>>> index 89ab057..e9450ef 100644
>>>>>>> --- a/include/linux/pci-acpi.h
>>>>>>> +++ b/include/linux/pci-acpi.h
>>>>>>> @@ -106,6 +106,39 @@ extern const u8 pci_acpi_dsm_uuid[];
>>>>>>>    #define RESET_DELAY_DSM              0x08
>>>>>>>    #define FUNCTION_DELAY_DSM   0x09
>>>>>>>
>>>>>>> +/* common API to maintain list of MCFG regions */
>>>>>>> +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
>>>>>>> +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
>>>>>>> +
>>>>>>> +struct pci_mmcfg_region {
>>>>>>> +     struct list_head list;
>>>>>>> +     struct resource res;
>>>>>>> +     u64 address;
>>>>>>> +     char __iomem *virt;
>>>>>>> +     u16 segment;
>>>>>>> +     u8 start_bus;
>>>>>>> +     u8 end_bus;
>>>>>>> +     char name[PCI_MMCFG_RESOURCE_NAME_LEN];
>>>>>>> +};
>>>>>>> +
>>>>>>> +extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8
>>>>>>> start,
>>>>>>> u8 end,
>>>>>>> +                            phys_addr_t addr);
>>>>>>> +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
>>>>>>> +
>>>>>>> +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int
>>>>>>> bus);
>>>>>>> +extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int
>>>>>>> start,
>>>>>>> +                                                     int end, u64
>>>>>>> addr);
>>>>>>> +extern int pci_mmconfig_map_resource(struct device *dev,
>>>>>>> +     struct pci_mmcfg_region *mcfg);
>>>>>>> +extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region
>>>>>>> *mcfg);
>>>>>>> +extern int pci_mmconfig_enabled(void);
>>>>>>> +extern int __init pci_mmconfig_parse_table(void);
>>>>>>> +
>>>>>>> +extern struct list_head pci_mmcfg_list;
>>>>>>> +
>>>>>>> +#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
>>>>>>> +#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
>>>>>>> +
>>>>>>
>>>>>>
>>>>>>
>>>>>> With the exception of pci_mmconfig_parse_table(), nothing here is
>>>>>> ACPI-specific.  I'd like to see the PCI ECAM-related interfaces
>>>>>> (hopefully not these exact ones, but a more rational set) put in
>>>>>> something like include/linux/pci-ecam.h.
>>>>>>
>>>>>>>    #else        /* CONFIG_ACPI */
>>>>>>>    static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
>>>>>>>    static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
>>>>>
>>>>>
>>>>>
>>>>> I can update this patch to
>>>>>    -  drop the pci_mmcfg_list handling from generic case
>>>>>    -  move common ECAM code so that it can be shared with
>>>>>       pci-host-generic.c
>>>>> if that is what you are looking for. The code will end up looking much
>>>>> simpler.
>>>>
>>>>
>>>>
>>>> I think we should ignore x86 mmconfig for now.  It is absurdly
>>>> complicated and I'm not sure it's fixable.  I *do* want to keep
>>>> drivers/acpi/pci_root.c for all ACPI host bridges, including x86,
>>>> ia64, and arm64.
>>>>
>>>> So I think we should write generic MCFG and ECAM support from scratch
>>>> for arm64.  Something like this:
>>>>
>>>>     - Add an acpi_mcfg_init(), maybe in drivers/acpi/pci_mcfg.c, to be
>>>>       called from acpi_init() to copy MCFG info to something we can
>>>>       access after __init.  This would not reserve resources, but
>>>>       probably does have to ioremap() the regions to support
>>>>       raw_pci_read().
>>>
>>>
>>>
>>> As said, ECAM and ACPI specific code was isolated in previous patch.
>>> There,
>>> I tried to leave x86 complication in arch/x86/ and extract generic
>>> functionalities to driver/pci/ecam.c as the library.
>>>
>>>>
>>>>     - Implement raw_pci_read(), which is "special" because ACPI needs it
>>>>       for PCI config access from AML.  It's supposed to be "always
>>>>       accessible" and we don't have a struct pci_bus *, so this probably
>>>>       has to use the MCFG copy and the ioremap done above.  Maybe it
>>>>       should go in the same file.  This is completely independent of
>>>>       the PCI core and PCI data structures.
>>>
>>>
>>> We were looking for the answer which would justify RAW PCI config
>>> accessors
>>> being for ARM64 world. Unfortunately, nobody was able to show real use
>>> case
>>> for ARM64. Do you see the reason we need this? Our conclusion was to
>>> leave
>>> it empty for ARM64 which in turn makes code simpler. I am not ASWG member
>>> while that was under discussion so I will ask Lorenzo to elaborate more
>>> on
>>> this.
>>>
>>>>
>>>>     - Implement arm64 pci_acpi_scan_root() that calls
>>>>       acpi_pci_root_create() with an .init_info() function that calls
>>>>       acpi_pci_root_get_mcfg_addr() to read _CBA, and if that fails,
>>>>       looks up the bus range in the MCFG copy from above.  It should
>>>>       call request_mem_region().  For a region from _CBA, it should call
>>>>       ioremap().  For regions from MCFG it can probably use the ioremap
>>>>       done by acpi_mcfg_init().
>>>
>>>
>>> Yes, Expanding .init_info() to check for _CBA is good point.
>>>
>>>>
>>>>       I know acpi_pci_root_add() calls acpi_pci_root_get_mcfg_addr()
>>>>       before calling pci_acpi_scan_root(), but I think that's wrong
>>>>       because (a) some arches, e.g., ia64, don't use ECAM and (b) _CBA
>>>>       and MCFG should be handled in the same place.
>>>>
>>>>       I know calling request_mem_region() here will probably be an
>>>>       ordering problem because the PNP0C02 driver hasn't reserved
>>>>       resources yet.  But the host bridge driver is using the region and
>>>>       it should reserve it.
>>>>
>>>>     - If we store the ECAM mapped base address in the sysdata or struct
>>>>       pci_host_bridge, the normal config accessors can use
>>>>       pci_generic_config_read() with a new generic .map_bus() function.
>>>
>>>
>>>
>>> pci_generic_config_{read|write}() is what we want to use, actually we do
>>> now, but ECAM region and sysdata association will remove ECAM region
>>> lookup
>>> step (see patch 09/15 of this series).
>>>
>>>>
>>>>       On x86, the normal config access path is:
>>>>
>>>>         pci_read(struct pci_bus *, ...)
>>>>           raw_pci_read(seg, bus#, ...)
>>>>            raw_pci_ext_ops->read(seg, bus#, ...)
>>>>              pci_mmcfg_read(seg, bus#, ...)
>>>>                pci_dev_base
>>>>                  pci_mmconfig_lookup(seg, bus#)
>>>>
>>>>       I think this is somewhat backwards because we start with a pci_bus
>>>>       pointer, so we *could* have a nice simple bus-specific accessor,
>>>>       but we throw that pointer away, so pci_mmcfg_read() has to start
>>>>       over and look up the ECAM offset from scratch, which makes it all
>>>>       unnecessarily complicated.
>>>>
>>>
>>> As you pointed out raw_pci_{read|write} make things complicated, so IMO
>>> we
>>> should either say they are absolutely necessary (and then think how to
>>> simplify it) or just use simple bus-specific accessor (patch 02/15) e.g.
>>> for
>>> ARM64.
>>
>>
>> Both raw_pci_read/write and a host controller with pci_generic_read/write
>> can
>> be done without much trouble, please see the patch I had at:
>> https://patchwork.ozlabs.org/patch/575526/
>
>
> Yes it is doable, I implemented it in the same way in one of my initial
> patch series, about year ago. I'm questioning raw accessors presence on
> per-arch basis. If it is really needed for all archs, then we definitely
> should implement it. If ARM64 does not care for it, there is no point to
> complicate it. Especially, I mean all kind of PCI config space quirk we will
> need to handle right after this patch got merged, see:
> [PATCH V5 13/15] pci, acpi: Match PCI config space accessors against
> platfrom specific quirks.
>
> and
>
> https://lkml.org/lkml/2016/2/9/627
> https://lkml.org/lkml/2016/2/8/967
>
> Giving these quirks, raw accessors are not so easy.

The whole quirk handling infrastructure seems to be an overkill to me.
I will leave it to maintainers to comment further.

>>
>> I have been looking at Bjorn's suggestions and trying to see if I can
>> update
>> my MCFG patch taking care of them. I will post an updated patch set soon,
>> unless you want to take this up.
>>
>
> Yes, I want to post next version and keep this patch set together, if you
> and Bjorn are okay. I am feeling that my previous patch set is close to what
> Bjorn suggested, modulo the way we keep MCFG regions. Lets discuss it here,
> then I will post it as next version. I am looking forward to hear Bjorn's
> comment on my previous patch set.

I have been looking thru the code, and I have a reasonable implementation
which updates this one patch. This pulls in common code from pci-host-generic.c
as well.  I will post it by next week and you can decide whether to use it
to update your patchset.

Thanks,
JC.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jayachandran C March 17, 2016, 8:18 p.m. UTC | #13
Hi Bjorn,

Here is a new patchset for the ACPI PCI controller driver based on the
earlier discussion[1].

The first two patches in the patchset implements pci/ecam.c for generic
config space access and uses it in pci-host-generic.c and related files.

The third patch implements the ACPI PCI host driver using the same ecam
access functions. The fourth patch adds the implementation of raw
operations.

I have not used the pci_mmcfg_list or the region definitions from x86,
but have used a much simpler approach here.

This should apply cleanly on top of the current pci next tree, and
can be reviewed as a patchset.  To use it on ARM64, we need to pull
in about 7 patches more from Tomasz patchset that fixes various
issues (like stub code in arm64 pci.c, ACPI companion setup,
domain number assignment, IO resources fixup etc.).

If you are okay with this approach, I will work with Tomasz and
post the full patchset.

This has been tested on qemu with OVMF for the ACPI part and with
device tree for pci-host-generic code.

Thanks,
JC.

[1] https://lkml.org/lkml/2016/3/3/921

Jayachandran C (4):
  PCI: Provide generic ECAM mapping functions
  PCI: generic,thunder: Use generic config functions
  ACPI: PCI: Add generic PCI host controller
  ACPI: PCI: Add raw_pci_read/write operations

 drivers/acpi/Kconfig                |   9 +
 drivers/acpi/Makefile               |   1 +
 drivers/acpi/pci_gen_host.c         | 334 ++++++++++++++++++++++++++++++++++++
 drivers/pci/Kconfig                 |   3 +
 drivers/pci/Makefile                |   2 +
 drivers/pci/ecam.c                  | 127 ++++++++++++++
 drivers/pci/host/Kconfig            |   1 +
 drivers/pci/host/pci-host-common.c  |  68 ++++----
 drivers/pci/host/pci-host-common.h  |  25 +--
 drivers/pci/host/pci-host-generic.c |  51 +-----
 drivers/pci/host/pci-thunder-ecam.c |  33 +---
 drivers/pci/host/pci-thunder-pem.c  |  41 ++---
 include/linux/pci.h                 |  10 ++
 13 files changed, 560 insertions(+), 145 deletions(-)
 create mode 100644 drivers/acpi/pci_gen_host.c
 create mode 100644 drivers/pci/ecam.c
Jayachandran C March 18, 2016, 5:47 p.m. UTC | #14
On Fri, Mar 18, 2016 at 1:48 AM, Jayachandran C <jchandra@broadcom.com> wrote:
> Hi Bjorn,
>
> Here is a new patchset for the ACPI PCI controller driver based on the
> earlier discussion[1].
>
> The first two patches in the patchset implements pci/ecam.c for generic
> config space access and uses it in pci-host-generic.c and related files.
>
> The third patch implements the ACPI PCI host driver using the same ecam
> access functions. The fourth patch adds the implementation of raw
> operations.
>
> I have not used the pci_mmcfg_list or the region definitions from x86,
> but have used a much simpler approach here.
>
> This should apply cleanly on top of the current pci next tree, and
> can be reviewed as a patchset.  To use it on ARM64, we need to pull
> in about 7 patches more from Tomasz patchset that fixes various
> issues (like stub code in arm64 pci.c, ACPI companion setup,
> domain number assignment, IO resources fixup etc.).
>
> If you are okay with this approach, I will work with Tomasz and
> post the full patchset.
>
> This has been tested on qemu with OVMF for the ACPI part and with
> device tree for pci-host-generic code.

The full patchset is available at https://github.com/jchandra-brcm/linux.git on
branch arm64-acpi-pci, if anyone wants to try it.

Comments, suggestions and testing would be welcome.

Thanks,
JC.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gabriele Paoloni March 23, 2016, 10:22 a.m. UTC | #15
Hi Jayachandran

> -----Original Message-----

> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-

> owner@vger.kernel.org] On Behalf Of Jayachandran C

> Sent: 18 March 2016 17:48

> To: Bjorn Helgaas; Tomasz Nowicki; rafael@kernel.org

> Cc: Jayachandran C; Arnd Bergmann; Will Deacon; Catalin Marinas; Hanjun

> Guo; Lorenzo Pieralisi; okaya@codeaurora.org;

> jiang.liu@linux.intel.com; Stefano Stabellini;

> robert.richter@caviumnetworks.com; Marcin Wojtas; Liviu.Dudau@arm.com;

> David Daney; Wangyijing; Suravee.Suthikulpanit@amd.com;

> msalter@redhat.com; linux-pci@vger.kernel.org; linux-arm-

> kernel@lists.infradead.org; linux-acpi@vger.kernel.org; linux-

> kernel@vger.kernel.org; linaro-acpi@lists.linaro.org; Jon Masters

> Subject: Re: [RFC PATCH 0/4] ACPI based PCI host driver with generic

> ECAM

> 

> On Fri, Mar 18, 2016 at 1:48 AM, Jayachandran C <jchandra@broadcom.com>

> wrote:

> > Hi Bjorn,

> >

> > Here is a new patchset for the ACPI PCI controller driver based on

> the

> > earlier discussion[1].

> >

> > The first two patches in the patchset implements pci/ecam.c for

> generic

> > config space access and uses it in pci-host-generic.c and related

> files.

> >

> > The third patch implements the ACPI PCI host driver using the same

> ecam

> > access functions. The fourth patch adds the implementation of raw

> > operations.

> >

> > I have not used the pci_mmcfg_list or the region definitions from

> x86,

> > but have used a much simpler approach here.

> >

> > This should apply cleanly on top of the current pci next tree, and

> > can be reviewed as a patchset.  To use it on ARM64, we need to pull

> > in about 7 patches more from Tomasz patchset that fixes various

> > issues (like stub code in arm64 pci.c, ACPI companion setup,

> > domain number assignment, IO resources fixup etc.).

> >

> > If you are okay with this approach, I will work with Tomasz and

> > post the full patchset.

> >

> > This has been tested on qemu with OVMF for the ACPI part and with

> > device tree for pci-host-generic code.

> 

> The full patchset is available at https://github.com/jchandra-

> brcm/linux.git on

> branch arm64-acpi-pci, if anyone wants to try it.


I had a look at your patchset and also in your git repo at the other
patches that you ported over from Tomasz; it seems that now we miss
a quirk mechanism to enable controller that are not fully ECAM.

This was provided before by Tomasz in:
https://lkml.org/lkml/2016/2/16/410

I think we should put something like that back in...

Thanks

Gab

> 

> Comments, suggestions and testing would be welcome.

> 

> Thanks,

> JC.
Sinan Kaya March 28, 2016, 1:42 p.m. UTC | #16
Hi,

On 3/23/2016 6:22 AM, Gabriele Paoloni wrote:
> I had a look at your patchset and also in your git repo at the other
> patches that you ported over from Tomasz; it seems that now we miss
> a quirk mechanism to enable controller that are not fully ECAM.
> 
> This was provided before by Tomasz in:
> https://lkml.org/lkml/2016/2/16/410
> 
> I think we should put something like that back in...
> 
> Thanks
> 
> Gab

I was requested to test your patchset. I'll need this mechanism before
I can start.

Sinan
Jayachandran C March 28, 2016, 6:01 p.m. UTC | #17
On Mon, Mar 28, 2016 at 7:12 PM, Sinan Kaya <okaya@codeaurora.org> wrote:
> Hi,
>
> On 3/23/2016 6:22 AM, Gabriele Paoloni wrote:
>> I had a look at your patchset and also in your git repo at the other
>> patches that you ported over from Tomasz; it seems that now we miss
>> a quirk mechanism to enable controller that are not fully ECAM.
>>
>> This was provided before by Tomasz in:
>> https://lkml.org/lkml/2016/2/16/410
>>
>> I think we should put something like that back in...

Like Tomasz mentioned in his mail, his approach does not work
for raw operations. I have added raw operations in may patchset,
so we have to come up with a new approach or decide that raw
operations can be dropped.

I am waiting for the overall acceptance of the patch set before
going further along this path

>>
>> Thanks
>>
>> Gab
>
> I was requested to test your patchset. I'll need this mechanism before
> I can start.

Please see above, we will need to look at the quirks again.

> Sinan

JC.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tomasz Nowicki April 5, 2016, 2:11 p.m. UTC | #18
On 09.03.2016 10:13, Tomasz Nowicki wrote:
> Hi Bjorn,
>
> Thanks for your pointers! See my comments inline. Aslo, can you please
> have a look at my previous patch set v4 and check how many of your
> comments are already addressed there. We may want to back to it then.
>
> https://lkml.org/lkml/2016/2/4/646
> Especially patches [0-6] which handle MMCONFIG refactoring.
>
>
> On 05.03.2016 05:14, Bjorn Helgaas wrote:
>> On Fri, Mar 04, 2016 at 02:05:56PM +0530, Jayachandran Chandrashekaran
>> Nair wrote:
>>> On Fri, Mar 4, 2016 at 4:21 AM, Bjorn Helgaas <helgaas@kernel.org>
>>> wrote:
>>>> Hi Tomasz, Jayachandran, et al,
>>>>
>>>> On Tue, Feb 16, 2016 at 02:53:31PM +0100, Tomasz Nowicki wrote:
>>>>> From: Jayachandran C <jchandra@broadcom.com>
>>>>>
>>>>> Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
>>>>> to share the API and code with ARM64 later. The corresponding
>>>>> declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
>>>>>
>>>>> As a part of this we introduce three functions that can be
>>>>> implemented by the arch code: pci_mmconfig_map_resource() to map a
>>>>> mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
>>>>> unmap and pci_mmconfig_enabled to see if the arch setup of
>>>>> mcfg entries was successful. We also provide weak implementations
>>>>> of these, which will be used from ARM64. On x86, we retain the
>>>>> old logic by providing platform specific implementation.
>>>>>
>>>>> This patch is purely rearranging code, it should not have any
>>>>> impact on the logic of MCFG parsing or list handling.
>>>>
>>>> I definitely want to figure out how to make this work well on ARM64.
>>>> I need to ponder this some more, so these are just some initial
>>>> thoughts.
>>>>
>>>> My first impression is that (a) the x86 MCFG code is an unmitigated
>>>> disaster, and (b) we're trying a little too hard to make that mess
>>>> generic.  I think we might be better served if we came up with some
>>>> cleaner, more generic code that we can use for ARM64 today, and
>>>> migrate x86 toward that over time.
>>>>
>>>> My concern is that if we elevate the current x86 code to be
>>>> "arch-independent", we will be perpetuating some interfaces and
>>>> designs that shouldn't be allowed to escape arch/x86.
>>>
>>> I think the major decision is whether to maintain the pci_mmcfg_list
>>> for all architectures or not. My initial plan was not to do this because
>>> of the mess (basically the ECAM region info should be attached to
>>> the pci root and not maintained in a separate list that needs locking),
>>> The patch I posted initially https://patchwork.ozlabs.org/patch/553464/
>>> had a much simpler way of handling the MCFG table without using
>>> the list.
>>
>> I agree that ECAM info should be attached to the PCI host controller.
>> That should simplify locking and hot-add and hot-removal of host
>> controllers.
>>
>> I think pci_mmcfg_list is an implementation detail that may not need
>> to be generic.  I certainly don't think it needs to be part of the
>> interface.
>>
>>> In x86 case it is not feasible to remove using the pci_mmcfg_list.
>>> The only use of it outside is in xen that can be fixed up.
>>>
>>>> Some of the code that moved to drivers/acpi/pci_mcfg.c is not really
>>>> ACPI-specific, and could potentially be used for non-ACPI bridges that
>>>> support ECAM.  I'd like to see that sort of code moved to a new file
>>>> like drivers/pci/ecam.c.
>>>>
>>>> There's a little bit of overlap here with the ECAM code in
>>>> pci-host-generic.c.  I'm not sure whether or how to include that, but
>>>> it's a very good example of how simple this *should* be: probe the
>>>> host bridge, discover the ECAM region, request the region, ioremap it,
>>>> done.
>>>
>>> I had a similar approach in my initial patchset, please see the patch
>>> above. The resource for ECAM is mapped similar to the the way
>>> pci-host-generic.c handled it. An additional step I could do was to
>>> move the common code (ioremap and mapbus) into a common
>>> file and share the code with pci-host-generic.c
>>>
>>>>> diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
>>>>> new file mode 100644
>>>>> index 0000000..ea84365
>>>>> --- /dev/null
>>>>> +++ b/drivers/acpi/pci_mcfg.c
>>>>> ...
>>>>> +int __weak pci_mmconfig_map_resource(struct device *dev,
>>>>> +     struct pci_mmcfg_region *mcfg)
>>>>> +{
>>>>> +     struct resource *tmp;
>>>>> +     void __iomem *vaddr;
>>>>> +
>>>>> +     tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
>>>>> +     if (tmp) {
>>>>> +             dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
>>>>> +                     &mcfg->res, tmp->name, tmp);
>>>>> +             return -EBUSY;
>>>>> +     }
>>>>
>>>> I think this is a mistake in the x86 MCFG support that we should not
>>>> carry over to a generic implementation.  We should not use the MCFG
>>>> table for resource reservation because MCFG is not defined by the ACPI
>>>> spec and an OS need not include support for it.  The platform must
>>>> indicate in some other, more generic way, that ECAM space is reserved.
>>>> This probably means ECAM space should be declared in a PNP0C02 _CRS
>>>> method (see the PCI Firmware Spec r3.0, sec 4.1.2, note 2).
>>>>
>>>> We might need some kind of x86-specific quirk that does this, or a
>>>> pcibios hook or something here; I just don't think it should be
>>>> generic.
>>>>
>>>>> +int __init pci_mmconfig_parse_table(void)
>>>>> +{
>>>>> +     return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
>>>>> +}
>>>>
>>>> I don't like the fact that we parse the entire MCFG table at once
>>>> here.  I think we should look for the information we need when we are
>>>> claiming a PCI host bridge, e.g., in acpi_pci_root_add().  This might
>>>> not be a great fit for the way ACPI table management works, but I
>>>> think it's better to do things on-demand rather than just-in-case.
>>>
>>> There is an overhead of looking up this table, and the information
>>> available there is very limited (i.e, segment, start_bus, end_bus
>>> and address). My approach in the above patch is to save this info
>>> into an array at boot time and avoid multiple lookups.
>>
>> We need to look up MCFG info once per host bridge, so I don't think
>> there's any performance issue here.  But we do use acpi_table_parse(),
>> which is __init, and *that* is a reason why we might need to parse the
>> entire MCFG at boot-time.  But this is the least of our worries in any
>> case.
>>
>>>>> diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
>>>>> index 89ab057..e9450ef 100644
>>>>> --- a/include/linux/pci-acpi.h
>>>>> +++ b/include/linux/pci-acpi.h
>>>>> @@ -106,6 +106,39 @@ extern const u8 pci_acpi_dsm_uuid[];
>>>>>   #define RESET_DELAY_DSM              0x08
>>>>>   #define FUNCTION_DELAY_DSM   0x09
>>>>>
>>>>> +/* common API to maintain list of MCFG regions */
>>>>> +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
>>>>> +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
>>>>> +
>>>>> +struct pci_mmcfg_region {
>>>>> +     struct list_head list;
>>>>> +     struct resource res;
>>>>> +     u64 address;
>>>>> +     char __iomem *virt;
>>>>> +     u16 segment;
>>>>> +     u8 start_bus;
>>>>> +     u8 end_bus;
>>>>> +     char name[PCI_MMCFG_RESOURCE_NAME_LEN];
>>>>> +};
>>>>> +
>>>>> +extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8
>>>>> start, u8 end,
>>>>> +                            phys_addr_t addr);
>>>>> +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
>>>>> +
>>>>> +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment,
>>>>> int bus);
>>>>> +extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int
>>>>> start,
>>>>> +                                                     int end, u64
>>>>> addr);
>>>>> +extern int pci_mmconfig_map_resource(struct device *dev,
>>>>> +     struct pci_mmcfg_region *mcfg);
>>>>> +extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region
>>>>> *mcfg);
>>>>> +extern int pci_mmconfig_enabled(void);
>>>>> +extern int __init pci_mmconfig_parse_table(void);
>>>>> +
>>>>> +extern struct list_head pci_mmcfg_list;
>>>>> +
>>>>> +#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
>>>>> +#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
>>>>> +
>>>>
>>>> With the exception of pci_mmconfig_parse_table(), nothing here is
>>>> ACPI-specific.  I'd like to see the PCI ECAM-related interfaces
>>>> (hopefully not these exact ones, but a more rational set) put in
>>>> something like include/linux/pci-ecam.h.
>>>>
>>>>>   #else        /* CONFIG_ACPI */
>>>>>   static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
>>>>>   static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
>>>
>>> I can update this patch to
>>>   -  drop the pci_mmcfg_list handling from generic case
>>>   -  move common ECAM code so that it can be shared with
>>>      pci-host-generic.c
>>> if that is what you are looking for. The code will end up looking much
>>> simpler.
>>
>> I think we should ignore x86 mmconfig for now.  It is absurdly
>> complicated and I'm not sure it's fixable.  I *do* want to keep
>> drivers/acpi/pci_root.c for all ACPI host bridges, including x86,
>> ia64, and arm64.
>>
>> So I think we should write generic MCFG and ECAM support from scratch
>> for arm64.  Something like this:
>>
>>    - Add an acpi_mcfg_init(), maybe in drivers/acpi/pci_mcfg.c, to be
>>      called from acpi_init() to copy MCFG info to something we can
>>      access after __init.  This would not reserve resources, but
>>      probably does have to ioremap() the regions to support
>>      raw_pci_read().
>
> As said, ECAM and ACPI specific code was isolated in previous patch.
> There, I tried to leave x86 complication in arch/x86/ and extract
> generic functionalities to driver/pci/ecam.c as the library.
>
>>
>>    - Implement raw_pci_read(), which is "special" because ACPI needs it
>>      for PCI config access from AML.  It's supposed to be "always
>>      accessible" and we don't have a struct pci_bus *, so this probably
>>      has to use the MCFG copy and the ioremap done above.  Maybe it
>>      should go in the same file.  This is completely independent of
>>      the PCI core and PCI data structures.
> We were looking for the answer which would justify RAW PCI config
> accessors being for ARM64 world. Unfortunately, nobody was able to show
> real use case for ARM64. Do you see the reason we need this? Our
> conclusion was to leave it empty for ARM64 which in turn makes code
> simpler. I am not ASWG member while that was under discussion so I will
> ask Lorenzo to elaborate more on this.
>
>>
>>    - Implement arm64 pci_acpi_scan_root() that calls
>>      acpi_pci_root_create() with an .init_info() function that calls
>>      acpi_pci_root_get_mcfg_addr() to read _CBA, and if that fails,
>>      looks up the bus range in the MCFG copy from above.  It should
>>      call request_mem_region().  For a region from _CBA, it should call
>>      ioremap().  For regions from MCFG it can probably use the ioremap
>>      done by acpi_mcfg_init().
> Yes, Expanding .init_info() to check for _CBA is good point.
>
>>
>>      I know acpi_pci_root_add() calls acpi_pci_root_get_mcfg_addr()
>>      before calling pci_acpi_scan_root(), but I think that's wrong
>>      because (a) some arches, e.g., ia64, don't use ECAM and (b) _CBA
>>      and MCFG should be handled in the same place.
>>
>>      I know calling request_mem_region() here will probably be an
>>      ordering problem because the PNP0C02 driver hasn't reserved
>>      resources yet.  But the host bridge driver is using the region and
>>      it should reserve it.
>>
>>    - If we store the ECAM mapped base address in the sysdata or struct
>>      pci_host_bridge, the normal config accessors can use
>>      pci_generic_config_read() with a new generic .map_bus() function.
>
> pci_generic_config_{read|write}() is what we want to use, actually we do
> now, but ECAM region and sysdata association will remove ECAM region
> lookup step (see patch 09/15 of this series).
>
>>
>>      On x86, the normal config access path is:
>>
>>        pci_read(struct pci_bus *, ...)
>>          raw_pci_read(seg, bus#, ...)
>>       raw_pci_ext_ops->read(seg, bus#, ...)
>>         pci_mmcfg_read(seg, bus#, ...)
>>           pci_dev_base
>>             pci_mmconfig_lookup(seg, bus#)
>>
>>      I think this is somewhat backwards because we start with a pci_bus
>>      pointer, so we *could* have a nice simple bus-specific accessor,
>>      but we throw that pointer away, so pci_mmcfg_read() has to start
>>      over and look up the ECAM offset from scratch, which makes it all
>>      unnecessarily complicated.
>>
>
> As you pointed out raw_pci_{read|write} make things complicated, so IMO
> we should either say they are absolutely necessary (and then think how
> to simplify it) or just use simple bus-specific accessor (patch 02/15)
> e.g. for ARM64.
>
> Any comments appreciated.
>

Hi Bjorn,

Kindly reminder. I would like to move on with this patch set. Can you 
please comments on it so that we could decide which way to go.

Regards,
Tomasz
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas April 5, 2016, 4:41 p.m. UTC | #19
Hi Tomasz,

On Tue, Apr 05, 2016 at 04:11:55PM +0200, Tomasz Nowicki wrote:
> On 09.03.2016 10:13, Tomasz Nowicki wrote:
> >Hi Bjorn,
> >
> >Thanks for your pointers! See my comments inline. Aslo, can you please
> >have a look at my previous patch set v4 and check how many of your
> >comments are already addressed there. We may want to back to it then.
> >
> >https://lkml.org/lkml/2016/2/4/646
> >Especially patches [0-6] which handle MMCONFIG refactoring.
> >
> >
> >On 05.03.2016 05:14, Bjorn Helgaas wrote:
> >>On Fri, Mar 04, 2016 at 02:05:56PM +0530, Jayachandran Chandrashekaran
> >>Nair wrote:
> >>>On Fri, Mar 4, 2016 at 4:21 AM, Bjorn Helgaas <helgaas@kernel.org>
> >>>wrote:
> >>>>Hi Tomasz, Jayachandran, et al,
> >>>>
> >>>>On Tue, Feb 16, 2016 at 02:53:31PM +0100, Tomasz Nowicki wrote:
> >>>>>From: Jayachandran C <jchandra@broadcom.com>
> >>>>>
> >>>>>Move pci_mmcfg_list handling to a drivers/acpi/pci_mcfg.c. This is
> >>>>>to share the API and code with ARM64 later. The corresponding
> >>>>>declarations are moved from asm/pci_x86.h to linux/pci-acpi.h
> >>>>>
> >>>>>As a part of this we introduce three functions that can be
> >>>>>implemented by the arch code: pci_mmconfig_map_resource() to map a
> >>>>>mcfg entry, pci_mmconfig_unmap_resource to do the corresponding
> >>>>>unmap and pci_mmconfig_enabled to see if the arch setup of
> >>>>>mcfg entries was successful. We also provide weak implementations
> >>>>>of these, which will be used from ARM64. On x86, we retain the
> >>>>>old logic by providing platform specific implementation.
> >>>>>
> >>>>>This patch is purely rearranging code, it should not have any
> >>>>>impact on the logic of MCFG parsing or list handling.
> >>>>
> >>>>I definitely want to figure out how to make this work well on ARM64.
> >>>>I need to ponder this some more, so these are just some initial
> >>>>thoughts.
> >>>>
> >>>>My first impression is that (a) the x86 MCFG code is an unmitigated
> >>>>disaster, and (b) we're trying a little too hard to make that mess
> >>>>generic.  I think we might be better served if we came up with some
> >>>>cleaner, more generic code that we can use for ARM64 today, and
> >>>>migrate x86 toward that over time.
> >>>>
> >>>>My concern is that if we elevate the current x86 code to be
> >>>>"arch-independent", we will be perpetuating some interfaces and
> >>>>designs that shouldn't be allowed to escape arch/x86.
> >>>
> >>>I think the major decision is whether to maintain the pci_mmcfg_list
> >>>for all architectures or not. My initial plan was not to do this because
> >>>of the mess (basically the ECAM region info should be attached to
> >>>the pci root and not maintained in a separate list that needs locking),
> >>>The patch I posted initially https://patchwork.ozlabs.org/patch/553464/
> >>>had a much simpler way of handling the MCFG table without using
> >>>the list.
> >>
> >>I agree that ECAM info should be attached to the PCI host controller.
> >>That should simplify locking and hot-add and hot-removal of host
> >>controllers.
> >>
> >>I think pci_mmcfg_list is an implementation detail that may not need
> >>to be generic.  I certainly don't think it needs to be part of the
> >>interface.
> >>
> >>>In x86 case it is not feasible to remove using the pci_mmcfg_list.
> >>>The only use of it outside is in xen that can be fixed up.
> >>>
> >>>>Some of the code that moved to drivers/acpi/pci_mcfg.c is not really
> >>>>ACPI-specific, and could potentially be used for non-ACPI bridges that
> >>>>support ECAM.  I'd like to see that sort of code moved to a new file
> >>>>like drivers/pci/ecam.c.
> >>>>
> >>>>There's a little bit of overlap here with the ECAM code in
> >>>>pci-host-generic.c.  I'm not sure whether or how to include that, but
> >>>>it's a very good example of how simple this *should* be: probe the
> >>>>host bridge, discover the ECAM region, request the region, ioremap it,
> >>>>done.
> >>>
> >>>I had a similar approach in my initial patchset, please see the patch
> >>>above. The resource for ECAM is mapped similar to the the way
> >>>pci-host-generic.c handled it. An additional step I could do was to
> >>>move the common code (ioremap and mapbus) into a common
> >>>file and share the code with pci-host-generic.c
> >>>
> >>>>>diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
> >>>>>new file mode 100644
> >>>>>index 0000000..ea84365
> >>>>>--- /dev/null
> >>>>>+++ b/drivers/acpi/pci_mcfg.c
> >>>>>...
> >>>>>+int __weak pci_mmconfig_map_resource(struct device *dev,
> >>>>>+     struct pci_mmcfg_region *mcfg)
> >>>>>+{
> >>>>>+     struct resource *tmp;
> >>>>>+     void __iomem *vaddr;
> >>>>>+
> >>>>>+     tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
> >>>>>+     if (tmp) {
> >>>>>+             dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
> >>>>>+                     &mcfg->res, tmp->name, tmp);
> >>>>>+             return -EBUSY;
> >>>>>+     }
> >>>>
> >>>>I think this is a mistake in the x86 MCFG support that we should not
> >>>>carry over to a generic implementation.  We should not use the MCFG
> >>>>table for resource reservation because MCFG is not defined by the ACPI
> >>>>spec and an OS need not include support for it.  The platform must
> >>>>indicate in some other, more generic way, that ECAM space is reserved.
> >>>>This probably means ECAM space should be declared in a PNP0C02 _CRS
> >>>>method (see the PCI Firmware Spec r3.0, sec 4.1.2, note 2).
> >>>>
> >>>>We might need some kind of x86-specific quirk that does this, or a
> >>>>pcibios hook or something here; I just don't think it should be
> >>>>generic.
> >>>>
> >>>>>+int __init pci_mmconfig_parse_table(void)
> >>>>>+{
> >>>>>+     return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
> >>>>>+}
> >>>>
> >>>>I don't like the fact that we parse the entire MCFG table at once
> >>>>here.  I think we should look for the information we need when we are
> >>>>claiming a PCI host bridge, e.g., in acpi_pci_root_add().  This might
> >>>>not be a great fit for the way ACPI table management works, but I
> >>>>think it's better to do things on-demand rather than just-in-case.
> >>>
> >>>There is an overhead of looking up this table, and the information
> >>>available there is very limited (i.e, segment, start_bus, end_bus
> >>>and address). My approach in the above patch is to save this info
> >>>into an array at boot time and avoid multiple lookups.
> >>
> >>We need to look up MCFG info once per host bridge, so I don't think
> >>there's any performance issue here.  But we do use acpi_table_parse(),
> >>which is __init, and *that* is a reason why we might need to parse the
> >>entire MCFG at boot-time.  But this is the least of our worries in any
> >>case.
> >>
> >>>>>diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
> >>>>>index 89ab057..e9450ef 100644
> >>>>>--- a/include/linux/pci-acpi.h
> >>>>>+++ b/include/linux/pci-acpi.h
> >>>>>@@ -106,6 +106,39 @@ extern const u8 pci_acpi_dsm_uuid[];
> >>>>>  #define RESET_DELAY_DSM              0x08
> >>>>>  #define FUNCTION_DELAY_DSM   0x09
> >>>>>
> >>>>>+/* common API to maintain list of MCFG regions */
> >>>>>+/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
> >>>>>+#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
> >>>>>+
> >>>>>+struct pci_mmcfg_region {
> >>>>>+     struct list_head list;
> >>>>>+     struct resource res;
> >>>>>+     u64 address;
> >>>>>+     char __iomem *virt;
> >>>>>+     u16 segment;
> >>>>>+     u8 start_bus;
> >>>>>+     u8 end_bus;
> >>>>>+     char name[PCI_MMCFG_RESOURCE_NAME_LEN];
> >>>>>+};
> >>>>>+
> >>>>>+extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8
> >>>>>start, u8 end,
> >>>>>+                            phys_addr_t addr);
> >>>>>+extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
> >>>>>+
> >>>>>+extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment,
> >>>>>int bus);
> >>>>>+extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int
> >>>>>start,
> >>>>>+                                                     int end, u64
> >>>>>addr);
> >>>>>+extern int pci_mmconfig_map_resource(struct device *dev,
> >>>>>+     struct pci_mmcfg_region *mcfg);
> >>>>>+extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region
> >>>>>*mcfg);
> >>>>>+extern int pci_mmconfig_enabled(void);
> >>>>>+extern int __init pci_mmconfig_parse_table(void);
> >>>>>+
> >>>>>+extern struct list_head pci_mmcfg_list;
> >>>>>+
> >>>>>+#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
> >>>>>+#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
> >>>>>+
> >>>>
> >>>>With the exception of pci_mmconfig_parse_table(), nothing here is
> >>>>ACPI-specific.  I'd like to see the PCI ECAM-related interfaces
> >>>>(hopefully not these exact ones, but a more rational set) put in
> >>>>something like include/linux/pci-ecam.h.
> >>>>
> >>>>>  #else        /* CONFIG_ACPI */
> >>>>>  static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
> >>>>>  static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
> >>>
> >>>I can update this patch to
> >>>  -  drop the pci_mmcfg_list handling from generic case
> >>>  -  move common ECAM code so that it can be shared with
> >>>     pci-host-generic.c
> >>>if that is what you are looking for. The code will end up looking much
> >>>simpler.
> >>
> >>I think we should ignore x86 mmconfig for now.  It is absurdly
> >>complicated and I'm not sure it's fixable.  I *do* want to keep
> >>drivers/acpi/pci_root.c for all ACPI host bridges, including x86,
> >>ia64, and arm64.
> >>
> >>So I think we should write generic MCFG and ECAM support from scratch
> >>for arm64.  Something like this:
> >>
> >>   - Add an acpi_mcfg_init(), maybe in drivers/acpi/pci_mcfg.c, to be
> >>     called from acpi_init() to copy MCFG info to something we can
> >>     access after __init.  This would not reserve resources, but
> >>     probably does have to ioremap() the regions to support
> >>     raw_pci_read().
> >
> >As said, ECAM and ACPI specific code was isolated in previous patch.
> >There, I tried to leave x86 complication in arch/x86/ and extract
> >generic functionalities to driver/pci/ecam.c as the library.
> >
> >>
> >>   - Implement raw_pci_read(), which is "special" because ACPI needs it
> >>     for PCI config access from AML.  It's supposed to be "always
> >>     accessible" and we don't have a struct pci_bus *, so this probably
> >>     has to use the MCFG copy and the ioremap done above.  Maybe it
> >>     should go in the same file.  This is completely independent of
> >>     the PCI core and PCI data structures.
> >We were looking for the answer which would justify RAW PCI config
> >accessors being for ARM64 world. Unfortunately, nobody was able to show
> >real use case for ARM64. Do you see the reason we need this? Our
> >conclusion was to leave it empty for ARM64 which in turn makes code
> >simpler. I am not ASWG member while that was under discussion so I will
> >ask Lorenzo to elaborate more on this.
> >
> >>
> >>   - Implement arm64 pci_acpi_scan_root() that calls
> >>     acpi_pci_root_create() with an .init_info() function that calls
> >>     acpi_pci_root_get_mcfg_addr() to read _CBA, and if that fails,
> >>     looks up the bus range in the MCFG copy from above.  It should
> >>     call request_mem_region().  For a region from _CBA, it should call
> >>     ioremap().  For regions from MCFG it can probably use the ioremap
> >>     done by acpi_mcfg_init().
> >Yes, Expanding .init_info() to check for _CBA is good point.
> >
> >>
> >>     I know acpi_pci_root_add() calls acpi_pci_root_get_mcfg_addr()
> >>     before calling pci_acpi_scan_root(), but I think that's wrong
> >>     because (a) some arches, e.g., ia64, don't use ECAM and (b) _CBA
> >>     and MCFG should be handled in the same place.
> >>
> >>     I know calling request_mem_region() here will probably be an
> >>     ordering problem because the PNP0C02 driver hasn't reserved
> >>     resources yet.  But the host bridge driver is using the region and
> >>     it should reserve it.
> >>
> >>   - If we store the ECAM mapped base address in the sysdata or struct
> >>     pci_host_bridge, the normal config accessors can use
> >>     pci_generic_config_read() with a new generic .map_bus() function.
> >
> >pci_generic_config_{read|write}() is what we want to use, actually we do
> >now, but ECAM region and sysdata association will remove ECAM region
> >lookup step (see patch 09/15 of this series).
> >
> >>
> >>     On x86, the normal config access path is:
> >>
> >>       pci_read(struct pci_bus *, ...)
> >>         raw_pci_read(seg, bus#, ...)
> >>      raw_pci_ext_ops->read(seg, bus#, ...)
> >>        pci_mmcfg_read(seg, bus#, ...)
> >>          pci_dev_base
> >>            pci_mmconfig_lookup(seg, bus#)
> >>
> >>     I think this is somewhat backwards because we start with a pci_bus
> >>     pointer, so we *could* have a nice simple bus-specific accessor,
> >>     but we throw that pointer away, so pci_mmcfg_read() has to start
> >>     over and look up the ECAM offset from scratch, which makes it all
> >>     unnecessarily complicated.
> >>
> >
> >As you pointed out raw_pci_{read|write} make things complicated, so IMO
> >we should either say they are absolutely necessary (and then think how
> >to simplify it) or just use simple bus-specific accessor (patch 02/15)
> >e.g. for ARM64.
> >
> >Any comments appreciated.
> 
> Kindly reminder. I would like to move on with this patch set. Can
> you please comments on it so that we could decide which way to go.

Can you repost your current proposal with a version number higher than
any previous ones?  It's OK if the content is the same as v4; I just
think it's confusing if we resurrect v4 and have to follow discussion
from v3 to v4 to v5 and back to v4.  The archives would be a bit of a
muddle.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tomasz Nowicki April 5, 2016, 6:07 p.m. UTC | #20
Hi Bjorn,

On 05.04.2016 18:41, Bjorn Helgaas wrote:
> Hi Tomasz,
>

[...]

>>>>
>>>
>>> As you pointed out raw_pci_{read|write} make things complicated, so IMO
>>> we should either say they are absolutely necessary (and then think how
>>> to simplify it) or just use simple bus-specific accessor (patch 02/15)
>>> e.g. for ARM64.
>>>
>>> Any comments appreciated.
>>
>> Kindly reminder. I would like to move on with this patch set. Can
>> you please comments on it so that we could decide which way to go.
>
> Can you repost your current proposal with a version number higher than
> any previous ones?  It's OK if the content is the same as v4; I just
> think it's confusing if we resurrect v4 and have to follow discussion
> from v3 to v4 to v5 and back to v4.  The archives would be a bit of a
> muddle.
>

Sure I will repost ASAP.

Thanks!
Tomasz
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jayachandran C April 5, 2016, 6:50 p.m. UTC | #21
Hi Bjorn,

On Tue, Apr 5, 2016 at 10:11 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> Hi Tomasz,
>
> On Tue, Apr 05, 2016 at 04:11:55PM +0200, Tomasz Nowicki wrote:
> > On 09.03.2016 10:13, Tomasz Nowicki wrote:
> > >Hi Bjorn,
> > >
> > >Thanks for your pointers! See my comments inline. Aslo, can you please
> > >have a look at my previous patch set v4 and check how many of your
> > >comments are already addressed there. We may want to back to it then.
> > >
> > >https://lkml.org/lkml/2016/2/4/646
> > >Especially patches [0-6] which handle MMCONFIG refactoring.
> > >
> > >
> > >On 05.03.2016 05:14, Bjorn Helgaas wrote:
> [...]
> > >
> > >As you pointed out raw_pci_{read|write} make things complicated, so IMO
> > >we should either say they are absolutely necessary (and then think how
> > >to simplify it) or just use simple bus-specific accessor (patch 02/15)
> > >e.g. for ARM64.
> > >
> > >Any comments appreciated.
>
> > Kindly reminder. I would like to move on with this patch set. Can
> > you please comments on it so that we could decide which way to go.
>
> Can you repost your current proposal with a version number higher than
> any previous ones?  It's OK if the content is the same as v4; I just
> think it's confusing if we resurrect v4 and have to follow discussion
> from v3 to v4 to v5 and back to v4.  The archives would be a bit of a
> muddle.

I had posted a patchset based on your suggestions in this thread
https://lkml.org/lkml/2016/3/17/621

Would appreciate any comments on that. Like I said in the earlier
mail, if this is a reasonable approach, I can combine this with
Tomasz patchset to provide the full patchset for ACPI support.

Thanks,
JC.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 46873fb..7824626 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -122,33 +122,11 @@  extern int pci_legacy_init(void);
 extern void pcibios_fixup_irqs(void);
 
 /* pci-mmconfig.c */
-
-/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
-#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
-
-struct pci_mmcfg_region {
-	struct list_head list;
-	struct resource res;
-	u64 address;
-	char __iomem *virt;
-	u16 segment;
-	u8 start_bus;
-	u8 end_bus;
-	char name[PCI_MMCFG_RESOURCE_NAME_LEN];
-};
-
+struct pci_mmcfg_region;
 extern int __init pci_mmcfg_arch_init(void);
 extern void __init pci_mmcfg_arch_free(void);
 extern int pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg);
 extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg);
-extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
-			       phys_addr_t addr);
-extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
-extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
-
-extern struct list_head pci_mmcfg_list;
-
-#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
 
 /*
  * On AMD Fam10h CPUs, all PCI MMIO configuration space accesses must use
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index dd30b7e..626710b 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -12,13 +12,12 @@ 
 
 #include <linux/pci.h>
 #include <linux/init.h>
-#include <linux/sfi_acpi.h>
 #include <linux/bitmap.h>
-#include <linux/dmi.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/rculist.h>
 #include <asm/e820.h>
+#include <linux/pci-acpi.h>
 #include <asm/pci_x86.h>
 #include <asm/acpi.h>
 
@@ -27,9 +26,6 @@ 
 /* Indicate if the mmcfg resources have been placed into the resource table. */
 static bool pci_mmcfg_running_state;
 static bool pci_mmcfg_arch_init_failed;
-static DEFINE_MUTEX(pci_mmcfg_lock);
-
-LIST_HEAD(pci_mmcfg_list);
 
 static void __init pci_mmconfig_remove(struct pci_mmcfg_region *cfg)
 {
@@ -48,83 +44,6 @@  static void __init free_all_mmcfg(void)
 		pci_mmconfig_remove(cfg);
 }
 
-static void list_add_sorted(struct pci_mmcfg_region *new)
-{
-	struct pci_mmcfg_region *cfg;
-
-	/* keep list sorted by segment and starting bus number */
-	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) {
-		if (cfg->segment > new->segment ||
-		    (cfg->segment == new->segment &&
-		     cfg->start_bus >= new->start_bus)) {
-			list_add_tail_rcu(&new->list, &cfg->list);
-			return;
-		}
-	}
-	list_add_tail_rcu(&new->list, &pci_mmcfg_list);
-}
-
-static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start,
-						   int end, u64 addr)
-{
-	struct pci_mmcfg_region *new;
-	struct resource *res;
-
-	if (addr == 0)
-		return NULL;
-
-	new = kzalloc(sizeof(*new), GFP_KERNEL);
-	if (!new)
-		return NULL;
-
-	new->address = addr;
-	new->segment = segment;
-	new->start_bus = start;
-	new->end_bus = end;
-
-	res = &new->res;
-	res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
-	res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
-	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-	snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
-		 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
-	res->name = new->name;
-
-	return new;
-}
-
-static struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
-							int end, u64 addr)
-{
-	struct pci_mmcfg_region *new;
-
-	new = pci_mmconfig_alloc(segment, start, end, addr);
-	if (new) {
-		mutex_lock(&pci_mmcfg_lock);
-		list_add_sorted(new);
-		mutex_unlock(&pci_mmcfg_lock);
-
-		pr_info(PREFIX
-		       "MMCONFIG for domain %04x [bus %02x-%02x] at %pR "
-		       "(base %#lx)\n",
-		       segment, start, end, &new->res, (unsigned long)addr);
-	}
-
-	return new;
-}
-
-struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
-{
-	struct pci_mmcfg_region *cfg;
-
-	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
-		if (cfg->segment == segment &&
-		    cfg->start_bus <= bus && bus <= cfg->end_bus)
-			return cfg;
-
-	return NULL;
-}
-
 static const char *__init pci_mmcfg_e7520(void)
 {
 	u32 win;
@@ -543,73 +462,6 @@  static void __init pci_mmcfg_reject_broken(int early)
 	}
 }
 
-static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
-					struct acpi_mcfg_allocation *cfg)
-{
-	int year;
-
-	if (cfg->address < 0xFFFFFFFF)
-		return 0;
-
-	if (!strncmp(mcfg->header.oem_id, "SGI", 3))
-		return 0;
-
-	if (mcfg->header.revision >= 1) {
-		if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
-		    year >= 2010)
-			return 0;
-	}
-
-	pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
-	       "is above 4GB, ignored\n", cfg->pci_segment,
-	       cfg->start_bus_number, cfg->end_bus_number, cfg->address);
-	return -EINVAL;
-}
-
-static int __init pci_parse_mcfg(struct acpi_table_header *header)
-{
-	struct acpi_table_mcfg *mcfg;
-	struct acpi_mcfg_allocation *cfg_table, *cfg;
-	unsigned long i;
-	int entries;
-
-	if (!header)
-		return -EINVAL;
-
-	mcfg = (struct acpi_table_mcfg *)header;
-
-	/* how many config structures do we have */
-	free_all_mmcfg();
-	entries = 0;
-	i = header->length - sizeof(struct acpi_table_mcfg);
-	while (i >= sizeof(struct acpi_mcfg_allocation)) {
-		entries++;
-		i -= sizeof(struct acpi_mcfg_allocation);
-	}
-	if (entries == 0) {
-		pr_err(PREFIX "MMCONFIG has no entries\n");
-		return -ENODEV;
-	}
-
-	cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
-	for (i = 0; i < entries; i++) {
-		cfg = &cfg_table[i];
-		if (acpi_mcfg_check_entry(mcfg, cfg)) {
-			free_all_mmcfg();
-			return -ENODEV;
-		}
-
-		if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
-				   cfg->end_bus_number, cfg->address) == NULL) {
-			pr_warn(PREFIX "no memory for MCFG entries\n");
-			free_all_mmcfg();
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-
 #ifdef CONFIG_ACPI_APEI
 extern int (*arch_apei_filter_addr)(int (*func)(__u64 start, __u64 size,
 				     void *data), void *data);
@@ -662,13 +514,20 @@  static void __init __pci_mmcfg_init(int early)
 
 static int __initdata known_bridge;
 
+static void __init pci_mmcfg_list_setup(void)
+{
+	free_all_mmcfg();
+	if (pci_mmconfig_parse_table())
+		free_all_mmcfg();
+}
+
 void __init pci_mmcfg_early_init(void)
 {
 	if (pci_probe & PCI_PROBE_MMCONF) {
 		if (pci_mmcfg_check_hostbridge())
 			known_bridge = 1;
 		else
-			acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+			pci_mmcfg_list_setup();
 		__pci_mmcfg_init(1);
 
 		set_apei_filter();
@@ -686,7 +545,7 @@  void __init pci_mmcfg_late_init(void)
 
 	/* MMCONFIG hasn't been enabled yet, try again */
 	if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) {
-		acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+		pci_mmcfg_list_setup();
 		__pci_mmcfg_init(0);
 	}
 }
@@ -720,99 +579,41 @@  static int __init pci_mmcfg_late_insert_resources(void)
  */
 late_initcall(pci_mmcfg_late_insert_resources);
 
-/* Add MMCFG information for host bridges */
-int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
-			phys_addr_t addr)
+int pci_mmconfig_map_resource(struct device *dev, struct pci_mmcfg_region *cfg)
 {
-	int rc;
-	struct resource *tmp = NULL;
-	struct pci_mmcfg_region *cfg;
+	struct resource *tmp;
 
-	if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed)
-		return -ENODEV;
-
-	if (start > end)
-		return -EINVAL;
-
-	mutex_lock(&pci_mmcfg_lock);
-	cfg = pci_mmconfig_lookup(seg, start);
-	if (cfg) {
-		if (cfg->end_bus < end)
-			dev_info(dev, FW_INFO
-				 "MMCONFIG for "
-				 "domain %04x [bus %02x-%02x] "
-				 "only partially covers this bridge\n",
-				  cfg->segment, cfg->start_bus, cfg->end_bus);
-		mutex_unlock(&pci_mmcfg_lock);
-		return -EEXIST;
-	}
-
-	if (!addr) {
-		mutex_unlock(&pci_mmcfg_lock);
-		return -EINVAL;
-	}
-
-	rc = -EBUSY;
-	cfg = pci_mmconfig_alloc(seg, start, end, addr);
-	if (cfg == NULL) {
-		dev_warn(dev, "fail to add MMCONFIG (out of memory)\n");
-		rc = -ENOMEM;
-	} else if (!pci_mmcfg_check_reserved(dev, cfg, 0)) {
+	if (!pci_mmcfg_check_reserved(dev, cfg, 0)) {
 		dev_warn(dev, FW_BUG "MMCONFIG %pR isn't reserved\n",
 			 &cfg->res);
-	} else {
-		/* Insert resource if it's not in boot stage */
-		if (pci_mmcfg_running_state)
-			tmp = insert_resource_conflict(&iomem_resource,
-						       &cfg->res);
-
+		return -EBUSY;
+	}
+	/* Insert resource if it's not in boot stage */
+	if (pci_mmcfg_running_state) {
+		tmp = insert_resource_conflict(&iomem_resource,
+					       &cfg->res);
 		if (tmp) {
-			dev_warn(dev,
-				 "MMCONFIG %pR conflicts with "
-				 "%s %pR\n",
-				 &cfg->res, tmp->name, tmp);
-		} else if (pci_mmcfg_arch_map(cfg)) {
-			dev_warn(dev, "fail to map MMCONFIG %pR.\n",
-				 &cfg->res);
-		} else {
-			list_add_sorted(cfg);
-			dev_info(dev, "MMCONFIG at %pR (base %#lx)\n",
-				 &cfg->res, (unsigned long)addr);
-			cfg = NULL;
-			rc = 0;
+			dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
+				&cfg->res, tmp->name, tmp);
+			return -EBUSY;
 		}
 	}
-
-	if (cfg) {
-		if (cfg->res.parent)
-			release_resource(&cfg->res);
-		kfree(cfg);
+	if (pci_mmcfg_arch_map(cfg)) {
+		dev_warn(dev, "fail to map MMCONFIG %pR.\n", &cfg->res);
+		return -EBUSY;
 	}
-
-	mutex_unlock(&pci_mmcfg_lock);
-
-	return rc;
+	return 0;
 }
 
-/* Delete MMCFG information for host bridges */
-int pci_mmconfig_delete(u16 seg, u8 start, u8 end)
+void pci_mmconfig_unmap_resource(struct pci_mmcfg_region *cfg)
 {
-	struct pci_mmcfg_region *cfg;
-
-	mutex_lock(&pci_mmcfg_lock);
-	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
-		if (cfg->segment == seg && cfg->start_bus == start &&
-		    cfg->end_bus == end) {
-			list_del_rcu(&cfg->list);
-			synchronize_rcu();
-			pci_mmcfg_arch_unmap(cfg);
-			if (cfg->res.parent)
-				release_resource(&cfg->res);
-			mutex_unlock(&pci_mmcfg_lock);
-			kfree(cfg);
-			return 0;
-		}
-	mutex_unlock(&pci_mmcfg_lock);
+	pci_mmcfg_arch_unmap(cfg);
+	if (cfg->res.parent)
+		release_resource(&cfg->res);
+	cfg->res.parent = NULL;
+}
 
-	return -ENOENT;
+int pci_mmconfig_enabled(void)
+{
+	return (pci_probe & PCI_PROBE_MMCONF) && !pci_mmcfg_arch_init_failed;
 }
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 43984bc..38a37f8 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -12,6 +12,7 @@ 
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/rcupdate.h>
+#include <linux/pci-acpi.h>
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
 
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index bea5249..29253ec 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -10,6 +10,7 @@ 
 #include <linux/acpi.h>
 #include <linux/bitmap.h>
 #include <linux/rcupdate.h>
+#include <linux/pci-acpi.h>
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
 
diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
index 2e565e6..c181eeb 100644
--- a/arch/x86/pci/numachip.c
+++ b/arch/x86/pci/numachip.c
@@ -14,6 +14,7 @@ 
  */
 
 #include <linux/pci.h>
+#include <linux/pci-acpi.h>
 #include <asm/pci_x86.h>
 
 static u8 limit __read_mostly;
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 7ea903d..e5e4393 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -40,6 +40,7 @@  acpi-$(CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC) += processor_pdc.o
 acpi-y				+= ec.o
 acpi-$(CONFIG_ACPI_DOCK)	+= dock.o
 acpi-y				+= pci_root.o pci_link.o pci_irq.o
+acpi-$(CONFIG_PCI_MMCONFIG)	+= pci_mcfg.o
 acpi-y				+= acpi_lpss.o acpi_apd.o
 acpi-y				+= acpi_platform.o
 acpi-y				+= acpi_pnp.o
diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
new file mode 100644
index 0000000..ea84365
--- /dev/null
+++ b/drivers/acpi/pci_mcfg.c
@@ -0,0 +1,312 @@ 
+/*
+ * pci_mcfg.c
+ *
+ * Common code to maintain the MCFG areas and mappings
+ *
+ * This has been extracted from arch/x86/pci/mmconfig-shared.c
+ * and moved here so that other architectures can use this code.
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/pci-acpi.h>
+#include <linux/sfi_acpi.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+
+#define PREFIX	"ACPI: "
+
+static DEFINE_MUTEX(pci_mmcfg_lock);
+LIST_HEAD(pci_mmcfg_list);
+
+static void list_add_sorted(struct pci_mmcfg_region *new)
+{
+	struct pci_mmcfg_region *cfg;
+
+	/* keep list sorted by segment and starting bus number */
+	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) {
+		if (cfg->segment > new->segment ||
+		    (cfg->segment == new->segment &&
+		     cfg->start_bus >= new->start_bus)) {
+			list_add_tail_rcu(&new->list, &cfg->list);
+			return;
+		}
+	}
+	list_add_tail_rcu(&new->list, &pci_mmcfg_list);
+}
+
+static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start,
+						   int end, u64 addr)
+{
+	struct pci_mmcfg_region *new;
+	struct resource *res;
+
+	if (addr == 0)
+		return NULL;
+
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	new->address = addr;
+	new->segment = segment;
+	new->start_bus = start;
+	new->end_bus = end;
+
+	res = &new->res;
+	res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
+	res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
+	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+	snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
+		 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
+	res->name = new->name;
+
+	return new;
+}
+
+struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
+							int end, u64 addr)
+{
+	struct pci_mmcfg_region *new;
+
+	new = pci_mmconfig_alloc(segment, start, end, addr);
+	if (new) {
+		mutex_lock(&pci_mmcfg_lock);
+		list_add_sorted(new);
+		mutex_unlock(&pci_mmcfg_lock);
+
+		pr_info(PREFIX
+		       "MMCONFIG for domain %04x [bus %02x-%02x] at %pR "
+		       "(base %#lx)\n",
+		       segment, start, end, &new->res, (unsigned long)addr);
+	}
+
+	return new;
+}
+
+struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
+{
+	struct pci_mmcfg_region *cfg;
+
+	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
+		if (cfg->segment == segment &&
+		    cfg->start_bus <= bus && bus <= cfg->end_bus)
+			return cfg;
+
+	return NULL;
+}
+
+/*
+ * Map a pci_mmcfg_region, can be overrriden by arch
+ */
+int __weak pci_mmconfig_map_resource(struct device *dev,
+	struct pci_mmcfg_region *mcfg)
+{
+	struct resource *tmp;
+	void __iomem *vaddr;
+
+	tmp = insert_resource_conflict(&iomem_resource, &mcfg->res);
+	if (tmp) {
+		dev_warn(dev, "MMCONFIG %pR conflicts with %s %pR\n",
+			&mcfg->res, tmp->name, tmp);
+		return -EBUSY;
+	}
+
+	vaddr =  ioremap(mcfg->res.start, resource_size(&mcfg->res));
+	if (!vaddr) {
+		release_resource(&mcfg->res);
+		return -ENOMEM;
+	}
+
+	mcfg->virt = vaddr;
+	return 0;
+}
+
+/*
+ * Unmap a pci_mmcfg_region, can be overrriden by arch
+ */
+void __weak pci_mmconfig_unmap_resource(struct pci_mmcfg_region *mcfg)
+{
+	if (mcfg->virt) {
+		iounmap(mcfg->virt);
+		mcfg->virt = NULL;
+	}
+	if (mcfg->res.parent) {
+		release_resource(&mcfg->res);
+		mcfg->res.parent = NULL;
+	}
+}
+
+/*
+ * check if the mmconfig is enabled and configured
+ */
+int __weak pci_mmconfig_enabled(void)
+{
+	return 1;
+}
+
+/* Add MMCFG information for host bridges */
+int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
+			phys_addr_t addr)
+{
+	struct pci_mmcfg_region *cfg;
+	int rc;
+
+	if (!pci_mmconfig_enabled())
+		return -ENODEV;
+	if (start > end)
+		return -EINVAL;
+
+	mutex_lock(&pci_mmcfg_lock);
+	cfg = pci_mmconfig_lookup(seg, start);
+	if (cfg) {
+		if (cfg->end_bus < end)
+			dev_info(dev, FW_INFO
+				 "MMCONFIG for "
+				 "domain %04x [bus %02x-%02x] "
+				 "only partially covers this bridge\n",
+				  cfg->segment, cfg->start_bus, cfg->end_bus);
+		rc = -EEXIST;
+		goto err;
+	}
+
+	if (!addr) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	cfg = pci_mmconfig_alloc(seg, start, end, addr);
+	if (cfg == NULL) {
+		dev_warn(dev, "fail to add MMCONFIG (out of memory)\n");
+		rc = -ENOMEM;
+		goto err;
+	}
+	rc = pci_mmconfig_map_resource(dev, cfg);
+	if (!rc) {
+		list_add_sorted(cfg);
+		dev_info(dev, "MMCONFIG at %pR (base %#lx)\n",
+				 &cfg->res, (unsigned long)addr);
+		return 0;
+	} else {
+		if (cfg->res.parent)
+			release_resource(&cfg->res);
+		kfree(cfg);
+	}
+
+err:
+	mutex_unlock(&pci_mmcfg_lock);
+	return rc;
+}
+
+/* Delete MMCFG information for host bridges */
+int pci_mmconfig_delete(u16 seg, u8 start, u8 end)
+{
+	struct pci_mmcfg_region *cfg;
+
+	mutex_lock(&pci_mmcfg_lock);
+	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
+		if (cfg->segment == seg && cfg->start_bus == start &&
+		    cfg->end_bus == end) {
+			list_del_rcu(&cfg->list);
+			synchronize_rcu();
+			pci_mmconfig_unmap_resource(cfg);
+			mutex_unlock(&pci_mmcfg_lock);
+			kfree(cfg);
+			return 0;
+		}
+	mutex_unlock(&pci_mmcfg_lock);
+
+	return -ENOENT;
+}
+
+static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
+					struct acpi_mcfg_allocation *cfg)
+{
+	int year;
+
+	if (!config_enabled(CONFIG_X86))
+		return 0;
+
+	if (cfg->address < 0xFFFFFFFF)
+		return 0;
+
+	if (!strncmp(mcfg->header.oem_id, "SGI", 3))
+		return 0;
+
+	if (mcfg->header.revision >= 1) {
+		if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
+		    year >= 2010)
+			return 0;
+	}
+
+	pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
+	       "is above 4GB, ignored\n", cfg->pci_segment,
+	       cfg->start_bus_number, cfg->end_bus_number, cfg->address);
+	return -EINVAL;
+}
+
+static int __init pci_parse_mcfg(struct acpi_table_header *header)
+{
+	struct acpi_table_mcfg *mcfg;
+	struct acpi_mcfg_allocation *cfg_table, *cfg;
+	unsigned long i;
+	int entries;
+
+	if (!header)
+		return -EINVAL;
+
+	mcfg = (struct acpi_table_mcfg *)header;
+
+	/* how many config structures do we have */
+	entries = 0;
+	i = header->length - sizeof(struct acpi_table_mcfg);
+	while (i >= sizeof(struct acpi_mcfg_allocation)) {
+		entries++;
+		i -= sizeof(struct acpi_mcfg_allocation);
+	}
+	if (entries == 0) {
+		pr_err(PREFIX "MMCONFIG has no entries\n");
+		return -ENODEV;
+	}
+
+	cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
+	for (i = 0; i < entries; i++) {
+		cfg = &cfg_table[i];
+		if (acpi_mcfg_check_entry(mcfg, cfg))
+			return -ENODEV;
+
+		if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
+				   cfg->end_bus_number, cfg->address) == NULL) {
+			pr_warn(PREFIX "no memory for MCFG entries\n");
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+int __init pci_mmconfig_parse_table(void)
+{
+	return acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+}
+
+void __weak __init pci_mmcfg_late_init(void)
+{
+	int err, n = 0;
+	struct pci_mmcfg_region *cfg;
+
+	err = pci_mmconfig_parse_table();
+	if (err) {
+		pr_err(PREFIX " Failed to parse MCFG (%d)\n", err);
+		return;
+	}
+
+	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+		pci_mmconfig_map_resource(NULL, cfg);
+		n++;
+	}
+
+	pr_info(PREFIX " MCFG table loaded %d entries\n", n);
+}
diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
index 7494dbe..97aa9d3 100644
--- a/drivers/xen/pci.c
+++ b/drivers/xen/pci.c
@@ -27,9 +27,6 @@ 
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 #include "../pci/pci.h"
-#ifdef CONFIG_PCI_MMCONFIG
-#include <asm/pci_x86.h>
-#endif
 
 static bool __read_mostly pci_seg_supported = true;
 
@@ -221,7 +218,7 @@  static int __init xen_mcfg_late(void)
 	if (!xen_initial_domain())
 		return 0;
 
-	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+	if (!pci_mmconfig_enabled())
 		return 0;
 
 	if (list_empty(&pci_mmcfg_list))
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 89ab057..e9450ef 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -106,6 +106,39 @@  extern const u8 pci_acpi_dsm_uuid[];
 #define RESET_DELAY_DSM		0x08
 #define FUNCTION_DELAY_DSM	0x09
 
+/* common API to maintain list of MCFG regions */
+/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
+#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
+
+struct pci_mmcfg_region {
+	struct list_head list;
+	struct resource res;
+	u64 address;
+	char __iomem *virt;
+	u16 segment;
+	u8 start_bus;
+	u8 end_bus;
+	char name[PCI_MMCFG_RESOURCE_NAME_LEN];
+};
+
+extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
+			       phys_addr_t addr);
+extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
+
+extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
+extern struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
+							int end, u64 addr);
+extern int pci_mmconfig_map_resource(struct device *dev,
+	struct pci_mmcfg_region *mcfg);
+extern void pci_mmconfig_unmap_resource(struct pci_mmcfg_region *mcfg);
+extern int pci_mmconfig_enabled(void);
+extern int __init pci_mmconfig_parse_table(void);
+
+extern struct list_head pci_mmcfg_list;
+
+#define PCI_MMCFG_BUS_OFFSET(bus)      ((bus) << 20)
+#define PCI_MMCFG_OFFSET(bus, devfn)   ((bus) << 20 | (devfn) << 12)
+
 #else	/* CONFIG_ACPI */
 static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
 static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }