diff mbox

[16/23] pci: pcie host and mmcfg support.

Message ID 1254737223-16129-17-git-send-email-yamahata@valinux.co.jp
State Superseded
Headers show

Commit Message

Isaku Yamahata Oct. 5, 2009, 10:06 a.m. UTC
This patch adds common routines for pcie host bridge and pcie mmcfg.
This will be used by q35 based chipset emulation.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
---
 hw/hw.h       |   12 +++
 hw/pci.c      |  247 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 hw/pci.h      |   36 ++++++++-
 hw/pci_host.h |   22 +++++
 4 files changed, 285 insertions(+), 32 deletions(-)

Comments

Michael S. Tsirkin Oct. 5, 2009, 11:01 a.m. UTC | #1
On Mon, Oct 05, 2009 at 07:06:56PM +0900, Isaku Yamahata wrote:
> This patch adds common routines for pcie host bridge and pcie mmcfg.
> This will be used by q35 based chipset emulation.
> 
> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>

Looks like most my comments on this patch were ignored.
I started repeating them here but then I grew tired,
so only partially repeated them. Are you going to address this?

Generally, pci express code isn't well separated from pci, here.  Please
try to generalize pci code, then express support would only touch a
couple of places.  Maybe even a separate file like msix.  Also, pci
express spec requires pci express capability to be present. I expected
to see this code using capability machinery.

> ---
>  hw/hw.h       |   12 +++
>  hw/pci.c      |  247 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
>  hw/pci.h      |   36 ++++++++-
>  hw/pci_host.h |   22 +++++
>  4 files changed, 285 insertions(+), 32 deletions(-)
> 
> diff --git a/hw/hw.h b/hw/hw.h
> index cf266b3..478b0b2 100644
> --- a/hw/hw.h
> +++ b/hw/hw.h
> @@ -428,6 +428,18 @@ extern const VMStateDescription vmstate_pci_device;
>              + type_check(PCIDevice,typeof_field(_state, _field))     \
>  }
>  
> +extern const VMStateDescription vmstate_pcie_device;
> +
> +#define VMSTATE_PCIE_DEVICE(_field, _state) {                        \
> +    .name       = (stringify(_field)),                               \
> +    .version_id = 2,                                                 \
> +    .size       = sizeof(PCIDevice),                                 \
> +    .vmsd       = &vmstate_pcie_device,                              \
> +    .flags      = VMS_STRUCT,                                        \
> +    .offset     = offsetof(_state, _field)                           \
> +            + type_check(PCIDevice,typeof_field(_state, _field))     \
> +}
> +
>  /* _f : field name
>     _f_n : num of elements field_name
>     _n : num of elements
> diff --git a/hw/pci.c b/hw/pci.c
> index 5f808ff..12260da 100644
> --- a/hw/pci.c
> +++ b/hw/pci.c
> @@ -23,6 +23,7 @@
>   */
>  #include "hw.h"
>  #include "pci.h"
> +#include "pci_host.h"
>  #include "monitor.h"
>  #include "net.h"
>  #include "sysemu.h"
> @@ -184,9 +185,12 @@ int pci_bus_num(PCIBus *s)
>  static int get_pci_config_device(QEMUFile *f, void *pv, size_t size)
>  {
>      PCIDevice *s = container_of(pv, PCIDevice, config);
> -    uint8_t config[size];
> +    uint8_t *config;
>      int i;
>  
> +    assert(size == pcie_config_size(s));
> +    config = qemu_malloc(size * sizeof(config[0]));
> +
>      qemu_get_buffer(f, config, size);
>      for (i = 0; i < size; ++i)
>          if ((config[i] ^ s->config[i]) & s->cmask[i] & ~s->wmask[i])
> @@ -195,6 +199,7 @@ static int get_pci_config_device(QEMUFile *f, void *pv, size_t size)
>  
>      pci_update_mappings(s);
>  
> +    qemu_free(config);
>      return 0;
>  }
>  
> @@ -202,6 +207,7 @@ static int get_pci_config_device(QEMUFile *f, void *pv, size_t size)
>  static void put_pci_config_device(QEMUFile *f, const void *pv, size_t size)
>  {
>      const uint8_t *v = pv;
> +    assert(size == pcie_config_size(container_of(pv, PCIDevice, config)));
>      qemu_put_buffer(f, v, size);
>  }
>  
> @@ -211,6 +217,17 @@ static VMStateInfo vmstate_info_pci_config = {
>      .put  = put_pci_config_device,
>  };
>  
> +#define VMSTATE_PCI_CONFIG(_field, _state, _version, _info, _type,   \
> +                           _size) {                                  \
> +    .name       = (stringify(_field)),                               \
> +    .version_id = (_version),                                        \
> +    .size       = (_size),                                           \
> +    .info       = &(_info),                                          \
> +    .flags      = VMS_SINGLE | VMS_POINTER,                          \
> +    .offset     = offsetof(_state, _field)                           \
> +            + type_check(_type,typeof_field(_state, _field))         \
> +}
> +
>  const VMStateDescription vmstate_pci_device = {
>      .name = "PCIDevice",
>      .version_id = 2,
> @@ -218,21 +235,46 @@ const VMStateDescription vmstate_pci_device = {
>      .minimum_version_id_old = 1,
>      .fields      = (VMStateField []) {
>          VMSTATE_INT32_LE(version_id, PCIDevice),
> -        VMSTATE_SINGLE(config, PCIDevice, 0, vmstate_info_pci_config,
> -                       typeof_field(PCIDevice,config)),
> +        VMSTATE_PCI_CONFIG(config, PCIDevice, 0, vmstate_info_pci_config,
> +                           typeof_field(PCIDevice, config),
> +                           PCI_CONFIG_SPACE_SIZE),
> +        VMSTATE_INT32_ARRAY_V(irq_state, PCIDevice, PCI_NUM_PINS, 2),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +const VMStateDescription vmstate_pcie_device = {
> +    .name = "PCIDevice",
> +    .version_id = 2,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .fields      = (VMStateField []) {
> +        VMSTATE_INT32_LE(version_id, PCIDevice),
> +        VMSTATE_PCI_CONFIG(config, PCIDevice, 0, vmstate_info_pci_config,
> +                           typeof_field(PCIDevice, config),
> +                           PCIE_CONFIG_SPACE_SIZE),
>          VMSTATE_INT32_ARRAY_V(irq_state, PCIDevice, PCI_NUM_PINS, 2),
>          VMSTATE_END_OF_LIST()
>      }
>  };
>  
> +static const VMStateDescription *pci_get_vmstate(PCIDevice *s)
> +{
> +    if (pci_is_pcie(s)) {
> +        return &vmstate_pcie_device;
> +    }
> +
> +    return &vmstate_pci_device;
> +}
> +
>  void pci_device_save(PCIDevice *s, QEMUFile *f)
>  {
> -    vmstate_save_state(f, &vmstate_pci_device, s);
> +    vmstate_save_state(f, pci_get_vmstate(s), s);
>  }
>  
>  int pci_device_load(PCIDevice *s, QEMUFile *f)
>  {
> -    return vmstate_load_state(f, &vmstate_pci_device, s, s->version_id);
> +    return vmstate_load_state(f, pci_get_vmstate(s), s, s->version_id);
>  }
>  
>  static int pci_set_default_subsystem_id(PCIDevice *pci_dev)
> @@ -341,14 +383,31 @@ static void pci_init_cmask(PCIDevice *dev)
>  static void pci_init_wmask(PCIDevice *dev)
>  {
>      int i;
> +    uint32_t config_size = pcie_config_size(dev);
> +
>      dev->wmask[PCI_CACHE_LINE_SIZE] = 0xff;
>      dev->wmask[PCI_INTERRUPT_LINE] = 0xff;
>      dev->wmask[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY
>                                | PCI_COMMAND_MASTER;
> -    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> +    for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i)
>          dev->wmask[i] = 0xff;
>  }
>  
> +static void pci_config_alloc(PCIDevice *pci_dev)
> +{
> +    int config_size = pcie_config_size(pci_dev);
> +#define PCI_CONFIG_ALLOC(d, member, size)                               \
> +    do {                                                                \
> +        (d)->member =                                                   \
> +            (typeof((d)->member))qemu_mallocz(sizeof((d)->member[0]) *  \
> +                                              size);                    \
> +    } while (0)
> +    PCI_CONFIG_ALLOC(pci_dev, config, config_size);
> +    PCI_CONFIG_ALLOC(pci_dev, cmask, config_size);
> +    PCI_CONFIG_ALLOC(pci_dev, wmask, config_size);
> +    PCI_CONFIG_ALLOC(pci_dev, used, config_size);

Please don't abuse macros in this way.
And sizeof is a single byte anyway, so just qemu_mallocz(config_size)
is all you need.

Also, I don't see this memory freed anywhere.

> +}
> +
>  /* -1 for devfn means auto assign */
>  static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
>                                           const char *name, int devfn,
> @@ -369,6 +428,7 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
>      pci_dev->devfn = devfn;
>      pstrcpy(pci_dev->name, sizeof(pci_dev->name), name);
>      memset(pci_dev->irq_state, 0, sizeof(pci_dev->irq_state));
> +    pci_config_alloc(pci_dev);
>      pci_set_default_subsystem_id(pci_dev);
>      pci_init_cmask(pci_dev);
>      pci_init_wmask(pci_dev);
> @@ -586,40 +646,48 @@ static void pci_update_mappings(PCIDevice *d)
>      }
>  }
>  
> +static uint8_t pcie_config_get_byte(PCIDevice *d, uint32_t addr)
> +{
> +    uint8_t *conf = &d->config[addr];
> +    if (conf != NULL)
> +        return *conf;
> +    return 0;
> +}
> +

config is never NULL here.

> +static uint32_t pcie_config_get(PCIDevice *d, uint32_t addr, int len)
> +{
> +    int i;
> +    union {
> +        uint8_t val8[4];
> +        uint32_t val32;
> +    } v = { .val32 = 0 };
> +
> +    for (i = 0; i < len; i++) {
> +        v.val8[i] = pcie_config_get_byte(d, addr + i);
> +    }
> +
> +    return le32_to_cpu(v.val32);
> +}
> +

You seem to displike memcpy :) It's more readbable and generates better
code than an open loop. I already posted a single-line replacement for
this function:
uint32_t val = 0;
memcpy(&val, d->config + addr, len);
return le32_to_cpu(val);

which is now small enough to be open-coded.
Also, names starting with pcie are inappropriate for
generic pci code.


>  uint32_t pci_default_read_config(PCIDevice *d,
>                                   uint32_t address, int len)
>  {
> -    uint32_t val;
> +    uint32_t config_size = pcie_config_size(d);
>  
> -    switch(len) {
> -    default:
> -    case 4:
> -	if (address <= 0xfc) {
> -            val = pci_get_long(d->config + address);
> -	    break;
> -	}
> -	/* fall through */
> -    case 2:
> -        if (address <= 0xfe) {
> -            val = pci_get_word(d->config + address);
> -	    break;
> -	}
> -	/* fall through */
> -    case 1:
> -        val = pci_get_byte(d->config + address);
> -        break;
> -    }
> -    return val;
> +    assert(len == 1 || len == 2 || len == 4);
> +    len = MIN(len, config_size - MIN(config_size, address));
> +    return pcie_config_get(d, address, len);
>  }
>  
>  void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
>  {
>      uint8_t orig[PCI_CONFIG_SPACE_SIZE];
>      int i;
> +    uint32_t config_size = pcie_config_size(d);
>  
>      /* not efficient, but simple */
>      memcpy(orig, d->config, PCI_CONFIG_SPACE_SIZE);
> -    for(i = 0; i < l && addr < PCI_CONFIG_SPACE_SIZE; val >>= 8, ++i, ++addr) {
> +    for(i = 0; i < l && addr < config_size; val >>= 8, ++i, ++addr) {
>          uint8_t wmask = d->wmask[addr];
>          d->config[addr] = (d->config[addr] & ~wmask) | (val & wmask);
>      }
> @@ -703,6 +771,128 @@ uint32_t pci_data_read(void *opaque, uint32_t addr, int len)
>      return pci_data_read_common(pci_addr_to_dev(s, addr),
>                                  pci_addr_to_config(addr), len);
>  }
> +
> +#define PCIE_MASK(val, hi_bit, low_bit)                 \
> +    (((val) & (((1ULL << (hi_bit)) - 1))) >> (low_bit))

Why long long? Should be 1UL?

> +#define PCIE_VAL(VAL, val)                                              \
> +    PCIE_MASK((val), PCIE_MMCFG_ ## VAL ## _HI, PCIE_MMCFG_ ## VAL ## _LOW)

Please kill these macros.

> +#define PCIE_MMCFG_BUS_HI               28
> +#define PCIE_MMCFG_BUS_LOW              20
> +#define PCIE_MMCFG_DEV_HI               19
> +#define PCIE_MMCFG_DEV_LOW              15
> +#define PCIE_MMCFG_FUNC_HI              14
> +#define PCIE_MMCFG_FUNC_LOW             12
> +#define PCIE_MMCFG_CONFADDR_HI          11
> +#define PCIE_MMCFG_CONFADDR_LOW         0

HI/LOW are confusing: you don't know whether HI is last bit or one after
last. _BIT and _MASK are unambigious and would be better.

> +#define PCIE_MMCFG_BUS(addr)            PCIE_VAL(BUS, (addr))
> +#define PCIE_MMCFG_DEV(addr)            PCIE_VAL(DEV, (addr))
> +#define PCIE_MMCFG_FUNC(addr)           PCIE_VAL(FUNC, (addr))
> +#define PCIE_MMCFG_CONFADDR(addr)       PCIE_VAL(CONFADDR, (addr))
> +
> +void pcie_data_write(void *opaque, uint32_t addr, uint32_t val, int len)
> +{
> +    PCIBus *s = opaque;
> +    pci_data_write_common(pci_find_device(s, PCIE_MMCFG_BUS(addr),
> +                                          PCIE_MMCFG_DEV(addr),
> +                                          PCIE_MMCFG_FUNC(addr)),
> +                          PCIE_MMCFG_CONFADDR(addr),
> +                          val, len);
> +}
> +
> +uint32_t pcie_data_read(void *opaque, uint32_t addr, int len)
> +{
> +    PCIBus *s = opaque;
> +    return pci_data_read_common(pci_find_device(s, PCIE_MMCFG_BUS(addr),
> +                                                PCIE_MMCFG_DEV(addr),
> +                                                PCIE_MMCFG_FUNC(addr)),
> +                                PCIE_MMCFG_CONFADDR(addr),
> +                                len);
> +}

These are only for mmcfg, right? So please name these
pcie_mmcfg_data_read.

> +
> +#define DEFINE_PCIE_HOST_DATA_READ(len)                         \
> +    static uint32_t pcie_host_data_read_ ## len (               \
> +        void *opaque, target_phys_addr_t addr)                  \
> +    {                                                           \
> +        PCIExpressHost *e = (PCIExpressHost *)opaque;           \


Do not cast void * values. This way if type is changed
at some point, you get compiler error.

> +        return pcie_data_read(e->pci.bus,                       \
> +                              addr - e->base_addr, (len));      \
> +    }
> +
> +#define DEFINE_PCIE_HOST_DATA_WRITE(len)                        \
> +    static void pcie_host_data_write_ ## len (                  \
> +        void *opaque, target_phys_addr_t addr, uint32_t value)  \
> +    {                                                           \
> +        PCIExpressHost *e = (PCIExpressHost *)opaque;           \
> +        pcie_data_write(e->pci.bus,                             \
> +                        addr - e->base_addr, value, (len));     \
> +    }
> +
> +#define DEFINE_PCIE_HOST_DATA_MMIO(len)      \
> +        DEFINE_PCIE_HOST_DATA_READ(len)      \
> +        DEFINE_PCIE_HOST_DATA_WRITE(len)
> +
> +DEFINE_PCIE_HOST_DATA_MMIO(1)
> +DEFINE_PCIE_HOST_DATA_MMIO(2)
> +DEFINE_PCIE_HOST_DATA_MMIO(4)
> +
> +#define DEFINE_PCIE_MEMORY_FUNCS(Type, type)                            \
> +    static CPU ## Type ## MemoryFunc *pcie_host_data_ ## type [] =      \
> +    {                                                                   \
> +        &pcie_host_data_ ## type ## _1,                                 \
> +        &pcie_host_data_ ## type ## _2,                                 \
> +        &pcie_host_data_ ## type ## _4,                                 \
> +    };
> +
> +DEFINE_PCIE_MEMORY_FUNCS(Read, read)
> +DEFINE_PCIE_MEMORY_FUNCS(Write, write)

Please just init structures in the plain way.  Then editors, taggers
have a chance in hell to figure out how a function gets called.  These
macros obfuscate code terribly, and do not really buy us anything.

> +
> +int pcie_host_init(PCIExpressHost *e,
> +                   CPUReadMemoryFunc **mmcfg_read,
> +                   CPUWriteMemoryFunc **mmcfg_write)
> +{
> +    e->base_addr = PCIE_BASE_ADDR_INVALID;
> +
> +    if (mmcfg_read == NULL)

Replace == NULL with !

> +        mmcfg_read = pcie_host_data_read;
> +    if (mmcfg_write == NULL)
> +        mmcfg_write = pcie_host_data_write;

Do you already need special casing reads/writes?
It seems unlikely that anyone would override these.

> +    e->mmio_index = cpu_register_io_memory(mmcfg_read, mmcfg_write, e);
> +    if (e->mmio_index < 0) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +void pcie_host_mmcfg_unmap(PCIExpressHost *e)
> +{
> +    if (e->base_addr != PCIE_BASE_ADDR_INVALID) {
> +        cpu_register_physical_memory(e->base_addr, e->size, IO_MEM_UNASSIGNED);
> +    }
> +}
> +
> +void pcie_host_mmcfg_map(PCIExpressHost *e,
> +                         target_phys_addr_t addr, uint32_t size)
> +{
> +    assert((size & (size - 1)) == 0); /* power of 2 */

Replace == 0 with !

> +    assert(size >= (1ULL << PCIE_MMCFG_BUS_LOW));
> +    assert(size <= (1ULL << PCIE_MMCFG_BUS_HI));
> +
> +    e->base_addr = addr;
> +    e->size = size;
> +    cpu_register_physical_memory(e->base_addr, e->size, e->mmio_index);
> +}
> +
> +void pcie_host_mmcfg_update(PCIExpressHost *e,
> +                            int enable,
> +                            target_phys_addr_t addr, uint32_t size)
> +{
> +    pcie_host_mmcfg_unmap(e);
> +    if (enable) {
> +        pcie_host_mmcfg_map(e, addr, size);
> +    }
> +}
> +
>  /***********************************************************/
>  /* generic PCI irq support */
>  
> @@ -1052,9 +1242,10 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name)
>  
>  static int pci_find_space(PCIDevice *pdev, uint8_t size)
>  {
> +    int config_size = pcie_config_size(pdev);
>      int offset = PCI_CONFIG_HEADER_SIZE;
>      int i;
> -    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> +    for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i)
>          if (pdev->used[i])
>              offset = i + 1;
>          else if (i - offset + 1 == size)
> diff --git a/hw/pci.h b/hw/pci.h
> index 00f2b78..1f402d2 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -175,20 +175,26 @@ enum {
>      QEMU_PCI_CAP_MSIX = 0x1,
>  };
>  
> +/* Size of the standart PCIe config space: 4KB */
> +#define PCIE_CONFIG_SPACE_SIZE  0x1000
> +#define PCIE_EXT_CONFIG_SPACE_SIZE                      \
> +    (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE)
> +

where's this used?

>  struct PCIDevice {
>      DeviceState qdev;
> +
>      /* PCI config space */
> -    uint8_t config[PCI_CONFIG_SPACE_SIZE];
> +    uint8_t *config;
>  
>      /* Used to enable config checks on load. Note that writeable bits are
>       * never checked even if set in cmask. */
> -    uint8_t cmask[PCI_CONFIG_SPACE_SIZE];
> +    uint8_t *cmask;
>  
>      /* Used to implement R/W bytes */
> -    uint8_t wmask[PCI_CONFIG_SPACE_SIZE];
> +    uint8_t *wmask;
>  
>      /* Used to allocate config space for capabilities. */
> -    uint8_t used[PCI_CONFIG_SPACE_SIZE];
> +    uint8_t *used;
>  
>      /* the following fields are read only */
>      PCIBus *bus;
> @@ -270,6 +276,8 @@ PCIDevice *pci_nic_init(NICInfo *nd, const char *default_model,
>                          const char *default_devaddr);
>  void pci_data_write(void *opaque, uint32_t addr, uint32_t val, int len);
>  uint32_t pci_data_read(void *opaque, uint32_t addr, int len);
> +void pcie_data_write(void *opaque, uint32_t addr, uint32_t val, int len);
> +uint32_t pcie_data_read(void *opaque, uint32_t addr, int len);
>  int pci_bus_num(PCIBus *s);
>  void pci_for_each_device(PCIBus *bus, int bus_num, void (*fn)(PCIBus *bus, PCIDevice *d));
>  PCIBus *pci_find_host_bus(int domain);
> @@ -356,6 +364,9 @@ typedef struct {
>      pci_qdev_initfn init;
>      PCIConfigReadFunc *config_read;
>      PCIConfigWriteFunc *config_write;
> +
> +    /* pcie stuff */
> +    int pcie;
>  } PCIDeviceInfo;
>  
>  void pci_qdev_register(PCIDeviceInfo *info);
> @@ -365,6 +376,23 @@ PCIDevice *pci_create(const char *name, const char *devaddr);
>  PCIDevice *pci_create_noinit(PCIBus *bus, int devfn, const char *name);
>  PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name);
>  
> +static inline int pci_is_pcie(PCIDevice *d)
> +{
> +    /*
> +     * At the moment, all the pci devices aren't qdevfied. So
> +     * d->qdev.info might be NULL.
> +     * Given that pcie device emulator hasn't exist, we conclude that
> +     * such a device isn't pcie.
> +     */
> +    return d->qdev.info != NULL &&

kill != NULL

> +        container_of(d->qdev.info, PCIDeviceInfo, qdev)->pcie;
> +}
> +

I think this should use capabilities just like msix does.
So you would have a bit set in the cap_present field
instead of adding and using an extra field.


> +static inline uint32_t pcie_config_size(PCIDevice *d)
> +{
> +    return pci_is_pcie(d)? PCIE_CONFIG_SPACE_SIZE: PCI_CONFIG_SPACE_SIZE;

space before ?

> +}
> +

Should be pci_config_size. Generally pcie_xxx should be for
express only, generic stuff should be pci_xxx.

>  /* lsi53c895a.c */
>  #define LSI_MAX_DEVS 7
>  
> diff --git a/hw/pci_host.h b/hw/pci_host.h
> index ea98ed2..59147cf 100644
> --- a/hw/pci_host.h
> +++ b/hw/pci_host.h
> @@ -48,4 +48,26 @@ uint32_t pci_host_data_readb_ioport(void* opaque, uint32_t addr);
>  uint32_t pci_host_data_readw_ioport(void* opaque, uint32_t addr);
>  uint32_t pci_host_data_readl_ioport(void* opaque, uint32_t addr);
>  
> +typedef struct {
> +    PCIHostState pci;
> +
> +    /* express part */
> +    target_phys_addr_t  base_addr;
> +#define PCIE_BASE_ADDR_INVALID  ((target_phys_addr_t)-1ULL)

Isn't this the same as PCI? Let's reuse it?

> +    target_phys_addr_t  size;
> +    int bus_num_order;
> +    int mmio_index;
> +} PCIExpressHost;
> +
> +int pcie_host_init(PCIExpressHost *e,
> +                   CPUReadMemoryFunc **mmcfg_read,
> +                   CPUWriteMemoryFunc **mmcfg_write);
> +
> +void pcie_host_mmcfg_unmap(PCIExpressHost *e);
> +void pcie_host_mmcfg_map(PCIExpressHost *e,
> +                         target_phys_addr_t addr, uint32_t size);
> +void pcie_host_mmcfg_update(PCIExpressHost *e,
> +                            int enable,
> +                            target_phys_addr_t addr, uint32_t size);
> +
>  #endif /* PCI_HOST_H */
> -- 
> 1.6.0.2
Michael S. Tsirkin Oct. 5, 2009, 11:41 a.m. UTC | #2
On Mon, Oct 05, 2009 at 07:06:56PM +0900, Isaku Yamahata wrote:
> @@ -1052,9 +1242,10 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name)
>  
>  static int pci_find_space(PCIDevice *pdev, uint8_t size)
>  {
> +    int config_size = pcie_config_size(pdev);
>      int offset = PCI_CONFIG_HEADER_SIZE;
>      int i;
> -    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> +    for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i)
>          if (pdev->used[i])
>              offset = i + 1;
>          else if (i - offset + 1 == size)
> diff --git a/hw/pci.h b/hw/pci.h
> index 00f2b78..1f402d2 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -175,20 +175,26 @@ enum {
>      QEMU_PCI_CAP_MSIX = 0x1,
>  };
>  
> +/* Size of the standart PCIe config space: 4KB */
> +#define PCIE_CONFIG_SPACE_SIZE  0x1000
> +#define PCIE_EXT_CONFIG_SPACE_SIZE                      \
> +    (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE)
> +
>  struct PCIDevice {
>      DeviceState qdev;
> +
>      /* PCI config space */
> -    uint8_t config[PCI_CONFIG_SPACE_SIZE];
> +    uint8_t *config;
>  
>      /* Used to enable config checks on load. Note that writeable bits are
>       * never checked even if set in cmask. */
> -    uint8_t cmask[PCI_CONFIG_SPACE_SIZE];
> +    uint8_t *cmask;
>  
>      /* Used to implement R/W bytes */
> -    uint8_t wmask[PCI_CONFIG_SPACE_SIZE];
> +    uint8_t *wmask;
>  
>      /* Used to allocate config space for capabilities. */
> -    uint8_t used[PCI_CONFIG_SPACE_SIZE];
> +    uint8_t *used;
>  
>      /* the following fields are read only */
>      PCIBus *bus;


So I thought about this some more, and I think that this change
in unnecessary. PCI Express adds support for extended 4K
configuration space, but the only thing that must reside
there is the optional advanced error reporting capability,
which I don't think we'll need to implement, ever.

Everything else can reside in the first 256 bytes, just as for regular
PCI. And pci code already returns 0 for accesses outside the first 256
bytes, so express specific code is necessary.

If we do this, we can keep memcpy for the full original
space (it's just 4 cache lines) and simple memcmp
with no overlap checks will be enough to figure out
if something changed. To make that also possible for
bridges, we'll then just put the orig array inside
PCIDevice. Not sure I explain this last point clearly -
would you like to see a patch?
Isaku Yamahata Oct. 6, 2009, 8:48 a.m. UTC | #3
On Mon, Oct 05, 2009 at 01:01:43PM +0200, Michael S. Tsirkin wrote:
> On Mon, Oct 05, 2009 at 07:06:56PM +0900, Isaku Yamahata wrote:
> > This patch adds common routines for pcie host bridge and pcie mmcfg.
> > This will be used by q35 based chipset emulation.
> > 
> > Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
> 
> Looks like most my comments on this patch were ignored.
> I started repeating them here but then I grew tired,
> so only partially repeated them. Are you going to address this?

Which mail? Could you please care to resent the previous mail to me?
I'm willing to address your review, and I appreciated your review.

I'm afraid that your mail was lost somewhere in the net.
I tried the qemu.org ML archive, however I couldn't find it there.
And I also wasn't able to find it in gmane.org neither.
Michael S. Tsirkin Oct. 6, 2009, 9:30 a.m. UTC | #4
On Tue, Oct 06, 2009 at 05:48:22PM +0900, Isaku Yamahata wrote:
> On Mon, Oct 05, 2009 at 01:01:43PM +0200, Michael S. Tsirkin wrote:
> > On Mon, Oct 05, 2009 at 07:06:56PM +0900, Isaku Yamahata wrote:
> > > This patch adds common routines for pcie host bridge and pcie mmcfg.
> > > This will be used by q35 based chipset emulation.
> > > 
> > > Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
> > 
> > Looks like most my comments on this patch were ignored.
> > I started repeating them here but then I grew tired,
> > so only partially repeated them. Are you going to address this?
> 
> Which mail? Could you please care to resent the previous mail to me?
> I'm willing to address your review, and I appreciated your review.
> 
> I'm afraid that your mail was lost somewhere in the net.

Ugh, looks like I never sent it.  No wonder comments weren't addressed
:) Sending out now, sorry about the noise.
Isaku Yamahata Oct. 6, 2009, 10:02 a.m. UTC | #5
On Mon, Oct 05, 2009 at 01:41:21PM +0200, Michael S. Tsirkin wrote:
> On Mon, Oct 05, 2009 at 07:06:56PM +0900, Isaku Yamahata wrote:
> > @@ -1052,9 +1242,10 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name)
> >  
> >  static int pci_find_space(PCIDevice *pdev, uint8_t size)
> >  {
> > +    int config_size = pcie_config_size(pdev);
> >      int offset = PCI_CONFIG_HEADER_SIZE;
> >      int i;
> > -    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> > +    for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i)
> >          if (pdev->used[i])
> >              offset = i + 1;
> >          else if (i - offset + 1 == size)
> > diff --git a/hw/pci.h b/hw/pci.h
> > index 00f2b78..1f402d2 100644
> > --- a/hw/pci.h
> > +++ b/hw/pci.h
> > @@ -175,20 +175,26 @@ enum {
> >      QEMU_PCI_CAP_MSIX = 0x1,
> >  };
> >  
> > +/* Size of the standart PCIe config space: 4KB */
> > +#define PCIE_CONFIG_SPACE_SIZE  0x1000
> > +#define PCIE_EXT_CONFIG_SPACE_SIZE                      \
> > +    (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE)
> > +
> >  struct PCIDevice {
> >      DeviceState qdev;
> > +
> >      /* PCI config space */
> > -    uint8_t config[PCI_CONFIG_SPACE_SIZE];
> > +    uint8_t *config;
> >  
> >      /* Used to enable config checks on load. Note that writeable bits are
> >       * never checked even if set in cmask. */
> > -    uint8_t cmask[PCI_CONFIG_SPACE_SIZE];
> > +    uint8_t *cmask;
> >  
> >      /* Used to implement R/W bytes */
> > -    uint8_t wmask[PCI_CONFIG_SPACE_SIZE];
> > +    uint8_t *wmask;
> >  
> >      /* Used to allocate config space for capabilities. */
> > -    uint8_t used[PCI_CONFIG_SPACE_SIZE];
> > +    uint8_t *used;
> >  
> >      /* the following fields are read only */
> >      PCIBus *bus;
> 
> 
> So I thought about this some more, and I think that this change
> in unnecessary. PCI Express adds support for extended 4K
> configuration space, but the only thing that must reside
> there is the optional advanced error reporting capability,
> which I don't think we'll need to implement, ever.
>
> Everything else can reside in the first 256 bytes, just as for regular
> PCI. And pci code already returns 0 for accesses outside the first 256
> bytes, so express specific code is necessary.

I agree with you for emulated PCI express device
(which doesn't exist for now). However I oppose it for other reason.

My purpose is to direct attach PCIe device to a guest including
AER emulation somehow.
When direct attaching PCIe native device to a guest,
we don't have any control on how its configuration space is used.
When an error is reported on host via AER, I'd like to pass 
the error to guest in some manner. So I want AER too in a sense.
Michael S. Tsirkin Oct. 6, 2009, 10:57 a.m. UTC | #6
On Tue, Oct 06, 2009 at 07:02:59PM +0900, Isaku Yamahata wrote:
> On Mon, Oct 05, 2009 at 01:41:21PM +0200, Michael S. Tsirkin wrote:
> > On Mon, Oct 05, 2009 at 07:06:56PM +0900, Isaku Yamahata wrote:
> > > @@ -1052,9 +1242,10 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name)
> > >  
> > >  static int pci_find_space(PCIDevice *pdev, uint8_t size)
> > >  {
> > > +    int config_size = pcie_config_size(pdev);
> > >      int offset = PCI_CONFIG_HEADER_SIZE;
> > >      int i;
> > > -    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> > > +    for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i)
> > >          if (pdev->used[i])
> > >              offset = i + 1;
> > >          else if (i - offset + 1 == size)
> > > diff --git a/hw/pci.h b/hw/pci.h
> > > index 00f2b78..1f402d2 100644
> > > --- a/hw/pci.h
> > > +++ b/hw/pci.h
> > > @@ -175,20 +175,26 @@ enum {
> > >      QEMU_PCI_CAP_MSIX = 0x1,
> > >  };
> > >  
> > > +/* Size of the standart PCIe config space: 4KB */
> > > +#define PCIE_CONFIG_SPACE_SIZE  0x1000
> > > +#define PCIE_EXT_CONFIG_SPACE_SIZE                      \
> > > +    (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE)
> > > +
> > >  struct PCIDevice {
> > >      DeviceState qdev;
> > > +
> > >      /* PCI config space */
> > > -    uint8_t config[PCI_CONFIG_SPACE_SIZE];
> > > +    uint8_t *config;
> > >  
> > >      /* Used to enable config checks on load. Note that writeable bits are
> > >       * never checked even if set in cmask. */
> > > -    uint8_t cmask[PCI_CONFIG_SPACE_SIZE];
> > > +    uint8_t *cmask;
> > >  
> > >      /* Used to implement R/W bytes */
> > > -    uint8_t wmask[PCI_CONFIG_SPACE_SIZE];
> > > +    uint8_t *wmask;
> > >  
> > >      /* Used to allocate config space for capabilities. */
> > > -    uint8_t used[PCI_CONFIG_SPACE_SIZE];
> > > +    uint8_t *used;
> > >  
> > >      /* the following fields are read only */
> > >      PCIBus *bus;
> > 
> > 
> > So I thought about this some more, and I think that this change
> > in unnecessary. PCI Express adds support for extended 4K
> > configuration space, but the only thing that must reside
> > there is the optional advanced error reporting capability,
> > which I don't think we'll need to implement, ever.
> >
> > Everything else can reside in the first 256 bytes, just as for regular
> > PCI. And pci code already returns 0 for accesses outside the first 256
> > bytes, so express specific code is necessary.
> 
> I agree with you for emulated PCI express device
> (which doesn't exist for now). However I oppose it for other reason.
> 
> My purpose is to direct attach PCIe device to a guest including
> AER emulation somehow.

For now, if I were you, I would just ignore AER.

> When direct attaching PCIe native device to a guest,
> we don't have any control on how its configuration space is used.
> When an error is reported on host via AER, I'd like to pass 
> the error to guest in some manner. So I want AER too in a sense.

Since what you will want to do is forward stuff to a physical device,
you likely won't need to keep anything in memory at all.
So express code might just do
	if (offset > 256)
		write_to_physical_device();
But, let's take these things one at a time.
For one, device assignment is not upstream at all.

By the way, a general question: how was this tested?  Did you manage to
make guest generate mmconfig transactions?  Which guests do this?

> -- 
> yamahata
Michael S. Tsirkin Oct. 6, 2009, 1:21 p.m. UTC | #7
On Tue, Oct 06, 2009 at 12:57:01PM +0200, Michael S. Tsirkin wrote:
> On Tue, Oct 06, 2009 at 07:02:59PM +0900, Isaku Yamahata wrote:
> > On Mon, Oct 05, 2009 at 01:41:21PM +0200, Michael S. Tsirkin wrote:
> > > On Mon, Oct 05, 2009 at 07:06:56PM +0900, Isaku Yamahata wrote:
> > > > @@ -1052,9 +1242,10 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name)
> > > >  
> > > >  static int pci_find_space(PCIDevice *pdev, uint8_t size)
> > > >  {
> > > > +    int config_size = pcie_config_size(pdev);
> > > >      int offset = PCI_CONFIG_HEADER_SIZE;
> > > >      int i;
> > > > -    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> > > > +    for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i)
> > > >          if (pdev->used[i])
> > > >              offset = i + 1;
> > > >          else if (i - offset + 1 == size)
> > > > diff --git a/hw/pci.h b/hw/pci.h
> > > > index 00f2b78..1f402d2 100644
> > > > --- a/hw/pci.h
> > > > +++ b/hw/pci.h
> > > > @@ -175,20 +175,26 @@ enum {
> > > >      QEMU_PCI_CAP_MSIX = 0x1,
> > > >  };
> > > >  
> > > > +/* Size of the standart PCIe config space: 4KB */
> > > > +#define PCIE_CONFIG_SPACE_SIZE  0x1000
> > > > +#define PCIE_EXT_CONFIG_SPACE_SIZE                      \
> > > > +    (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE)
> > > > +
> > > >  struct PCIDevice {
> > > >      DeviceState qdev;
> > > > +
> > > >      /* PCI config space */
> > > > -    uint8_t config[PCI_CONFIG_SPACE_SIZE];
> > > > +    uint8_t *config;
> > > >  
> > > >      /* Used to enable config checks on load. Note that writeable bits are
> > > >       * never checked even if set in cmask. */
> > > > -    uint8_t cmask[PCI_CONFIG_SPACE_SIZE];
> > > > +    uint8_t *cmask;
> > > >  
> > > >      /* Used to implement R/W bytes */
> > > > -    uint8_t wmask[PCI_CONFIG_SPACE_SIZE];
> > > > +    uint8_t *wmask;
> > > >  
> > > >      /* Used to allocate config space for capabilities. */
> > > > -    uint8_t used[PCI_CONFIG_SPACE_SIZE];
> > > > +    uint8_t *used;
> > > >  
> > > >      /* the following fields are read only */
> > > >      PCIBus *bus;
> > > 
> > > 
> > > So I thought about this some more, and I think that this change
> > > in unnecessary. PCI Express adds support for extended 4K
> > > configuration space, but the only thing that must reside
> > > there is the optional advanced error reporting capability,
> > > which I don't think we'll need to implement, ever.
> > >
> > > Everything else can reside in the first 256 bytes, just as for regular
> > > PCI. And pci code already returns 0 for accesses outside the first 256
> > > bytes, so express specific code is necessary.
> > 
> > I agree with you for emulated PCI express device
> > (which doesn't exist for now). However I oppose it for other reason.
> > 
> > My purpose is to direct attach PCIe device to a guest including
> > AER emulation somehow.
> 
> For now, if I were you, I would just ignore AER.
> 
> > When direct attaching PCIe native device to a guest,
> > we don't have any control on how its configuration space is used.
> > When an error is reported on host via AER, I'd like to pass 
> > the error to guest in some manner. So I want AER too in a sense.
> 
> Since what you will want to do is forward stuff to a physical device,
> you likely won't need to keep anything in memory at all.
> So express code might just do
> 	if (offset > 256)
> 		write_to_physical_device();
> But, let's take these things one at a time.
> For one, device assignment is not upstream at all.

The point being, we'll know what's the best way to implement
extended memory space when we see some code using it.


> By the way, a general question: how was this tested?  Did you manage to
> make guest generate mmconfig transactions?  Which guests do this?
> 
> > -- 
> > yamahata
Isaku Yamahata Oct. 7, 2009, 2:25 a.m. UTC | #8
On Tue, Oct 06, 2009 at 03:21:38PM +0200, Michael S. Tsirkin wrote:
> On Tue, Oct 06, 2009 at 12:57:01PM +0200, Michael S. Tsirkin wrote:
> > On Tue, Oct 06, 2009 at 07:02:59PM +0900, Isaku Yamahata wrote:
> > > On Mon, Oct 05, 2009 at 01:41:21PM +0200, Michael S. Tsirkin wrote:
> > > > On Mon, Oct 05, 2009 at 07:06:56PM +0900, Isaku Yamahata wrote:
> > > > > @@ -1052,9 +1242,10 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name)
> > > > >  
> > > > >  static int pci_find_space(PCIDevice *pdev, uint8_t size)
> > > > >  {
> > > > > +    int config_size = pcie_config_size(pdev);
> > > > >      int offset = PCI_CONFIG_HEADER_SIZE;
> > > > >      int i;
> > > > > -    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> > > > > +    for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i)
> > > > >          if (pdev->used[i])
> > > > >              offset = i + 1;
> > > > >          else if (i - offset + 1 == size)
> > > > > diff --git a/hw/pci.h b/hw/pci.h
> > > > > index 00f2b78..1f402d2 100644
> > > > > --- a/hw/pci.h
> > > > > +++ b/hw/pci.h
> > > > > @@ -175,20 +175,26 @@ enum {
> > > > >      QEMU_PCI_CAP_MSIX = 0x1,
> > > > >  };
> > > > >  
> > > > > +/* Size of the standart PCIe config space: 4KB */
> > > > > +#define PCIE_CONFIG_SPACE_SIZE  0x1000
> > > > > +#define PCIE_EXT_CONFIG_SPACE_SIZE                      \
> > > > > +    (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE)
> > > > > +
> > > > >  struct PCIDevice {
> > > > >      DeviceState qdev;
> > > > > +
> > > > >      /* PCI config space */
> > > > > -    uint8_t config[PCI_CONFIG_SPACE_SIZE];
> > > > > +    uint8_t *config;
> > > > >  
> > > > >      /* Used to enable config checks on load. Note that writeable bits are
> > > > >       * never checked even if set in cmask. */
> > > > > -    uint8_t cmask[PCI_CONFIG_SPACE_SIZE];
> > > > > +    uint8_t *cmask;
> > > > >  
> > > > >      /* Used to implement R/W bytes */
> > > > > -    uint8_t wmask[PCI_CONFIG_SPACE_SIZE];
> > > > > +    uint8_t *wmask;
> > > > >  
> > > > >      /* Used to allocate config space for capabilities. */
> > > > > -    uint8_t used[PCI_CONFIG_SPACE_SIZE];
> > > > > +    uint8_t *used;
> > > > >  
> > > > >      /* the following fields are read only */
> > > > >      PCIBus *bus;
> > > > 
> > > > 
> > > > So I thought about this some more, and I think that this change
> > > > in unnecessary. PCI Express adds support for extended 4K
> > > > configuration space, but the only thing that must reside
> > > > there is the optional advanced error reporting capability,
> > > > which I don't think we'll need to implement, ever.
> > > >
> > > > Everything else can reside in the first 256 bytes, just as for regular
> > > > PCI. And pci code already returns 0 for accesses outside the first 256
> > > > bytes, so express specific code is necessary.
> > > 
> > > I agree with you for emulated PCI express device
> > > (which doesn't exist for now). However I oppose it for other reason.
> > > 
> > > My purpose is to direct attach PCIe device to a guest including
> > > AER emulation somehow.
> > 
> > For now, if I were you, I would just ignore AER.
>
> > > When direct attaching PCIe native device to a guest,
> > > we don't have any control on how its configuration space is used.
> > > When an error is reported on host via AER, I'd like to pass 
> > > the error to guest in some manner. So I want AER too in a sense.
> > 
> > Since what you will want to do is forward stuff to a physical device,
> > you likely won't need to keep anything in memory at all.
> > So express code might just do
> > 	if (offset > 256)
> > 		write_to_physical_device();
> > But, let's take these things one at a time.
> > For one, device assignment is not upstream at all.
> 
> The point being, we'll know what's the best way to implement
> extended memory space when we see some code using it.

The above code doesn't work.
Some bits in configuration space must be virtualized, it means that
virtualized value must be stored in memory.
Limiting 256 bytes would just make unnecessary complication.
We have to add many hooks like if (offset > 256) call_express_code()
and the express code would be almost same as pci one.
We've already had direct attached code. The difference for pci express 
direct attach from pci direct attach is just that the code receives offset
in configuration space which might be greater than 255.


> > By the way, a general question: how was this tested?  Did you manage to
> > make guest generate mmconfig transactions?  Which guests do this?

Yes, I've tested MMCONFIG access by patching linux kernel as stated in
patch description in PATCH 0.
Akio Takebe Oct. 7, 2009, 11:30 a.m. UTC | #9
>> So I thought about this some more, and I think that this change
>> in unnecessary. PCI Express adds support for extended 4K
>> configuration space, but the only thing that must reside
>> there is the optional advanced error reporting capability,
>> which I don't think we'll need to implement, ever.
>>
>> Everything else can reside in the first 256 bytes, just as for regular
>> PCI. And pci code already returns 0 for accesses outside the first 256
>> bytes, so express specific code is necessary.
>
>I agree with you for emulated PCI express device
>(which doesn't exist for now). However I oppose it for other reason.
>
>My purpose is to direct attach PCIe device to a guest including
>AER emulation somehow.
>When direct attaching PCIe native device to a guest,
>we don't have any control on how its configuration space is used.
>When an error is reported on host via AER, I'd like to pass 
>the error to guest in some manner. So I want AER too in a sense.
I also want AER feature. In this PCIe emulator codes,
PCIe features like AER is highlight.
If we can use AER, this pcie emulator is very worthwhile, I think.

Best Regards,

Akio Takebe
Michael S. Tsirkin Oct. 7, 2009, 12:17 p.m. UTC | #10
On Wed, Oct 07, 2009 at 11:25:46AM +0900, Isaku Yamahata wrote:
> The above code doesn't work.
> Some bits in configuration space must be virtualized, it means that
> virtualized value must be stored in memory.
> Limiting 256 bytes would just make unnecessary complication.
> We have to add many hooks like if (offset > 256) call_express_code()
> and the express code would be almost same as pci one.
> We've already had direct attached code. The difference for pci express 
> direct attach from pci direct attach is just that the code receives offset
> in configuration space which might be greater than 255.

Okay. What I mainly disliked is the complex update routines.  But I
found a way to get rid of them without limiting ourselves to 256 bytes
size: in fact, we only need to save the first 64 bytes for default
config callback, whatever the full config space size is.

Since this is a single cache line, it won't get any more efficient,
and we can use simple memcmp to check for bar changes etc.

Patch soon, please review and let me know.
diff mbox

Patch

diff --git a/hw/hw.h b/hw/hw.h
index cf266b3..478b0b2 100644
--- a/hw/hw.h
+++ b/hw/hw.h
@@ -428,6 +428,18 @@  extern const VMStateDescription vmstate_pci_device;
             + type_check(PCIDevice,typeof_field(_state, _field))     \
 }
 
+extern const VMStateDescription vmstate_pcie_device;
+
+#define VMSTATE_PCIE_DEVICE(_field, _state) {                        \
+    .name       = (stringify(_field)),                               \
+    .version_id = 2,                                                 \
+    .size       = sizeof(PCIDevice),                                 \
+    .vmsd       = &vmstate_pcie_device,                              \
+    .flags      = VMS_STRUCT,                                        \
+    .offset     = offsetof(_state, _field)                           \
+            + type_check(PCIDevice,typeof_field(_state, _field))     \
+}
+
 /* _f : field name
    _f_n : num of elements field_name
    _n : num of elements
diff --git a/hw/pci.c b/hw/pci.c
index 5f808ff..12260da 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -23,6 +23,7 @@ 
  */
 #include "hw.h"
 #include "pci.h"
+#include "pci_host.h"
 #include "monitor.h"
 #include "net.h"
 #include "sysemu.h"
@@ -184,9 +185,12 @@  int pci_bus_num(PCIBus *s)
 static int get_pci_config_device(QEMUFile *f, void *pv, size_t size)
 {
     PCIDevice *s = container_of(pv, PCIDevice, config);
-    uint8_t config[size];
+    uint8_t *config;
     int i;
 
+    assert(size == pcie_config_size(s));
+    config = qemu_malloc(size * sizeof(config[0]));
+
     qemu_get_buffer(f, config, size);
     for (i = 0; i < size; ++i)
         if ((config[i] ^ s->config[i]) & s->cmask[i] & ~s->wmask[i])
@@ -195,6 +199,7 @@  static int get_pci_config_device(QEMUFile *f, void *pv, size_t size)
 
     pci_update_mappings(s);
 
+    qemu_free(config);
     return 0;
 }
 
@@ -202,6 +207,7 @@  static int get_pci_config_device(QEMUFile *f, void *pv, size_t size)
 static void put_pci_config_device(QEMUFile *f, const void *pv, size_t size)
 {
     const uint8_t *v = pv;
+    assert(size == pcie_config_size(container_of(pv, PCIDevice, config)));
     qemu_put_buffer(f, v, size);
 }
 
@@ -211,6 +217,17 @@  static VMStateInfo vmstate_info_pci_config = {
     .put  = put_pci_config_device,
 };
 
+#define VMSTATE_PCI_CONFIG(_field, _state, _version, _info, _type,   \
+                           _size) {                                  \
+    .name       = (stringify(_field)),                               \
+    .version_id = (_version),                                        \
+    .size       = (_size),                                           \
+    .info       = &(_info),                                          \
+    .flags      = VMS_SINGLE | VMS_POINTER,                          \
+    .offset     = offsetof(_state, _field)                           \
+            + type_check(_type,typeof_field(_state, _field))         \
+}
+
 const VMStateDescription vmstate_pci_device = {
     .name = "PCIDevice",
     .version_id = 2,
@@ -218,21 +235,46 @@  const VMStateDescription vmstate_pci_device = {
     .minimum_version_id_old = 1,
     .fields      = (VMStateField []) {
         VMSTATE_INT32_LE(version_id, PCIDevice),
-        VMSTATE_SINGLE(config, PCIDevice, 0, vmstate_info_pci_config,
-                       typeof_field(PCIDevice,config)),
+        VMSTATE_PCI_CONFIG(config, PCIDevice, 0, vmstate_info_pci_config,
+                           typeof_field(PCIDevice, config),
+                           PCI_CONFIG_SPACE_SIZE),
+        VMSTATE_INT32_ARRAY_V(irq_state, PCIDevice, PCI_NUM_PINS, 2),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+const VMStateDescription vmstate_pcie_device = {
+    .name = "PCIDevice",
+    .version_id = 2,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields      = (VMStateField []) {
+        VMSTATE_INT32_LE(version_id, PCIDevice),
+        VMSTATE_PCI_CONFIG(config, PCIDevice, 0, vmstate_info_pci_config,
+                           typeof_field(PCIDevice, config),
+                           PCIE_CONFIG_SPACE_SIZE),
         VMSTATE_INT32_ARRAY_V(irq_state, PCIDevice, PCI_NUM_PINS, 2),
         VMSTATE_END_OF_LIST()
     }
 };
 
+static const VMStateDescription *pci_get_vmstate(PCIDevice *s)
+{
+    if (pci_is_pcie(s)) {
+        return &vmstate_pcie_device;
+    }
+
+    return &vmstate_pci_device;
+}
+
 void pci_device_save(PCIDevice *s, QEMUFile *f)
 {
-    vmstate_save_state(f, &vmstate_pci_device, s);
+    vmstate_save_state(f, pci_get_vmstate(s), s);
 }
 
 int pci_device_load(PCIDevice *s, QEMUFile *f)
 {
-    return vmstate_load_state(f, &vmstate_pci_device, s, s->version_id);
+    return vmstate_load_state(f, pci_get_vmstate(s), s, s->version_id);
 }
 
 static int pci_set_default_subsystem_id(PCIDevice *pci_dev)
@@ -341,14 +383,31 @@  static void pci_init_cmask(PCIDevice *dev)
 static void pci_init_wmask(PCIDevice *dev)
 {
     int i;
+    uint32_t config_size = pcie_config_size(dev);
+
     dev->wmask[PCI_CACHE_LINE_SIZE] = 0xff;
     dev->wmask[PCI_INTERRUPT_LINE] = 0xff;
     dev->wmask[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY
                               | PCI_COMMAND_MASTER;
-    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
+    for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i)
         dev->wmask[i] = 0xff;
 }
 
+static void pci_config_alloc(PCIDevice *pci_dev)
+{
+    int config_size = pcie_config_size(pci_dev);
+#define PCI_CONFIG_ALLOC(d, member, size)                               \
+    do {                                                                \
+        (d)->member =                                                   \
+            (typeof((d)->member))qemu_mallocz(sizeof((d)->member[0]) *  \
+                                              size);                    \
+    } while (0)
+    PCI_CONFIG_ALLOC(pci_dev, config, config_size);
+    PCI_CONFIG_ALLOC(pci_dev, cmask, config_size);
+    PCI_CONFIG_ALLOC(pci_dev, wmask, config_size);
+    PCI_CONFIG_ALLOC(pci_dev, used, config_size);
+}
+
 /* -1 for devfn means auto assign */
 static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
                                          const char *name, int devfn,
@@ -369,6 +428,7 @@  static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
     pci_dev->devfn = devfn;
     pstrcpy(pci_dev->name, sizeof(pci_dev->name), name);
     memset(pci_dev->irq_state, 0, sizeof(pci_dev->irq_state));
+    pci_config_alloc(pci_dev);
     pci_set_default_subsystem_id(pci_dev);
     pci_init_cmask(pci_dev);
     pci_init_wmask(pci_dev);
@@ -586,40 +646,48 @@  static void pci_update_mappings(PCIDevice *d)
     }
 }
 
+static uint8_t pcie_config_get_byte(PCIDevice *d, uint32_t addr)
+{
+    uint8_t *conf = &d->config[addr];
+    if (conf != NULL)
+        return *conf;
+    return 0;
+}
+
+static uint32_t pcie_config_get(PCIDevice *d, uint32_t addr, int len)
+{
+    int i;
+    union {
+        uint8_t val8[4];
+        uint32_t val32;
+    } v = { .val32 = 0 };
+
+    for (i = 0; i < len; i++) {
+        v.val8[i] = pcie_config_get_byte(d, addr + i);
+    }
+
+    return le32_to_cpu(v.val32);
+}
+
 uint32_t pci_default_read_config(PCIDevice *d,
                                  uint32_t address, int len)
 {
-    uint32_t val;
+    uint32_t config_size = pcie_config_size(d);
 
-    switch(len) {
-    default:
-    case 4:
-	if (address <= 0xfc) {
-            val = pci_get_long(d->config + address);
-	    break;
-	}
-	/* fall through */
-    case 2:
-        if (address <= 0xfe) {
-            val = pci_get_word(d->config + address);
-	    break;
-	}
-	/* fall through */
-    case 1:
-        val = pci_get_byte(d->config + address);
-        break;
-    }
-    return val;
+    assert(len == 1 || len == 2 || len == 4);
+    len = MIN(len, config_size - MIN(config_size, address));
+    return pcie_config_get(d, address, len);
 }
 
 void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
 {
     uint8_t orig[PCI_CONFIG_SPACE_SIZE];
     int i;
+    uint32_t config_size = pcie_config_size(d);
 
     /* not efficient, but simple */
     memcpy(orig, d->config, PCI_CONFIG_SPACE_SIZE);
-    for(i = 0; i < l && addr < PCI_CONFIG_SPACE_SIZE; val >>= 8, ++i, ++addr) {
+    for(i = 0; i < l && addr < config_size; val >>= 8, ++i, ++addr) {
         uint8_t wmask = d->wmask[addr];
         d->config[addr] = (d->config[addr] & ~wmask) | (val & wmask);
     }
@@ -703,6 +771,128 @@  uint32_t pci_data_read(void *opaque, uint32_t addr, int len)
     return pci_data_read_common(pci_addr_to_dev(s, addr),
                                 pci_addr_to_config(addr), len);
 }
+
+#define PCIE_MASK(val, hi_bit, low_bit)                 \
+    (((val) & (((1ULL << (hi_bit)) - 1))) >> (low_bit))
+#define PCIE_VAL(VAL, val)                                              \
+    PCIE_MASK((val), PCIE_MMCFG_ ## VAL ## _HI, PCIE_MMCFG_ ## VAL ## _LOW)
+#define PCIE_MMCFG_BUS_HI               28
+#define PCIE_MMCFG_BUS_LOW              20
+#define PCIE_MMCFG_DEV_HI               19
+#define PCIE_MMCFG_DEV_LOW              15
+#define PCIE_MMCFG_FUNC_HI              14
+#define PCIE_MMCFG_FUNC_LOW             12
+#define PCIE_MMCFG_CONFADDR_HI          11
+#define PCIE_MMCFG_CONFADDR_LOW         0
+#define PCIE_MMCFG_BUS(addr)            PCIE_VAL(BUS, (addr))
+#define PCIE_MMCFG_DEV(addr)            PCIE_VAL(DEV, (addr))
+#define PCIE_MMCFG_FUNC(addr)           PCIE_VAL(FUNC, (addr))
+#define PCIE_MMCFG_CONFADDR(addr)       PCIE_VAL(CONFADDR, (addr))
+
+void pcie_data_write(void *opaque, uint32_t addr, uint32_t val, int len)
+{
+    PCIBus *s = opaque;
+    pci_data_write_common(pci_find_device(s, PCIE_MMCFG_BUS(addr),
+                                          PCIE_MMCFG_DEV(addr),
+                                          PCIE_MMCFG_FUNC(addr)),
+                          PCIE_MMCFG_CONFADDR(addr),
+                          val, len);
+}
+
+uint32_t pcie_data_read(void *opaque, uint32_t addr, int len)
+{
+    PCIBus *s = opaque;
+    return pci_data_read_common(pci_find_device(s, PCIE_MMCFG_BUS(addr),
+                                                PCIE_MMCFG_DEV(addr),
+                                                PCIE_MMCFG_FUNC(addr)),
+                                PCIE_MMCFG_CONFADDR(addr),
+                                len);
+}
+
+#define DEFINE_PCIE_HOST_DATA_READ(len)                         \
+    static uint32_t pcie_host_data_read_ ## len (               \
+        void *opaque, target_phys_addr_t addr)                  \
+    {                                                           \
+        PCIExpressHost *e = (PCIExpressHost *)opaque;           \
+        return pcie_data_read(e->pci.bus,                       \
+                              addr - e->base_addr, (len));      \
+    }
+
+#define DEFINE_PCIE_HOST_DATA_WRITE(len)                        \
+    static void pcie_host_data_write_ ## len (                  \
+        void *opaque, target_phys_addr_t addr, uint32_t value)  \
+    {                                                           \
+        PCIExpressHost *e = (PCIExpressHost *)opaque;           \
+        pcie_data_write(e->pci.bus,                             \
+                        addr - e->base_addr, value, (len));     \
+    }
+
+#define DEFINE_PCIE_HOST_DATA_MMIO(len)      \
+        DEFINE_PCIE_HOST_DATA_READ(len)      \
+        DEFINE_PCIE_HOST_DATA_WRITE(len)
+
+DEFINE_PCIE_HOST_DATA_MMIO(1)
+DEFINE_PCIE_HOST_DATA_MMIO(2)
+DEFINE_PCIE_HOST_DATA_MMIO(4)
+
+#define DEFINE_PCIE_MEMORY_FUNCS(Type, type)                            \
+    static CPU ## Type ## MemoryFunc *pcie_host_data_ ## type [] =      \
+    {                                                                   \
+        &pcie_host_data_ ## type ## _1,                                 \
+        &pcie_host_data_ ## type ## _2,                                 \
+        &pcie_host_data_ ## type ## _4,                                 \
+    };
+
+DEFINE_PCIE_MEMORY_FUNCS(Read, read)
+DEFINE_PCIE_MEMORY_FUNCS(Write, write)
+
+int pcie_host_init(PCIExpressHost *e,
+                   CPUReadMemoryFunc **mmcfg_read,
+                   CPUWriteMemoryFunc **mmcfg_write)
+{
+    e->base_addr = PCIE_BASE_ADDR_INVALID;
+
+    if (mmcfg_read == NULL)
+        mmcfg_read = pcie_host_data_read;
+    if (mmcfg_write == NULL)
+        mmcfg_write = pcie_host_data_write;
+    e->mmio_index = cpu_register_io_memory(mmcfg_read, mmcfg_write, e);
+    if (e->mmio_index < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+void pcie_host_mmcfg_unmap(PCIExpressHost *e)
+{
+    if (e->base_addr != PCIE_BASE_ADDR_INVALID) {
+        cpu_register_physical_memory(e->base_addr, e->size, IO_MEM_UNASSIGNED);
+    }
+}
+
+void pcie_host_mmcfg_map(PCIExpressHost *e,
+                         target_phys_addr_t addr, uint32_t size)
+{
+    assert((size & (size - 1)) == 0); /* power of 2 */
+    assert(size >= (1ULL << PCIE_MMCFG_BUS_LOW));
+    assert(size <= (1ULL << PCIE_MMCFG_BUS_HI));
+
+    e->base_addr = addr;
+    e->size = size;
+    cpu_register_physical_memory(e->base_addr, e->size, e->mmio_index);
+}
+
+void pcie_host_mmcfg_update(PCIExpressHost *e,
+                            int enable,
+                            target_phys_addr_t addr, uint32_t size)
+{
+    pcie_host_mmcfg_unmap(e);
+    if (enable) {
+        pcie_host_mmcfg_map(e, addr, size);
+    }
+}
+
 /***********************************************************/
 /* generic PCI irq support */
 
@@ -1052,9 +1242,10 @@  PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name)
 
 static int pci_find_space(PCIDevice *pdev, uint8_t size)
 {
+    int config_size = pcie_config_size(pdev);
     int offset = PCI_CONFIG_HEADER_SIZE;
     int i;
-    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
+    for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i)
         if (pdev->used[i])
             offset = i + 1;
         else if (i - offset + 1 == size)
diff --git a/hw/pci.h b/hw/pci.h
index 00f2b78..1f402d2 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -175,20 +175,26 @@  enum {
     QEMU_PCI_CAP_MSIX = 0x1,
 };
 
+/* Size of the standart PCIe config space: 4KB */
+#define PCIE_CONFIG_SPACE_SIZE  0x1000
+#define PCIE_EXT_CONFIG_SPACE_SIZE                      \
+    (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE)
+
 struct PCIDevice {
     DeviceState qdev;
+
     /* PCI config space */
-    uint8_t config[PCI_CONFIG_SPACE_SIZE];
+    uint8_t *config;
 
     /* Used to enable config checks on load. Note that writeable bits are
      * never checked even if set in cmask. */
-    uint8_t cmask[PCI_CONFIG_SPACE_SIZE];
+    uint8_t *cmask;
 
     /* Used to implement R/W bytes */
-    uint8_t wmask[PCI_CONFIG_SPACE_SIZE];
+    uint8_t *wmask;
 
     /* Used to allocate config space for capabilities. */
-    uint8_t used[PCI_CONFIG_SPACE_SIZE];
+    uint8_t *used;
 
     /* the following fields are read only */
     PCIBus *bus;
@@ -270,6 +276,8 @@  PCIDevice *pci_nic_init(NICInfo *nd, const char *default_model,
                         const char *default_devaddr);
 void pci_data_write(void *opaque, uint32_t addr, uint32_t val, int len);
 uint32_t pci_data_read(void *opaque, uint32_t addr, int len);
+void pcie_data_write(void *opaque, uint32_t addr, uint32_t val, int len);
+uint32_t pcie_data_read(void *opaque, uint32_t addr, int len);
 int pci_bus_num(PCIBus *s);
 void pci_for_each_device(PCIBus *bus, int bus_num, void (*fn)(PCIBus *bus, PCIDevice *d));
 PCIBus *pci_find_host_bus(int domain);
@@ -356,6 +364,9 @@  typedef struct {
     pci_qdev_initfn init;
     PCIConfigReadFunc *config_read;
     PCIConfigWriteFunc *config_write;
+
+    /* pcie stuff */
+    int pcie;
 } PCIDeviceInfo;
 
 void pci_qdev_register(PCIDeviceInfo *info);
@@ -365,6 +376,23 @@  PCIDevice *pci_create(const char *name, const char *devaddr);
 PCIDevice *pci_create_noinit(PCIBus *bus, int devfn, const char *name);
 PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name);
 
+static inline int pci_is_pcie(PCIDevice *d)
+{
+    /*
+     * At the moment, all the pci devices aren't qdevfied. So
+     * d->qdev.info might be NULL.
+     * Given that pcie device emulator hasn't exist, we conclude that
+     * such a device isn't pcie.
+     */
+    return d->qdev.info != NULL &&
+        container_of(d->qdev.info, PCIDeviceInfo, qdev)->pcie;
+}
+
+static inline uint32_t pcie_config_size(PCIDevice *d)
+{
+    return pci_is_pcie(d)? PCIE_CONFIG_SPACE_SIZE: PCI_CONFIG_SPACE_SIZE;
+}
+
 /* lsi53c895a.c */
 #define LSI_MAX_DEVS 7
 
diff --git a/hw/pci_host.h b/hw/pci_host.h
index ea98ed2..59147cf 100644
--- a/hw/pci_host.h
+++ b/hw/pci_host.h
@@ -48,4 +48,26 @@  uint32_t pci_host_data_readb_ioport(void* opaque, uint32_t addr);
 uint32_t pci_host_data_readw_ioport(void* opaque, uint32_t addr);
 uint32_t pci_host_data_readl_ioport(void* opaque, uint32_t addr);
 
+typedef struct {
+    PCIHostState pci;
+
+    /* express part */
+    target_phys_addr_t  base_addr;
+#define PCIE_BASE_ADDR_INVALID  ((target_phys_addr_t)-1ULL)
+    target_phys_addr_t  size;
+    int bus_num_order;
+    int mmio_index;
+} PCIExpressHost;
+
+int pcie_host_init(PCIExpressHost *e,
+                   CPUReadMemoryFunc **mmcfg_read,
+                   CPUWriteMemoryFunc **mmcfg_write);
+
+void pcie_host_mmcfg_unmap(PCIExpressHost *e);
+void pcie_host_mmcfg_map(PCIExpressHost *e,
+                         target_phys_addr_t addr, uint32_t size);
+void pcie_host_mmcfg_update(PCIExpressHost *e,
+                            int enable,
+                            target_phys_addr_t addr, uint32_t size);
+
 #endif /* PCI_HOST_H */