Patchwork [V3,08/10] Introduce Xen PCI Passthrough, PCI config space helpers (2/3)

login
register
mail settings
Submitter Anthony PERARD
Date Oct. 28, 2011, 3:07 p.m.
Message ID <1319814456-8158-9-git-send-email-anthony.perard@citrix.com>
Download mbox | patch
Permalink /patch/122434/
State New
Headers show

Comments

Anthony PERARD - Oct. 28, 2011, 3:07 p.m.
From: Allen Kay <allen.m.kay@intel.com>

Signed-off-by: Allen Kay <allen.m.kay@intel.com>
Signed-off-by: Guy Zana <guy@neocleus.com>
Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 Makefile.target                      |    1 +
 hw/xen_pci_passthrough.h             |    2 +
 hw/xen_pci_passthrough_config_init.c | 2068 ++++++++++++++++++++++++++++++++++
 3 files changed, 2071 insertions(+), 0 deletions(-)
 create mode 100644 hw/xen_pci_passthrough_config_init.c
Stefano Stabellini - Nov. 8, 2011, 12:57 p.m.
Obviously passthrough cannot work without this patch, but qemu should be
able to compile anyway. Please add to the previous patch empty stub
implementations for all the exported functions that you are going to
implement here.

I see that the timer is allocated here.
In that case it would make sense to move the timer update to this patch.

On Fri, 28 Oct 2011, Anthony PERARD wrote:
> From: Allen Kay <allen.m.kay@intel.com>
> 
> Signed-off-by: Allen Kay <allen.m.kay@intel.com>
> Signed-off-by: Guy Zana <guy@neocleus.com>
> Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
> ---
>  Makefile.target                      |    1 +
>  hw/xen_pci_passthrough.h             |    2 +
>  hw/xen_pci_passthrough_config_init.c | 2068 ++++++++++++++++++++++++++++++++++
>  3 files changed, 2071 insertions(+), 0 deletions(-)
>  create mode 100644 hw/xen_pci_passthrough_config_init.c
> 
> diff --git a/Makefile.target b/Makefile.target
> index 36ea47d..c32c688 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -219,6 +219,7 @@ obj-i386-$(CONFIG_XEN) += xen_platform.o
>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough.o
>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_helpers.o
> +obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_config_init.o
> 
>  # Inter-VM PCI shared memory
>  CONFIG_IVSHMEM =
> diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
> index 2d1979d..ebc04fd 100644
> --- a/hw/xen_pci_passthrough.h
> +++ b/hw/xen_pci_passthrough.h
> @@ -61,6 +61,8 @@ typedef int (*conf_byte_restore)
>  /* power state transition */
>  #define PT_FLAG_TRANSITING 0x0001
> 
> +#define PT_BAR_ALLF        0xFFFFFFFF  /* BAR ALLF value */
> +
> 
>  typedef enum {
>      GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
> diff --git a/hw/xen_pci_passthrough_config_init.c b/hw/xen_pci_passthrough_config_init.c
> new file mode 100644
> index 0000000..4103b59
> --- /dev/null
> +++ b/hw/xen_pci_passthrough_config_init.c
> @@ -0,0 +1,2068 @@
> +/*
> + * Copyright (c) 2007, Neocleus Corporation.
> + * Copyright (c) 2007, Intel Corporation.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Alex Novik <alex@neocleus.com>
> + * Allen Kay <allen.m.kay@intel.com>
> + * Guy Zana <guy@neocleus.com>
> + *
> + * This file implements direct PCI assignment to a HVM guest
> + */
> +
> +#include "qemu-timer.h"
> +#include "xen_backend.h"
> +#include "xen_pci_passthrough.h"
> +
> +#define PT_MERGE_VALUE(value, data, val_mask) \
> +    (((value) & (val_mask)) | ((data) & ~(val_mask)))
> +
> +#define PT_INVALID_REG          0xFFFFFFFF      /* invalid register value */
> +
> +/* prototype */
> +
> +static uint32_t pt_ptr_reg_init(XenPCIPassthroughState *s, XenPTRegInfo *reg,
> +                                uint32_t real_offset);
> +static int pt_init_pci_config(XenPCIPassthroughState *s);
> +
> +
> +/* helper */
> +
> +/* A return value of 1 means the capability should NOT be exposed to guest. */
> +static int pt_hide_dev_cap(const HostPCIDevice *d, uint8_t grp_id)
> +{
> +    switch (grp_id) {
> +    case PCI_CAP_ID_EXP:
> +        /* The PCI Express Capability Structure of the VF of Intel 82599 10GbE
> +         * Controller looks trivial, e.g., the PCI Express Capabilities
> +         * Register is 0. We should not try to expose it to guest.
> +         */
> +        if (d->vendor_id == PCI_VENDOR_ID_INTEL &&
> +                d->device_id == PCI_DEVICE_ID_INTEL_82599_VF) {
> +            return 1;
> +        }
> +        break;
> +    }
> +    return 0;
> +}
> +
> +/*   find emulate register group entry */
> +XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address)
> +{
> +    XenPTRegGroup *entry = NULL;
> +
> +    /* find register group entry */
> +    QLIST_FOREACH(entry, &s->reg_grp_tbl, entries) {
> +        /* check address */
> +        if ((entry->base_offset <= address)
> +            && ((entry->base_offset + entry->size) > address)) {
> +            return entry;
> +        }
> +    }
> +
> +    /* group entry not found */
> +    return NULL;
> +}
> +
> +/* find emulate register entry */
> +XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address)
> +{
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegInfo *reg = NULL;
> +    uint32_t real_offset = 0;
> +
> +    /* find register entry */
> +    QLIST_FOREACH(reg_entry, &reg_grp->reg_tbl_list, entries) {
> +        reg = reg_entry->reg;
> +        real_offset = reg_grp->base_offset + reg->offset;
> +        /* check address */
> +        if ((real_offset <= address)
> +            && ((real_offset + reg->size) > address)) {
> +            return reg_entry;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +/* parse BAR */
> +static PTBarFlag pt_bar_reg_parse(XenPCIPassthroughState *s, XenPTRegInfo *reg)
> +{
> +    PCIDevice *d = &s->dev;
> +    XenPTRegion *region = NULL;
> +    PCIIORegion *r;
> +    int index = 0;
> +
> +    /* check 64bit BAR */
> +    index = pt_bar_offset_to_index(reg->offset);
> +    if ((0 < index) && (index < PCI_ROM_SLOT)) {
> +        int flags = s->real_device->io_regions[index - 1].flags;
> +
> +        if ((flags & IORESOURCE_MEM) && (flags & IORESOURCE_MEM_64)) {
> +            region = &s->bases[index - 1];
> +            if (region->bar_flag != PT_BAR_FLAG_UPPER) {
> +                return PT_BAR_FLAG_UPPER;
> +            }
> +        }
> +    }
> +
> +    /* check unused BAR */
> +    r = &d->io_regions[index];
> +    if (r->size == 0) {
> +        return PT_BAR_FLAG_UNUSED;
> +    }
> +
> +    /* for ExpROM BAR */
> +    if (index == PCI_ROM_SLOT) {
> +        return PT_BAR_FLAG_MEM;
> +    }
> +
> +    /* check BAR I/O indicator */
> +    if (s->real_device->io_regions[index].flags & IORESOURCE_IO) {
> +        return PT_BAR_FLAG_IO;
> +    } else {
> +        return PT_BAR_FLAG_MEM;
> +    }
> +}
> +
> +
> +/****************
> + * general register functions
> + */
> +
> +/* register initialization function */
> +
> +static uint32_t pt_common_reg_init(XenPCIPassthroughState *s,
> +                                   XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    return reg->init_val;
> +}
> +
> +/* Read register functions */
> +
> +static int pt_byte_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                            uint8_t *value, uint8_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint8_t valid_emu_mask = 0;
> +
> +    /* emulate byte register */
> +    valid_emu_mask = reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +    return 0;
> +}
> +static int pt_word_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                            uint16_t *value, uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t valid_emu_mask = 0;
> +
> +    /* emulate word register */
> +    valid_emu_mask = reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +    return 0;
> +}
> +static int pt_long_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                            uint32_t *value, uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint32_t valid_emu_mask = 0;
> +
> +    /* emulate long register */
> +    valid_emu_mask = reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +   return 0;
> +}
> +
> +/* Write register functions */
> +
> +static int pt_byte_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                             uint8_t *value, uint8_t dev_value,
> +                             uint8_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint8_t writable_mask = 0;
> +    uint8_t throughable_mask = 0;
> +
> +    /* modify emulate register */
> +    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    return 0;
> +}
> +static int pt_word_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                             uint16_t *value, uint16_t dev_value,
> +                             uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t writable_mask = 0;
> +    uint16_t throughable_mask = 0;
> +
> +    /* modify emulate register */
> +    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    return 0;
> +}
> +static int pt_long_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                             uint32_t *value, uint32_t dev_value,
> +                             uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint32_t writable_mask = 0;
> +    uint32_t throughable_mask = 0;
> +
> +    /* modify emulate register */
> +    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    return 0;
> +}
> +
> +/* common restore register fonctions */
> +static int pt_byte_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                               uint32_t real_offset, uint8_t dev_value,
> +                               uint8_t *value)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    PCIDevice *d = &s->dev;
> +
> +    /* use I/O device register's value as restore value */
> +    *value = pci_get_byte(d->config + real_offset);
> +
> +    /* create value for restoring to I/O device register */
> +    *value = PT_MERGE_VALUE(*value, dev_value, reg->emu_mask);
> +
> +    return 0;
> +}
> +static int pt_word_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                               uint32_t real_offset, uint16_t dev_value,
> +                               uint16_t *value)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    PCIDevice *d = &s->dev;
> +
> +    /* use I/O device register's value as restore value */
> +    *value = pci_get_word(d->config + real_offset);
> +
> +    /* create value for restoring to I/O device register */
> +    *value = PT_MERGE_VALUE(*value, dev_value, reg->emu_mask);
> +
> +    return 0;
> +}
> +
> +
> +/* XenPTRegInfo declaration
> + * - only for emulated register (either a part or whole bit).
> + * - for passthrough register that need special behavior (like interacting with
> + *   other component), set emu_mask to all 0 and specify r/w func properly.
> + * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
> + */
> +
> +/********************
> + * Header Type0
> + */
> +
> +static uint32_t pt_vendor_reg_init(XenPCIPassthroughState *s,
> +                                   XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    return s->real_device->vendor_id;
> +}
> +static uint32_t pt_device_reg_init(XenPCIPassthroughState *s,
> +                                   XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    return s->real_device->device_id;
> +}
> +static uint32_t pt_status_reg_init(XenPCIPassthroughState *s,
> +                                   XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    int reg_field = 0;
> +
> +    /* find Header register group */
> +    reg_grp_entry = pt_find_reg_grp(s, PCI_CAPABILITY_LIST);
> +    if (reg_grp_entry) {
> +        /* find Capabilities Pointer register */
> +        reg_entry = pt_find_reg(reg_grp_entry, PCI_CAPABILITY_LIST);
> +        if (reg_entry) {
> +            /* check Capabilities Pointer register */
> +            if (reg_entry->data) {
> +                reg_field |= PCI_STATUS_CAP_LIST;
> +            } else {
> +                reg_field &= ~PCI_STATUS_CAP_LIST;
> +            }
> +        } else {
> +            hw_error("Internal error: Couldn't find pt_reg_tbl for "
> +                     "Capabilities Pointer register. I/O emulator exit.\n");
> +        }
> +    } else {
> +        hw_error("Internal error: Couldn't find pt_reg_grp_tbl for Header. "
> +                 "I/O emulator exit.\n");
> +    }
> +
> +    return reg_field;
> +}
> +static uint32_t pt_header_type_reg_init(XenPCIPassthroughState *s,
> +                                        XenPTRegInfo *reg,
> +                                        uint32_t real_offset)
> +{
> +    /* read PCI_HEADER_TYPE */
> +    return reg->init_val | 0x80;
> +}
> +
> +/* initialize Interrupt Pin register */
> +static uint32_t pt_irqpin_reg_init(XenPCIPassthroughState *s,
> +                                   XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    return pci_read_intx(s);
> +}
> +
> +/* Command register */
> +static int pt_cmd_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                           uint16_t *value, uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t valid_emu_mask = 0;
> +    uint16_t emu_mask = reg->emu_mask;
> +
> +    if (s->is_virtfn) {
> +        emu_mask |= PCI_COMMAND_MEMORY;
> +    }
> +
> +    /* emulate word register */
> +    valid_emu_mask = emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +    return 0;
> +}
> +static int pt_cmd_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                            uint16_t *value, uint16_t dev_value,
> +                            uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t writable_mask = 0;
> +    uint16_t throughable_mask = 0;
> +    uint16_t wr_value = *value;
> +    uint16_t emu_mask = reg->emu_mask;
> +
> +    if (s->is_virtfn) {
> +        emu_mask |= PCI_COMMAND_MEMORY;
> +    }
> +
> +    /* modify emulate register */
> +    writable_mask = ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~emu_mask & valid_mask;
> +
> +    if (*value & PCI_COMMAND_INTX_DISABLE) {
> +        throughable_mask |= PCI_COMMAND_INTX_DISABLE;
> +    } else {
> +        if (s->machine_irq) {
> +            throughable_mask |= PCI_COMMAND_INTX_DISABLE;
> +        }
> +    }
> +
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    /* mapping BAR */
> +    pt_bar_mapping(s, wr_value & PCI_COMMAND_IO,
> +                   wr_value & PCI_COMMAND_MEMORY);
> +
> +    return 0;
> +}
> +static int pt_cmd_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                              uint32_t real_offset, uint16_t dev_value,
> +                              uint16_t *value)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    PCIDevice *d = &s->dev;
> +    uint16_t restorable_mask = 0;
> +
> +    /* use I/O device register's value as restore value */
> +    *value = pci_get_word(d->config + real_offset);
> +
> +    /* create value for restoring to I/O device register
> +     * but do not include Fast Back-to-Back Enable bit.
> +     */
> +    restorable_mask = reg->emu_mask & ~PCI_COMMAND_FAST_BACK;
> +    *value = PT_MERGE_VALUE(*value, dev_value, restorable_mask);
> +
> +    if (!s->machine_irq) {
> +        *value |= PCI_COMMAND_INTX_DISABLE;
> +    } else {
> +        *value &= ~PCI_COMMAND_INTX_DISABLE;
> +    }
> +
> +    return 0;
> +}
> +
> +/* BAR */
> +#define PT_BAR_MEM_RO_MASK      0x0000000F      /* BAR ReadOnly mask(Memory) */
> +#define PT_BAR_MEM_EMU_MASK     0xFFFFFFF0      /* BAR emul mask(Memory) */
> +#define PT_BAR_IO_RO_MASK       0x00000003      /* BAR ReadOnly mask(I/O) */
> +#define PT_BAR_IO_EMU_MASK      0xFFFFFFFC      /* BAR emul mask(I/O) */
> +
> +static inline uint32_t base_address_with_flags(HostPCIIORegion *hr)
> +{
> +    if ((hr->flags & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
> +        return hr->base_addr | (hr->flags & ~PCI_BASE_ADDRESS_IO_MASK);
> +    } else {
> +        return hr->base_addr | (hr->flags & ~PCI_BASE_ADDRESS_MEM_MASK);
> +    }
> +}
> +
> +static uint32_t pt_bar_reg_init(XenPCIPassthroughState *s, XenPTRegInfo *reg,
> +                                uint32_t real_offset)
> +{
> +    int reg_field = 0;
> +    int index;
> +
> +    /* get BAR index */
> +    index = pt_bar_offset_to_index(reg->offset);
> +    if (index < 0) {
> +        hw_error("Internal error: Invalid BAR index[%d]. "
> +                 "I/O emulator exit.\n", index);
> +    }
> +
> +    /* set initial guest physical base address to -1 */
> +    s->bases[index].e_physbase = -1;
> +
> +    /* set BAR flag */
> +    s->bases[index].bar_flag = pt_bar_reg_parse(s, reg);
> +    if (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED) {
> +        reg_field = PT_INVALID_REG;
> +    }
> +
> +    return reg_field;
> +}
> +static int pt_bar_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                           uint32_t *value, uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint32_t valid_emu_mask = 0;
> +    uint32_t bar_emu_mask = 0;
> +    int index;
> +
> +    /* get BAR index */
> +    index = pt_bar_offset_to_index(reg->offset);
> +    if (index < 0) {
> +        hw_error("Internal error: Invalid BAR index[%d]. "
> +                 "I/O emulator exit.\n", index);
> +    }
> +
> +    /* use fixed-up value from kernel sysfs */
> +    *value = base_address_with_flags(&s->real_device->io_regions[index]);
> +
> +    /* set emulate mask depend on BAR flag */
> +    switch (s->bases[index].bar_flag) {
> +    case PT_BAR_FLAG_MEM:
> +        bar_emu_mask = PT_BAR_MEM_EMU_MASK;
> +        break;
> +    case PT_BAR_FLAG_IO:
> +        bar_emu_mask = PT_BAR_IO_EMU_MASK;
> +        break;
> +    case PT_BAR_FLAG_UPPER:
> +        bar_emu_mask = PT_BAR_ALLF;
> +        break;
> +    default:
> +        break;
> +    }
> +
> +    /* emulate BAR */
> +    valid_emu_mask = bar_emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +   return 0;
> +}
> +static int pt_bar_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                            uint32_t *value, uint32_t dev_value,
> +                            uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegion *base = NULL;
> +    PCIDevice *d = &s->dev;
> +    PCIIORegion *r;
> +    uint32_t writable_mask = 0;
> +    uint32_t throughable_mask = 0;
> +    uint32_t bar_emu_mask = 0;
> +    uint32_t bar_ro_mask = 0;
> +    uint32_t new_addr, last_addr;
> +    uint32_t prev_offset;
> +    uint32_t r_size = 0;
> +    int index = 0;
> +
> +    /* get BAR index */
> +    index = pt_bar_offset_to_index(reg->offset);
> +    if (index < 0) {
> +        hw_error("Internal error: Invalid BAR index[%d]. "
> +                 "I/O emulator exit.\n", index);
> +    }
> +
> +    r = &d->io_regions[index];
> +    base = &s->bases[index];
> +    r_size = pt_get_emul_size(base->bar_flag, r->size);
> +
> +    /* set emulate mask and read-only mask depend on BAR flag */
> +    switch (s->bases[index].bar_flag) {
> +    case PT_BAR_FLAG_MEM:
> +        bar_emu_mask = PT_BAR_MEM_EMU_MASK;
> +        bar_ro_mask = PT_BAR_MEM_RO_MASK | (r_size - 1);
> +        break;
> +    case PT_BAR_FLAG_IO:
> +        bar_emu_mask = PT_BAR_IO_EMU_MASK;
> +        bar_ro_mask = PT_BAR_IO_RO_MASK | (r_size - 1);
> +        break;
> +    case PT_BAR_FLAG_UPPER:
> +        bar_emu_mask = PT_BAR_ALLF;
> +        bar_ro_mask = 0;    /* all upper 32bit are R/W */
> +        break;
> +    default:
> +        break;
> +    }
> +
> +    /* modify emulate register */
> +    writable_mask = bar_emu_mask & ~bar_ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* check whether we need to update the virtual region address or not */
> +    switch (s->bases[index].bar_flag) {
> +    case PT_BAR_FLAG_MEM:
> +        /* nothing to do */
> +        break;
> +    case PT_BAR_FLAG_IO:
> +        new_addr = cfg_entry->data;
> +        last_addr = new_addr + r_size - 1;
> +        /* check invalid address */
> +        if (last_addr <= new_addr || !new_addr || last_addr >= 0x10000) {
> +            /* check 64K range */
> +            if ((last_addr >= 0x10000) &&
> +                (cfg_entry->data != (PT_BAR_ALLF & ~bar_ro_mask))) {
> +                PT_LOG("Warning: Guest attempt to set Base Address "
> +                       "over the 64KB. [%02x:%02x.%x][Offset:%02xh]"
> +                       "[Address:%08xh][Size:%08xh]\n",
> +                       pci_bus_num(d->bus), PCI_SLOT(d->devfn),
> +                       PCI_FUNC(d->devfn),
> +                       reg->offset, new_addr, r_size);
> +            }
> +            /* just remove mapping */
> +            r->addr = -1;
> +            goto exit;
> +        }
> +        break;
> +    case PT_BAR_FLAG_UPPER:
> +        if (cfg_entry->data) {
> +            if (cfg_entry->data != (PT_BAR_ALLF & ~bar_ro_mask)) {
> +                PT_LOG("Warning: Guest attempt to set high MMIO Base Address. "
> +                       "Ignore mapping. "
> +                       "[%02x:%02x.%x][Offset:%02xh][High Address:%08xh]\n",
> +                       pci_bus_num(d->bus), PCI_SLOT(d->devfn),
> +                       PCI_FUNC(d->devfn), reg->offset, cfg_entry->data);
> +            }
> +            /* clear lower address */
> +            d->io_regions[index-1].addr = -1;
> +        } else {
> +            /* find lower 32bit BAR */
> +            prev_offset = (reg->offset - 4);
> +            reg_grp_entry = pt_find_reg_grp(s, prev_offset);
> +            if (reg_grp_entry) {
> +                reg_entry = pt_find_reg(reg_grp_entry, prev_offset);
> +                if (reg_entry) {
> +                    /* restore lower address */
> +                    d->io_regions[index-1].addr = reg_entry->data;
> +                } else {
> +                    return -1;
> +                }
> +            } else {
> +                return -1;
> +            }
> +        }
> +
> +        /* never mapping the 'empty' upper region,
> +         * because we'll do it enough for the lower region.
> +         */
> +        r->addr = -1;
> +        goto exit;
> +    default:
> +        break;
> +    }
> +
> +    /* update the corresponding virtual region address */
> +    /*
> +     * When guest code tries to get block size of mmio, it will write all "1"s
> +     * into pci bar register. In this case, cfg_entry->data == writable_mask.
> +     * Especially for devices with large mmio, the value of writable_mask
> +     * is likely to be a guest physical address that has been mapped to ram
> +     * rather than mmio. Remapping this value to mmio should be prevented.
> +     */
> +
> +    if (cfg_entry->data != writable_mask) {
> +        r->addr = cfg_entry->data;
> +    }
> +
> +exit:
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~bar_emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    /* After BAR reg update, we need to remap BAR */
> +    reg_grp_entry = pt_find_reg_grp(s, PCI_COMMAND);
> +    if (reg_grp_entry) {
> +        reg_entry = pt_find_reg(reg_grp_entry, PCI_COMMAND);
> +        if (reg_entry) {
> +            pt_bar_mapping_one(s, index, reg_entry->data & PCI_COMMAND_IO,
> +                               reg_entry->data & PCI_COMMAND_MEMORY);
> +        }
> +    }
> +
> +    return 0;
> +}
> +static int pt_bar_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                              uint32_t real_offset, uint32_t dev_value,
> +                              uint32_t *value)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint32_t bar_emu_mask = 0;
> +    int index = 0;
> +
> +    /* get BAR index */
> +    index = pt_bar_offset_to_index(reg->offset);
> +    if (index < 0) {
> +        hw_error("Internal error: Invalid BAR index[%d]. "
> +                 "I/O emulator exit.\n", index);
> +    }
> +
> +    /* use value from kernel sysfs */
> +    if (s->bases[index].bar_flag == PT_BAR_FLAG_UPPER) {
> +        *value = s->real_device->io_regions[index - 1].base_addr >> 32;
> +    } else {
> +        *value = base_address_with_flags(&s->real_device->io_regions[index]);
> +    }
> +
> +    /* set emulate mask depend on BAR flag */
> +    switch (s->bases[index].bar_flag) {
> +    case PT_BAR_FLAG_MEM:
> +        bar_emu_mask = PT_BAR_MEM_EMU_MASK;
> +        break;
> +    case PT_BAR_FLAG_IO:
> +        bar_emu_mask = PT_BAR_IO_EMU_MASK;
> +        break;
> +    case PT_BAR_FLAG_UPPER:
> +        bar_emu_mask = PT_BAR_ALLF;
> +        break;
> +    default:
> +        break;
> +    }
> +
> +    /* create value for restoring to I/O device register */
> +    *value = PT_MERGE_VALUE(*value, dev_value, bar_emu_mask);
> +
> +    return 0;
> +}
> +
> +/* write Exp ROM BAR */
> +static int pt_exp_rom_bar_reg_write(XenPCIPassthroughState *s,
> +                                    XenPTReg *cfg_entry, uint32_t *value,
> +                                    uint32_t dev_value, uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegion *base = NULL;
> +    PCIDevice *d = (PCIDevice *)&s->dev;
> +    PCIIORegion *r;
> +    uint32_t writable_mask = 0;
> +    uint32_t throughable_mask = 0;
> +    pcibus_t r_size = 0;
> +    uint32_t bar_emu_mask = 0;
> +    uint32_t bar_ro_mask = 0;
> +
> +    r = &d->io_regions[PCI_ROM_SLOT];
> +    r_size = r->size;
> +    base = &s->bases[PCI_ROM_SLOT];
> +    /* align memory type resource size */
> +    pt_get_emul_size(base->bar_flag, r_size);
> +
> +    /* set emulate mask and read-only mask */
> +    bar_emu_mask = reg->emu_mask;
> +    bar_ro_mask = (reg->ro_mask | (r_size - 1)) & ~PCI_ROM_ADDRESS_ENABLE;
> +
> +    /* modify emulate register */
> +    writable_mask = ~bar_ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* update the corresponding virtual region address */
> +    /*
> +     * When guest code tries to get block size of mmio, it will write all "1"s
> +     * into pci bar register. In this case, cfg_entry->data == writable_mask.
> +     * Especially for devices with large mmio, the value of writable_mask
> +     * is likely to be a guest physical address that has been mapped to ram
> +     * rather than mmio. Remapping this value to mmio should be prevented.
> +     */
> +
> +    if (cfg_entry->data != writable_mask) {
> +        r->addr = cfg_entry->data;
> +    }
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~bar_emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    /* After BAR reg update, we need to remap BAR*/
> +    reg_grp_entry = pt_find_reg_grp(s, PCI_COMMAND);
> +    if (reg_grp_entry) {
> +        reg_entry = pt_find_reg(reg_grp_entry, PCI_COMMAND);
> +        if (reg_entry) {
> +            pt_bar_mapping_one(s, PCI_ROM_SLOT,
> +                               reg_entry->data & PCI_COMMAND_IO,
> +                               reg_entry->data & PCI_COMMAND_MEMORY);
> +        }
> +    }
> +
> +    return 0;
> +}
> +/* restore ROM BAR */
> +static int pt_exp_rom_bar_reg_restore(XenPCIPassthroughState *s,
> +                                      XenPTReg *cfg_entry,
> +                                      uint32_t real_offset,
> +                                      uint32_t dev_value, uint32_t *value)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +
> +    /* use value from kernel sysfs */
> +    *value =
> +        PT_MERGE_VALUE(host_pci_get_long(s->real_device, PCI_ROM_ADDRESS),
> +                       dev_value, reg->emu_mask);
> +    return 0;
> +}
> +
> +/* Header Type0 reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_header0_tbl[] = {
> +    /* Vendor ID reg */
> +    {
> +        .offset     = PCI_VENDOR_ID,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xFFFF,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_vendor_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    /* Device ID reg */
> +    {
> +        .offset     = PCI_DEVICE_ID,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xFFFF,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_device_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    /* Command reg */
> +    {
> +        .offset     = PCI_COMMAND,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xF880,
> +        .emu_mask   = 0x0740,
> +        .init       = pt_common_reg_init,
> +        .u.w.read   = pt_cmd_reg_read,
> +        .u.w.write  = pt_cmd_reg_write,
> +        .u.w.restore  = pt_cmd_reg_restore,
> +    },
> +    /* Capabilities Pointer reg */
> +    {
> +        .offset     = PCI_CAPABILITY_LIST,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Status reg */
> +    /* use emulated Cap Ptr value to initialize,
> +     * so need to be declared after Cap Ptr reg
> +     */
> +    {
> +        .offset     = PCI_STATUS,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0x06FF,
> +        .emu_mask   = 0x0010,
> +        .init       = pt_status_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    /* Cache Line Size reg */
> +    {
> +        .offset     = PCI_CACHE_LINE_SIZE,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0x00,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_common_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = pt_byte_reg_restore,
> +    },
> +    /* Latency Timer reg */
> +    {
> +        .offset     = PCI_LATENCY_TIMER,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0x00,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_common_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = pt_byte_reg_restore,
> +    },
> +    /* Header Type reg */
> +    {
> +        .offset     = PCI_HEADER_TYPE,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0x00,
> +        .init       = pt_header_type_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Interrupt Line reg */
> +    {
> +        .offset     = PCI_INTERRUPT_LINE,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0x00,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_common_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Interrupt Pin reg */
> +    {
> +        .offset     = PCI_INTERRUPT_PIN,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_irqpin_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* BAR 0 reg */
> +    /* mask of BAR need to be decided later, depends on IO/MEM type */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_0,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* BAR 1 reg */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_1,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* BAR 2 reg */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_2,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* BAR 3 reg */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_3,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* BAR 4 reg */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_4,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* BAR 5 reg */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_5,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* Expansion ROM BAR reg */
> +    {
> +        .offset     = PCI_ROM_ADDRESS,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .ro_mask    = 0x000007FE,
> +        .emu_mask   = 0xFFFFF800,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_long_reg_read,
> +        .u.dw.write = pt_exp_rom_bar_reg_write,
> +        .u.dw.restore = pt_exp_rom_bar_reg_restore,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
> +
> +/*********************************
> + * Vital Product Data Capability
> + */
> +
> +/* Vital Product Data Capability Structure reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_vpd_tbl[] = {
> +    {
> +        .offset     = PCI_CAP_LIST_NEXT,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
> +
> +/**************************************
> + * Vendor Specific Capability
> + */
> +
> +/* Vendor Specific Capability Structure reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_vendor_tbl[] = {
> +    {
> +        .offset     = PCI_CAP_LIST_NEXT,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
> +
> +/*****************************
> + * PCI Express Capability
> + */
> +
> +/* initialize Link Control register */
> +static uint32_t pt_linkctrl_reg_init(XenPCIPassthroughState *s,
> +                                     XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    uint8_t cap_ver = 0;
> +    uint8_t dev_type = 0;
> +
> +    /* TODO maybe better to use fonction from hw/pcie.c */
> +    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> +                           + PCI_EXP_FLAGS)
> +        & PCI_EXP_FLAGS_VERS;
> +    dev_type = (pci_get_byte(s->dev.config + real_offset - reg->offset
> +                             + PCI_EXP_FLAGS)
> +                & PCI_EXP_FLAGS_TYPE) >> 4;
> +
> +    /* no need to initialize in case of Root Complex Integrated Endpoint
> +     * with cap_ver 1.x
> +     */
> +    if ((dev_type == PCI_EXP_TYPE_RC_END) && (cap_ver == 1)) {
> +        return PT_INVALID_REG;
> +    }
> +
> +    return reg->init_val;
> +}
> +/* initialize Device Control 2 register */
> +static uint32_t pt_devctrl2_reg_init(XenPCIPassthroughState *s,
> +                                     XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    uint8_t cap_ver = 0;
> +
> +    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> +                           + PCI_EXP_FLAGS)
> +        & PCI_EXP_FLAGS_VERS;
> +
> +    /* no need to initialize in case of cap_ver 1.x */
> +    if (cap_ver == 1) {
> +        return PT_INVALID_REG;
> +    }
> +
> +    return reg->init_val;
> +}
> +/* initialize Link Control 2 register */
> +static uint32_t pt_linkctrl2_reg_init(XenPCIPassthroughState *s,
> +                                      XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    int reg_field = 0;
> +    uint8_t cap_ver = 0;
> +
> +    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> +                           + PCI_EXP_FLAGS)
> +        & PCI_EXP_FLAGS_VERS;
> +
> +    /* no need to initialize in case of cap_ver 1.x */
> +    if (cap_ver == 1) {
> +        return PT_INVALID_REG;
> +    }
> +
> +    /* set Supported Link Speed */
> +    reg_field |= PCI_EXP_LNKCAP_SLS &
> +        pci_get_byte(s->dev.config + real_offset - reg->offset
> +                     + PCI_EXP_LNKCAP);
> +
> +    return reg_field;
> +}
> +
> +/* PCI Express Capability Structure reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_pcie_tbl[] = {
> +    /* Next Pointer reg */
> +    {
> +        .offset     = PCI_CAP_LIST_NEXT,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Device Capabilities reg */
> +    {
> +        .offset     = PCI_EXP_DEVCAP,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .ro_mask    = 0x1FFCFFFF,
> +        .emu_mask   = 0x10000000,
> +        .init       = pt_common_reg_init,
> +        .u.dw.read  = pt_long_reg_read,
> +        .u.dw.write = pt_long_reg_write,
> +        .u.dw.restore = NULL,
> +    },
> +    /* Device Control reg */
> +    {
> +        .offset     = PCI_EXP_DEVCTL,
> +        .size       = 2,
> +        .init_val   = 0x2810,
> +        .ro_mask    = 0x8400,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_common_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = pt_word_reg_restore,
> +    },
> +    /* Link Control reg */
> +    {
> +        .offset     = PCI_EXP_LNKCTL,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xFC34,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_linkctrl_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = pt_word_reg_restore,
> +    },
> +    /* Device Control 2 reg */
> +    {
> +        .offset     = 0x28,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xFFE0,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_devctrl2_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = pt_word_reg_restore,
> +    },
> +    /* Link Control 2 reg */
> +    {
> +        .offset     = 0x30,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xE040,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_linkctrl2_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = pt_word_reg_restore,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
> +
> +/*********************************
> + * Power Management Capability
> + */
> +
> +/* initialize Power Management Capabilities register */
> +static uint32_t pt_pmc_reg_init(XenPCIPassthroughState *s,
> +                                XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    PCIDevice *d = &s->dev;
> +
> +    if (!s->power_mgmt) {
> +        return reg->init_val;
> +    }
> +
> +    /* set Power Management Capabilities register */
> +    s->pm_state->pmc_field = pci_get_word(d->config + real_offset);
> +
> +    return reg->init_val;
> +}
> +/* initialize PCI Power Management Control/Status register */
> +static uint32_t pt_pmcsr_reg_init(XenPCIPassthroughState *s,
> +                                  XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint16_t cap_ver  = 0;
> +
> +    if (!s->power_mgmt) {
> +        return reg->init_val;
> +    }
> +
> +    /* check PCI Power Management support version */
> +    cap_ver = s->pm_state->pmc_field & PCI_PM_CAP_VER_MASK;
> +
> +    if (cap_ver > 2) {
> +        /* set No Soft Reset */
> +        s->pm_state->no_soft_reset =
> +            pci_get_byte(d->config + real_offset) & PCI_PM_CTRL_NO_SOFT_RESET;
> +    }
> +
> +    /* wake up real physical device */
> +    switch (host_pci_get_word(s->real_device, real_offset)
> +            & PCI_PM_CTRL_STATE_MASK) {
> +    case 0:
> +        break;
> +    case 1:
> +        PT_LOG("Power state transition D1 -> D0active\n");
> +        host_pci_set_word(s->real_device, real_offset, 0);
> +        break;
> +    case 2:
> +        PT_LOG("Power state transition D2 -> D0active\n");
> +        host_pci_set_word(s->real_device, real_offset, 0);
> +        usleep(200);
> +        break;
> +    case 3:
> +        PT_LOG("Power state transition D3hot -> D0active\n");
> +        host_pci_set_word(s->real_device, real_offset, 0);
> +        usleep(10 * 1000);
> +        pt_init_pci_config(s);
> +        break;
> +    }
> +
> +    return reg->init_val;
> +}
> +/* read Power Management Control/Status register */
> +static int pt_pmcsr_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                             uint16_t *value, uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t valid_emu_mask = reg->emu_mask;
> +
> +    if (!s->power_mgmt) {
> +        valid_emu_mask |= PCI_PM_CTRL_STATE_MASK | PCI_PM_CTRL_NO_SOFT_RESET;
> +    }
> +
> +    valid_emu_mask = valid_emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +    return 0;
> +}
> +/* reset Interrupt and I/O resource  */
> +static void pt_reset_interrupt_and_io_mapping(XenPCIPassthroughState *s)
> +{
> +    PCIDevice *d = &s->dev;
> +    PCIIORegion *r;
> +    int i = 0;
> +    uint8_t e_device = 0;
> +    uint8_t e_intx = 0;
> +
> +    /* unbind INTx */
> +    e_device = PCI_SLOT(s->dev.devfn);
> +    e_intx = pci_intx(s);
> +
> +    if (s->machine_irq) {
> +        if (xc_domain_unbind_pt_irq(xen_xc, xen_domid, s->machine_irq,
> +                                    PT_IRQ_TYPE_PCI, 0, e_device, e_intx, 0)) {
> +            PT_LOG("Error: Unbinding of interrupt failed!\n");
> +        }
> +    }
> +
> +    /* clear all virtual region address */
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        r = &d->io_regions[i];
> +        r->addr = -1;
> +    }
> +
> +    /* unmapping BAR */
> +    pt_bar_mapping(s, 0, 0);
> +}
> +/* check power state transition */
> +static int check_power_state(XenPCIPassthroughState *s)
> +{
> +    XenPTPM *pm_state = s->pm_state;
> +    PCIDevice *d = &s->dev;
> +    uint16_t read_val = 0;
> +    uint16_t cur_state = 0;
> +
> +    /* get current power state */
> +    read_val = host_pci_get_word(s->real_device,
> +                                 pm_state->pm_base + PCI_PM_CTRL);
> +    cur_state = read_val & PCI_PM_CTRL_STATE_MASK;
> +
> +    if (pm_state->req_state != cur_state) {
> +        PT_LOG("Error: Failed to change power state. "
> +               "[%02x:%02x.%x][requested state:%d][current state:%d]\n",
> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +               pm_state->req_state, cur_state);
> +        return -1;
> +    }
> +    return 0;
> +}
> +/* write Power Management Control/Status register */
> +static void pt_from_d3hot_to_d0_with_reset(void *opaque)
> +{
> +    XenPCIPassthroughState *s = opaque;
> +    XenPTPM *pm_state = s->pm_state;
> +    int ret = 0;
> +
> +    /* check power state */
> +    ret = check_power_state(s);
> +
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    pt_init_pci_config(s);
> +
> +out:
> +    /* power state transition flags off */
> +    pm_state->flags &= ~PT_FLAG_TRANSITING;
> +
> +    qemu_free_timer(pm_state->pm_timer);
> +    pm_state->pm_timer = NULL;
> +}
> +static void pt_default_power_transition(void *opaque)
> +{
> +    XenPCIPassthroughState *ptdev = opaque;
> +    XenPTPM *pm_state = ptdev->pm_state;
> +
> +    /* check power state */
> +    check_power_state(ptdev);
> +
> +    /* power state transition flags off */
> +    pm_state->flags &= ~PT_FLAG_TRANSITING;
> +
> +    qemu_free_timer(pm_state->pm_timer);
> +    pm_state->pm_timer = NULL;
> +}
> +static int pt_pmcsr_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                              uint16_t *value, uint16_t dev_value,
> +                              uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    PCIDevice *d = &s->dev;
> +    uint16_t emu_mask = reg->emu_mask;
> +    uint16_t writable_mask = 0;
> +    uint16_t throughable_mask = 0;
> +    XenPTPM *pm_state = s->pm_state;
> +
> +    if (!s->power_mgmt) {
> +        emu_mask |= PCI_PM_CTRL_STATE_MASK | PCI_PM_CTRL_NO_SOFT_RESET;
> +    }
> +
> +    /* modify emulate register */
> +    writable_mask = emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    if (!s->power_mgmt) {
> +        return 0;
> +    }
> +
> +    /* set I/O device power state */
> +    pm_state->cur_state = dev_value & PCI_PM_CTRL_STATE_MASK;
> +
> +    /* set Guest requested PowerState */
> +    pm_state->req_state = *value & PCI_PM_CTRL_STATE_MASK;
> +
> +    /* check power state transition or not */
> +    if (pm_state->cur_state == pm_state->req_state) {
> +        /* not power state transition */
> +        return 0;
> +    }
> +
> +    /* check enable power state transition */
> +    if ((pm_state->req_state != 0) &&
> +        (pm_state->cur_state > pm_state->req_state)) {
> +        PT_LOG("Error: Invalid power transition. "
> +               "[%02x:%02x.%x][requested state:%d][current state:%d]\n",
> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +               pm_state->req_state, pm_state->cur_state);
> +
> +        return 0;
> +    }
> +
> +    /* check if this device supports the requested power state */
> +    if (((pm_state->req_state == 1) && !(pm_state->pmc_field & PCI_PM_CAP_D1))
> +        || ((pm_state->req_state == 2) &&
> +            !(pm_state->pmc_field & PCI_PM_CAP_D2))) {
> +        PT_LOG("Error: Invalid power transition. "
> +               "[%02x:%02x.%x][requested state:%d][current state:%d]\n",
> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +               pm_state->req_state, pm_state->cur_state);
> +
> +        return 0;
> +    }
> +
> +    /* in case of transition related to D3hot, it's necessary to wait 10 ms.
> +     * But because writing to register will be performed later on actually,
> +     * don't start QEMUTimer right now, just alloc and init QEMUTimer here.
> +     */
> +    if ((pm_state->cur_state == 3) || (pm_state->req_state == 3)) {
> +        if (pm_state->req_state == 0) {
> +            /* alloc and init QEMUTimer */
> +            if (!pm_state->no_soft_reset) {
> +                pm_state->pm_timer = qemu_new_timer_ms(rt_clock,
> +                    pt_from_d3hot_to_d0_with_reset, s);
> +
> +                /* reset Interrupt and I/O resource mapping */
> +                pt_reset_interrupt_and_io_mapping(s);
> +            } else {
> +                pm_state->pm_timer = qemu_new_timer_ms(rt_clock,
> +                                        pt_default_power_transition, s);
> +            }
> +        } else {
> +            /* alloc and init QEMUTimer */
> +            pm_state->pm_timer = qemu_new_timer_ms(rt_clock,
> +                pt_default_power_transition, s);
> +        }
> +
> +        /* set power state transition delay */
> +        pm_state->pm_delay = 10;
> +
> +        /* power state transition flags on */
> +        pm_state->flags |= PT_FLAG_TRANSITING;
> +    }
> +    /* in case of transition related to D0, D1 and D2,
> +     * no need to use QEMUTimer.
> +     * So, we perfom writing to register here and then read it back.
> +     */
> +    else {
> +        /* write power state to I/O device register */
> +        host_pci_set_word(s->real_device, pm_state->pm_base + PCI_PM_CTRL,
> +                          *value);
> +
> +        /* in case of transition related to D2,
> +         * it's necessary to wait 200 usec.
> +         * But because QEMUTimer do not support microsec unit right now,
> +         * so we do wait ourself here.
> +         */
> +        if ((pm_state->cur_state == 2) || (pm_state->req_state == 2)) {
> +            usleep(200);
> +        }
> +
> +        /* check power state */
> +        check_power_state(s);
> +
> +        /* recreate value for writing to I/O device register */
> +        *value = host_pci_get_word(s->real_device,
> +                                   pm_state->pm_base + PCI_PM_CTRL);
> +    }
> +
> +    return 0;
> +}
> +
> +/* restore Power Management Control/Status register */
> +static int pt_pmcsr_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                                uint32_t real_offset, uint16_t dev_value,
> +                                uint16_t *value)
> +{
> +    /* create value for restoring to I/O device register
> +     * No need to restore, just clear PME Enable and PME Status bit
> +     * Note: register type of PME Status bit is RW1C, so clear by writing 1b
> +     */
> +    *value = (dev_value & ~PCI_PM_CTRL_PME_ENABLE) | PCI_PM_CTRL_PME_STATUS;
> +
> +    return 0;
> +}
> +
> +
> +/* Power Management Capability reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_pm_tbl[] = {
> +    /* Next Pointer reg */
> +    {
> +        .offset     = PCI_CAP_LIST_NEXT,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Power Management Capabilities reg */
> +    {
> +        .offset     = PCI_CAP_FLAGS,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xFFFF,
> +        .emu_mask   = 0xF9C8,
> +        .init       = pt_pmc_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    /* PCI Power Management Control/Status reg */
> +    {
> +        .offset     = PCI_PM_CTRL,
> +        .size       = 2,
> +        .init_val   = 0x0008,
> +        .ro_mask    = 0xE1FC,
> +        .emu_mask   = 0x8100,
> +        .init       = pt_pmcsr_reg_init,
> +        .u.w.read   = pt_pmcsr_reg_read,
> +        .u.w.write  = pt_pmcsr_reg_write,
> +        .u.w.restore  = pt_pmcsr_reg_restore,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
> +
> +/****************************
> + * Capabilities
> + */
> +
> +/* AER register operations */
> +
> +static void aer_save_one_register(XenPCIPassthroughState *s, int offset)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint32_t aer_base = s->pm_state->aer_base;
> +    uint32_t val = 0;
> +
> +    val = host_pci_get_long(s->real_device, aer_base + offset);
> +    pci_set_long(d->config + aer_base + offset, val);
> +}
> +static void pt_aer_reg_save(XenPCIPassthroughState *s)
> +{
> +    /* after reset, following register values should be restored.
> +     * So, save them.
> +     */
> +    aer_save_one_register(s, PCI_ERR_UNCOR_MASK);
> +    aer_save_one_register(s, PCI_ERR_UNCOR_SEVER);
> +    aer_save_one_register(s, PCI_ERR_COR_MASK);
> +    aer_save_one_register(s, PCI_ERR_CAP);
> +}
> +static void aer_restore_one_register(XenPCIPassthroughState *s, int offset)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint32_t aer_base = s->pm_state->aer_base;
> +    uint32_t config = 0;
> +
> +    config = pci_get_long(d->config + aer_base + offset);
> +    host_pci_set_long(s->real_device, aer_base + offset, config);
> +}
> +static void pt_aer_reg_restore(XenPCIPassthroughState *s)
> +{
> +    /* the following registers should be reconfigured to correct values
> +     * after reset. restore them.
> +     * other registers should not be reconfigured after reset
> +     * if there is no reason
> +     */
> +    aer_restore_one_register(s, PCI_ERR_UNCOR_MASK);
> +    aer_restore_one_register(s, PCI_ERR_UNCOR_SEVER);
> +    aer_restore_one_register(s, PCI_ERR_COR_MASK);
> +    aer_restore_one_register(s, PCI_ERR_CAP);
> +}
> +
> +/* capability structure register group size functions */
> +
> +static uint8_t pt_reg_grp_size_init(XenPCIPassthroughState *s,
> +                                    const XenPTRegGroupInfo *grp_reg,
> +                                    uint32_t base_offset)
> +{
> +    return grp_reg->grp_size;
> +}
> +/* get Power Management Capability Structure register group size */
> +static uint8_t pt_pm_size_init(XenPCIPassthroughState *s,
> +                               const XenPTRegGroupInfo *grp_reg,
> +                               uint32_t base_offset)
> +{
> +    if (!s->power_mgmt) {
> +        return grp_reg->grp_size;
> +    }
> +
> +    s->pm_state = g_malloc0(sizeof (XenPTPM));
> +
> +    /* set Power Management Capability base offset */
> +    s->pm_state->pm_base = base_offset;
> +
> +    /* find AER register and set AER Capability base offset */
> +    s->pm_state->aer_base = host_pci_find_ext_cap_offset(s->real_device,
> +                                                         PCI_EXT_CAP_ID_ERR);
> +
> +    /* save AER register */
> +    if (s->pm_state->aer_base) {
> +        pt_aer_reg_save(s);
> +    }
> +
> +    return grp_reg->grp_size;
> +}
> +/* get Vendor Specific Capability Structure register group size */
> +static uint8_t pt_vendor_size_init(XenPCIPassthroughState *s,
> +                                   const XenPTRegGroupInfo *grp_reg,
> +                                   uint32_t base_offset)
> +{
> +    return pci_get_byte(s->dev.config + base_offset + 0x02);
> +}
> +/* get PCI Express Capability Structure register group size */
> +static uint8_t pt_pcie_size_init(XenPCIPassthroughState *s,
> +                                 const XenPTRegGroupInfo *grp_reg,
> +                                 uint32_t base_offset)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint16_t exp_flag = 0;
> +    uint16_t type = 0;
> +    uint16_t version = 0;
> +    uint8_t pcie_size = 0;
> +
> +    exp_flag = pci_get_word(d->config + base_offset + PCI_EXP_FLAGS);
> +    type = (exp_flag & PCI_EXP_FLAGS_TYPE) >> 4;
> +    version = exp_flag & PCI_EXP_FLAGS_VERS;
> +
> +    /* calculate size depend on capability version and device/port type */
> +    /* in case of PCI Express Base Specification Rev 1.x */
> +    if (version == 1) {
> +        /* The PCI Express Capabilities, Device Capabilities, and Device
> +         * Status/Control registers are required for all PCI Express devices.
> +         * The Link Capabilities and Link Status/Control are required for all
> +         * Endpoints that are not Root Complex Integrated Endpoints. Endpoints
> +         * are not required to implement registers other than those listed
> +         * above and terminate the capability structure.
> +         */
> +        switch (type) {
> +        case PCI_EXP_TYPE_ENDPOINT:
> +        case PCI_EXP_TYPE_LEG_END:
> +            pcie_size = 0x14;
> +            break;
> +        case PCI_EXP_TYPE_RC_END:
> +            /* has no link */
> +            pcie_size = 0x0C;
> +            break;
> +        /* only EndPoint passthrough is supported */
> +        case PCI_EXP_TYPE_ROOT_PORT:
> +        case PCI_EXP_TYPE_UPSTREAM:
> +        case PCI_EXP_TYPE_DOWNSTREAM:
> +        case PCI_EXP_TYPE_PCI_BRIDGE:
> +        case PCI_EXP_TYPE_PCIE_BRIDGE:
> +        case PCI_EXP_TYPE_RC_EC:
> +        default:
> +            hw_error("Internal error: Unsupported device/port type[%d]. "
> +                     "I/O emulator exit.\n", type);
> +        }
> +    }
> +    /* in case of PCI Express Base Specification Rev 2.0 */
> +    else if (version == 2) {
> +        switch (type) {
> +        case PCI_EXP_TYPE_ENDPOINT:
> +        case PCI_EXP_TYPE_LEG_END:
> +        case PCI_EXP_TYPE_RC_END:
> +            /* For Functions that do not implement the registers,
> +             * these spaces must be hardwired to 0b.
> +             */
> +            pcie_size = 0x3C;
> +            break;
> +        /* only EndPoint passthrough is supported */
> +        case PCI_EXP_TYPE_ROOT_PORT:
> +        case PCI_EXP_TYPE_UPSTREAM:
> +        case PCI_EXP_TYPE_DOWNSTREAM:
> +        case PCI_EXP_TYPE_PCI_BRIDGE:
> +        case PCI_EXP_TYPE_PCIE_BRIDGE:
> +        case PCI_EXP_TYPE_RC_EC:
> +        default:
> +            hw_error("Internal error: Unsupported device/port type[%d]. "
> +                     "I/O emulator exit.\n", type);
> +        }
> +    } else {
> +        hw_error("Internal error: Unsupported capability version[%d]. "
> +                 "I/O emulator exit.\n", version);
> +    }
> +
> +    return pcie_size;
> +}
> +
> +static const XenPTRegGroupInfo pt_emu_reg_grp_tbl[] = {
> +    /* Header Type0 reg group */
> +    {
> +        .grp_id      = 0xFF,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = 0x40,
> +        .size_init   = pt_reg_grp_size_init,
> +        .emu_reg_tbl = pt_emu_reg_header0_tbl,
> +    },
> +    /* PCI PowerManagement Capability reg group */
> +    {
> +        .grp_id      = PCI_CAP_ID_PM,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = PCI_PM_SIZEOF,
> +        .size_init   = pt_pm_size_init,
> +        .emu_reg_tbl = pt_emu_reg_pm_tbl,
> +    },
> +    /* AGP Capability Structure reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_AGP,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x30,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* Vital Product Data Capability Structure reg group */
> +    {
> +        .grp_id      = PCI_CAP_ID_VPD,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = 0x08,
> +        .size_init   = pt_reg_grp_size_init,
> +        .emu_reg_tbl = pt_emu_reg_vpd_tbl,
> +    },
> +    /* Slot Identification reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_SLOTID,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x04,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* PCI-X Capabilities List Item reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_PCIX,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x18,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* Vendor Specific Capability Structure reg group */
> +    {
> +        .grp_id      = PCI_CAP_ID_VNDR,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = 0xFF,
> +        .size_init   = pt_vendor_size_init,
> +        .emu_reg_tbl = pt_emu_reg_vendor_tbl,
> +    },
> +    /* SHPC Capability List Item reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_SHPC,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x08,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* Subsystem ID and Subsystem Vendor ID Capability List Item reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_SSVID,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x08,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* AGP 8x Capability Structure reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_AGP3,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x30,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* PCI Express Capability Structure reg group */
> +    {
> +        .grp_id      = PCI_CAP_ID_EXP,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = 0xFF,
> +        .size_init   = pt_pcie_size_init,
> +        .emu_reg_tbl = pt_emu_reg_pcie_tbl,
> +    },
> +    {
> +        .grp_size = 0,
> +    },
> +};
> +
> +/* initialize Capabilities Pointer or Next Pointer register */
> +static uint32_t pt_ptr_reg_init(XenPCIPassthroughState *s,
> +                                XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    /* uint32_t reg_field = (uint32_t)s->dev.config[real_offset]; */
> +    uint32_t reg_field = pci_get_byte(s->dev.config + real_offset);
> +    int i;
> +
> +    /* find capability offset */
> +    while (reg_field) {
> +        for (i = 0; pt_emu_reg_grp_tbl[i].grp_size != 0; i++) {
> +            if (pt_hide_dev_cap(s->real_device,
> +                                pt_emu_reg_grp_tbl[i].grp_id)) {
> +                continue;
> +            }
> +            if (pt_emu_reg_grp_tbl[i].grp_id == s->dev.config[reg_field]) {
> +                if (pt_emu_reg_grp_tbl[i].grp_type == GRP_TYPE_EMU) {
> +                    goto out;
> +                }
> +                /* ignore the 0 hardwired capability, find next one */
> +                break;
> +            }
> +        }
> +        /* next capability */
> +        /* reg_field = (uint32_t)s->dev.config[reg_field + 1]; */
> +        reg_field = pci_get_byte(s->dev.config + reg_field + 1);
> +    }
> +
> +out:
> +    return reg_field;
> +}
> +
> +
> +/*************
> + * Main
> + */
> +
> +/* restore a part of I/O device register */
> +static void pt_config_restore(XenPCIPassthroughState *s)
> +{
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegInfo *reg = NULL;
> +    uint32_t real_offset = 0;
> +    uint32_t read_val = 0;
> +    uint32_t val = 0;
> +    int ret = 0;
> +
> +    /* find emulate register group entry */
> +    QLIST_FOREACH(reg_grp_entry, &s->reg_grp_tbl, entries) {
> +        /* find emulate register entry */
> +        QLIST_FOREACH(reg_entry, &reg_grp_entry->reg_tbl_list, entries) {
> +            reg = reg_entry->reg;
> +
> +            /* check whether restoring is needed */
> +            if (!reg->u.b.restore) {
> +                continue;
> +            }
> +
> +            real_offset = reg_grp_entry->base_offset + reg->offset;
> +
> +            /* read I/O device register value */
> +            ret = host_pci_get_block(s->real_device, real_offset,
> +                                     (uint8_t *)&read_val, reg->size);
> +
> +            if (!ret) {
> +                PT_LOG("Error: pci_read_block failed. "
> +                       "return value[%d].\n", ret);
> +                memset(&read_val, 0xff, reg->size);
> +            }
> +
> +            val = 0;
> +
> +            /* restore based on register size */
> +            switch (reg->size) {
> +            case 1:
> +                /* byte register */
> +                ret = reg->u.b.restore(s, reg_entry, real_offset,
> +                                       (uint8_t)read_val, (uint8_t *)&val);
> +                break;
> +            case 2:
> +                /* word register */
> +                ret = reg->u.w.restore(s, reg_entry, real_offset,
> +                                       (uint16_t)read_val, (uint16_t *)&val);
> +                break;
> +            case 4:
> +                /* double word register */
> +                ret = reg->u.dw.restore(s, reg_entry, real_offset,
> +                                        (uint32_t)read_val, (uint32_t *)&val);
> +                break;
> +            }
> +
> +            /* restoring error */
> +            if (ret < 0) {
> +                hw_error("Internal error: Invalid restoring "
> +                         "return value[%d]. I/O emulator exit.\n", ret);
> +            }
> +
> +            PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
> +                          pci_bus_num(s->dev.bus), PCI_SLOT(s->dev.devfn),
> +                          PCI_FUNC(s->dev.devfn),
> +                          real_offset, val, reg->size);
> +
> +            ret = host_pci_set_block(s->real_device, real_offset,
> +                                     (uint8_t *)&val, reg->size);
> +
> +            if (!ret) {
> +                PT_LOG("Error: pci_write_block failed. "
> +                       "return value[%d].\n", ret);
> +            }
> +        }
> +    }
> +
> +    /* if AER supported, restore it */
> +    if (s->pm_state->aer_base) {
> +        pt_aer_reg_restore(s);
> +    }
> +}
> +/* reinitialize all emulate registers */
> +static void pt_config_reinit(XenPCIPassthroughState *s)
> +{
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegInfo *reg = NULL;
> +
> +    /* find emulate register group entry */
> +    QLIST_FOREACH(reg_grp_entry, &s->reg_grp_tbl, entries) {
> +        /* find emulate register entry */
> +        QLIST_FOREACH(reg_entry, &reg_grp_entry->reg_tbl_list, entries) {
> +            reg = reg_entry->reg;
> +            if (reg->init) {
> +                /* initialize emulate register */
> +                reg_entry->data =
> +                    reg->init(s, reg_entry->reg,
> +                              reg_grp_entry->base_offset + reg->offset);
> +            }
> +        }
> +    }
> +}
> +
> +static int pt_init_pci_config(XenPCIPassthroughState *s)
> +{
> +    PCIDevice *d = &s->dev;
> +    int ret = 0;
> +
> +    PT_LOG("Reinitialize PCI configuration registers due to power state"
> +           " transition with internal reset. [%02x:%02x.%x]\n",
> +           pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn));
> +
> +    /* restore a part of I/O device register */
> +    pt_config_restore(s);
> +
> +    /* reinitialize all emulate register */
> +    pt_config_reinit(s);
> +
> +    /* rebind machine_irq to device */
> +    if (s->machine_irq != 0) {
> +        uint8_t e_device = PCI_SLOT(s->dev.devfn);
> +        uint8_t e_intx = pci_intx(s);
> +
> +        ret = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, s->machine_irq, 0,
> +                                        e_device, e_intx);
> +        if (ret < 0) {
> +            PT_LOG("Error: Rebinding of interrupt failed! ret=%d\n", ret);
> +        }
> +    }
> +
> +    return ret;
> +}
> +
> +static uint8_t find_cap_offset(XenPCIPassthroughState *s, uint8_t cap)
> +{
> +    int id;
> +    int max_cap = 48;
> +    int pos = PCI_CAPABILITY_LIST;
> +    int status;
> +
> +    status = host_pci_get_byte(s->real_device, PCI_STATUS);
> +    if ((status & PCI_STATUS_CAP_LIST) == 0) {
> +        return 0;
> +    }
> +
> +    while (max_cap--) {
> +        pos = host_pci_get_byte(s->real_device, pos);
> +        if (pos < 0x40) {
> +            break;
> +        }
> +
> +        pos &= ~3;
> +        id = host_pci_get_byte(s->real_device, pos + PCI_CAP_LIST_ID);
> +
> +        if (id == 0xff) {
> +            break;
> +        }
> +        if (id == cap) {
> +            return pos;
> +        }
> +
> +        pos += PCI_CAP_LIST_NEXT;
> +    }
> +    return 0;
> +}
> +
> +static void pt_config_reg_init(XenPCIPassthroughState *s,
> +                               XenPTRegGroup *reg_grp, XenPTRegInfo *reg)
> +{
> +    XenPTReg *reg_entry;
> +    uint32_t data = 0;
> +
> +    reg_entry = g_malloc0(sizeof (XenPTReg));
> +
> +    reg_entry->reg = reg;
> +    reg_entry->data = 0;
> +
> +    if (reg->init) {
> +        /* initialize emulate register */
> +        data = reg->init(s, reg_entry->reg,
> +                         reg_grp->base_offset + reg->offset);
> +        if (data == PT_INVALID_REG) {
> +            /* free unused BAR register entry */
> +            free(reg_entry);
> +            return;
> +        }
> +        /* set register value */
> +        reg_entry->data = data;
> +    }
> +    /* list add register entry */
> +    QLIST_INSERT_HEAD(&reg_grp->reg_tbl_list, reg_entry, entries);
> +
> +    return;
> +}
> +
> +void pt_config_init(XenPCIPassthroughState *s)
> +{
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    uint32_t reg_grp_offset = 0;
> +    XenPTRegInfo *reg_tbl = NULL;
> +    int i, j;
> +
> +    QLIST_INIT(&s->reg_grp_tbl);
> +
> +    for (i = 0; pt_emu_reg_grp_tbl[i].grp_size != 0; i++) {
> +        if (pt_emu_reg_grp_tbl[i].grp_id != 0xFF) {
> +            if (pt_hide_dev_cap(s->real_device,
> +                                pt_emu_reg_grp_tbl[i].grp_id)) {
> +                continue;
> +            }
> +
> +            reg_grp_offset = find_cap_offset(s, pt_emu_reg_grp_tbl[i].grp_id);
> +
> +            if (!reg_grp_offset) {
> +                continue;
> +            }
> +        }
> +
> +        reg_grp_entry = g_malloc0(sizeof (XenPTRegGroup));
> +        QLIST_INIT(&reg_grp_entry->reg_tbl_list);
> +        QLIST_INSERT_HEAD(&s->reg_grp_tbl, reg_grp_entry, entries);
> +
> +        reg_grp_entry->base_offset = reg_grp_offset;
> +        reg_grp_entry->reg_grp = pt_emu_reg_grp_tbl + i;
> +        if (pt_emu_reg_grp_tbl[i].size_init) {
> +            /* get register group size */
> +            reg_grp_entry->size =
> +                pt_emu_reg_grp_tbl[i].size_init(s, reg_grp_entry->reg_grp,
> +                                                reg_grp_offset);
> +        }
> +
> +        if (pt_emu_reg_grp_tbl[i].grp_type == GRP_TYPE_EMU) {
> +            if (pt_emu_reg_grp_tbl[i].emu_reg_tbl) {
> +                reg_tbl = pt_emu_reg_grp_tbl[i].emu_reg_tbl;
> +                /* initialize capability register */
> +                for (j = 0; reg_tbl->size != 0; j++, reg_tbl++) {
> +                    /* initialize capability register */
> +                    pt_config_reg_init(s, reg_grp_entry, reg_tbl);
> +                }
> +            }
> +        }
> +        reg_grp_offset = 0;
> +    }
> +
> +    return;
> +}
> +
> +/* delete all emulate register */
> +void pt_config_delete(XenPCIPassthroughState *s)
> +{
> +    struct XenPTRegGroup *reg_group, *next_grp;
> +    struct XenPTReg *reg, *next_reg;
> +
> +    /* free Power Management info table */
> +    if (s->pm_state) {
> +        if (s->pm_state->pm_timer) {
> +            qemu_del_timer(s->pm_state->pm_timer);
> +            qemu_free_timer(s->pm_state->pm_timer);
> +            s->pm_state->pm_timer = NULL;
> +        }
> +
> +        g_free(s->pm_state);
> +    }
> +
> +    /* free all register group entry */
> +    QLIST_FOREACH_SAFE(reg_group, &s->reg_grp_tbl, entries, next_grp) {
> +        /* free all register entry */
> +        QLIST_FOREACH_SAFE(reg, &reg_group->reg_tbl_list, entries, next_reg) {
> +            QLIST_REMOVE(reg, entries);
> +            g_free(reg);
> +        }
> +
> +        QLIST_REMOVE(reg_group, entries);
> +        g_free(reg_group);
> +    }
> +}
> --
> Anthony PERARD
>
Anthony PERARD - Nov. 9, 2011, 5:05 p.m.
On Tue, Nov 8, 2011 at 12:57, Stefano Stabellini
<stefano.stabellini@eu.citrix.com> wrote:
> Obviously passthrough cannot work without this patch, but qemu should be
> able to compile anyway. Please add to the previous patch empty stub
> implementations for all the exported functions that you are going to
> implement here.
>
> I see that the timer is allocated here.
> In that case it would make sense to move the timer update to this patch.

Ok, I will do that.
Konrad Rzeszutek Wilk - Nov. 10, 2011, 9:53 p.m.
On Fri, Oct 28, 2011 at 04:07:34PM +0100, Anthony PERARD wrote:
> From: Allen Kay <allen.m.kay@intel.com>
> 
> Signed-off-by: Allen Kay <allen.m.kay@intel.com>
> Signed-off-by: Guy Zana <guy@neocleus.com>
> Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
> ---
>  Makefile.target                      |    1 +
>  hw/xen_pci_passthrough.h             |    2 +
>  hw/xen_pci_passthrough_config_init.c | 2068 ++++++++++++++++++++++++++++++++++
>  3 files changed, 2071 insertions(+), 0 deletions(-)
>  create mode 100644 hw/xen_pci_passthrough_config_init.c
> 
> diff --git a/Makefile.target b/Makefile.target
> index 36ea47d..c32c688 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -219,6 +219,7 @@ obj-i386-$(CONFIG_XEN) += xen_platform.o
>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough.o
>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_helpers.o
> +obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_config_init.o
>  
>  # Inter-VM PCI shared memory
>  CONFIG_IVSHMEM =
> diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
> index 2d1979d..ebc04fd 100644
> --- a/hw/xen_pci_passthrough.h
> +++ b/hw/xen_pci_passthrough.h
> @@ -61,6 +61,8 @@ typedef int (*conf_byte_restore)
>  /* power state transition */
>  #define PT_FLAG_TRANSITING 0x0001
>  
> +#define PT_BAR_ALLF        0xFFFFFFFF  /* BAR ALLF value */
> +
>  
>  typedef enum {
>      GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
> diff --git a/hw/xen_pci_passthrough_config_init.c b/hw/xen_pci_passthrough_config_init.c
> new file mode 100644
> index 0000000..4103b59
> --- /dev/null
> +++ b/hw/xen_pci_passthrough_config_init.c
> @@ -0,0 +1,2068 @@
> +/*
> + * Copyright (c) 2007, Neocleus Corporation.
> + * Copyright (c) 2007, Intel Corporation.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Alex Novik <alex@neocleus.com>
> + * Allen Kay <allen.m.kay@intel.com>
> + * Guy Zana <guy@neocleus.com>
> + *
> + * This file implements direct PCI assignment to a HVM guest
> + */
> +
> +#include "qemu-timer.h"
> +#include "xen_backend.h"
> +#include "xen_pci_passthrough.h"
> +
> +#define PT_MERGE_VALUE(value, data, val_mask) \
> +    (((value) & (val_mask)) | ((data) & ~(val_mask)))
> +
> +#define PT_INVALID_REG          0xFFFFFFFF      /* invalid register value */
> +
> +/* prototype */
> +
> +static uint32_t pt_ptr_reg_init(XenPCIPassthroughState *s, XenPTRegInfo *reg,
> +                                uint32_t real_offset);
> +static int pt_init_pci_config(XenPCIPassthroughState *s);
> +
> +
> +/* helper */
> +
> +/* A return value of 1 means the capability should NOT be exposed to guest. */
> +static int pt_hide_dev_cap(const HostPCIDevice *d, uint8_t grp_id)
> +{
> +    switch (grp_id) {
> +    case PCI_CAP_ID_EXP:
> +        /* The PCI Express Capability Structure of the VF of Intel 82599 10GbE
> +         * Controller looks trivial, e.g., the PCI Express Capabilities
> +         * Register is 0. We should not try to expose it to guest.

Why not?
> +         */
> +        if (d->vendor_id == PCI_VENDOR_ID_INTEL &&
> +                d->device_id == PCI_DEVICE_ID_INTEL_82599_VF) {
> +            return 1;
> +        }
> +        break;
> +    }
> +    return 0;
> +}
> +
> +/*   find emulate register group entry */
> +XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address)
> +{
> +    XenPTRegGroup *entry = NULL;
> +
> +    /* find register group entry */
> +    QLIST_FOREACH(entry, &s->reg_grp_tbl, entries) {
> +        /* check address */
> +        if ((entry->base_offset <= address)
> +            && ((entry->base_offset + entry->size) > address)) {
> +            return entry;
> +        }
> +    }
> +
> +    /* group entry not found */
> +    return NULL;
> +}
> +
> +/* find emulate register entry */
> +XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address)
> +{
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegInfo *reg = NULL;
> +    uint32_t real_offset = 0;
> +
> +    /* find register entry */
> +    QLIST_FOREACH(reg_entry, &reg_grp->reg_tbl_list, entries) {
> +        reg = reg_entry->reg;
> +        real_offset = reg_grp->base_offset + reg->offset;
> +        /* check address */
> +        if ((real_offset <= address)
> +            && ((real_offset + reg->size) > address)) {
> +            return reg_entry;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +/* parse BAR */
> +static PTBarFlag pt_bar_reg_parse(XenPCIPassthroughState *s, XenPTRegInfo *reg)
> +{
> +    PCIDevice *d = &s->dev;
> +    XenPTRegion *region = NULL;
> +    PCIIORegion *r;
> +    int index = 0;
> +
> +    /* check 64bit BAR */
> +    index = pt_bar_offset_to_index(reg->offset);
> +    if ((0 < index) && (index < PCI_ROM_SLOT)) {

This is  a bit confusing. Can you make the index be on the same
side, like

if ((0 < index) && (PCI_ROM_SLOT > index)

or better:

if ((index < 0) && (index < PCI_ROM_SLOT))

um, which looks wrong. Should it be 'index > 0' ?

> +        int flags = s->real_device->io_regions[index - 1].flags;

Do we want to check the index - 1 to make sure it is not negative?

> +
> +        if ((flags & IORESOURCE_MEM) && (flags & IORESOURCE_MEM_64)) {
> +            region = &s->bases[index - 1];
> +            if (region->bar_flag != PT_BAR_FLAG_UPPER) {
> +                return PT_BAR_FLAG_UPPER;
> +            }
> +        }
> +    }
> +
> +    /* check unused BAR */
> +    r = &d->io_regions[index];
> +    if (r->size == 0) {
> +        return PT_BAR_FLAG_UNUSED;
> +    }
> +
> +    /* for ExpROM BAR */
> +    if (index == PCI_ROM_SLOT) {
> +        return PT_BAR_FLAG_MEM;
> +    }
> +
> +    /* check BAR I/O indicator */
> +    if (s->real_device->io_regions[index].flags & IORESOURCE_IO) {
> +        return PT_BAR_FLAG_IO;
> +    } else {
> +        return PT_BAR_FLAG_MEM;
> +    }
> +}
> +
> +
> +/****************
> + * general register functions
> + */
> +
> +/* register initialization function */
> +
> +static uint32_t pt_common_reg_init(XenPCIPassthroughState *s,
> +                                   XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    return reg->init_val;
> +}
> +
> +/* Read register functions */
> +
> +static int pt_byte_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                            uint8_t *value, uint8_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint8_t valid_emu_mask = 0;
> +
> +    /* emulate byte register */
> +    valid_emu_mask = reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +    return 0;
> +}
> +static int pt_word_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                            uint16_t *value, uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t valid_emu_mask = 0;
> +
> +    /* emulate word register */
> +    valid_emu_mask = reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +    return 0;
> +}
> +static int pt_long_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                            uint32_t *value, uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint32_t valid_emu_mask = 0;
> +
> +    /* emulate long register */
> +    valid_emu_mask = reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +   return 0;
> +}
> +
> +/* Write register functions */
> +
> +static int pt_byte_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                             uint8_t *value, uint8_t dev_value,
> +                             uint8_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint8_t writable_mask = 0;
> +    uint8_t throughable_mask = 0;
> +
> +    /* modify emulate register */
> +    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    return 0;
> +}
> +static int pt_word_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                             uint16_t *value, uint16_t dev_value,
> +                             uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t writable_mask = 0;
> +    uint16_t throughable_mask = 0;
> +
> +    /* modify emulate register */
> +    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    return 0;
> +}
> +static int pt_long_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                             uint32_t *value, uint32_t dev_value,
> +                             uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint32_t writable_mask = 0;
> +    uint32_t throughable_mask = 0;
> +
> +    /* modify emulate register */
> +    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    return 0;
> +}
> +
> +/* common restore register fonctions */
> +static int pt_byte_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                               uint32_t real_offset, uint8_t dev_value,
> +                               uint8_t *value)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    PCIDevice *d = &s->dev;
> +
> +    /* use I/O device register's value as restore value */
> +    *value = pci_get_byte(d->config + real_offset);
> +
> +    /* create value for restoring to I/O device register */
> +    *value = PT_MERGE_VALUE(*value, dev_value, reg->emu_mask);
> +
> +    return 0;
> +}
> +static int pt_word_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                               uint32_t real_offset, uint16_t dev_value,
> +                               uint16_t *value)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    PCIDevice *d = &s->dev;
> +
> +    /* use I/O device register's value as restore value */
> +    *value = pci_get_word(d->config + real_offset);
> +
> +    /* create value for restoring to I/O device register */
> +    *value = PT_MERGE_VALUE(*value, dev_value, reg->emu_mask);
> +
> +    return 0;
> +}
> +
> +
> +/* XenPTRegInfo declaration
> + * - only for emulated register (either a part or whole bit).
> + * - for passthrough register that need special behavior (like interacting with
> + *   other component), set emu_mask to all 0 and specify r/w func properly.
> + * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
> + */
> +
> +/********************
> + * Header Type0
> + */
> +
> +static uint32_t pt_vendor_reg_init(XenPCIPassthroughState *s,
> +                                   XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    return s->real_device->vendor_id;
> +}
> +static uint32_t pt_device_reg_init(XenPCIPassthroughState *s,
> +                                   XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    return s->real_device->device_id;
> +}
> +static uint32_t pt_status_reg_init(XenPCIPassthroughState *s,
> +                                   XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    int reg_field = 0;
> +
> +    /* find Header register group */
> +    reg_grp_entry = pt_find_reg_grp(s, PCI_CAPABILITY_LIST);
> +    if (reg_grp_entry) {
> +        /* find Capabilities Pointer register */
> +        reg_entry = pt_find_reg(reg_grp_entry, PCI_CAPABILITY_LIST);
> +        if (reg_entry) {
> +            /* check Capabilities Pointer register */
> +            if (reg_entry->data) {
> +                reg_field |= PCI_STATUS_CAP_LIST;
> +            } else {
> +                reg_field &= ~PCI_STATUS_CAP_LIST;
> +            }
> +        } else {
> +            hw_error("Internal error: Couldn't find pt_reg_tbl for "
> +                     "Capabilities Pointer register. I/O emulator exit.\n");

Yikes. abort here? Um, can we just return a fault code instead?

> +        }
> +    } else {
> +        hw_error("Internal error: Couldn't find pt_reg_grp_tbl for Header. "
> +                 "I/O emulator exit.\n");
> +    }
> +
> +    return reg_field;
> +}
> +static uint32_t pt_header_type_reg_init(XenPCIPassthroughState *s,
> +                                        XenPTRegInfo *reg,
> +                                        uint32_t real_offset)
> +{
> +    /* read PCI_HEADER_TYPE */
> +    return reg->init_val | 0x80;
> +}
> +
> +/* initialize Interrupt Pin register */
> +static uint32_t pt_irqpin_reg_init(XenPCIPassthroughState *s,
> +                                   XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    return pci_read_intx(s);
> +}
> +
> +/* Command register */
> +static int pt_cmd_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                           uint16_t *value, uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t valid_emu_mask = 0;
> +    uint16_t emu_mask = reg->emu_mask;
> +
> +    if (s->is_virtfn) {
> +        emu_mask |= PCI_COMMAND_MEMORY;
> +    }
> +
> +    /* emulate word register */
> +    valid_emu_mask = emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +    return 0;
> +}
> +static int pt_cmd_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                            uint16_t *value, uint16_t dev_value,
> +                            uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t writable_mask = 0;
> +    uint16_t throughable_mask = 0;
> +    uint16_t wr_value = *value;
> +    uint16_t emu_mask = reg->emu_mask;
> +
> +    if (s->is_virtfn) {
> +        emu_mask |= PCI_COMMAND_MEMORY;
> +    }
> +
> +    /* modify emulate register */
> +    writable_mask = ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~emu_mask & valid_mask;
> +
> +    if (*value & PCI_COMMAND_INTX_DISABLE) {
> +        throughable_mask |= PCI_COMMAND_INTX_DISABLE;
> +    } else {
> +        if (s->machine_irq) {
> +            throughable_mask |= PCI_COMMAND_INTX_DISABLE;
> +        }
> +    }
> +
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    /* mapping BAR */
> +    pt_bar_mapping(s, wr_value & PCI_COMMAND_IO,
> +                   wr_value & PCI_COMMAND_MEMORY);
> +
> +    return 0;
> +}
> +static int pt_cmd_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                              uint32_t real_offset, uint16_t dev_value,
> +                              uint16_t *value)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    PCIDevice *d = &s->dev;
> +    uint16_t restorable_mask = 0;
> +
> +    /* use I/O device register's value as restore value */
> +    *value = pci_get_word(d->config + real_offset);
> +
> +    /* create value for restoring to I/O device register
> +     * but do not include Fast Back-to-Back Enable bit.
> +     */
> +    restorable_mask = reg->emu_mask & ~PCI_COMMAND_FAST_BACK;
> +    *value = PT_MERGE_VALUE(*value, dev_value, restorable_mask);
> +
> +    if (!s->machine_irq) {
> +        *value |= PCI_COMMAND_INTX_DISABLE;
> +    } else {
> +        *value &= ~PCI_COMMAND_INTX_DISABLE;
> +    }
> +
> +    return 0;
> +}
> +
> +/* BAR */
> +#define PT_BAR_MEM_RO_MASK      0x0000000F      /* BAR ReadOnly mask(Memory) */
> +#define PT_BAR_MEM_EMU_MASK     0xFFFFFFF0      /* BAR emul mask(Memory) */
> +#define PT_BAR_IO_RO_MASK       0x00000003      /* BAR ReadOnly mask(I/O) */
> +#define PT_BAR_IO_EMU_MASK      0xFFFFFFFC      /* BAR emul mask(I/O) */
> +
> +static inline uint32_t base_address_with_flags(HostPCIIORegion *hr)
> +{
> +    if ((hr->flags & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
> +        return hr->base_addr | (hr->flags & ~PCI_BASE_ADDRESS_IO_MASK);
> +    } else {
> +        return hr->base_addr | (hr->flags & ~PCI_BASE_ADDRESS_MEM_MASK);
> +    }
> +}
> +
> +static uint32_t pt_bar_reg_init(XenPCIPassthroughState *s, XenPTRegInfo *reg,
> +                                uint32_t real_offset)
> +{
> +    int reg_field = 0;
> +    int index;
> +
> +    /* get BAR index */
> +    index = pt_bar_offset_to_index(reg->offset);
> +    if (index < 0) {
> +        hw_error("Internal error: Invalid BAR index[%d]. "
> +                 "I/O emulator exit.\n", index);
> +    }
> +
> +    /* set initial guest physical base address to -1 */
> +    s->bases[index].e_physbase = -1;

Um, use that define PCI_.. something macro.
> +
> +    /* set BAR flag */
> +    s->bases[index].bar_flag = pt_bar_reg_parse(s, reg);
> +    if (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED) {
> +        reg_field = PT_INVALID_REG;
> +    }
> +
> +    return reg_field;
> +}
> +static int pt_bar_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                           uint32_t *value, uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint32_t valid_emu_mask = 0;
> +    uint32_t bar_emu_mask = 0;
> +    int index;
> +
> +    /* get BAR index */
> +    index = pt_bar_offset_to_index(reg->offset);
> +    if (index < 0) {
> +        hw_error("Internal error: Invalid BAR index[%d]. "
> +                 "I/O emulator exit.\n", index);
> +    }
> +
> +    /* use fixed-up value from kernel sysfs */
> +    *value = base_address_with_flags(&s->real_device->io_regions[index]);
> +
> +    /* set emulate mask depend on BAR flag */
> +    switch (s->bases[index].bar_flag) {
> +    case PT_BAR_FLAG_MEM:
> +        bar_emu_mask = PT_BAR_MEM_EMU_MASK;
> +        break;
> +    case PT_BAR_FLAG_IO:
> +        bar_emu_mask = PT_BAR_IO_EMU_MASK;
> +        break;
> +    case PT_BAR_FLAG_UPPER:
> +        bar_emu_mask = PT_BAR_ALLF;
> +        break;
> +    default:
> +        break;
> +    }
> +
> +    /* emulate BAR */
> +    valid_emu_mask = bar_emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +   return 0;
> +}
> +static int pt_bar_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                            uint32_t *value, uint32_t dev_value,
> +                            uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegion *base = NULL;
> +    PCIDevice *d = &s->dev;
> +    PCIIORegion *r;
> +    uint32_t writable_mask = 0;
> +    uint32_t throughable_mask = 0;
> +    uint32_t bar_emu_mask = 0;
> +    uint32_t bar_ro_mask = 0;
> +    uint32_t new_addr, last_addr;
> +    uint32_t prev_offset;
> +    uint32_t r_size = 0;
> +    int index = 0;
> +
> +    /* get BAR index */
> +    index = pt_bar_offset_to_index(reg->offset);
> +    if (index < 0) {
> +        hw_error("Internal error: Invalid BAR index[%d]. "
> +                 "I/O emulator exit.\n", index);
> +    }
> +
> +    r = &d->io_regions[index];
> +    base = &s->bases[index];
> +    r_size = pt_get_emul_size(base->bar_flag, r->size);
> +
> +    /* set emulate mask and read-only mask depend on BAR flag */
> +    switch (s->bases[index].bar_flag) {
> +    case PT_BAR_FLAG_MEM:
> +        bar_emu_mask = PT_BAR_MEM_EMU_MASK;
> +        bar_ro_mask = PT_BAR_MEM_RO_MASK | (r_size - 1);
> +        break;
> +    case PT_BAR_FLAG_IO:
> +        bar_emu_mask = PT_BAR_IO_EMU_MASK;
> +        bar_ro_mask = PT_BAR_IO_RO_MASK | (r_size - 1);
> +        break;
> +    case PT_BAR_FLAG_UPPER:
> +        bar_emu_mask = PT_BAR_ALLF;
> +        bar_ro_mask = 0;    /* all upper 32bit are R/W */
> +        break;
> +    default:
> +        break;
> +    }
> +
> +    /* modify emulate register */
> +    writable_mask = bar_emu_mask & ~bar_ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* check whether we need to update the virtual region address or not */
> +    switch (s->bases[index].bar_flag) {
> +    case PT_BAR_FLAG_MEM:
> +        /* nothing to do */
> +        break;
> +    case PT_BAR_FLAG_IO:
> +        new_addr = cfg_entry->data;
> +        last_addr = new_addr + r_size - 1;
> +        /* check invalid address */
> +        if (last_addr <= new_addr || !new_addr || last_addr >= 0x10000) {

Make a #define for 0x10000.. 

> +            /* check 64K range */
> +            if ((last_addr >= 0x10000) &&
> +                (cfg_entry->data != (PT_BAR_ALLF & ~bar_ro_mask))) {
> +                PT_LOG("Warning: Guest attempt to set Base Address "
> +                       "over the 64KB. [%02x:%02x.%x][Offset:%02xh]"
> +                       "[Address:%08xh][Size:%08xh]\n",
> +                       pci_bus_num(d->bus), PCI_SLOT(d->devfn),
> +                       PCI_FUNC(d->devfn),
> +                       reg->offset, new_addr, r_size);
> +            }
> +            /* just remove mapping */
> +            r->addr = -1;
> +            goto exit;
> +        }
> +        break;
> +    case PT_BAR_FLAG_UPPER:
> +        if (cfg_entry->data) {
> +            if (cfg_entry->data != (PT_BAR_ALLF & ~bar_ro_mask)) {
> +                PT_LOG("Warning: Guest attempt to set high MMIO Base Address. "
> +                       "Ignore mapping. "
> +                       "[%02x:%02x.%x][Offset:%02xh][High Address:%08xh]\n",
> +                       pci_bus_num(d->bus), PCI_SLOT(d->devfn),
> +                       PCI_FUNC(d->devfn), reg->offset, cfg_entry->data);
> +            }
> +            /* clear lower address */
> +            d->io_regions[index-1].addr = -1;
> +        } else {
> +            /* find lower 32bit BAR */
> +            prev_offset = (reg->offset - 4);
> +            reg_grp_entry = pt_find_reg_grp(s, prev_offset);
> +            if (reg_grp_entry) {
> +                reg_entry = pt_find_reg(reg_grp_entry, prev_offset);
> +                if (reg_entry) {
> +                    /* restore lower address */
> +                    d->io_regions[index-1].addr = reg_entry->data;
> +                } else {
> +                    return -1;
> +                }
> +            } else {
> +                return -1;
> +            }
> +        }
> +
> +        /* never mapping the 'empty' upper region,
> +         * because we'll do it enough for the lower region.
> +         */
> +        r->addr = -1;
> +        goto exit;
> +    default:
> +        break;
> +    }
> +
> +    /* update the corresponding virtual region address */
> +    /*
> +     * When guest code tries to get block size of mmio, it will write all "1"s
> +     * into pci bar register. In this case, cfg_entry->data == writable_mask.
> +     * Especially for devices with large mmio, the value of writable_mask
> +     * is likely to be a guest physical address that has been mapped to ram
> +     * rather than mmio. Remapping this value to mmio should be prevented.
> +     */
> +
> +    if (cfg_entry->data != writable_mask) {
> +        r->addr = cfg_entry->data;
> +    }
> +
> +exit:
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~bar_emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    /* After BAR reg update, we need to remap BAR */
> +    reg_grp_entry = pt_find_reg_grp(s, PCI_COMMAND);
> +    if (reg_grp_entry) {
> +        reg_entry = pt_find_reg(reg_grp_entry, PCI_COMMAND);
> +        if (reg_entry) {
> +            pt_bar_mapping_one(s, index, reg_entry->data & PCI_COMMAND_IO,
> +                               reg_entry->data & PCI_COMMAND_MEMORY);
> +        }
> +    }
> +
> +    return 0;
> +}
> +static int pt_bar_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                              uint32_t real_offset, uint32_t dev_value,
> +                              uint32_t *value)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint32_t bar_emu_mask = 0;
> +    int index = 0;
> +
> +    /* get BAR index */
> +    index = pt_bar_offset_to_index(reg->offset);
> +    if (index < 0) {
> +        hw_error("Internal error: Invalid BAR index[%d]. "
> +                 "I/O emulator exit.\n", index);
> +    }
> +
> +    /* use value from kernel sysfs */
> +    if (s->bases[index].bar_flag == PT_BAR_FLAG_UPPER) {
> +        *value = s->real_device->io_regions[index - 1].base_addr >> 32;
> +    } else {
> +        *value = base_address_with_flags(&s->real_device->io_regions[index]);
> +    }
> +
> +    /* set emulate mask depend on BAR flag */
> +    switch (s->bases[index].bar_flag) {
> +    case PT_BAR_FLAG_MEM:
> +        bar_emu_mask = PT_BAR_MEM_EMU_MASK;
> +        break;
> +    case PT_BAR_FLAG_IO:
> +        bar_emu_mask = PT_BAR_IO_EMU_MASK;
> +        break;
> +    case PT_BAR_FLAG_UPPER:
> +        bar_emu_mask = PT_BAR_ALLF;
> +        break;
> +    default:
> +        break;
> +    }
> +
> +    /* create value for restoring to I/O device register */
> +    *value = PT_MERGE_VALUE(*value, dev_value, bar_emu_mask);
> +
> +    return 0;
> +}
> +
> +/* write Exp ROM BAR */
> +static int pt_exp_rom_bar_reg_write(XenPCIPassthroughState *s,
> +                                    XenPTReg *cfg_entry, uint32_t *value,
> +                                    uint32_t dev_value, uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegion *base = NULL;
> +    PCIDevice *d = (PCIDevice *)&s->dev;
> +    PCIIORegion *r;
> +    uint32_t writable_mask = 0;
> +    uint32_t throughable_mask = 0;
> +    pcibus_t r_size = 0;
> +    uint32_t bar_emu_mask = 0;
> +    uint32_t bar_ro_mask = 0;
> +
> +    r = &d->io_regions[PCI_ROM_SLOT];
> +    r_size = r->size;
> +    base = &s->bases[PCI_ROM_SLOT];
> +    /* align memory type resource size */
> +    pt_get_emul_size(base->bar_flag, r_size);
> +
> +    /* set emulate mask and read-only mask */
> +    bar_emu_mask = reg->emu_mask;
> +    bar_ro_mask = (reg->ro_mask | (r_size - 1)) & ~PCI_ROM_ADDRESS_ENABLE;
> +
> +    /* modify emulate register */
> +    writable_mask = ~bar_ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* update the corresponding virtual region address */
> +    /*
> +     * When guest code tries to get block size of mmio, it will write all "1"s
> +     * into pci bar register. In this case, cfg_entry->data == writable_mask.
> +     * Especially for devices with large mmio, the value of writable_mask
> +     * is likely to be a guest physical address that has been mapped to ram
> +     * rather than mmio. Remapping this value to mmio should be prevented.
> +     */
> +
> +    if (cfg_entry->data != writable_mask) {
> +        r->addr = cfg_entry->data;
> +    }
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~bar_emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    /* After BAR reg update, we need to remap BAR*/
> +    reg_grp_entry = pt_find_reg_grp(s, PCI_COMMAND);
> +    if (reg_grp_entry) {
> +        reg_entry = pt_find_reg(reg_grp_entry, PCI_COMMAND);
> +        if (reg_entry) {
> +            pt_bar_mapping_one(s, PCI_ROM_SLOT,
> +                               reg_entry->data & PCI_COMMAND_IO,
> +                               reg_entry->data & PCI_COMMAND_MEMORY);
> +        }
> +    }
> +
> +    return 0;
> +}
> +/* restore ROM BAR */
> +static int pt_exp_rom_bar_reg_restore(XenPCIPassthroughState *s,
> +                                      XenPTReg *cfg_entry,
> +                                      uint32_t real_offset,
> +                                      uint32_t dev_value, uint32_t *value)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +
> +    /* use value from kernel sysfs */
> +    *value =
> +        PT_MERGE_VALUE(host_pci_get_long(s->real_device, PCI_ROM_ADDRESS),
> +                       dev_value, reg->emu_mask);
> +    return 0;
> +}
> +
> +/* Header Type0 reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_header0_tbl[] = {
> +    /* Vendor ID reg */
> +    {
> +        .offset     = PCI_VENDOR_ID,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xFFFF,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_vendor_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    /* Device ID reg */
> +    {
> +        .offset     = PCI_DEVICE_ID,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xFFFF,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_device_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    /* Command reg */
> +    {
> +        .offset     = PCI_COMMAND,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xF880,
> +        .emu_mask   = 0x0740,
> +        .init       = pt_common_reg_init,
> +        .u.w.read   = pt_cmd_reg_read,
> +        .u.w.write  = pt_cmd_reg_write,
> +        .u.w.restore  = pt_cmd_reg_restore,
> +    },
> +    /* Capabilities Pointer reg */
> +    {
> +        .offset     = PCI_CAPABILITY_LIST,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Status reg */
> +    /* use emulated Cap Ptr value to initialize,
> +     * so need to be declared after Cap Ptr reg
> +     */
> +    {
> +        .offset     = PCI_STATUS,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0x06FF,
> +        .emu_mask   = 0x0010,
> +        .init       = pt_status_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    /* Cache Line Size reg */
> +    {
> +        .offset     = PCI_CACHE_LINE_SIZE,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0x00,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_common_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = pt_byte_reg_restore,
> +    },
> +    /* Latency Timer reg */
> +    {
> +        .offset     = PCI_LATENCY_TIMER,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0x00,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_common_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = pt_byte_reg_restore,
> +    },
> +    /* Header Type reg */
> +    {
> +        .offset     = PCI_HEADER_TYPE,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0x00,
> +        .init       = pt_header_type_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Interrupt Line reg */
> +    {
> +        .offset     = PCI_INTERRUPT_LINE,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0x00,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_common_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Interrupt Pin reg */
> +    {
> +        .offset     = PCI_INTERRUPT_PIN,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_irqpin_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* BAR 0 reg */
> +    /* mask of BAR need to be decided later, depends on IO/MEM type */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_0,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* BAR 1 reg */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_1,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* BAR 2 reg */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_2,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* BAR 3 reg */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_3,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* BAR 4 reg */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_4,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* BAR 5 reg */
> +    {
> +        .offset     = PCI_BASE_ADDRESS_5,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_bar_reg_read,
> +        .u.dw.write = pt_bar_reg_write,
> +        .u.dw.restore = pt_bar_reg_restore,
> +    },
> +    /* Expansion ROM BAR reg */
> +    {
> +        .offset     = PCI_ROM_ADDRESS,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .ro_mask    = 0x000007FE,
> +        .emu_mask   = 0xFFFFF800,
> +        .init       = pt_bar_reg_init,
> +        .u.dw.read  = pt_long_reg_read,
> +        .u.dw.write = pt_exp_rom_bar_reg_write,
> +        .u.dw.restore = pt_exp_rom_bar_reg_restore,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
> +
> +/*********************************
> + * Vital Product Data Capability
> + */
> +
> +/* Vital Product Data Capability Structure reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_vpd_tbl[] = {
> +    {
> +        .offset     = PCI_CAP_LIST_NEXT,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
> +
> +/**************************************
> + * Vendor Specific Capability
> + */
> +
> +/* Vendor Specific Capability Structure reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_vendor_tbl[] = {
> +    {
> +        .offset     = PCI_CAP_LIST_NEXT,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
> +
> +/*****************************
> + * PCI Express Capability
> + */
> +
> +/* initialize Link Control register */
> +static uint32_t pt_linkctrl_reg_init(XenPCIPassthroughState *s,
> +                                     XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    uint8_t cap_ver = 0;
> +    uint8_t dev_type = 0;
> +
> +    /* TODO maybe better to use fonction from hw/pcie.c */

function
> +    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> +                           + PCI_EXP_FLAGS)
> +        & PCI_EXP_FLAGS_VERS;
> +    dev_type = (pci_get_byte(s->dev.config + real_offset - reg->offset
> +                             + PCI_EXP_FLAGS)
> +                & PCI_EXP_FLAGS_TYPE) >> 4;
> +
> +    /* no need to initialize in case of Root Complex Integrated Endpoint
> +     * with cap_ver 1.x

Why?

> +     */
> +    if ((dev_type == PCI_EXP_TYPE_RC_END) && (cap_ver == 1)) {
> +        return PT_INVALID_REG;
> +    }
> +
> +    return reg->init_val;
> +}
> +/* initialize Device Control 2 register */
> +static uint32_t pt_devctrl2_reg_init(XenPCIPassthroughState *s,
> +                                     XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    uint8_t cap_ver = 0;
> +
> +    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> +                           + PCI_EXP_FLAGS)
> +        & PCI_EXP_FLAGS_VERS;
> +
> +    /* no need to initialize in case of cap_ver 1.x */
> +    if (cap_ver == 1) {
> +        return PT_INVALID_REG;
> +    }
> +
> +    return reg->init_val;
> +}
> +/* initialize Link Control 2 register */
> +static uint32_t pt_linkctrl2_reg_init(XenPCIPassthroughState *s,
> +                                      XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    int reg_field = 0;
> +    uint8_t cap_ver = 0;
> +
> +    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> +                           + PCI_EXP_FLAGS)
> +        & PCI_EXP_FLAGS_VERS;

This looks like a weird tab issue, but it might be just my mailer.

> +
> +    /* no need to initialize in case of cap_ver 1.x */
> +    if (cap_ver == 1) {
> +        return PT_INVALID_REG;
> +    }
> +
> +    /* set Supported Link Speed */
> +    reg_field |= PCI_EXP_LNKCAP_SLS &
> +        pci_get_byte(s->dev.config + real_offset - reg->offset
> +                     + PCI_EXP_LNKCAP);
> +
> +    return reg_field;
> +}
> +
> +/* PCI Express Capability Structure reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_pcie_tbl[] = {
> +    /* Next Pointer reg */
> +    {
> +        .offset     = PCI_CAP_LIST_NEXT,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Device Capabilities reg */
> +    {
> +        .offset     = PCI_EXP_DEVCAP,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .ro_mask    = 0x1FFCFFFF,
> +        .emu_mask   = 0x10000000,
> +        .init       = pt_common_reg_init,
> +        .u.dw.read  = pt_long_reg_read,
> +        .u.dw.write = pt_long_reg_write,
> +        .u.dw.restore = NULL,
> +    },
> +    /* Device Control reg */
> +    {
> +        .offset     = PCI_EXP_DEVCTL,
> +        .size       = 2,
> +        .init_val   = 0x2810,
> +        .ro_mask    = 0x8400,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_common_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = pt_word_reg_restore,
> +    },
> +    /* Link Control reg */
> +    {
> +        .offset     = PCI_EXP_LNKCTL,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xFC34,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_linkctrl_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = pt_word_reg_restore,
> +    },
> +    /* Device Control 2 reg */
> +    {
> +        .offset     = 0x28,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xFFE0,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_devctrl2_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = pt_word_reg_restore,
> +    },
> +    /* Link Control 2 reg */
> +    {
> +        .offset     = 0x30,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xE040,
> +        .emu_mask   = 0xFFFF,
> +        .init       = pt_linkctrl2_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = pt_word_reg_restore,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
> +
> +/*********************************
> + * Power Management Capability
> + */
> +
> +/* initialize Power Management Capabilities register */
> +static uint32_t pt_pmc_reg_init(XenPCIPassthroughState *s,
> +                                XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    PCIDevice *d = &s->dev;
> +
> +    if (!s->power_mgmt) {
> +        return reg->init_val;
> +    }
> +
> +    /* set Power Management Capabilities register */
> +    s->pm_state->pmc_field = pci_get_word(d->config + real_offset);
> +
> +    return reg->init_val;
> +}
> +/* initialize PCI Power Management Control/Status register */
> +static uint32_t pt_pmcsr_reg_init(XenPCIPassthroughState *s,
> +                                  XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint16_t cap_ver  = 0;
> +
> +    if (!s->power_mgmt) {
> +        return reg->init_val;
> +    }
> +
> +    /* check PCI Power Management support version */
> +    cap_ver = s->pm_state->pmc_field & PCI_PM_CAP_VER_MASK;
> +
> +    if (cap_ver > 2) {
> +        /* set No Soft Reset */
> +        s->pm_state->no_soft_reset =
> +            pci_get_byte(d->config + real_offset) & PCI_PM_CTRL_NO_SOFT_RESET;
> +    }
> +
> +    /* wake up real physical device */
> +    switch (host_pci_get_word(s->real_device, real_offset)
> +            & PCI_PM_CTRL_STATE_MASK) {
> +    case 0:
> +        break;
> +    case 1:
> +        PT_LOG("Power state transition D1 -> D0active\n");
> +        host_pci_set_word(s->real_device, real_offset, 0);
> +        break;
> +    case 2:
> +        PT_LOG("Power state transition D2 -> D0active\n");
> +        host_pci_set_word(s->real_device, real_offset, 0);
> +        usleep(200);

Heheh..
> +        break;
> +    case 3:
> +        PT_LOG("Power state transition D3hot -> D0active\n");
> +        host_pci_set_word(s->real_device, real_offset, 0);
> +        usleep(10 * 1000);
> +        pt_init_pci_config(s);
> +        break;
> +    }
> +
> +    return reg->init_val;
> +}
> +/* read Power Management Control/Status register */
> +static int pt_pmcsr_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                             uint16_t *value, uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t valid_emu_mask = reg->emu_mask;
> +
> +    if (!s->power_mgmt) {
> +        valid_emu_mask |= PCI_PM_CTRL_STATE_MASK | PCI_PM_CTRL_NO_SOFT_RESET;
> +    }
> +
> +    valid_emu_mask = valid_emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
> +
> +    return 0;
> +}
> +/* reset Interrupt and I/O resource  */
> +static void pt_reset_interrupt_and_io_mapping(XenPCIPassthroughState *s)
> +{
> +    PCIDevice *d = &s->dev;
> +    PCIIORegion *r;
> +    int i = 0;
> +    uint8_t e_device = 0;
> +    uint8_t e_intx = 0;
> +
> +    /* unbind INTx */
> +    e_device = PCI_SLOT(s->dev.devfn);
> +    e_intx = pci_intx(s);
> +
> +    if (s->machine_irq) {
> +        if (xc_domain_unbind_pt_irq(xen_xc, xen_domid, s->machine_irq,
> +                                    PT_IRQ_TYPE_PCI, 0, e_device, e_intx, 0)) {
> +            PT_LOG("Error: Unbinding of interrupt failed!\n");
> +        }
> +    }
> +
> +    /* clear all virtual region address */
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        r = &d->io_regions[i];
> +        r->addr = -1;
> +    }
> +
> +    /* unmapping BAR */
> +    pt_bar_mapping(s, 0, 0);
> +}
> +/* check power state transition */
> +static int check_power_state(XenPCIPassthroughState *s)
> +{
> +    XenPTPM *pm_state = s->pm_state;
> +    PCIDevice *d = &s->dev;
> +    uint16_t read_val = 0;
> +    uint16_t cur_state = 0;
> +
> +    /* get current power state */
> +    read_val = host_pci_get_word(s->real_device,
> +                                 pm_state->pm_base + PCI_PM_CTRL);
> +    cur_state = read_val & PCI_PM_CTRL_STATE_MASK;
> +
> +    if (pm_state->req_state != cur_state) {
> +        PT_LOG("Error: Failed to change power state. "
> +               "[%02x:%02x.%x][requested state:%d][current state:%d]\n",
> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +               pm_state->req_state, cur_state);
> +        return -1;
> +    }
> +    return 0;
> +}
> +/* write Power Management Control/Status register */
> +static void pt_from_d3hot_to_d0_with_reset(void *opaque)
> +{
> +    XenPCIPassthroughState *s = opaque;
> +    XenPTPM *pm_state = s->pm_state;
> +    int ret = 0;
> +
> +    /* check power state */
> +    ret = check_power_state(s);
> +
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    pt_init_pci_config(s);
> +
> +out:
> +    /* power state transition flags off */
> +    pm_state->flags &= ~PT_FLAG_TRANSITING;
> +
> +    qemu_free_timer(pm_state->pm_timer);
> +    pm_state->pm_timer = NULL;
> +}
> +static void pt_default_power_transition(void *opaque)
> +{
> +    XenPCIPassthroughState *ptdev = opaque;
> +    XenPTPM *pm_state = ptdev->pm_state;
> +
> +    /* check power state */
> +    check_power_state(ptdev);
> +
> +    /* power state transition flags off */
> +    pm_state->flags &= ~PT_FLAG_TRANSITING;
> +
> +    qemu_free_timer(pm_state->pm_timer);
> +    pm_state->pm_timer = NULL;
> +}
> +static int pt_pmcsr_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                              uint16_t *value, uint16_t dev_value,
> +                              uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    PCIDevice *d = &s->dev;
> +    uint16_t emu_mask = reg->emu_mask;
> +    uint16_t writable_mask = 0;
> +    uint16_t throughable_mask = 0;
> +    XenPTPM *pm_state = s->pm_state;
> +
> +    if (!s->power_mgmt) {
> +        emu_mask |= PCI_PM_CTRL_STATE_MASK | PCI_PM_CTRL_NO_SOFT_RESET;
> +    }
> +
> +    /* modify emulate register */
> +    writable_mask = emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    if (!s->power_mgmt) {
> +        return 0;
> +    }
> +
> +    /* set I/O device power state */
> +    pm_state->cur_state = dev_value & PCI_PM_CTRL_STATE_MASK;
> +
> +    /* set Guest requested PowerState */
> +    pm_state->req_state = *value & PCI_PM_CTRL_STATE_MASK;
> +
> +    /* check power state transition or not */
> +    if (pm_state->cur_state == pm_state->req_state) {
> +        /* not power state transition */
> +        return 0;
> +    }
> +
> +    /* check enable power state transition */
> +    if ((pm_state->req_state != 0) &&
> +        (pm_state->cur_state > pm_state->req_state)) {
> +        PT_LOG("Error: Invalid power transition. "
> +               "[%02x:%02x.%x][requested state:%d][current state:%d]\n",
> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +               pm_state->req_state, pm_state->cur_state);
> +
> +        return 0;
> +    }
> +
> +    /* check if this device supports the requested power state */
> +    if (((pm_state->req_state == 1) && !(pm_state->pmc_field & PCI_PM_CAP_D1))
> +        || ((pm_state->req_state == 2) &&
> +            !(pm_state->pmc_field & PCI_PM_CAP_D2))) {
> +        PT_LOG("Error: Invalid power transition. "
> +               "[%02x:%02x.%x][requested state:%d][current state:%d]\n",
> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +               pm_state->req_state, pm_state->cur_state);
> +
> +        return 0;
> +    }
> +
> +    /* in case of transition related to D3hot, it's necessary to wait 10 ms.
> +     * But because writing to register will be performed later on actually,
> +     * don't start QEMUTimer right now, just alloc and init QEMUTimer here.
> +     */
> +    if ((pm_state->cur_state == 3) || (pm_state->req_state == 3)) {
> +        if (pm_state->req_state == 0) {
> +            /* alloc and init QEMUTimer */
> +            if (!pm_state->no_soft_reset) {
> +                pm_state->pm_timer = qemu_new_timer_ms(rt_clock,
> +                    pt_from_d3hot_to_d0_with_reset, s);
> +
> +                /* reset Interrupt and I/O resource mapping */
> +                pt_reset_interrupt_and_io_mapping(s);
> +            } else {
> +                pm_state->pm_timer = qemu_new_timer_ms(rt_clock,
> +                                        pt_default_power_transition, s);
> +            }
> +        } else {
> +            /* alloc and init QEMUTimer */
> +            pm_state->pm_timer = qemu_new_timer_ms(rt_clock,
> +                pt_default_power_transition, s);
> +        }
> +
> +        /* set power state transition delay */
> +        pm_state->pm_delay = 10;
> +
> +        /* power state transition flags on */
> +        pm_state->flags |= PT_FLAG_TRANSITING;
> +    }
> +    /* in case of transition related to D0, D1 and D2,
> +     * no need to use QEMUTimer.
> +     * So, we perfom writing to register here and then read it back.
> +     */
> +    else {
> +        /* write power state to I/O device register */
> +        host_pci_set_word(s->real_device, pm_state->pm_base + PCI_PM_CTRL,
> +                          *value);
> +
> +        /* in case of transition related to D2,
> +         * it's necessary to wait 200 usec.
> +         * But because QEMUTimer do not support microsec unit right now,
> +         * so we do wait ourself here.
> +         */
> +        if ((pm_state->cur_state == 2) || (pm_state->req_state == 2)) {
> +            usleep(200);
> +        }
> +
> +        /* check power state */
> +        check_power_state(s);
> +
> +        /* recreate value for writing to I/O device register */
> +        *value = host_pci_get_word(s->real_device,
> +                                   pm_state->pm_base + PCI_PM_CTRL);
> +    }
> +
> +    return 0;
> +}
> +
> +/* restore Power Management Control/Status register */
> +static int pt_pmcsr_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
> +                                uint32_t real_offset, uint16_t dev_value,
> +                                uint16_t *value)
> +{
> +    /* create value for restoring to I/O device register
> +     * No need to restore, just clear PME Enable and PME Status bit
> +     * Note: register type of PME Status bit is RW1C, so clear by writing 1b
> +     */
> +    *value = (dev_value & ~PCI_PM_CTRL_PME_ENABLE) | PCI_PM_CTRL_PME_STATUS;
> +
> +    return 0;
> +}
> +
> +
> +/* Power Management Capability reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_pm_tbl[] = {
> +    /* Next Pointer reg */
> +    {
> +        .offset     = PCI_CAP_LIST_NEXT,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Power Management Capabilities reg */
> +    {
> +        .offset     = PCI_CAP_FLAGS,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xFFFF,
> +        .emu_mask   = 0xF9C8,
> +        .init       = pt_pmc_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_word_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    /* PCI Power Management Control/Status reg */
> +    {
> +        .offset     = PCI_PM_CTRL,
> +        .size       = 2,
> +        .init_val   = 0x0008,
> +        .ro_mask    = 0xE1FC,
> +        .emu_mask   = 0x8100,
> +        .init       = pt_pmcsr_reg_init,
> +        .u.w.read   = pt_pmcsr_reg_read,
> +        .u.w.write  = pt_pmcsr_reg_write,
> +        .u.w.restore  = pt_pmcsr_reg_restore,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
> +
> +/****************************
> + * Capabilities
> + */
> +
> +/* AER register operations */
> +
> +static void aer_save_one_register(XenPCIPassthroughState *s, int offset)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint32_t aer_base = s->pm_state->aer_base;
> +    uint32_t val = 0;
> +
> +    val = host_pci_get_long(s->real_device, aer_base + offset);
> +    pci_set_long(d->config + aer_base + offset, val);
> +}
> +static void pt_aer_reg_save(XenPCIPassthroughState *s)
> +{
> +    /* after reset, following register values should be restored.
> +     * So, save them.
> +     */
> +    aer_save_one_register(s, PCI_ERR_UNCOR_MASK);
> +    aer_save_one_register(s, PCI_ERR_UNCOR_SEVER);
> +    aer_save_one_register(s, PCI_ERR_COR_MASK);
> +    aer_save_one_register(s, PCI_ERR_CAP);
> +}
> +static void aer_restore_one_register(XenPCIPassthroughState *s, int offset)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint32_t aer_base = s->pm_state->aer_base;
> +    uint32_t config = 0;
> +
> +    config = pci_get_long(d->config + aer_base + offset);
> +    host_pci_set_long(s->real_device, aer_base + offset, config);
> +}
> +static void pt_aer_reg_restore(XenPCIPassthroughState *s)
> +{
> +    /* the following registers should be reconfigured to correct values
> +     * after reset. restore them.
> +     * other registers should not be reconfigured after reset
> +     * if there is no reason
> +     */
> +    aer_restore_one_register(s, PCI_ERR_UNCOR_MASK);
> +    aer_restore_one_register(s, PCI_ERR_UNCOR_SEVER);
> +    aer_restore_one_register(s, PCI_ERR_COR_MASK);
> +    aer_restore_one_register(s, PCI_ERR_CAP);
> +}
> +
> +/* capability structure register group size functions */
> +
> +static uint8_t pt_reg_grp_size_init(XenPCIPassthroughState *s,
> +                                    const XenPTRegGroupInfo *grp_reg,
> +                                    uint32_t base_offset)
> +{
> +    return grp_reg->grp_size;
> +}
> +/* get Power Management Capability Structure register group size */
> +static uint8_t pt_pm_size_init(XenPCIPassthroughState *s,
> +                               const XenPTRegGroupInfo *grp_reg,
> +                               uint32_t base_offset)
> +{
> +    if (!s->power_mgmt) {
> +        return grp_reg->grp_size;
> +    }
> +
> +    s->pm_state = g_malloc0(sizeof (XenPTPM));
> +
> +    /* set Power Management Capability base offset */
> +    s->pm_state->pm_base = base_offset;
> +
> +    /* find AER register and set AER Capability base offset */
> +    s->pm_state->aer_base = host_pci_find_ext_cap_offset(s->real_device,
> +                                                         PCI_EXT_CAP_ID_ERR);
> +
> +    /* save AER register */
> +    if (s->pm_state->aer_base) {
> +        pt_aer_reg_save(s);
> +    }
> +
> +    return grp_reg->grp_size;
> +}
> +/* get Vendor Specific Capability Structure register group size */
> +static uint8_t pt_vendor_size_init(XenPCIPassthroughState *s,
> +                                   const XenPTRegGroupInfo *grp_reg,
> +                                   uint32_t base_offset)
> +{
> +    return pci_get_byte(s->dev.config + base_offset + 0x02);
> +}
> +/* get PCI Express Capability Structure register group size */
> +static uint8_t pt_pcie_size_init(XenPCIPassthroughState *s,
> +                                 const XenPTRegGroupInfo *grp_reg,
> +                                 uint32_t base_offset)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint16_t exp_flag = 0;
> +    uint16_t type = 0;
> +    uint16_t version = 0;
> +    uint8_t pcie_size = 0;
> +
> +    exp_flag = pci_get_word(d->config + base_offset + PCI_EXP_FLAGS);
> +    type = (exp_flag & PCI_EXP_FLAGS_TYPE) >> 4;
> +    version = exp_flag & PCI_EXP_FLAGS_VERS;
> +
> +    /* calculate size depend on capability version and device/port type */
> +    /* in case of PCI Express Base Specification Rev 1.x */
> +    if (version == 1) {
> +        /* The PCI Express Capabilities, Device Capabilities, and Device
> +         * Status/Control registers are required for all PCI Express devices.
> +         * The Link Capabilities and Link Status/Control are required for all
> +         * Endpoints that are not Root Complex Integrated Endpoints. Endpoints
> +         * are not required to implement registers other than those listed
> +         * above and terminate the capability structure.
> +         */
> +        switch (type) {
> +        case PCI_EXP_TYPE_ENDPOINT:
> +        case PCI_EXP_TYPE_LEG_END:
> +            pcie_size = 0x14;
> +            break;
> +        case PCI_EXP_TYPE_RC_END:
> +            /* has no link */
> +            pcie_size = 0x0C;
> +            break;
> +        /* only EndPoint passthrough is supported */
> +        case PCI_EXP_TYPE_ROOT_PORT:
> +        case PCI_EXP_TYPE_UPSTREAM:
> +        case PCI_EXP_TYPE_DOWNSTREAM:
> +        case PCI_EXP_TYPE_PCI_BRIDGE:
> +        case PCI_EXP_TYPE_PCIE_BRIDGE:
> +        case PCI_EXP_TYPE_RC_EC:
> +        default:
> +            hw_error("Internal error: Unsupported device/port type[%d]. "
> +                     "I/O emulator exit.\n", type);
> +        }
> +    }
> +    /* in case of PCI Express Base Specification Rev 2.0 */
> +    else if (version == 2) {
> +        switch (type) {
> +        case PCI_EXP_TYPE_ENDPOINT:
> +        case PCI_EXP_TYPE_LEG_END:
> +        case PCI_EXP_TYPE_RC_END:
> +            /* For Functions that do not implement the registers,
> +             * these spaces must be hardwired to 0b.
> +             */
> +            pcie_size = 0x3C;
> +            break;
> +        /* only EndPoint passthrough is supported */
> +        case PCI_EXP_TYPE_ROOT_PORT:
> +        case PCI_EXP_TYPE_UPSTREAM:
> +        case PCI_EXP_TYPE_DOWNSTREAM:
> +        case PCI_EXP_TYPE_PCI_BRIDGE:
> +        case PCI_EXP_TYPE_PCIE_BRIDGE:
> +        case PCI_EXP_TYPE_RC_EC:
> +        default:
> +            hw_error("Internal error: Unsupported device/port type[%d]. "
> +                     "I/O emulator exit.\n", type);
> +        }
> +    } else {
> +        hw_error("Internal error: Unsupported capability version[%d]. "
> +                 "I/O emulator exit.\n", version);
> +    }
> +
> +    return pcie_size;
> +}
> +
> +static const XenPTRegGroupInfo pt_emu_reg_grp_tbl[] = {
> +    /* Header Type0 reg group */
> +    {
> +        .grp_id      = 0xFF,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = 0x40,
> +        .size_init   = pt_reg_grp_size_init,
> +        .emu_reg_tbl = pt_emu_reg_header0_tbl,
> +    },
> +    /* PCI PowerManagement Capability reg group */
> +    {
> +        .grp_id      = PCI_CAP_ID_PM,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = PCI_PM_SIZEOF,
> +        .size_init   = pt_pm_size_init,
> +        .emu_reg_tbl = pt_emu_reg_pm_tbl,
> +    },
> +    /* AGP Capability Structure reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_AGP,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x30,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* Vital Product Data Capability Structure reg group */
> +    {
> +        .grp_id      = PCI_CAP_ID_VPD,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = 0x08,
> +        .size_init   = pt_reg_grp_size_init,
> +        .emu_reg_tbl = pt_emu_reg_vpd_tbl,
> +    },
> +    /* Slot Identification reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_SLOTID,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x04,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* PCI-X Capabilities List Item reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_PCIX,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x18,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* Vendor Specific Capability Structure reg group */
> +    {
> +        .grp_id      = PCI_CAP_ID_VNDR,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = 0xFF,
> +        .size_init   = pt_vendor_size_init,
> +        .emu_reg_tbl = pt_emu_reg_vendor_tbl,
> +    },
> +    /* SHPC Capability List Item reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_SHPC,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x08,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* Subsystem ID and Subsystem Vendor ID Capability List Item reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_SSVID,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x08,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* AGP 8x Capability Structure reg group */
> +    {
> +        .grp_id     = PCI_CAP_ID_AGP3,
> +        .grp_type   = GRP_TYPE_HARDWIRED,
> +        .grp_size   = 0x30,
> +        .size_init  = pt_reg_grp_size_init,
> +    },
> +    /* PCI Express Capability Structure reg group */
> +    {
> +        .grp_id      = PCI_CAP_ID_EXP,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = 0xFF,
> +        .size_init   = pt_pcie_size_init,
> +        .emu_reg_tbl = pt_emu_reg_pcie_tbl,
> +    },
> +    {
> +        .grp_size = 0,
> +    },
> +};
> +
> +/* initialize Capabilities Pointer or Next Pointer register */
> +static uint32_t pt_ptr_reg_init(XenPCIPassthroughState *s,
> +                                XenPTRegInfo *reg, uint32_t real_offset)
> +{
> +    /* uint32_t reg_field = (uint32_t)s->dev.config[real_offset]; */
> +    uint32_t reg_field = pci_get_byte(s->dev.config + real_offset);
> +    int i;
> +
> +    /* find capability offset */
> +    while (reg_field) {
> +        for (i = 0; pt_emu_reg_grp_tbl[i].grp_size != 0; i++) {
> +            if (pt_hide_dev_cap(s->real_device,
> +                                pt_emu_reg_grp_tbl[i].grp_id)) {
> +                continue;
> +            }
> +            if (pt_emu_reg_grp_tbl[i].grp_id == s->dev.config[reg_field]) {
> +                if (pt_emu_reg_grp_tbl[i].grp_type == GRP_TYPE_EMU) {
> +                    goto out;
> +                }
> +                /* ignore the 0 hardwired capability, find next one */
> +                break;
> +            }
> +        }
> +        /* next capability */
> +        /* reg_field = (uint32_t)s->dev.config[reg_field + 1]; */
> +        reg_field = pci_get_byte(s->dev.config + reg_field + 1);
> +    }
> +
> +out:
> +    return reg_field;
> +}
> +
> +
> +/*************
> + * Main
> + */
> +
> +/* restore a part of I/O device register */
> +static void pt_config_restore(XenPCIPassthroughState *s)
> +{
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegInfo *reg = NULL;
> +    uint32_t real_offset = 0;
> +    uint32_t read_val = 0;
> +    uint32_t val = 0;
> +    int ret = 0;
> +
> +    /* find emulate register group entry */
> +    QLIST_FOREACH(reg_grp_entry, &s->reg_grp_tbl, entries) {
> +        /* find emulate register entry */
> +        QLIST_FOREACH(reg_entry, &reg_grp_entry->reg_tbl_list, entries) {
> +            reg = reg_entry->reg;
> +
> +            /* check whether restoring is needed */
> +            if (!reg->u.b.restore) {
> +                continue;
> +            }
> +
> +            real_offset = reg_grp_entry->base_offset + reg->offset;
> +
> +            /* read I/O device register value */
> +            ret = host_pci_get_block(s->real_device, real_offset,
> +                                     (uint8_t *)&read_val, reg->size);
> +
> +            if (!ret) {
> +                PT_LOG("Error: pci_read_block failed. "
> +                       "return value[%d].\n", ret);
> +                memset(&read_val, 0xff, reg->size);
> +            }
> +
> +            val = 0;
> +
> +            /* restore based on register size */
> +            switch (reg->size) {
> +            case 1:
> +                /* byte register */
> +                ret = reg->u.b.restore(s, reg_entry, real_offset,
> +                                       (uint8_t)read_val, (uint8_t *)&val);
> +                break;
> +            case 2:
> +                /* word register */
> +                ret = reg->u.w.restore(s, reg_entry, real_offset,
> +                                       (uint16_t)read_val, (uint16_t *)&val);
> +                break;
> +            case 4:
> +                /* double word register */
> +                ret = reg->u.dw.restore(s, reg_entry, real_offset,
> +                                        (uint32_t)read_val, (uint32_t *)&val);
> +                break;
> +            }
> +
> +            /* restoring error */
> +            if (ret < 0) {
> +                hw_error("Internal error: Invalid restoring "
> +                         "return value[%d]. I/O emulator exit.\n", ret);
> +            }
> +
> +            PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
> +                          pci_bus_num(s->dev.bus), PCI_SLOT(s->dev.devfn),
> +                          PCI_FUNC(s->dev.devfn),
> +                          real_offset, val, reg->size);
> +
> +            ret = host_pci_set_block(s->real_device, real_offset,
> +                                     (uint8_t *)&val, reg->size);
> +
> +            if (!ret) {
> +                PT_LOG("Error: pci_write_block failed. "
> +                       "return value[%d].\n", ret);
> +            }
> +        }
> +    }
> +
> +    /* if AER supported, restore it */
> +    if (s->pm_state->aer_base) {
> +        pt_aer_reg_restore(s);
> +    }
> +}
> +/* reinitialize all emulate registers */
> +static void pt_config_reinit(XenPCIPassthroughState *s)
> +{
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegInfo *reg = NULL;
> +
> +    /* find emulate register group entry */
> +    QLIST_FOREACH(reg_grp_entry, &s->reg_grp_tbl, entries) {
> +        /* find emulate register entry */
> +        QLIST_FOREACH(reg_entry, &reg_grp_entry->reg_tbl_list, entries) {
> +            reg = reg_entry->reg;
> +            if (reg->init) {
> +                /* initialize emulate register */
> +                reg_entry->data =
> +                    reg->init(s, reg_entry->reg,
> +                              reg_grp_entry->base_offset + reg->offset);
> +            }
> +        }
> +    }
> +}
> +
> +static int pt_init_pci_config(XenPCIPassthroughState *s)
> +{
> +    PCIDevice *d = &s->dev;
> +    int ret = 0;
> +
> +    PT_LOG("Reinitialize PCI configuration registers due to power state"
> +           " transition with internal reset. [%02x:%02x.%x]\n",
> +           pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn));
> +
> +    /* restore a part of I/O device register */
> +    pt_config_restore(s);
> +
> +    /* reinitialize all emulate register */
> +    pt_config_reinit(s);
> +
> +    /* rebind machine_irq to device */
> +    if (s->machine_irq != 0) {
> +        uint8_t e_device = PCI_SLOT(s->dev.devfn);
> +        uint8_t e_intx = pci_intx(s);
> +
> +        ret = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, s->machine_irq, 0,
> +                                        e_device, e_intx);
> +        if (ret < 0) {
> +            PT_LOG("Error: Rebinding of interrupt failed! ret=%d\n", ret);
> +        }
> +    }
> +
> +    return ret;
> +}
> +
> +static uint8_t find_cap_offset(XenPCIPassthroughState *s, uint8_t cap)
> +{
> +    int id;
> +    int max_cap = 48;
> +    int pos = PCI_CAPABILITY_LIST;
> +    int status;
> +
> +    status = host_pci_get_byte(s->real_device, PCI_STATUS);
> +    if ((status & PCI_STATUS_CAP_LIST) == 0) {
> +        return 0;
> +    }
> +
> +    while (max_cap--) {
> +        pos = host_pci_get_byte(s->real_device, pos);
> +        if (pos < 0x40) {
> +            break;
> +        }
> +
> +        pos &= ~3;
> +        id = host_pci_get_byte(s->real_device, pos + PCI_CAP_LIST_ID);
> +
> +        if (id == 0xff) {
> +            break;
> +        }
> +        if (id == cap) {
> +            return pos;
> +        }
> +
> +        pos += PCI_CAP_LIST_NEXT;
> +    }
> +    return 0;
> +}
> +
> +static void pt_config_reg_init(XenPCIPassthroughState *s,
> +                               XenPTRegGroup *reg_grp, XenPTRegInfo *reg)
> +{
> +    XenPTReg *reg_entry;
> +    uint32_t data = 0;
> +
> +    reg_entry = g_malloc0(sizeof (XenPTReg));
> +
> +    reg_entry->reg = reg;
> +    reg_entry->data = 0;
> +
> +    if (reg->init) {
> +        /* initialize emulate register */
> +        data = reg->init(s, reg_entry->reg,
> +                         reg_grp->base_offset + reg->offset);
> +        if (data == PT_INVALID_REG) {
> +            /* free unused BAR register entry */
> +            free(reg_entry);
> +            return;
> +        }
> +        /* set register value */
> +        reg_entry->data = data;
> +    }
> +    /* list add register entry */
> +    QLIST_INSERT_HEAD(&reg_grp->reg_tbl_list, reg_entry, entries);
> +
> +    return;
> +}
> +
> +void pt_config_init(XenPCIPassthroughState *s)
> +{
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    uint32_t reg_grp_offset = 0;
> +    XenPTRegInfo *reg_tbl = NULL;
> +    int i, j;
> +
> +    QLIST_INIT(&s->reg_grp_tbl);
> +
> +    for (i = 0; pt_emu_reg_grp_tbl[i].grp_size != 0; i++) {
> +        if (pt_emu_reg_grp_tbl[i].grp_id != 0xFF) {
> +            if (pt_hide_dev_cap(s->real_device,
> +                                pt_emu_reg_grp_tbl[i].grp_id)) {
> +                continue;
> +            }
> +
> +            reg_grp_offset = find_cap_offset(s, pt_emu_reg_grp_tbl[i].grp_id);
> +
> +            if (!reg_grp_offset) {
> +                continue;
> +            }
> +        }
> +
> +        reg_grp_entry = g_malloc0(sizeof (XenPTRegGroup));
> +        QLIST_INIT(&reg_grp_entry->reg_tbl_list);
> +        QLIST_INSERT_HEAD(&s->reg_grp_tbl, reg_grp_entry, entries);
> +
> +        reg_grp_entry->base_offset = reg_grp_offset;
> +        reg_grp_entry->reg_grp = pt_emu_reg_grp_tbl + i;
> +        if (pt_emu_reg_grp_tbl[i].size_init) {
> +            /* get register group size */
> +            reg_grp_entry->size =
> +                pt_emu_reg_grp_tbl[i].size_init(s, reg_grp_entry->reg_grp,
> +                                                reg_grp_offset);
> +        }
> +
> +        if (pt_emu_reg_grp_tbl[i].grp_type == GRP_TYPE_EMU) {
> +            if (pt_emu_reg_grp_tbl[i].emu_reg_tbl) {
> +                reg_tbl = pt_emu_reg_grp_tbl[i].emu_reg_tbl;
> +                /* initialize capability register */
> +                for (j = 0; reg_tbl->size != 0; j++, reg_tbl++) {
> +                    /* initialize capability register */
> +                    pt_config_reg_init(s, reg_grp_entry, reg_tbl);
> +                }
> +            }
> +        }
> +        reg_grp_offset = 0;
> +    }
> +
> +    return;
> +}
> +
> +/* delete all emulate register */
> +void pt_config_delete(XenPCIPassthroughState *s)
> +{
> +    struct XenPTRegGroup *reg_group, *next_grp;
> +    struct XenPTReg *reg, *next_reg;
> +
> +    /* free Power Management info table */
> +    if (s->pm_state) {
> +        if (s->pm_state->pm_timer) {
> +            qemu_del_timer(s->pm_state->pm_timer);
> +            qemu_free_timer(s->pm_state->pm_timer);
> +            s->pm_state->pm_timer = NULL;
> +        }
> +
> +        g_free(s->pm_state);
> +    }
> +
> +    /* free all register group entry */
> +    QLIST_FOREACH_SAFE(reg_group, &s->reg_grp_tbl, entries, next_grp) {
> +        /* free all register entry */
> +        QLIST_FOREACH_SAFE(reg, &reg_group->reg_tbl_list, entries, next_reg) {
> +            QLIST_REMOVE(reg, entries);
> +            g_free(reg);
> +        }
> +
> +        QLIST_REMOVE(reg_group, entries);
> +        g_free(reg_group);
> +    }
> +}
> -- 
> Anthony PERARD
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
Anthony PERARD - Nov. 11, 2011, 5:40 p.m.
On Thu, 10 Nov 2011, Konrad Rzeszutek Wilk wrote:

> On Fri, Oct 28, 2011 at 04:07:34PM +0100, Anthony PERARD wrote:
> > From: Allen Kay <allen.m.kay@intel.com>
> >
> > Signed-off-by: Allen Kay <allen.m.kay@intel.com>
> > Signed-off-by: Guy Zana <guy@neocleus.com>
> > Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
> > ---
> >  Makefile.target                      |    1 +
> >  hw/xen_pci_passthrough.h             |    2 +
> >  hw/xen_pci_passthrough_config_init.c | 2068 ++++++++++++++++++++++++++++++++++
> >  3 files changed, 2071 insertions(+), 0 deletions(-)
> >  create mode 100644 hw/xen_pci_passthrough_config_init.c
> >

[...]

> > +/* A return value of 1 means the capability should NOT be exposed to guest. */
> > +static int pt_hide_dev_cap(const HostPCIDevice *d, uint8_t grp_id)
> > +{
> > +    switch (grp_id) {
> > +    case PCI_CAP_ID_EXP:
> > +        /* The PCI Express Capability Structure of the VF of Intel 82599 10GbE
> > +         * Controller looks trivial, e.g., the PCI Express Capabilities
> > +         * Register is 0. We should not try to expose it to guest.
>
> Why not?

Because (an old commit):

passthrough: support the assignment of the VF of Intel 82599 10GbE Controller

The datasheet is available at
http://download.intel.com/design/network/datashts/82599_datasheet.pdf

See 'Table 9.7. VF PCIe Configuration Space' of the datasheet, the PCI
Express Capability Structure of the VF of Intel 82599 10GbE Controller looks
trivial, e.g., the PCI Express Capabilities Register is 0, so the Capability
Version is 0 and pt_pcie_size_init() would fail.

We should not try to expose the PCIe cap of the device to guest.

a patch from Dexuan Cui <dexuan.cui@intel.com>

> > +         */
> > +        if (d->vendor_id == PCI_VENDOR_ID_INTEL &&
> > +                d->device_id == PCI_DEVICE_ID_INTEL_82599_VF) {
> > +            return 1;
> > +        }
> > +        break;
> > +    }
> > +    return 0;
> > +}
> > +
> > +/*   find emulate register group entry */
> > +XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address)
> > +{
> > +    XenPTRegGroup *entry = NULL;
> > +
> > +    /* find register group entry */
> > +    QLIST_FOREACH(entry, &s->reg_grp_tbl, entries) {
> > +        /* check address */
> > +        if ((entry->base_offset <= address)
> > +            && ((entry->base_offset + entry->size) > address)) {
> > +            return entry;
> > +        }
> > +    }
> > +
> > +    /* group entry not found */
> > +    return NULL;
> > +}
> > +
> > +/* find emulate register entry */
> > +XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address)
> > +{
> > +    XenPTReg *reg_entry = NULL;
> > +    XenPTRegInfo *reg = NULL;
> > +    uint32_t real_offset = 0;
> > +
> > +    /* find register entry */
> > +    QLIST_FOREACH(reg_entry, &reg_grp->reg_tbl_list, entries) {
> > +        reg = reg_entry->reg;
> > +        real_offset = reg_grp->base_offset + reg->offset;
> > +        /* check address */
> > +        if ((real_offset <= address)
> > +            && ((real_offset + reg->size) > address)) {
> > +            return reg_entry;
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +/* parse BAR */
> > +static PTBarFlag pt_bar_reg_parse(XenPCIPassthroughState *s, XenPTRegInfo *reg)
> > +{
> > +    PCIDevice *d = &s->dev;
> > +    XenPTRegion *region = NULL;
> > +    PCIIORegion *r;
> > +    int index = 0;
> > +
> > +    /* check 64bit BAR */
> > +    index = pt_bar_offset_to_index(reg->offset);
> > +    if ((0 < index) && (index < PCI_ROM_SLOT)) {
>
> This is  a bit confusing. Can you make the index be on the same
> side, like
>
> if ((0 < index) && (PCI_ROM_SLOT > index)
>
> or better:
>
> if ((index < 0) && (index < PCI_ROM_SLOT))
>
> um, which looks wrong. Should it be 'index > 0' ?

Every other form is a bit confusing to me. I'd like to write
0 < index < ROM_SLOT, so I know that index is between 0 and ROM_SLOT.
But, it's C and not math, so I wrote the closest way I can.

> > +        int flags = s->real_device->io_regions[index - 1].flags;
>
> Do we want to check the index - 1 to make sure it is not negative?

We have:
  0 < index < ROM_SLOT
so (index - 1) give us:
  0 <= index - 1 < ROM_SLOT - 1

So (index - 1) can be 0, but under 0.
;)


[...]

> > +/********************
> > + * Header Type0
> > + */
> > +
> > +static uint32_t pt_vendor_reg_init(XenPCIPassthroughState *s,
> > +                                   XenPTRegInfo *reg, uint32_t real_offset)
> > +{
> > +    return s->real_device->vendor_id;
> > +}
> > +static uint32_t pt_device_reg_init(XenPCIPassthroughState *s,
> > +                                   XenPTRegInfo *reg, uint32_t real_offset)
> > +{
> > +    return s->real_device->device_id;
> > +}
> > +static uint32_t pt_status_reg_init(XenPCIPassthroughState *s,
> > +                                   XenPTRegInfo *reg, uint32_t real_offset)
> > +{
> > +    XenPTRegGroup *reg_grp_entry = NULL;
> > +    XenPTReg *reg_entry = NULL;
> > +    int reg_field = 0;
> > +
> > +    /* find Header register group */
> > +    reg_grp_entry = pt_find_reg_grp(s, PCI_CAPABILITY_LIST);
> > +    if (reg_grp_entry) {
> > +        /* find Capabilities Pointer register */
> > +        reg_entry = pt_find_reg(reg_grp_entry, PCI_CAPABILITY_LIST);
> > +        if (reg_entry) {
> > +            /* check Capabilities Pointer register */
> > +            if (reg_entry->data) {
> > +                reg_field |= PCI_STATUS_CAP_LIST;
> > +            } else {
> > +                reg_field &= ~PCI_STATUS_CAP_LIST;
> > +            }
> > +        } else {
> > +            hw_error("Internal error: Couldn't find pt_reg_tbl for "
> > +                     "Capabilities Pointer register. I/O emulator exit.\n");
>
> Yikes. abort here? Um, can we just return a fault code instead?

This should probably not happend, I suppose.

> > +        }
> > +    } else {
> > +        hw_error("Internal error: Couldn't find pt_reg_grp_tbl for Header. "
> > +                 "I/O emulator exit.\n");
> > +    }
> > +
> > +    return reg_field;
> > +}

[...]

> > +static uint32_t pt_bar_reg_init(XenPCIPassthroughState *s, XenPTRegInfo *reg,
> > +                                uint32_t real_offset)
> > +{
> > +    int reg_field = 0;
> > +    int index;
> > +
> > +    /* get BAR index */
> > +    index = pt_bar_offset_to_index(reg->offset);
> > +    if (index < 0) {
> > +        hw_error("Internal error: Invalid BAR index[%d]. "
> > +                 "I/O emulator exit.\n", index);
> > +    }
> > +
> > +    /* set initial guest physical base address to -1 */
> > +    s->bases[index].e_physbase = -1;
>
> Um, use that define PCI_.. something macro.

:(, e_physbase is uint32, and PCI_BAR_UNMAPPED is uint64.

> > +
> > +    /* set BAR flag */
> > +    s->bases[index].bar_flag = pt_bar_reg_parse(s, reg);
> > +    if (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED) {
> > +        reg_field = PT_INVALID_REG;
> > +    }
> > +
> > +    return reg_field;
> > +}

[...]

> > +
> > +
> > +/*****************************
> > + * PCI Express Capability
> > + */
> > +
> > +/* initialize Link Control register */
> > +static uint32_t pt_linkctrl_reg_init(XenPCIPassthroughState *s,
> > +                                     XenPTRegInfo *reg, uint32_t real_offset)
> > +{
> > +    uint8_t cap_ver = 0;
> > +    uint8_t dev_type = 0;
> > +
> > +    /* TODO maybe better to use fonction from hw/pcie.c */
>
> function
> > +    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> > +                           + PCI_EXP_FLAGS)
> > +        & PCI_EXP_FLAGS_VERS;
> > +    dev_type = (pci_get_byte(s->dev.config + real_offset - reg->offset
> > +                             + PCI_EXP_FLAGS)
> > +                & PCI_EXP_FLAGS_TYPE) >> 4;
> > +
> > +    /* no need to initialize in case of Root Complex Integrated Endpoint
> > +     * with cap_ver 1.x
>
> Why?

Who knows? I don't. And `git log` does not give me more information.

> > +     */
> > +    if ((dev_type == PCI_EXP_TYPE_RC_END) && (cap_ver == 1)) {
> > +        return PT_INVALID_REG;
> > +    }
> > +
> > +    return reg->init_val;
> > +}
> > +/* initialize Device Control 2 register */
> > +static uint32_t pt_devctrl2_reg_init(XenPCIPassthroughState *s,
> > +                                     XenPTRegInfo *reg, uint32_t real_offset)
> > +{
> > +    uint8_t cap_ver = 0;
> > +
> > +    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> > +                           + PCI_EXP_FLAGS)
> > +        & PCI_EXP_FLAGS_VERS;
> > +
> > +    /* no need to initialize in case of cap_ver 1.x */
> > +    if (cap_ver == 1) {
> > +        return PT_INVALID_REG;
> > +    }
> > +
> > +    return reg->init_val;
> > +}
> > +/* initialize Link Control 2 register */
> > +static uint32_t pt_linkctrl2_reg_init(XenPCIPassthroughState *s,
> > +                                      XenPTRegInfo *reg, uint32_t real_offset)
> > +{
> > +    int reg_field = 0;
> > +    uint8_t cap_ver = 0;
> > +
> > +    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> > +                           + PCI_EXP_FLAGS)
> > +        & PCI_EXP_FLAGS_VERS;
>
> This looks like a weird tab issue, but it might be just my mailer.

Nop, there is no tab.

Maybe writing it like that:
> cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
>                        + PCI_EXP_FLAGS);
> cap_ver &= PCI_EXP_FLAGS_VERS;
would be better.

> > +
> > +    /* no need to initialize in case of cap_ver 1.x */
> > +    if (cap_ver == 1) {
> > +        return PT_INVALID_REG;
> > +    }
> > +
> > +    /* set Supported Link Speed */
> > +    reg_field |= PCI_EXP_LNKCAP_SLS &
> > +        pci_get_byte(s->dev.config + real_offset - reg->offset
> > +                     + PCI_EXP_LNKCAP);
> > +
> > +    return reg_field;
> > +}
> > +

[...]

> > +/*********************************
> > + * Power Management Capability
> > + */
> > +
> > +/* initialize Power Management Capabilities register */
> > +static uint32_t pt_pmc_reg_init(XenPCIPassthroughState *s,
> > +                                XenPTRegInfo *reg, uint32_t real_offset)
> > +{
> > +    PCIDevice *d = &s->dev;
> > +
> > +    if (!s->power_mgmt) {
> > +        return reg->init_val;
> > +    }
> > +
> > +    /* set Power Management Capabilities register */
> > +    s->pm_state->pmc_field = pci_get_word(d->config + real_offset);
> > +
> > +    return reg->init_val;
> > +}
> > +/* initialize PCI Power Management Control/Status register */
> > +static uint32_t pt_pmcsr_reg_init(XenPCIPassthroughState *s,
> > +                                  XenPTRegInfo *reg, uint32_t real_offset)
> > +{
> > +    PCIDevice *d = &s->dev;
> > +    uint16_t cap_ver  = 0;
> > +
> > +    if (!s->power_mgmt) {
> > +        return reg->init_val;
> > +    }
> > +
> > +    /* check PCI Power Management support version */
> > +    cap_ver = s->pm_state->pmc_field & PCI_PM_CAP_VER_MASK;
> > +
> > +    if (cap_ver > 2) {
> > +        /* set No Soft Reset */
> > +        s->pm_state->no_soft_reset =
> > +            pci_get_byte(d->config + real_offset) & PCI_PM_CTRL_NO_SOFT_RESET;
> > +    }
> > +
> > +    /* wake up real physical device */
> > +    switch (host_pci_get_word(s->real_device, real_offset)
> > +            & PCI_PM_CTRL_STATE_MASK) {
> > +    case 0:
> > +        break;
> > +    case 1:
> > +        PT_LOG("Power state transition D1 -> D0active\n");
> > +        host_pci_set_word(s->real_device, real_offset, 0);
> > +        break;
> > +    case 2:
> > +        PT_LOG("Power state transition D2 -> D0active\n");
> > +        host_pci_set_word(s->real_device, real_offset, 0);
> > +        usleep(200);
>
> Heheh..

I don't know if I can remove it safely, or not.

> > +        break;
> > +    case 3:
> > +        PT_LOG("Power state transition D3hot -> D0active\n");
> > +        host_pci_set_word(s->real_device, real_offset, 0);
> > +        usleep(10 * 1000);

Same for this one.

> > +        pt_init_pci_config(s);
> > +        break;
> > +    }
> > +
> > +    return reg->init_val;
> > +}
Konrad Rzeszutek Wilk - Nov. 11, 2011, 6:11 p.m.
> > > +    case PCI_CAP_ID_EXP:
> > > +        /* The PCI Express Capability Structure of the VF of Intel 82599 10GbE
> > > +         * Controller looks trivial, e.g., the PCI Express Capabilities
> > > +         * Register is 0. We should not try to expose it to guest.
> >
> > Why not?
> 
> Because (an old commit):
> 
> passthrough: support the assignment of the VF of Intel 82599 10GbE Controller
> 
> The datasheet is available at
> http://download.intel.com/design/network/datashts/82599_datasheet.pdf
> 
> See 'Table 9.7. VF PCIe Configuration Space' of the datasheet, the PCI
> Express Capability Structure of the VF of Intel 82599 10GbE Controller looks
> trivial, e.g., the PCI Express Capabilities Register is 0, so the Capability
> Version is 0 and pt_pcie_size_init() would fail.
> 
> We should not try to expose the PCIe cap of the device to guest.
> 
> a patch from Dexuan Cui <dexuan.cui@intel.com>

Lets inlude that in the description here..
> 
> > > +         */
> > > +        if (d->vendor_id == PCI_VENDOR_ID_INTEL &&
> > > +                d->device_id == PCI_DEVICE_ID_INTEL_82599_VF) {
> > > +            return 1;
> > > +        }
> > > +        break;
> > > +    }
> > > +    return 0;
> > > +}
> > > +
> > > +/*   find emulate register group entry */
> > > +XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address)
> > > +{
> > > +    XenPTRegGroup *entry = NULL;
> > > +
> > > +    /* find register group entry */
> > > +    QLIST_FOREACH(entry, &s->reg_grp_tbl, entries) {
> > > +        /* check address */
> > > +        if ((entry->base_offset <= address)
> > > +            && ((entry->base_offset + entry->size) > address)) {
> > > +            return entry;
> > > +        }
> > > +    }
> > > +
> > > +    /* group entry not found */
> > > +    return NULL;
> > > +}
> > > +
> > > +/* find emulate register entry */
> > > +XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address)
> > > +{
> > > +    XenPTReg *reg_entry = NULL;
> > > +    XenPTRegInfo *reg = NULL;
> > > +    uint32_t real_offset = 0;
> > > +
> > > +    /* find register entry */
> > > +    QLIST_FOREACH(reg_entry, &reg_grp->reg_tbl_list, entries) {
> > > +        reg = reg_entry->reg;
> > > +        real_offset = reg_grp->base_offset + reg->offset;
> > > +        /* check address */
> > > +        if ((real_offset <= address)
> > > +            && ((real_offset + reg->size) > address)) {
> > > +            return reg_entry;
> > > +        }
> > > +    }
> > > +
> > > +    return NULL;
> > > +}
> > > +
> > > +/* parse BAR */
> > > +static PTBarFlag pt_bar_reg_parse(XenPCIPassthroughState *s, XenPTRegInfo *reg)
> > > +{
> > > +    PCIDevice *d = &s->dev;
> > > +    XenPTRegion *region = NULL;
> > > +    PCIIORegion *r;
> > > +    int index = 0;
> > > +
> > > +    /* check 64bit BAR */
> > > +    index = pt_bar_offset_to_index(reg->offset);
> > > +    if ((0 < index) && (index < PCI_ROM_SLOT)) {
> >
> > This is  a bit confusing. Can you make the index be on the same
> > side, like
> >
> > if ((0 < index) && (PCI_ROM_SLOT > index)
> >
> > or better:
> >
> > if ((index < 0) && (index < PCI_ROM_SLOT))
> >
> > um, which looks wrong. Should it be 'index > 0' ?
> 
> Every other form is a bit confusing to me. I'd like to write
> 0 < index < ROM_SLOT, so I know that index is between 0 and ROM_SLOT.
> But, it's C and not math, so I wrote the closest way I can.
> 
> > > +        int flags = s->real_device->io_regions[index - 1].flags;
> >
> > Do we want to check the index - 1 to make sure it is not negative?
> 
> We have:
>   0 < index < ROM_SLOT
> so (index - 1) give us:
>   0 <= index - 1 < ROM_SLOT - 1
> 
> So (index - 1) can be 0, but under 0.
> ;)

Right! Ok, then please ignore my comment.

.. snip..
> > > +    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> > > +                           + PCI_EXP_FLAGS)
> > > +        & PCI_EXP_FLAGS_VERS;
> > > +    dev_type = (pci_get_byte(s->dev.config + real_offset - reg->offset
> > > +                             + PCI_EXP_FLAGS)
> > > +                & PCI_EXP_FLAGS_TYPE) >> 4;
> > > +
> > > +    /* no need to initialize in case of Root Complex Integrated Endpoint
> > > +     * with cap_ver 1.x
> >
> > Why?
> 
> Who knows? I don't. And `git log` does not give me more information.

<laughs> OK, could the earlier author provide some ideas? Or perhaps
there is something akin in the Linux code.

.. snip..
> > > +    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> > > +                           + PCI_EXP_FLAGS)
> > > +        & PCI_EXP_FLAGS_VERS;
> >
> > This looks like a weird tab issue, but it might be just my mailer.
> 
> Nop, there is no tab.
> 
> Maybe writing it like that:
> > cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
> >                        + PCI_EXP_FLAGS);
> > cap_ver &= PCI_EXP_FLAGS_VERS;
> would be better.

OK.
> 
> > > +
> > > +    /* no need to initialize in case of cap_ver 1.x */
> > > +    if (cap_ver == 1) {
> > > +        return PT_INVALID_REG;

.. snip..
> > > +    case 2:
> > > +        PT_LOG("Power state transition D2 -> D0active\n");
> > > +        host_pci_set_word(s->real_device, real_offset, 0);
> > > +        usleep(200);
> >
> > Heheh..
> 
> I don't know if I can remove it safely, or not.

Probably not. One of my machines reguarly gets confused when the SSD disk
returns VPD information way to fast and it ends up using the name of a
previous disk.. So the usleep is probably very much required (and in
all likehood defined in the PCI spec).
Ian Campbell - Nov. 11, 2011, 8:37 p.m.
On Fri, 2011-11-11 at 17:40 +0000, Anthony PERARD wrote:
> 
> > if ((index < 0) && (index < PCI_ROM_SLOT))
> >
> > um, which looks wrong. Should it be 'index > 0' ?
> 
> Every other form is a bit confusing to me. I'd like to write
> 0 < index < ROM_SLOT, so I know that index is between 0 and ROM_SLOT.
> But, it's C and not math, so I wrote the closest way I can.

"0 < index < ROM_SLOT" ==> "0 < index && index < ROM_SLOT"
but you have "index < 0 && ..." which is backwards.

Ian.

Patch

diff --git a/Makefile.target b/Makefile.target
index 36ea47d..c32c688 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -219,6 +219,7 @@  obj-i386-$(CONFIG_XEN) += xen_platform.o
 obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
 obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough.o
 obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_helpers.o
+obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_config_init.o
 
 # Inter-VM PCI shared memory
 CONFIG_IVSHMEM =
diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
index 2d1979d..ebc04fd 100644
--- a/hw/xen_pci_passthrough.h
+++ b/hw/xen_pci_passthrough.h
@@ -61,6 +61,8 @@  typedef int (*conf_byte_restore)
 /* power state transition */
 #define PT_FLAG_TRANSITING 0x0001
 
+#define PT_BAR_ALLF        0xFFFFFFFF  /* BAR ALLF value */
+
 
 typedef enum {
     GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
diff --git a/hw/xen_pci_passthrough_config_init.c b/hw/xen_pci_passthrough_config_init.c
new file mode 100644
index 0000000..4103b59
--- /dev/null
+++ b/hw/xen_pci_passthrough_config_init.c
@@ -0,0 +1,2068 @@ 
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Alex Novik <alex@neocleus.com>
+ * Allen Kay <allen.m.kay@intel.com>
+ * Guy Zana <guy@neocleus.com>
+ *
+ * This file implements direct PCI assignment to a HVM guest
+ */
+
+#include "qemu-timer.h"
+#include "xen_backend.h"
+#include "xen_pci_passthrough.h"
+
+#define PT_MERGE_VALUE(value, data, val_mask) \
+    (((value) & (val_mask)) | ((data) & ~(val_mask)))
+
+#define PT_INVALID_REG          0xFFFFFFFF      /* invalid register value */
+
+/* prototype */
+
+static uint32_t pt_ptr_reg_init(XenPCIPassthroughState *s, XenPTRegInfo *reg,
+                                uint32_t real_offset);
+static int pt_init_pci_config(XenPCIPassthroughState *s);
+
+
+/* helper */
+
+/* A return value of 1 means the capability should NOT be exposed to guest. */
+static int pt_hide_dev_cap(const HostPCIDevice *d, uint8_t grp_id)
+{
+    switch (grp_id) {
+    case PCI_CAP_ID_EXP:
+        /* The PCI Express Capability Structure of the VF of Intel 82599 10GbE
+         * Controller looks trivial, e.g., the PCI Express Capabilities
+         * Register is 0. We should not try to expose it to guest.
+         */
+        if (d->vendor_id == PCI_VENDOR_ID_INTEL &&
+                d->device_id == PCI_DEVICE_ID_INTEL_82599_VF) {
+            return 1;
+        }
+        break;
+    }
+    return 0;
+}
+
+/*   find emulate register group entry */
+XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address)
+{
+    XenPTRegGroup *entry = NULL;
+
+    /* find register group entry */
+    QLIST_FOREACH(entry, &s->reg_grp_tbl, entries) {
+        /* check address */
+        if ((entry->base_offset <= address)
+            && ((entry->base_offset + entry->size) > address)) {
+            return entry;
+        }
+    }
+
+    /* group entry not found */
+    return NULL;
+}
+
+/* find emulate register entry */
+XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address)
+{
+    XenPTReg *reg_entry = NULL;
+    XenPTRegInfo *reg = NULL;
+    uint32_t real_offset = 0;
+
+    /* find register entry */
+    QLIST_FOREACH(reg_entry, &reg_grp->reg_tbl_list, entries) {
+        reg = reg_entry->reg;
+        real_offset = reg_grp->base_offset + reg->offset;
+        /* check address */
+        if ((real_offset <= address)
+            && ((real_offset + reg->size) > address)) {
+            return reg_entry;
+        }
+    }
+
+    return NULL;
+}
+
+/* parse BAR */
+static PTBarFlag pt_bar_reg_parse(XenPCIPassthroughState *s, XenPTRegInfo *reg)
+{
+    PCIDevice *d = &s->dev;
+    XenPTRegion *region = NULL;
+    PCIIORegion *r;
+    int index = 0;
+
+    /* check 64bit BAR */
+    index = pt_bar_offset_to_index(reg->offset);
+    if ((0 < index) && (index < PCI_ROM_SLOT)) {
+        int flags = s->real_device->io_regions[index - 1].flags;
+
+        if ((flags & IORESOURCE_MEM) && (flags & IORESOURCE_MEM_64)) {
+            region = &s->bases[index - 1];
+            if (region->bar_flag != PT_BAR_FLAG_UPPER) {
+                return PT_BAR_FLAG_UPPER;
+            }
+        }
+    }
+
+    /* check unused BAR */
+    r = &d->io_regions[index];
+    if (r->size == 0) {
+        return PT_BAR_FLAG_UNUSED;
+    }
+
+    /* for ExpROM BAR */
+    if (index == PCI_ROM_SLOT) {
+        return PT_BAR_FLAG_MEM;
+    }
+
+    /* check BAR I/O indicator */
+    if (s->real_device->io_regions[index].flags & IORESOURCE_IO) {
+        return PT_BAR_FLAG_IO;
+    } else {
+        return PT_BAR_FLAG_MEM;
+    }
+}
+
+
+/****************
+ * general register functions
+ */
+
+/* register initialization function */
+
+static uint32_t pt_common_reg_init(XenPCIPassthroughState *s,
+                                   XenPTRegInfo *reg, uint32_t real_offset)
+{
+    return reg->init_val;
+}
+
+/* Read register functions */
+
+static int pt_byte_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                            uint8_t *value, uint8_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint8_t valid_emu_mask = 0;
+
+    /* emulate byte register */
+    valid_emu_mask = reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+    return 0;
+}
+static int pt_word_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                            uint16_t *value, uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint16_t valid_emu_mask = 0;
+
+    /* emulate word register */
+    valid_emu_mask = reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+    return 0;
+}
+static int pt_long_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                            uint32_t *value, uint32_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint32_t valid_emu_mask = 0;
+
+    /* emulate long register */
+    valid_emu_mask = reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+   return 0;
+}
+
+/* Write register functions */
+
+static int pt_byte_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                             uint8_t *value, uint8_t dev_value,
+                             uint8_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint8_t writable_mask = 0;
+    uint8_t throughable_mask = 0;
+
+    /* modify emulate register */
+    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    return 0;
+}
+static int pt_word_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                             uint16_t *value, uint16_t dev_value,
+                             uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint16_t writable_mask = 0;
+    uint16_t throughable_mask = 0;
+
+    /* modify emulate register */
+    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    return 0;
+}
+static int pt_long_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                             uint32_t *value, uint32_t dev_value,
+                             uint32_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint32_t writable_mask = 0;
+    uint32_t throughable_mask = 0;
+
+    /* modify emulate register */
+    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    return 0;
+}
+
+/* common restore register fonctions */
+static int pt_byte_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                               uint32_t real_offset, uint8_t dev_value,
+                               uint8_t *value)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    PCIDevice *d = &s->dev;
+
+    /* use I/O device register's value as restore value */
+    *value = pci_get_byte(d->config + real_offset);
+
+    /* create value for restoring to I/O device register */
+    *value = PT_MERGE_VALUE(*value, dev_value, reg->emu_mask);
+
+    return 0;
+}
+static int pt_word_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                               uint32_t real_offset, uint16_t dev_value,
+                               uint16_t *value)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    PCIDevice *d = &s->dev;
+
+    /* use I/O device register's value as restore value */
+    *value = pci_get_word(d->config + real_offset);
+
+    /* create value for restoring to I/O device register */
+    *value = PT_MERGE_VALUE(*value, dev_value, reg->emu_mask);
+
+    return 0;
+}
+
+
+/* XenPTRegInfo declaration
+ * - only for emulated register (either a part or whole bit).
+ * - for passthrough register that need special behavior (like interacting with
+ *   other component), set emu_mask to all 0 and specify r/w func properly.
+ * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
+ */
+
+/********************
+ * Header Type0
+ */
+
+static uint32_t pt_vendor_reg_init(XenPCIPassthroughState *s,
+                                   XenPTRegInfo *reg, uint32_t real_offset)
+{
+    return s->real_device->vendor_id;
+}
+static uint32_t pt_device_reg_init(XenPCIPassthroughState *s,
+                                   XenPTRegInfo *reg, uint32_t real_offset)
+{
+    return s->real_device->device_id;
+}
+static uint32_t pt_status_reg_init(XenPCIPassthroughState *s,
+                                   XenPTRegInfo *reg, uint32_t real_offset)
+{
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    int reg_field = 0;
+
+    /* find Header register group */
+    reg_grp_entry = pt_find_reg_grp(s, PCI_CAPABILITY_LIST);
+    if (reg_grp_entry) {
+        /* find Capabilities Pointer register */
+        reg_entry = pt_find_reg(reg_grp_entry, PCI_CAPABILITY_LIST);
+        if (reg_entry) {
+            /* check Capabilities Pointer register */
+            if (reg_entry->data) {
+                reg_field |= PCI_STATUS_CAP_LIST;
+            } else {
+                reg_field &= ~PCI_STATUS_CAP_LIST;
+            }
+        } else {
+            hw_error("Internal error: Couldn't find pt_reg_tbl for "
+                     "Capabilities Pointer register. I/O emulator exit.\n");
+        }
+    } else {
+        hw_error("Internal error: Couldn't find pt_reg_grp_tbl for Header. "
+                 "I/O emulator exit.\n");
+    }
+
+    return reg_field;
+}
+static uint32_t pt_header_type_reg_init(XenPCIPassthroughState *s,
+                                        XenPTRegInfo *reg,
+                                        uint32_t real_offset)
+{
+    /* read PCI_HEADER_TYPE */
+    return reg->init_val | 0x80;
+}
+
+/* initialize Interrupt Pin register */
+static uint32_t pt_irqpin_reg_init(XenPCIPassthroughState *s,
+                                   XenPTRegInfo *reg, uint32_t real_offset)
+{
+    return pci_read_intx(s);
+}
+
+/* Command register */
+static int pt_cmd_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                           uint16_t *value, uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint16_t valid_emu_mask = 0;
+    uint16_t emu_mask = reg->emu_mask;
+
+    if (s->is_virtfn) {
+        emu_mask |= PCI_COMMAND_MEMORY;
+    }
+
+    /* emulate word register */
+    valid_emu_mask = emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+    return 0;
+}
+static int pt_cmd_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                            uint16_t *value, uint16_t dev_value,
+                            uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint16_t writable_mask = 0;
+    uint16_t throughable_mask = 0;
+    uint16_t wr_value = *value;
+    uint16_t emu_mask = reg->emu_mask;
+
+    if (s->is_virtfn) {
+        emu_mask |= PCI_COMMAND_MEMORY;
+    }
+
+    /* modify emulate register */
+    writable_mask = ~reg->ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~emu_mask & valid_mask;
+
+    if (*value & PCI_COMMAND_INTX_DISABLE) {
+        throughable_mask |= PCI_COMMAND_INTX_DISABLE;
+    } else {
+        if (s->machine_irq) {
+            throughable_mask |= PCI_COMMAND_INTX_DISABLE;
+        }
+    }
+
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    /* mapping BAR */
+    pt_bar_mapping(s, wr_value & PCI_COMMAND_IO,
+                   wr_value & PCI_COMMAND_MEMORY);
+
+    return 0;
+}
+static int pt_cmd_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                              uint32_t real_offset, uint16_t dev_value,
+                              uint16_t *value)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    PCIDevice *d = &s->dev;
+    uint16_t restorable_mask = 0;
+
+    /* use I/O device register's value as restore value */
+    *value = pci_get_word(d->config + real_offset);
+
+    /* create value for restoring to I/O device register
+     * but do not include Fast Back-to-Back Enable bit.
+     */
+    restorable_mask = reg->emu_mask & ~PCI_COMMAND_FAST_BACK;
+    *value = PT_MERGE_VALUE(*value, dev_value, restorable_mask);
+
+    if (!s->machine_irq) {
+        *value |= PCI_COMMAND_INTX_DISABLE;
+    } else {
+        *value &= ~PCI_COMMAND_INTX_DISABLE;
+    }
+
+    return 0;
+}
+
+/* BAR */
+#define PT_BAR_MEM_RO_MASK      0x0000000F      /* BAR ReadOnly mask(Memory) */
+#define PT_BAR_MEM_EMU_MASK     0xFFFFFFF0      /* BAR emul mask(Memory) */
+#define PT_BAR_IO_RO_MASK       0x00000003      /* BAR ReadOnly mask(I/O) */
+#define PT_BAR_IO_EMU_MASK      0xFFFFFFFC      /* BAR emul mask(I/O) */
+
+static inline uint32_t base_address_with_flags(HostPCIIORegion *hr)
+{
+    if ((hr->flags & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
+        return hr->base_addr | (hr->flags & ~PCI_BASE_ADDRESS_IO_MASK);
+    } else {
+        return hr->base_addr | (hr->flags & ~PCI_BASE_ADDRESS_MEM_MASK);
+    }
+}
+
+static uint32_t pt_bar_reg_init(XenPCIPassthroughState *s, XenPTRegInfo *reg,
+                                uint32_t real_offset)
+{
+    int reg_field = 0;
+    int index;
+
+    /* get BAR index */
+    index = pt_bar_offset_to_index(reg->offset);
+    if (index < 0) {
+        hw_error("Internal error: Invalid BAR index[%d]. "
+                 "I/O emulator exit.\n", index);
+    }
+
+    /* set initial guest physical base address to -1 */
+    s->bases[index].e_physbase = -1;
+
+    /* set BAR flag */
+    s->bases[index].bar_flag = pt_bar_reg_parse(s, reg);
+    if (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED) {
+        reg_field = PT_INVALID_REG;
+    }
+
+    return reg_field;
+}
+static int pt_bar_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                           uint32_t *value, uint32_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint32_t valid_emu_mask = 0;
+    uint32_t bar_emu_mask = 0;
+    int index;
+
+    /* get BAR index */
+    index = pt_bar_offset_to_index(reg->offset);
+    if (index < 0) {
+        hw_error("Internal error: Invalid BAR index[%d]. "
+                 "I/O emulator exit.\n", index);
+    }
+
+    /* use fixed-up value from kernel sysfs */
+    *value = base_address_with_flags(&s->real_device->io_regions[index]);
+
+    /* set emulate mask depend on BAR flag */
+    switch (s->bases[index].bar_flag) {
+    case PT_BAR_FLAG_MEM:
+        bar_emu_mask = PT_BAR_MEM_EMU_MASK;
+        break;
+    case PT_BAR_FLAG_IO:
+        bar_emu_mask = PT_BAR_IO_EMU_MASK;
+        break;
+    case PT_BAR_FLAG_UPPER:
+        bar_emu_mask = PT_BAR_ALLF;
+        break;
+    default:
+        break;
+    }
+
+    /* emulate BAR */
+    valid_emu_mask = bar_emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+   return 0;
+}
+static int pt_bar_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                            uint32_t *value, uint32_t dev_value,
+                            uint32_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    XenPTRegion *base = NULL;
+    PCIDevice *d = &s->dev;
+    PCIIORegion *r;
+    uint32_t writable_mask = 0;
+    uint32_t throughable_mask = 0;
+    uint32_t bar_emu_mask = 0;
+    uint32_t bar_ro_mask = 0;
+    uint32_t new_addr, last_addr;
+    uint32_t prev_offset;
+    uint32_t r_size = 0;
+    int index = 0;
+
+    /* get BAR index */
+    index = pt_bar_offset_to_index(reg->offset);
+    if (index < 0) {
+        hw_error("Internal error: Invalid BAR index[%d]. "
+                 "I/O emulator exit.\n", index);
+    }
+
+    r = &d->io_regions[index];
+    base = &s->bases[index];
+    r_size = pt_get_emul_size(base->bar_flag, r->size);
+
+    /* set emulate mask and read-only mask depend on BAR flag */
+    switch (s->bases[index].bar_flag) {
+    case PT_BAR_FLAG_MEM:
+        bar_emu_mask = PT_BAR_MEM_EMU_MASK;
+        bar_ro_mask = PT_BAR_MEM_RO_MASK | (r_size - 1);
+        break;
+    case PT_BAR_FLAG_IO:
+        bar_emu_mask = PT_BAR_IO_EMU_MASK;
+        bar_ro_mask = PT_BAR_IO_RO_MASK | (r_size - 1);
+        break;
+    case PT_BAR_FLAG_UPPER:
+        bar_emu_mask = PT_BAR_ALLF;
+        bar_ro_mask = 0;    /* all upper 32bit are R/W */
+        break;
+    default:
+        break;
+    }
+
+    /* modify emulate register */
+    writable_mask = bar_emu_mask & ~bar_ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* check whether we need to update the virtual region address or not */
+    switch (s->bases[index].bar_flag) {
+    case PT_BAR_FLAG_MEM:
+        /* nothing to do */
+        break;
+    case PT_BAR_FLAG_IO:
+        new_addr = cfg_entry->data;
+        last_addr = new_addr + r_size - 1;
+        /* check invalid address */
+        if (last_addr <= new_addr || !new_addr || last_addr >= 0x10000) {
+            /* check 64K range */
+            if ((last_addr >= 0x10000) &&
+                (cfg_entry->data != (PT_BAR_ALLF & ~bar_ro_mask))) {
+                PT_LOG("Warning: Guest attempt to set Base Address "
+                       "over the 64KB. [%02x:%02x.%x][Offset:%02xh]"
+                       "[Address:%08xh][Size:%08xh]\n",
+                       pci_bus_num(d->bus), PCI_SLOT(d->devfn),
+                       PCI_FUNC(d->devfn),
+                       reg->offset, new_addr, r_size);
+            }
+            /* just remove mapping */
+            r->addr = -1;
+            goto exit;
+        }
+        break;
+    case PT_BAR_FLAG_UPPER:
+        if (cfg_entry->data) {
+            if (cfg_entry->data != (PT_BAR_ALLF & ~bar_ro_mask)) {
+                PT_LOG("Warning: Guest attempt to set high MMIO Base Address. "
+                       "Ignore mapping. "
+                       "[%02x:%02x.%x][Offset:%02xh][High Address:%08xh]\n",
+                       pci_bus_num(d->bus), PCI_SLOT(d->devfn),
+                       PCI_FUNC(d->devfn), reg->offset, cfg_entry->data);
+            }
+            /* clear lower address */
+            d->io_regions[index-1].addr = -1;
+        } else {
+            /* find lower 32bit BAR */
+            prev_offset = (reg->offset - 4);
+            reg_grp_entry = pt_find_reg_grp(s, prev_offset);
+            if (reg_grp_entry) {
+                reg_entry = pt_find_reg(reg_grp_entry, prev_offset);
+                if (reg_entry) {
+                    /* restore lower address */
+                    d->io_regions[index-1].addr = reg_entry->data;
+                } else {
+                    return -1;
+                }
+            } else {
+                return -1;
+            }
+        }
+
+        /* never mapping the 'empty' upper region,
+         * because we'll do it enough for the lower region.
+         */
+        r->addr = -1;
+        goto exit;
+    default:
+        break;
+    }
+
+    /* update the corresponding virtual region address */
+    /*
+     * When guest code tries to get block size of mmio, it will write all "1"s
+     * into pci bar register. In this case, cfg_entry->data == writable_mask.
+     * Especially for devices with large mmio, the value of writable_mask
+     * is likely to be a guest physical address that has been mapped to ram
+     * rather than mmio. Remapping this value to mmio should be prevented.
+     */
+
+    if (cfg_entry->data != writable_mask) {
+        r->addr = cfg_entry->data;
+    }
+
+exit:
+    /* create value for writing to I/O device register */
+    throughable_mask = ~bar_emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    /* After BAR reg update, we need to remap BAR */
+    reg_grp_entry = pt_find_reg_grp(s, PCI_COMMAND);
+    if (reg_grp_entry) {
+        reg_entry = pt_find_reg(reg_grp_entry, PCI_COMMAND);
+        if (reg_entry) {
+            pt_bar_mapping_one(s, index, reg_entry->data & PCI_COMMAND_IO,
+                               reg_entry->data & PCI_COMMAND_MEMORY);
+        }
+    }
+
+    return 0;
+}
+static int pt_bar_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                              uint32_t real_offset, uint32_t dev_value,
+                              uint32_t *value)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint32_t bar_emu_mask = 0;
+    int index = 0;
+
+    /* get BAR index */
+    index = pt_bar_offset_to_index(reg->offset);
+    if (index < 0) {
+        hw_error("Internal error: Invalid BAR index[%d]. "
+                 "I/O emulator exit.\n", index);
+    }
+
+    /* use value from kernel sysfs */
+    if (s->bases[index].bar_flag == PT_BAR_FLAG_UPPER) {
+        *value = s->real_device->io_regions[index - 1].base_addr >> 32;
+    } else {
+        *value = base_address_with_flags(&s->real_device->io_regions[index]);
+    }
+
+    /* set emulate mask depend on BAR flag */
+    switch (s->bases[index].bar_flag) {
+    case PT_BAR_FLAG_MEM:
+        bar_emu_mask = PT_BAR_MEM_EMU_MASK;
+        break;
+    case PT_BAR_FLAG_IO:
+        bar_emu_mask = PT_BAR_IO_EMU_MASK;
+        break;
+    case PT_BAR_FLAG_UPPER:
+        bar_emu_mask = PT_BAR_ALLF;
+        break;
+    default:
+        break;
+    }
+
+    /* create value for restoring to I/O device register */
+    *value = PT_MERGE_VALUE(*value, dev_value, bar_emu_mask);
+
+    return 0;
+}
+
+/* write Exp ROM BAR */
+static int pt_exp_rom_bar_reg_write(XenPCIPassthroughState *s,
+                                    XenPTReg *cfg_entry, uint32_t *value,
+                                    uint32_t dev_value, uint32_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    XenPTRegion *base = NULL;
+    PCIDevice *d = (PCIDevice *)&s->dev;
+    PCIIORegion *r;
+    uint32_t writable_mask = 0;
+    uint32_t throughable_mask = 0;
+    pcibus_t r_size = 0;
+    uint32_t bar_emu_mask = 0;
+    uint32_t bar_ro_mask = 0;
+
+    r = &d->io_regions[PCI_ROM_SLOT];
+    r_size = r->size;
+    base = &s->bases[PCI_ROM_SLOT];
+    /* align memory type resource size */
+    pt_get_emul_size(base->bar_flag, r_size);
+
+    /* set emulate mask and read-only mask */
+    bar_emu_mask = reg->emu_mask;
+    bar_ro_mask = (reg->ro_mask | (r_size - 1)) & ~PCI_ROM_ADDRESS_ENABLE;
+
+    /* modify emulate register */
+    writable_mask = ~bar_ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* update the corresponding virtual region address */
+    /*
+     * When guest code tries to get block size of mmio, it will write all "1"s
+     * into pci bar register. In this case, cfg_entry->data == writable_mask.
+     * Especially for devices with large mmio, the value of writable_mask
+     * is likely to be a guest physical address that has been mapped to ram
+     * rather than mmio. Remapping this value to mmio should be prevented.
+     */
+
+    if (cfg_entry->data != writable_mask) {
+        r->addr = cfg_entry->data;
+    }
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~bar_emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    /* After BAR reg update, we need to remap BAR*/
+    reg_grp_entry = pt_find_reg_grp(s, PCI_COMMAND);
+    if (reg_grp_entry) {
+        reg_entry = pt_find_reg(reg_grp_entry, PCI_COMMAND);
+        if (reg_entry) {
+            pt_bar_mapping_one(s, PCI_ROM_SLOT,
+                               reg_entry->data & PCI_COMMAND_IO,
+                               reg_entry->data & PCI_COMMAND_MEMORY);
+        }
+    }
+
+    return 0;
+}
+/* restore ROM BAR */
+static int pt_exp_rom_bar_reg_restore(XenPCIPassthroughState *s,
+                                      XenPTReg *cfg_entry,
+                                      uint32_t real_offset,
+                                      uint32_t dev_value, uint32_t *value)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+
+    /* use value from kernel sysfs */
+    *value =
+        PT_MERGE_VALUE(host_pci_get_long(s->real_device, PCI_ROM_ADDRESS),
+                       dev_value, reg->emu_mask);
+    return 0;
+}
+
+/* Header Type0 reg static infomation table */
+static XenPTRegInfo pt_emu_reg_header0_tbl[] = {
+    /* Vendor ID reg */
+    {
+        .offset     = PCI_VENDOR_ID,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xFFFF,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_vendor_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+        .u.w.restore  = NULL,
+    },
+    /* Device ID reg */
+    {
+        .offset     = PCI_DEVICE_ID,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xFFFF,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_device_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+        .u.w.restore  = NULL,
+    },
+    /* Command reg */
+    {
+        .offset     = PCI_COMMAND,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xF880,
+        .emu_mask   = 0x0740,
+        .init       = pt_common_reg_init,
+        .u.w.read   = pt_cmd_reg_read,
+        .u.w.write  = pt_cmd_reg_write,
+        .u.w.restore  = pt_cmd_reg_restore,
+    },
+    /* Capabilities Pointer reg */
+    {
+        .offset     = PCI_CAPABILITY_LIST,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_ptr_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+        .u.b.restore  = NULL,
+    },
+    /* Status reg */
+    /* use emulated Cap Ptr value to initialize,
+     * so need to be declared after Cap Ptr reg
+     */
+    {
+        .offset     = PCI_STATUS,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0x06FF,
+        .emu_mask   = 0x0010,
+        .init       = pt_status_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+        .u.w.restore  = NULL,
+    },
+    /* Cache Line Size reg */
+    {
+        .offset     = PCI_CACHE_LINE_SIZE,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0x00,
+        .emu_mask   = 0xFF,
+        .init       = pt_common_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+        .u.b.restore  = pt_byte_reg_restore,
+    },
+    /* Latency Timer reg */
+    {
+        .offset     = PCI_LATENCY_TIMER,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0x00,
+        .emu_mask   = 0xFF,
+        .init       = pt_common_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+        .u.b.restore  = pt_byte_reg_restore,
+    },
+    /* Header Type reg */
+    {
+        .offset     = PCI_HEADER_TYPE,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0x00,
+        .init       = pt_header_type_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+        .u.b.restore  = NULL,
+    },
+    /* Interrupt Line reg */
+    {
+        .offset     = PCI_INTERRUPT_LINE,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0x00,
+        .emu_mask   = 0xFF,
+        .init       = pt_common_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+        .u.b.restore  = NULL,
+    },
+    /* Interrupt Pin reg */
+    {
+        .offset     = PCI_INTERRUPT_PIN,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_irqpin_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+        .u.b.restore  = NULL,
+    },
+    /* BAR 0 reg */
+    /* mask of BAR need to be decided later, depends on IO/MEM type */
+    {
+        .offset     = PCI_BASE_ADDRESS_0,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+        .u.dw.restore = pt_bar_reg_restore,
+    },
+    /* BAR 1 reg */
+    {
+        .offset     = PCI_BASE_ADDRESS_1,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+        .u.dw.restore = pt_bar_reg_restore,
+    },
+    /* BAR 2 reg */
+    {
+        .offset     = PCI_BASE_ADDRESS_2,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+        .u.dw.restore = pt_bar_reg_restore,
+    },
+    /* BAR 3 reg */
+    {
+        .offset     = PCI_BASE_ADDRESS_3,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+        .u.dw.restore = pt_bar_reg_restore,
+    },
+    /* BAR 4 reg */
+    {
+        .offset     = PCI_BASE_ADDRESS_4,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+        .u.dw.restore = pt_bar_reg_restore,
+    },
+    /* BAR 5 reg */
+    {
+        .offset     = PCI_BASE_ADDRESS_5,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+        .u.dw.restore = pt_bar_reg_restore,
+    },
+    /* Expansion ROM BAR reg */
+    {
+        .offset     = PCI_ROM_ADDRESS,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .ro_mask    = 0x000007FE,
+        .emu_mask   = 0xFFFFF800,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_long_reg_read,
+        .u.dw.write = pt_exp_rom_bar_reg_write,
+        .u.dw.restore = pt_exp_rom_bar_reg_restore,
+    },
+    {
+        .size = 0,
+    },
+};
+
+
+/*********************************
+ * Vital Product Data Capability
+ */
+
+/* Vital Product Data Capability Structure reg static infomation table */
+static XenPTRegInfo pt_emu_reg_vpd_tbl[] = {
+    {
+        .offset     = PCI_CAP_LIST_NEXT,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_ptr_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+        .u.b.restore  = NULL,
+    },
+    {
+        .size = 0,
+    },
+};
+
+
+/**************************************
+ * Vendor Specific Capability
+ */
+
+/* Vendor Specific Capability Structure reg static infomation table */
+static XenPTRegInfo pt_emu_reg_vendor_tbl[] = {
+    {
+        .offset     = PCI_CAP_LIST_NEXT,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_ptr_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+        .u.b.restore  = NULL,
+    },
+    {
+        .size = 0,
+    },
+};
+
+
+/*****************************
+ * PCI Express Capability
+ */
+
+/* initialize Link Control register */
+static uint32_t pt_linkctrl_reg_init(XenPCIPassthroughState *s,
+                                     XenPTRegInfo *reg, uint32_t real_offset)
+{
+    uint8_t cap_ver = 0;
+    uint8_t dev_type = 0;
+
+    /* TODO maybe better to use fonction from hw/pcie.c */
+    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
+                           + PCI_EXP_FLAGS)
+        & PCI_EXP_FLAGS_VERS;
+    dev_type = (pci_get_byte(s->dev.config + real_offset - reg->offset
+                             + PCI_EXP_FLAGS)
+                & PCI_EXP_FLAGS_TYPE) >> 4;
+
+    /* no need to initialize in case of Root Complex Integrated Endpoint
+     * with cap_ver 1.x
+     */
+    if ((dev_type == PCI_EXP_TYPE_RC_END) && (cap_ver == 1)) {
+        return PT_INVALID_REG;
+    }
+
+    return reg->init_val;
+}
+/* initialize Device Control 2 register */
+static uint32_t pt_devctrl2_reg_init(XenPCIPassthroughState *s,
+                                     XenPTRegInfo *reg, uint32_t real_offset)
+{
+    uint8_t cap_ver = 0;
+
+    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
+                           + PCI_EXP_FLAGS)
+        & PCI_EXP_FLAGS_VERS;
+
+    /* no need to initialize in case of cap_ver 1.x */
+    if (cap_ver == 1) {
+        return PT_INVALID_REG;
+    }
+
+    return reg->init_val;
+}
+/* initialize Link Control 2 register */
+static uint32_t pt_linkctrl2_reg_init(XenPCIPassthroughState *s,
+                                      XenPTRegInfo *reg, uint32_t real_offset)
+{
+    int reg_field = 0;
+    uint8_t cap_ver = 0;
+
+    cap_ver = pci_get_byte(s->dev.config + real_offset - reg->offset
+                           + PCI_EXP_FLAGS)
+        & PCI_EXP_FLAGS_VERS;
+
+    /* no need to initialize in case of cap_ver 1.x */
+    if (cap_ver == 1) {
+        return PT_INVALID_REG;
+    }
+
+    /* set Supported Link Speed */
+    reg_field |= PCI_EXP_LNKCAP_SLS &
+        pci_get_byte(s->dev.config + real_offset - reg->offset
+                     + PCI_EXP_LNKCAP);
+
+    return reg_field;
+}
+
+/* PCI Express Capability Structure reg static infomation table */
+static XenPTRegInfo pt_emu_reg_pcie_tbl[] = {
+    /* Next Pointer reg */
+    {
+        .offset     = PCI_CAP_LIST_NEXT,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_ptr_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+        .u.b.restore  = NULL,
+    },
+    /* Device Capabilities reg */
+    {
+        .offset     = PCI_EXP_DEVCAP,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .ro_mask    = 0x1FFCFFFF,
+        .emu_mask   = 0x10000000,
+        .init       = pt_common_reg_init,
+        .u.dw.read  = pt_long_reg_read,
+        .u.dw.write = pt_long_reg_write,
+        .u.dw.restore = NULL,
+    },
+    /* Device Control reg */
+    {
+        .offset     = PCI_EXP_DEVCTL,
+        .size       = 2,
+        .init_val   = 0x2810,
+        .ro_mask    = 0x8400,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_common_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+        .u.w.restore  = pt_word_reg_restore,
+    },
+    /* Link Control reg */
+    {
+        .offset     = PCI_EXP_LNKCTL,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xFC34,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_linkctrl_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+        .u.w.restore  = pt_word_reg_restore,
+    },
+    /* Device Control 2 reg */
+    {
+        .offset     = 0x28,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xFFE0,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_devctrl2_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+        .u.w.restore  = pt_word_reg_restore,
+    },
+    /* Link Control 2 reg */
+    {
+        .offset     = 0x30,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xE040,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_linkctrl2_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+        .u.w.restore  = pt_word_reg_restore,
+    },
+    {
+        .size = 0,
+    },
+};
+
+
+/*********************************
+ * Power Management Capability
+ */
+
+/* initialize Power Management Capabilities register */
+static uint32_t pt_pmc_reg_init(XenPCIPassthroughState *s,
+                                XenPTRegInfo *reg, uint32_t real_offset)
+{
+    PCIDevice *d = &s->dev;
+
+    if (!s->power_mgmt) {
+        return reg->init_val;
+    }
+
+    /* set Power Management Capabilities register */
+    s->pm_state->pmc_field = pci_get_word(d->config + real_offset);
+
+    return reg->init_val;
+}
+/* initialize PCI Power Management Control/Status register */
+static uint32_t pt_pmcsr_reg_init(XenPCIPassthroughState *s,
+                                  XenPTRegInfo *reg, uint32_t real_offset)
+{
+    PCIDevice *d = &s->dev;
+    uint16_t cap_ver  = 0;
+
+    if (!s->power_mgmt) {
+        return reg->init_val;
+    }
+
+    /* check PCI Power Management support version */
+    cap_ver = s->pm_state->pmc_field & PCI_PM_CAP_VER_MASK;
+
+    if (cap_ver > 2) {
+        /* set No Soft Reset */
+        s->pm_state->no_soft_reset =
+            pci_get_byte(d->config + real_offset) & PCI_PM_CTRL_NO_SOFT_RESET;
+    }
+
+    /* wake up real physical device */
+    switch (host_pci_get_word(s->real_device, real_offset)
+            & PCI_PM_CTRL_STATE_MASK) {
+    case 0:
+        break;
+    case 1:
+        PT_LOG("Power state transition D1 -> D0active\n");
+        host_pci_set_word(s->real_device, real_offset, 0);
+        break;
+    case 2:
+        PT_LOG("Power state transition D2 -> D0active\n");
+        host_pci_set_word(s->real_device, real_offset, 0);
+        usleep(200);
+        break;
+    case 3:
+        PT_LOG("Power state transition D3hot -> D0active\n");
+        host_pci_set_word(s->real_device, real_offset, 0);
+        usleep(10 * 1000);
+        pt_init_pci_config(s);
+        break;
+    }
+
+    return reg->init_val;
+}
+/* read Power Management Control/Status register */
+static int pt_pmcsr_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                             uint16_t *value, uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint16_t valid_emu_mask = reg->emu_mask;
+
+    if (!s->power_mgmt) {
+        valid_emu_mask |= PCI_PM_CTRL_STATE_MASK | PCI_PM_CTRL_NO_SOFT_RESET;
+    }
+
+    valid_emu_mask = valid_emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+    return 0;
+}
+/* reset Interrupt and I/O resource  */
+static void pt_reset_interrupt_and_io_mapping(XenPCIPassthroughState *s)
+{
+    PCIDevice *d = &s->dev;
+    PCIIORegion *r;
+    int i = 0;
+    uint8_t e_device = 0;
+    uint8_t e_intx = 0;
+
+    /* unbind INTx */
+    e_device = PCI_SLOT(s->dev.devfn);
+    e_intx = pci_intx(s);
+
+    if (s->machine_irq) {
+        if (xc_domain_unbind_pt_irq(xen_xc, xen_domid, s->machine_irq,
+                                    PT_IRQ_TYPE_PCI, 0, e_device, e_intx, 0)) {
+            PT_LOG("Error: Unbinding of interrupt failed!\n");
+        }
+    }
+
+    /* clear all virtual region address */
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        r = &d->io_regions[i];
+        r->addr = -1;
+    }
+
+    /* unmapping BAR */
+    pt_bar_mapping(s, 0, 0);
+}
+/* check power state transition */
+static int check_power_state(XenPCIPassthroughState *s)
+{
+    XenPTPM *pm_state = s->pm_state;
+    PCIDevice *d = &s->dev;
+    uint16_t read_val = 0;
+    uint16_t cur_state = 0;
+
+    /* get current power state */
+    read_val = host_pci_get_word(s->real_device,
+                                 pm_state->pm_base + PCI_PM_CTRL);
+    cur_state = read_val & PCI_PM_CTRL_STATE_MASK;
+
+    if (pm_state->req_state != cur_state) {
+        PT_LOG("Error: Failed to change power state. "
+               "[%02x:%02x.%x][requested state:%d][current state:%d]\n",
+               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+               pm_state->req_state, cur_state);
+        return -1;
+    }
+    return 0;
+}
+/* write Power Management Control/Status register */
+static void pt_from_d3hot_to_d0_with_reset(void *opaque)
+{
+    XenPCIPassthroughState *s = opaque;
+    XenPTPM *pm_state = s->pm_state;
+    int ret = 0;
+
+    /* check power state */
+    ret = check_power_state(s);
+
+    if (ret < 0) {
+        goto out;
+    }
+
+    pt_init_pci_config(s);
+
+out:
+    /* power state transition flags off */
+    pm_state->flags &= ~PT_FLAG_TRANSITING;
+
+    qemu_free_timer(pm_state->pm_timer);
+    pm_state->pm_timer = NULL;
+}
+static void pt_default_power_transition(void *opaque)
+{
+    XenPCIPassthroughState *ptdev = opaque;
+    XenPTPM *pm_state = ptdev->pm_state;
+
+    /* check power state */
+    check_power_state(ptdev);
+
+    /* power state transition flags off */
+    pm_state->flags &= ~PT_FLAG_TRANSITING;
+
+    qemu_free_timer(pm_state->pm_timer);
+    pm_state->pm_timer = NULL;
+}
+static int pt_pmcsr_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                              uint16_t *value, uint16_t dev_value,
+                              uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    PCIDevice *d = &s->dev;
+    uint16_t emu_mask = reg->emu_mask;
+    uint16_t writable_mask = 0;
+    uint16_t throughable_mask = 0;
+    XenPTPM *pm_state = s->pm_state;
+
+    if (!s->power_mgmt) {
+        emu_mask |= PCI_PM_CTRL_STATE_MASK | PCI_PM_CTRL_NO_SOFT_RESET;
+    }
+
+    /* modify emulate register */
+    writable_mask = emu_mask & ~reg->ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    if (!s->power_mgmt) {
+        return 0;
+    }
+
+    /* set I/O device power state */
+    pm_state->cur_state = dev_value & PCI_PM_CTRL_STATE_MASK;
+
+    /* set Guest requested PowerState */
+    pm_state->req_state = *value & PCI_PM_CTRL_STATE_MASK;
+
+    /* check power state transition or not */
+    if (pm_state->cur_state == pm_state->req_state) {
+        /* not power state transition */
+        return 0;
+    }
+
+    /* check enable power state transition */
+    if ((pm_state->req_state != 0) &&
+        (pm_state->cur_state > pm_state->req_state)) {
+        PT_LOG("Error: Invalid power transition. "
+               "[%02x:%02x.%x][requested state:%d][current state:%d]\n",
+               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+               pm_state->req_state, pm_state->cur_state);
+
+        return 0;
+    }
+
+    /* check if this device supports the requested power state */
+    if (((pm_state->req_state == 1) && !(pm_state->pmc_field & PCI_PM_CAP_D1))
+        || ((pm_state->req_state == 2) &&
+            !(pm_state->pmc_field & PCI_PM_CAP_D2))) {
+        PT_LOG("Error: Invalid power transition. "
+               "[%02x:%02x.%x][requested state:%d][current state:%d]\n",
+               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+               pm_state->req_state, pm_state->cur_state);
+
+        return 0;
+    }
+
+    /* in case of transition related to D3hot, it's necessary to wait 10 ms.
+     * But because writing to register will be performed later on actually,
+     * don't start QEMUTimer right now, just alloc and init QEMUTimer here.
+     */
+    if ((pm_state->cur_state == 3) || (pm_state->req_state == 3)) {
+        if (pm_state->req_state == 0) {
+            /* alloc and init QEMUTimer */
+            if (!pm_state->no_soft_reset) {
+                pm_state->pm_timer = qemu_new_timer_ms(rt_clock,
+                    pt_from_d3hot_to_d0_with_reset, s);
+
+                /* reset Interrupt and I/O resource mapping */
+                pt_reset_interrupt_and_io_mapping(s);
+            } else {
+                pm_state->pm_timer = qemu_new_timer_ms(rt_clock,
+                                        pt_default_power_transition, s);
+            }
+        } else {
+            /* alloc and init QEMUTimer */
+            pm_state->pm_timer = qemu_new_timer_ms(rt_clock,
+                pt_default_power_transition, s);
+        }
+
+        /* set power state transition delay */
+        pm_state->pm_delay = 10;
+
+        /* power state transition flags on */
+        pm_state->flags |= PT_FLAG_TRANSITING;
+    }
+    /* in case of transition related to D0, D1 and D2,
+     * no need to use QEMUTimer.
+     * So, we perfom writing to register here and then read it back.
+     */
+    else {
+        /* write power state to I/O device register */
+        host_pci_set_word(s->real_device, pm_state->pm_base + PCI_PM_CTRL,
+                          *value);
+
+        /* in case of transition related to D2,
+         * it's necessary to wait 200 usec.
+         * But because QEMUTimer do not support microsec unit right now,
+         * so we do wait ourself here.
+         */
+        if ((pm_state->cur_state == 2) || (pm_state->req_state == 2)) {
+            usleep(200);
+        }
+
+        /* check power state */
+        check_power_state(s);
+
+        /* recreate value for writing to I/O device register */
+        *value = host_pci_get_word(s->real_device,
+                                   pm_state->pm_base + PCI_PM_CTRL);
+    }
+
+    return 0;
+}
+
+/* restore Power Management Control/Status register */
+static int pt_pmcsr_reg_restore(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                                uint32_t real_offset, uint16_t dev_value,
+                                uint16_t *value)
+{
+    /* create value for restoring to I/O device register
+     * No need to restore, just clear PME Enable and PME Status bit
+     * Note: register type of PME Status bit is RW1C, so clear by writing 1b
+     */
+    *value = (dev_value & ~PCI_PM_CTRL_PME_ENABLE) | PCI_PM_CTRL_PME_STATUS;
+
+    return 0;
+}
+
+
+/* Power Management Capability reg static infomation table */
+static XenPTRegInfo pt_emu_reg_pm_tbl[] = {
+    /* Next Pointer reg */
+    {
+        .offset     = PCI_CAP_LIST_NEXT,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_ptr_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+        .u.b.restore  = NULL,
+    },
+    /* Power Management Capabilities reg */
+    {
+        .offset     = PCI_CAP_FLAGS,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xFFFF,
+        .emu_mask   = 0xF9C8,
+        .init       = pt_pmc_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+        .u.w.restore  = NULL,
+    },
+    /* PCI Power Management Control/Status reg */
+    {
+        .offset     = PCI_PM_CTRL,
+        .size       = 2,
+        .init_val   = 0x0008,
+        .ro_mask    = 0xE1FC,
+        .emu_mask   = 0x8100,
+        .init       = pt_pmcsr_reg_init,
+        .u.w.read   = pt_pmcsr_reg_read,
+        .u.w.write  = pt_pmcsr_reg_write,
+        .u.w.restore  = pt_pmcsr_reg_restore,
+    },
+    {
+        .size = 0,
+    },
+};
+
+
+/****************************
+ * Capabilities
+ */
+
+/* AER register operations */
+
+static void aer_save_one_register(XenPCIPassthroughState *s, int offset)
+{
+    PCIDevice *d = &s->dev;
+    uint32_t aer_base = s->pm_state->aer_base;
+    uint32_t val = 0;
+
+    val = host_pci_get_long(s->real_device, aer_base + offset);
+    pci_set_long(d->config + aer_base + offset, val);
+}
+static void pt_aer_reg_save(XenPCIPassthroughState *s)
+{
+    /* after reset, following register values should be restored.
+     * So, save them.
+     */
+    aer_save_one_register(s, PCI_ERR_UNCOR_MASK);
+    aer_save_one_register(s, PCI_ERR_UNCOR_SEVER);
+    aer_save_one_register(s, PCI_ERR_COR_MASK);
+    aer_save_one_register(s, PCI_ERR_CAP);
+}
+static void aer_restore_one_register(XenPCIPassthroughState *s, int offset)
+{
+    PCIDevice *d = &s->dev;
+    uint32_t aer_base = s->pm_state->aer_base;
+    uint32_t config = 0;
+
+    config = pci_get_long(d->config + aer_base + offset);
+    host_pci_set_long(s->real_device, aer_base + offset, config);
+}
+static void pt_aer_reg_restore(XenPCIPassthroughState *s)
+{
+    /* the following registers should be reconfigured to correct values
+     * after reset. restore them.
+     * other registers should not be reconfigured after reset
+     * if there is no reason
+     */
+    aer_restore_one_register(s, PCI_ERR_UNCOR_MASK);
+    aer_restore_one_register(s, PCI_ERR_UNCOR_SEVER);
+    aer_restore_one_register(s, PCI_ERR_COR_MASK);
+    aer_restore_one_register(s, PCI_ERR_CAP);
+}
+
+/* capability structure register group size functions */
+
+static uint8_t pt_reg_grp_size_init(XenPCIPassthroughState *s,
+                                    const XenPTRegGroupInfo *grp_reg,
+                                    uint32_t base_offset)
+{
+    return grp_reg->grp_size;
+}
+/* get Power Management Capability Structure register group size */
+static uint8_t pt_pm_size_init(XenPCIPassthroughState *s,
+                               const XenPTRegGroupInfo *grp_reg,
+                               uint32_t base_offset)
+{
+    if (!s->power_mgmt) {
+        return grp_reg->grp_size;
+    }
+
+    s->pm_state = g_malloc0(sizeof (XenPTPM));
+
+    /* set Power Management Capability base offset */
+    s->pm_state->pm_base = base_offset;
+
+    /* find AER register and set AER Capability base offset */
+    s->pm_state->aer_base = host_pci_find_ext_cap_offset(s->real_device,
+                                                         PCI_EXT_CAP_ID_ERR);
+
+    /* save AER register */
+    if (s->pm_state->aer_base) {
+        pt_aer_reg_save(s);
+    }
+
+    return grp_reg->grp_size;
+}
+/* get Vendor Specific Capability Structure register group size */
+static uint8_t pt_vendor_size_init(XenPCIPassthroughState *s,
+                                   const XenPTRegGroupInfo *grp_reg,
+                                   uint32_t base_offset)
+{
+    return pci_get_byte(s->dev.config + base_offset + 0x02);
+}
+/* get PCI Express Capability Structure register group size */
+static uint8_t pt_pcie_size_init(XenPCIPassthroughState *s,
+                                 const XenPTRegGroupInfo *grp_reg,
+                                 uint32_t base_offset)
+{
+    PCIDevice *d = &s->dev;
+    uint16_t exp_flag = 0;
+    uint16_t type = 0;
+    uint16_t version = 0;
+    uint8_t pcie_size = 0;
+
+    exp_flag = pci_get_word(d->config + base_offset + PCI_EXP_FLAGS);
+    type = (exp_flag & PCI_EXP_FLAGS_TYPE) >> 4;
+    version = exp_flag & PCI_EXP_FLAGS_VERS;
+
+    /* calculate size depend on capability version and device/port type */
+    /* in case of PCI Express Base Specification Rev 1.x */
+    if (version == 1) {
+        /* The PCI Express Capabilities, Device Capabilities, and Device
+         * Status/Control registers are required for all PCI Express devices.
+         * The Link Capabilities and Link Status/Control are required for all
+         * Endpoints that are not Root Complex Integrated Endpoints. Endpoints
+         * are not required to implement registers other than those listed
+         * above and terminate the capability structure.
+         */
+        switch (type) {
+        case PCI_EXP_TYPE_ENDPOINT:
+        case PCI_EXP_TYPE_LEG_END:
+            pcie_size = 0x14;
+            break;
+        case PCI_EXP_TYPE_RC_END:
+            /* has no link */
+            pcie_size = 0x0C;
+            break;
+        /* only EndPoint passthrough is supported */
+        case PCI_EXP_TYPE_ROOT_PORT:
+        case PCI_EXP_TYPE_UPSTREAM:
+        case PCI_EXP_TYPE_DOWNSTREAM:
+        case PCI_EXP_TYPE_PCI_BRIDGE:
+        case PCI_EXP_TYPE_PCIE_BRIDGE:
+        case PCI_EXP_TYPE_RC_EC:
+        default:
+            hw_error("Internal error: Unsupported device/port type[%d]. "
+                     "I/O emulator exit.\n", type);
+        }
+    }
+    /* in case of PCI Express Base Specification Rev 2.0 */
+    else if (version == 2) {
+        switch (type) {
+        case PCI_EXP_TYPE_ENDPOINT:
+        case PCI_EXP_TYPE_LEG_END:
+        case PCI_EXP_TYPE_RC_END:
+            /* For Functions that do not implement the registers,
+             * these spaces must be hardwired to 0b.
+             */
+            pcie_size = 0x3C;
+            break;
+        /* only EndPoint passthrough is supported */
+        case PCI_EXP_TYPE_ROOT_PORT:
+        case PCI_EXP_TYPE_UPSTREAM:
+        case PCI_EXP_TYPE_DOWNSTREAM:
+        case PCI_EXP_TYPE_PCI_BRIDGE:
+        case PCI_EXP_TYPE_PCIE_BRIDGE:
+        case PCI_EXP_TYPE_RC_EC:
+        default:
+            hw_error("Internal error: Unsupported device/port type[%d]. "
+                     "I/O emulator exit.\n", type);
+        }
+    } else {
+        hw_error("Internal error: Unsupported capability version[%d]. "
+                 "I/O emulator exit.\n", version);
+    }
+
+    return pcie_size;
+}
+
+static const XenPTRegGroupInfo pt_emu_reg_grp_tbl[] = {
+    /* Header Type0 reg group */
+    {
+        .grp_id      = 0xFF,
+        .grp_type    = GRP_TYPE_EMU,
+        .grp_size    = 0x40,
+        .size_init   = pt_reg_grp_size_init,
+        .emu_reg_tbl = pt_emu_reg_header0_tbl,
+    },
+    /* PCI PowerManagement Capability reg group */
+    {
+        .grp_id      = PCI_CAP_ID_PM,
+        .grp_type    = GRP_TYPE_EMU,
+        .grp_size    = PCI_PM_SIZEOF,
+        .size_init   = pt_pm_size_init,
+        .emu_reg_tbl = pt_emu_reg_pm_tbl,
+    },
+    /* AGP Capability Structure reg group */
+    {
+        .grp_id     = PCI_CAP_ID_AGP,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x30,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* Vital Product Data Capability Structure reg group */
+    {
+        .grp_id      = PCI_CAP_ID_VPD,
+        .grp_type    = GRP_TYPE_EMU,
+        .grp_size    = 0x08,
+        .size_init   = pt_reg_grp_size_init,
+        .emu_reg_tbl = pt_emu_reg_vpd_tbl,
+    },
+    /* Slot Identification reg group */
+    {
+        .grp_id     = PCI_CAP_ID_SLOTID,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x04,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* PCI-X Capabilities List Item reg group */
+    {
+        .grp_id     = PCI_CAP_ID_PCIX,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x18,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* Vendor Specific Capability Structure reg group */
+    {
+        .grp_id      = PCI_CAP_ID_VNDR,
+        .grp_type    = GRP_TYPE_EMU,
+        .grp_size    = 0xFF,
+        .size_init   = pt_vendor_size_init,
+        .emu_reg_tbl = pt_emu_reg_vendor_tbl,
+    },
+    /* SHPC Capability List Item reg group */
+    {
+        .grp_id     = PCI_CAP_ID_SHPC,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x08,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* Subsystem ID and Subsystem Vendor ID Capability List Item reg group */
+    {
+        .grp_id     = PCI_CAP_ID_SSVID,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x08,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* AGP 8x Capability Structure reg group */
+    {
+        .grp_id     = PCI_CAP_ID_AGP3,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x30,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* PCI Express Capability Structure reg group */
+    {
+        .grp_id      = PCI_CAP_ID_EXP,
+        .grp_type    = GRP_TYPE_EMU,
+        .grp_size    = 0xFF,
+        .size_init   = pt_pcie_size_init,
+        .emu_reg_tbl = pt_emu_reg_pcie_tbl,
+    },
+    {
+        .grp_size = 0,
+    },
+};
+
+/* initialize Capabilities Pointer or Next Pointer register */
+static uint32_t pt_ptr_reg_init(XenPCIPassthroughState *s,
+                                XenPTRegInfo *reg, uint32_t real_offset)
+{
+    /* uint32_t reg_field = (uint32_t)s->dev.config[real_offset]; */
+    uint32_t reg_field = pci_get_byte(s->dev.config + real_offset);
+    int i;
+
+    /* find capability offset */
+    while (reg_field) {
+        for (i = 0; pt_emu_reg_grp_tbl[i].grp_size != 0; i++) {
+            if (pt_hide_dev_cap(s->real_device,
+                                pt_emu_reg_grp_tbl[i].grp_id)) {
+                continue;
+            }
+            if (pt_emu_reg_grp_tbl[i].grp_id == s->dev.config[reg_field]) {
+                if (pt_emu_reg_grp_tbl[i].grp_type == GRP_TYPE_EMU) {
+                    goto out;
+                }
+                /* ignore the 0 hardwired capability, find next one */
+                break;
+            }
+        }
+        /* next capability */
+        /* reg_field = (uint32_t)s->dev.config[reg_field + 1]; */
+        reg_field = pci_get_byte(s->dev.config + reg_field + 1);
+    }
+
+out:
+    return reg_field;
+}
+
+
+/*************
+ * Main
+ */
+
+/* restore a part of I/O device register */
+static void pt_config_restore(XenPCIPassthroughState *s)
+{
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    XenPTRegInfo *reg = NULL;
+    uint32_t real_offset = 0;
+    uint32_t read_val = 0;
+    uint32_t val = 0;
+    int ret = 0;
+
+    /* find emulate register group entry */
+    QLIST_FOREACH(reg_grp_entry, &s->reg_grp_tbl, entries) {
+        /* find emulate register entry */
+        QLIST_FOREACH(reg_entry, &reg_grp_entry->reg_tbl_list, entries) {
+            reg = reg_entry->reg;
+
+            /* check whether restoring is needed */
+            if (!reg->u.b.restore) {
+                continue;
+            }
+
+            real_offset = reg_grp_entry->base_offset + reg->offset;
+
+            /* read I/O device register value */
+            ret = host_pci_get_block(s->real_device, real_offset,
+                                     (uint8_t *)&read_val, reg->size);
+
+            if (!ret) {
+                PT_LOG("Error: pci_read_block failed. "
+                       "return value[%d].\n", ret);
+                memset(&read_val, 0xff, reg->size);
+            }
+
+            val = 0;
+
+            /* restore based on register size */
+            switch (reg->size) {
+            case 1:
+                /* byte register */
+                ret = reg->u.b.restore(s, reg_entry, real_offset,
+                                       (uint8_t)read_val, (uint8_t *)&val);
+                break;
+            case 2:
+                /* word register */
+                ret = reg->u.w.restore(s, reg_entry, real_offset,
+                                       (uint16_t)read_val, (uint16_t *)&val);
+                break;
+            case 4:
+                /* double word register */
+                ret = reg->u.dw.restore(s, reg_entry, real_offset,
+                                        (uint32_t)read_val, (uint32_t *)&val);
+                break;
+            }
+
+            /* restoring error */
+            if (ret < 0) {
+                hw_error("Internal error: Invalid restoring "
+                         "return value[%d]. I/O emulator exit.\n", ret);
+            }
+
+            PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
+                          pci_bus_num(s->dev.bus), PCI_SLOT(s->dev.devfn),
+                          PCI_FUNC(s->dev.devfn),
+                          real_offset, val, reg->size);
+
+            ret = host_pci_set_block(s->real_device, real_offset,
+                                     (uint8_t *)&val, reg->size);
+
+            if (!ret) {
+                PT_LOG("Error: pci_write_block failed. "
+                       "return value[%d].\n", ret);
+            }
+        }
+    }
+
+    /* if AER supported, restore it */
+    if (s->pm_state->aer_base) {
+        pt_aer_reg_restore(s);
+    }
+}
+/* reinitialize all emulate registers */
+static void pt_config_reinit(XenPCIPassthroughState *s)
+{
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    XenPTRegInfo *reg = NULL;
+
+    /* find emulate register group entry */
+    QLIST_FOREACH(reg_grp_entry, &s->reg_grp_tbl, entries) {
+        /* find emulate register entry */
+        QLIST_FOREACH(reg_entry, &reg_grp_entry->reg_tbl_list, entries) {
+            reg = reg_entry->reg;
+            if (reg->init) {
+                /* initialize emulate register */
+                reg_entry->data =
+                    reg->init(s, reg_entry->reg,
+                              reg_grp_entry->base_offset + reg->offset);
+            }
+        }
+    }
+}
+
+static int pt_init_pci_config(XenPCIPassthroughState *s)
+{
+    PCIDevice *d = &s->dev;
+    int ret = 0;
+
+    PT_LOG("Reinitialize PCI configuration registers due to power state"
+           " transition with internal reset. [%02x:%02x.%x]\n",
+           pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn));
+
+    /* restore a part of I/O device register */
+    pt_config_restore(s);
+
+    /* reinitialize all emulate register */
+    pt_config_reinit(s);
+
+    /* rebind machine_irq to device */
+    if (s->machine_irq != 0) {
+        uint8_t e_device = PCI_SLOT(s->dev.devfn);
+        uint8_t e_intx = pci_intx(s);
+
+        ret = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, s->machine_irq, 0,
+                                        e_device, e_intx);
+        if (ret < 0) {
+            PT_LOG("Error: Rebinding of interrupt failed! ret=%d\n", ret);
+        }
+    }
+
+    return ret;
+}
+
+static uint8_t find_cap_offset(XenPCIPassthroughState *s, uint8_t cap)
+{
+    int id;
+    int max_cap = 48;
+    int pos = PCI_CAPABILITY_LIST;
+    int status;
+
+    status = host_pci_get_byte(s->real_device, PCI_STATUS);
+    if ((status & PCI_STATUS_CAP_LIST) == 0) {
+        return 0;
+    }
+
+    while (max_cap--) {
+        pos = host_pci_get_byte(s->real_device, pos);
+        if (pos < 0x40) {
+            break;
+        }
+
+        pos &= ~3;
+        id = host_pci_get_byte(s->real_device, pos + PCI_CAP_LIST_ID);
+
+        if (id == 0xff) {
+            break;
+        }
+        if (id == cap) {
+            return pos;
+        }
+
+        pos += PCI_CAP_LIST_NEXT;
+    }
+    return 0;
+}
+
+static void pt_config_reg_init(XenPCIPassthroughState *s,
+                               XenPTRegGroup *reg_grp, XenPTRegInfo *reg)
+{
+    XenPTReg *reg_entry;
+    uint32_t data = 0;
+
+    reg_entry = g_malloc0(sizeof (XenPTReg));
+
+    reg_entry->reg = reg;
+    reg_entry->data = 0;
+
+    if (reg->init) {
+        /* initialize emulate register */
+        data = reg->init(s, reg_entry->reg,
+                         reg_grp->base_offset + reg->offset);
+        if (data == PT_INVALID_REG) {
+            /* free unused BAR register entry */
+            free(reg_entry);
+            return;
+        }
+        /* set register value */
+        reg_entry->data = data;
+    }
+    /* list add register entry */
+    QLIST_INSERT_HEAD(&reg_grp->reg_tbl_list, reg_entry, entries);
+
+    return;
+}
+
+void pt_config_init(XenPCIPassthroughState *s)
+{
+    XenPTRegGroup *reg_grp_entry = NULL;
+    uint32_t reg_grp_offset = 0;
+    XenPTRegInfo *reg_tbl = NULL;
+    int i, j;
+
+    QLIST_INIT(&s->reg_grp_tbl);
+
+    for (i = 0; pt_emu_reg_grp_tbl[i].grp_size != 0; i++) {
+        if (pt_emu_reg_grp_tbl[i].grp_id != 0xFF) {
+            if (pt_hide_dev_cap(s->real_device,
+                                pt_emu_reg_grp_tbl[i].grp_id)) {
+                continue;
+            }
+
+            reg_grp_offset = find_cap_offset(s, pt_emu_reg_grp_tbl[i].grp_id);
+
+            if (!reg_grp_offset) {
+                continue;
+            }
+        }
+
+        reg_grp_entry = g_malloc0(sizeof (XenPTRegGroup));
+        QLIST_INIT(&reg_grp_entry->reg_tbl_list);
+        QLIST_INSERT_HEAD(&s->reg_grp_tbl, reg_grp_entry, entries);
+
+        reg_grp_entry->base_offset = reg_grp_offset;
+        reg_grp_entry->reg_grp = pt_emu_reg_grp_tbl + i;
+        if (pt_emu_reg_grp_tbl[i].size_init) {
+            /* get register group size */
+            reg_grp_entry->size =
+                pt_emu_reg_grp_tbl[i].size_init(s, reg_grp_entry->reg_grp,
+                                                reg_grp_offset);
+        }
+
+        if (pt_emu_reg_grp_tbl[i].grp_type == GRP_TYPE_EMU) {
+            if (pt_emu_reg_grp_tbl[i].emu_reg_tbl) {
+                reg_tbl = pt_emu_reg_grp_tbl[i].emu_reg_tbl;
+                /* initialize capability register */
+                for (j = 0; reg_tbl->size != 0; j++, reg_tbl++) {
+                    /* initialize capability register */
+                    pt_config_reg_init(s, reg_grp_entry, reg_tbl);
+                }
+            }
+        }
+        reg_grp_offset = 0;
+    }
+
+    return;
+}
+
+/* delete all emulate register */
+void pt_config_delete(XenPCIPassthroughState *s)
+{
+    struct XenPTRegGroup *reg_group, *next_grp;
+    struct XenPTReg *reg, *next_reg;
+
+    /* free Power Management info table */
+    if (s->pm_state) {
+        if (s->pm_state->pm_timer) {
+            qemu_del_timer(s->pm_state->pm_timer);
+            qemu_free_timer(s->pm_state->pm_timer);
+            s->pm_state->pm_timer = NULL;
+        }
+
+        g_free(s->pm_state);
+    }
+
+    /* free all register group entry */
+    QLIST_FOREACH_SAFE(reg_group, &s->reg_grp_tbl, entries, next_grp) {
+        /* free all register entry */
+        QLIST_FOREACH_SAFE(reg, &reg_group->reg_tbl_list, entries, next_reg) {
+            QLIST_REMOVE(reg, entries);
+            g_free(reg);
+        }
+
+        QLIST_REMOVE(reg_group, entries);
+        g_free(reg_group);
+    }
+}