Patchwork [V3,07/10] Introduce Xen PCI Passthrough, qdevice (1/3)

login
register
mail settings
Submitter Anthony PERARD
Date Oct. 28, 2011, 3:07 p.m.
Message ID <1319814456-8158-8-git-send-email-anthony.perard@citrix.com>
Download mbox | patch
Permalink /patch/122426/
State New
Headers show

Comments

Anthony PERARD - Oct. 28, 2011, 3:07 p.m.
From: Allen Kay <allen.m.kay@intel.com>

Signed-off-by: Allen Kay <allen.m.kay@intel.com>
Signed-off-by: Guy Zana <guy@neocleus.com>
Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 Makefile.target                  |    2 +
 hw/xen_pci_passthrough.c         |  838 ++++++++++++++++++++++++++++++++++++++
 hw/xen_pci_passthrough.h         |  223 ++++++++++
 hw/xen_pci_passthrough_helpers.c |   46 ++
 4 files changed, 1109 insertions(+), 0 deletions(-)
 create mode 100644 hw/xen_pci_passthrough.c
 create mode 100644 hw/xen_pci_passthrough.h
 create mode 100644 hw/xen_pci_passthrough_helpers.c
Stefano Stabellini - Nov. 8, 2011, 12:56 p.m.
On Fri, 28 Oct 2011, Anthony PERARD wrote:
> From: Allen Kay <allen.m.kay@intel.com>
> 
> Signed-off-by: Allen Kay <allen.m.kay@intel.com>
> Signed-off-by: Guy Zana <guy@neocleus.com>
> Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
> ---
>  Makefile.target                  |    2 +
>  hw/xen_pci_passthrough.c         |  838 ++++++++++++++++++++++++++++++++++++++
>  hw/xen_pci_passthrough.h         |  223 ++++++++++
>  hw/xen_pci_passthrough_helpers.c |   46 ++
>  4 files changed, 1109 insertions(+), 0 deletions(-)
>  create mode 100644 hw/xen_pci_passthrough.c
>  create mode 100644 hw/xen_pci_passthrough.h
>  create mode 100644 hw/xen_pci_passthrough_helpers.c
> 
> diff --git a/Makefile.target b/Makefile.target
> index 243f9f2..36ea47d 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -217,6 +217,8 @@ obj-i386-$(CONFIG_XEN) += xen_platform.o
> 
>  # Xen PCI Passthrough
>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
> +obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough.o
> +obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_helpers.o
> 
>  # Inter-VM PCI shared memory
>  CONFIG_IVSHMEM =
> diff --git a/hw/xen_pci_passthrough.c b/hw/xen_pci_passthrough.c
> new file mode 100644
> index 0000000..b97c5b6
> --- /dev/null
> +++ b/hw/xen_pci_passthrough.c
> @@ -0,0 +1,838 @@
> +/*
> + * Copyright (c) 2007, Neocleus Corporation.
> + * Copyright (c) 2007, Intel Corporation.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Alex Novik <alex@neocleus.com>
> + * Allen Kay <allen.m.kay@intel.com>
> + * Guy Zana <guy@neocleus.com>
> + *
> + * This file implements direct PCI assignment to a HVM guest
> + */
> +
> +/*
> + * Interrupt Disable policy:
> + *
> + * INTx interrupt:
> + *   Initialize(register_real_device)
> + *     Map INTx(xc_physdev_map_pirq):
> + *       <fail>
> + *         - Set real Interrupt Disable bit to '1'.
> + *         - Set machine_irq and assigned_device->machine_irq to '0'.
> + *         * Don't bind INTx.
> + *
> + *     Bind INTx(xc_domain_bind_pt_pci_irq):
> + *       <fail>
> + *         - Set real Interrupt Disable bit to '1'.
> + *         - Unmap INTx.
> + *         - Decrement mapped_machine_irq[machine_irq]
> + *         - Set assigned_device->machine_irq to '0'.
> + *
> + *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
> + *     Write '0'
> + *       <ptdev->msi_trans_en is false>
> + *         - Set real bit to '0' if assigned_device->machine_irq isn't '0'.
> + *
> + *     Write '1'
> + *       <ptdev->msi_trans_en is false>
> + *         - Set real bit to '1'.
> + *
> + * MSI-INTx translation.
> + *   Initialize(xc_physdev_map_pirq_msi/pt_msi_setup)
> + *     Bind MSI-INTx(xc_domain_bind_pt_irq)
> + *       <fail>
> + *         - Unmap MSI.
> + *           <success>
> + *             - Set dev->msi->pirq to '-1'.
> + *           <fail>
> + *             - Do nothing.
> + *
> + *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
> + *     Write '0'
> + *       <ptdev->msi_trans_en is true>
> + *         - Set MSI Enable bit to '1'.
> + *
> + *     Write '1'
> + *       <ptdev->msi_trans_en is true>
> + *         - Set MSI Enable bit to '0'.
> + *
> + * MSI interrupt:
> + *   Initialize MSI register(pt_msi_setup, pt_msi_update)
> + *     Bind MSI(xc_domain_update_msi_irq)
> + *       <fail>
> + *         - Unmap MSI.
> + *         - Set dev->msi->pirq to '-1'.
> + *
> + * MSI-X interrupt:
> + *   Initialize MSI-X register(pt_msix_update_one)
> + *     Bind MSI-X(xc_domain_update_msi_irq)
> + *       <fail>
> + *         - Unmap MSI-X.
> + *         - Set entry->pirq to '-1'.
> + */
> +

you should move all the MSI related comments to the MSI patch


> +#include <sys/ioctl.h>
> +
> +#include "pci.h"
> +#include "xen.h"
> +#include "xen_backend.h"
> +#include "xen_pci_passthrough.h"
> +
> +#define PCI_BAR_ENTRIES (6)
> +
> +#define PT_NR_IRQS          (256)
> +char mapped_machine_irq[PT_NR_IRQS] = {0};
> +
> +/* Config Space */
> +static int pt_pci_config_access_check(PCIDevice *d, uint32_t address, int len)
> +{
> +    /* check offset range */
> +    if (address >= 0xFF) {
> +        PT_LOG("Error: Failed to access register with offset exceeding FFh. "
> +               "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +               address, len);
> +        return -1;
> +    }
> +
> +    /* check read size */
> +    if ((len != 1) && (len != 2) && (len != 4)) {
> +        PT_LOG("Error: Failed to access register with invalid access length. "
> +               "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +               address, len);
> +        return -1;
> +    }
> +
> +    /* check offset alignment */
> +    if (address & (len - 1)) {
> +        PT_LOG("Error: Failed to access register with invalid access size "
> +            "alignment. [%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
> +            pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +            address, len);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +int pt_bar_offset_to_index(uint32_t offset)
> +{
> +    int index = 0;
> +
> +    /* check Exp ROM BAR */
> +    if (offset == PCI_ROM_ADDRESS) {
> +        return PCI_ROM_SLOT;
> +    }
> +
> +    /* calculate BAR index */
> +    index = (offset - PCI_BASE_ADDRESS_0) >> 2;
> +    if (index >= PCI_NUM_REGIONS) {
> +        return -1;
> +    }
> +
> +    return index;
> +}
> +
> +static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t address, int len)
> +{
> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
> +    uint32_t val = 0;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    int rc = 0;
> +    int emul_len = 0;
> +    uint32_t find_addr = address;
> +
> +    if (pt_pci_config_access_check(d, address, len)) {
> +        goto exit;
> +    }
> +
> +    /* check power state transition flags */
> +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
> +        /* can't accept until previous power state transition is completed.
> +         * so finished previous request here.
> +         */
> +        PT_LOG("Warning: guest want to write durring power state transition\n");
> +        goto exit;
> +    }
> +
> +    /* find register group entry */
> +    reg_grp_entry = pt_find_reg_grp(s, address);
> +    if (reg_grp_entry) {
> +        /* check 0 Hardwired register group */
> +        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
> +            /* no need to emulate, just return 0 */
> +            val = 0;
> +            goto exit;
> +        }
> +    }
> +
> +    /* read I/O device register value */
> +    rc = host_pci_get_block(s->real_device, address, (uint8_t *)&val, len);
> +    if (!rc) {
> +        PT_LOG("Error: pci_read_block failed. return value[%d].\n", rc);
> +        memset(&val, 0xff, len);
> +    }
> +
> +    /* just return the I/O device register value for
> +     * passthrough type register group */
> +    if (reg_grp_entry == NULL) {
> +        goto exit;
> +    }
> +
> +    /* adjust the read value to appropriate CFC-CFF window */
> +    val <<= (address & 3) << 3;
> +    emul_len = len;
> +
> +    /* loop Guest request size */
> +    while (emul_len > 0) {
> +        /* find register entry to be emulated */
> +        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
> +        if (reg_entry) {
> +            XenPTRegInfo *reg = reg_entry->reg;
> +            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
> +            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
> +            uint8_t *ptr_val = NULL;
> +
> +            valid_mask <<= (find_addr - real_offset) << 3;
> +            ptr_val = (uint8_t *)&val + (real_offset & 3);
> +
> +            /* do emulation depend on register size */
> +            switch (reg->size) {
> +            case 1:
> +                if (reg->u.b.read) {
> +                    rc = reg->u.b.read(s, reg_entry, ptr_val, valid_mask);
> +                }
> +                break;
> +            case 2:
> +                if (reg->u.w.read) {
> +                    rc = reg->u.w.read(s, reg_entry,
> +                                       (uint16_t *)ptr_val, valid_mask);
> +                }
> +                break;
> +            case 4:
> +                if (reg->u.dw.read) {
> +                    rc = reg->u.dw.read(s, reg_entry,
> +                                        (uint32_t *)ptr_val, valid_mask);
> +                }
> +                break;
> +            }
> +
> +            if (rc < 0) {
> +                hw_error("Internal error: Invalid read emulation "
> +                         "return value[%d]. I/O emulator exit.\n", rc);
> +            }
> +
> +            /* calculate next address to find */
> +            emul_len -= reg->size;
> +            if (emul_len > 0) {
> +                find_addr = real_offset + reg->size;
> +            }
> +        } else {
> +            /* nothing to do with passthrough type register,
> +             * continue to find next byte */
> +            emul_len--;
> +            find_addr++;
> +        }
> +    }
> +
> +    /* need to shift back before returning them to pci bus emulator */
> +    val >>= ((address & 3) << 3);
> +
> +exit:
> +    PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
> +                  pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +                  address, val, len);
> +    return val;
> +}
> +
> +static void pt_pci_write_config(PCIDevice *d, uint32_t address,
> +                                uint32_t val, int len)
> +{
> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
> +    int index = 0;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    int rc = 0;
> +    uint32_t read_val = 0;
> +    int emul_len = 0;
> +    XenPTReg *reg_entry = NULL;
> +    uint32_t find_addr = address;
> +    XenPTRegInfo *reg = NULL;
> +
> +    if (pt_pci_config_access_check(d, address, len)) {
> +        return;
> +    }
> +
> +    PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
> +                  pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +                  address, val, len);
> +
> +    /* check unused BAR register */
> +    index = pt_bar_offset_to_index(address);
> +    if ((index >= 0) && (val > 0 && val < PT_BAR_ALLF) &&
> +        (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED)) {
> +        PT_LOG("Warning: Guest attempt to set address to unused Base Address "
> +               "Register. [%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +               address, len);
> +    }
> +
> +    /* check power state transition flags */
> +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
> +        /* can't accept untill previous power state transition is completed.
> +         * so finished previous request here.
> +         */
> +        PT_LOG("Warning: guest want to write durring power state transition\n");
> +        return;
> +    }
> +
> +    /* find register group entry */
> +    reg_grp_entry = pt_find_reg_grp(s, address);
> +    if (reg_grp_entry) {
> +        /* check 0 Hardwired register group */
> +        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
> +            /* ignore silently */
> +            PT_LOG("Warning: Access to 0 Hardwired register. "
> +                   "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
> +                   pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +                   address, len);
> +            return;
> +        }
> +    }
> +
> +    /* read I/O device register value */
> +    rc = host_pci_get_block(s->real_device, address,
> +                             (uint8_t *)&read_val, len);
> +    if (!rc) {
> +        PT_LOG("Error: pci_read_block failed. return value[%d].\n", rc);
> +        memset(&read_val, 0xff, len);
> +    }
> +
> +    /* pass directly to libpci for passthrough type register group */
> +    if (reg_grp_entry == NULL) {
> +        goto out;
> +    }
> +
> +    /* adjust the read and write value to appropriate CFC-CFF window */
> +    read_val <<= (address & 3) << 3;
> +    val <<= (address & 3) << 3;
> +    emul_len = len;
> +
> +    /* loop Guest request size */
> +    while (emul_len > 0) {
> +        /* find register entry to be emulated */
> +        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
> +        if (reg_entry) {
> +            reg = reg_entry->reg;
> +            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
> +            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
> +            uint8_t *ptr_val = NULL;
> +
> +            valid_mask <<= (find_addr - real_offset) << 3;
> +            ptr_val = (uint8_t *)&val + (real_offset & 3);
> +
> +            /* do emulation depend on register size */
> +            switch (reg->size) {
> +            case 1:
> +                if (reg->u.b.write) {
> +                    rc = reg->u.b.write(s, reg_entry, ptr_val,
> +                                        read_val >> ((real_offset & 3) << 3),
> +                                        valid_mask);
> +                }
> +                break;
> +            case 2:
> +                if (reg->u.w.write) {
> +                    rc = reg->u.w.write(s, reg_entry, (uint16_t *)ptr_val,
> +                                        (read_val >> ((real_offset & 3) << 3)),
> +                                        valid_mask);
> +                }
> +                break;
> +            case 4:
> +                if (reg->u.dw.write) {
> +                    rc = reg->u.dw.write(s, reg_entry, (uint32_t *)ptr_val,
> +                                         (read_val >> ((real_offset & 3) << 3)),
> +                                         valid_mask);
> +                }
> +                break;
> +            }
> +
> +            if (rc < 0) {
> +                hw_error("Internal error: Invalid write emulation "
> +                         "return value[%d]. I/O emulator exit.\n", rc);
> +            }
> +
> +            /* calculate next address to find */
> +            emul_len -= reg->size;
> +            if (emul_len > 0) {
> +                find_addr = real_offset + reg->size;
> +            }
> +        } else {
> +            /* nothing to do with passthrough type register,
> +             * continue to find next byte */
> +            emul_len--;
> +            find_addr++;
> +        }
> +    }
> +
> +    /* need to shift back before passing them to libpci */
> +    val >>= (address & 3) << 3;
> +
> +out:
> +    if (!(reg && reg->no_wb)) {
> +        /* unknown regs are passed through */
> +        rc = host_pci_set_block(s->real_device, address, (uint8_t *)&val, len);
> +
> +        if (!rc) {
> +            PT_LOG("Error: pci_write_block failed. return value[%d].\n", rc);
> +        }
> +    }
> +
> +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
> +        qemu_mod_timer(s->pm_state->pm_timer,
> +                       qemu_get_clock_ms(rt_clock) + s->pm_state->pm_delay);
> +    }
> +}

Where is this timer allocated and initialized?


> +/* ioport/iomem space*/
> +static void pt_iomem_map(XenPCIPassthroughState *s, int i,
> +                         pcibus_t e_phys, pcibus_t e_size, int type)
> +{
> +    uint32_t old_ebase = s->bases[i].e_physbase;
> +    bool first_map = s->bases[i].e_size == 0;
> +    int ret = 0;
> +
> +    s->bases[i].e_physbase = e_phys;
> +    s->bases[i].e_size = e_size;
> +
> +    PT_LOG("e_phys=%#"PRIx64" maddr=%#"PRIx64" type=%%d"
> +           " len=%#"PRIx64" index=%d first_map=%d\n",
> +           e_phys, s->bases[i].access.maddr, /*type,*/
> +           e_size, i, first_map);
> +
> +    if (e_size == 0) {
> +        return;
> +    }
> +
> +    if (!first_map && old_ebase != -1) {
> +        /* Remove old mapping */
> +        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
> +                               old_ebase >> XC_PAGE_SHIFT,
> +                               s->bases[i].access.maddr >> XC_PAGE_SHIFT,
> +                               (e_size + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
> +                               DPCI_REMOVE_MAPPING);
> +        if (ret != 0) {
> +            PT_LOG("Error: remove old mapping failed!\n");
> +            return;
> +        }
> +    }
> +
> +    /* map only valid guest address */
> +    if (e_phys != -1) {
> +        /* Create new mapping */
> +        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
> +                                   s->bases[i].e_physbase >> XC_PAGE_SHIFT,
> +                                   s->bases[i].access.maddr >> XC_PAGE_SHIFT,
> +                                   (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
> +                                   DPCI_ADD_MAPPING);
> +
> +        if (ret != 0) {
> +            PT_LOG("Error: create new mapping failed!\n");
> +        }
> +    }
> +}
> +
> +static void pt_ioport_map(XenPCIPassthroughState *s, int i,
> +                          pcibus_t e_phys, pcibus_t e_size, int type)
> +{
> +    uint32_t old_ebase = s->bases[i].e_physbase;
> +    bool first_map = s->bases[i].e_size == 0;
> +    int ret = 0;
> +
> +    s->bases[i].e_physbase = e_phys;
> +    s->bases[i].e_size = e_size;
> +
> +    PT_LOG("e_phys=%#04"PRIx64" pio_base=%#04"PRIx64" len=%"PRId64" index=%d"
> +           " first_map=%d\n",
> +           e_phys, s->bases[i].access.pio_base, e_size, i, first_map);
> +
> +    if (e_size == 0) {
> +        return;
> +    }
> +
> +    if (!first_map && old_ebase != -1) {
> +        /* Remove old mapping */
> +        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, old_ebase,
> +                                       s->bases[i].access.pio_base, e_size,
> +                                       DPCI_REMOVE_MAPPING);
> +        if (ret != 0) {
> +            PT_LOG("Error: remove old mapping failed!\n");
> +            return;
> +        }
> +    }
> +
> +    /* map only valid guest address (include 0) */
> +    if (e_phys != -1) {
> +        /* Create new mapping */
> +        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, e_phys,
> +                                       s->bases[i].access.pio_base, e_size,
> +                                       DPCI_ADD_MAPPING);
> +        if (ret != 0) {
> +            PT_LOG("Error: create new mapping failed!\n");
> +        }
> +    }
> +
> +}
> +
> +
> +/* mapping BAR */
> +
> +void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
> +                        int io_enable, int mem_enable)
> +{
> +    PCIDevice *dev = &s->dev;
> +    PCIIORegion *r;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegion *base = NULL;
> +    pcibus_t r_size = 0, r_addr = -1;
> +    int rc = 0;
> +
> +    r = &dev->io_regions[bar];
> +
> +    /* check valid region */
> +    if (!r->size) {
> +        return;
> +    }
> +
> +    base = &s->bases[bar];
> +    /* skip unused BAR or upper 64bit BAR */
> +    if ((base->bar_flag == PT_BAR_FLAG_UNUSED)
> +        || (base->bar_flag == PT_BAR_FLAG_UPPER)) {
> +           return;
> +    }
> +
> +    /* copy region address to temporary */
> +    r_addr = r->addr;
> +
> +    /* need unmapping in case I/O Space or Memory Space disable */
> +    if (((base->bar_flag == PT_BAR_FLAG_IO) && !io_enable) ||
> +        ((base->bar_flag == PT_BAR_FLAG_MEM) && !mem_enable)) {
> +        r_addr = -1;
> +    }
> +    if ((bar == PCI_ROM_SLOT) && (r_addr != -1)) {
> +        reg_grp_entry = pt_find_reg_grp(s, PCI_ROM_ADDRESS);
> +        if (reg_grp_entry) {
> +            reg_entry = pt_find_reg(reg_grp_entry, PCI_ROM_ADDRESS);
> +            if (reg_entry && !(reg_entry->data & PCI_ROM_ADDRESS_ENABLE)) {
> +                r_addr = -1;
> +            }
> +        }
> +    }
> +
> +    /* prevent guest software mapping memory resource to 00000000h */
> +    if ((base->bar_flag == PT_BAR_FLAG_MEM) && (r_addr == 0)) {
> +        r_addr = -1;
> +    }
> +
> +    r_size = pt_get_emul_size(base->bar_flag, r->size);
> +
> +    rc = pci_check_bar_overlap(dev, r_addr, r_size, r->type);
> +    if (rc > 0) {
> +        PT_LOG("Warning: s[%02x:%02x.%x][Region:%d][Address:%"FMT_PCIBUS"h]"
> +               "[Size:%"FMT_PCIBUS"h] is overlapped.\n", pci_bus_num(dev->bus),
> +               PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), bar,
> +               r_addr, r_size);
> +    }
> +
> +    /* check whether we need to update the mapping or not */
> +    if (r_addr != s->bases[bar].e_physbase) {
> +        /* mapping BAR */
> +        if (base->bar_flag == PT_BAR_FLAG_IO) {
> +            pt_ioport_map(s, bar, r_addr, r_size, r->type);
> +        } else {
> +            pt_iomem_map(s, bar, r_addr, r_size, r->type);
> +        }
> +    }
> +}
> +
> +void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable)
> +{
> +    int i;
> +
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        pt_bar_mapping_one(s, i, io_enable, mem_enable);
> +    }
> +}
> +
> +/* register regions */
> +static int pt_register_regions(XenPCIPassthroughState *s)
> +{
> +    int i = 0;
> +    uint32_t bar_data = 0;
> +    HostPCIDevice *d = s->real_device;
> +
> +    /* Register PIO/MMIO BARs */
> +    for (i = 0; i < PCI_BAR_ENTRIES; i++) {
> +        HostPCIIORegion *r = &d->io_regions[i];
> +
> +        if (r->base_addr) {
> +            s->bases[i].e_physbase = r->base_addr;
> +            s->bases[i].access.u = r->base_addr;
> +
> +            /* Register current region */
> +            if (r->flags & IORESOURCE_IO) {
> +                memory_region_init_io(&s->bar[i], NULL, NULL,
> +                                      "xen-pci-pt-bar", r->size);
> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_IO,
> +                                 &s->bar[i]);
> +            } else if (r->flags & IORESOURCE_PREFETCH) {
> +                memory_region_init_io(&s->bar[i], NULL, NULL,
> +                                      "xen-pci-pt-bar", r->size);
> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_MEM_PREFETCH,
> +                                 &s->bar[i]);
> +            } else {
> +                memory_region_init_io(&s->bar[i], NULL, NULL,
> +                                      "xen-pci-pt-bar", r->size);
> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_MEMORY,
> +                                 &s->bar[i]);
> +            }
> +
> +            PT_LOG("IO region registered (size=0x%08"PRIx64
> +                   " base_addr=0x%08"PRIx64")\n",
> +                   r->size, r->base_addr);
> +        }
> +    }
> +
> +    /* Register expansion ROM address */
> +    if (d->rom.base_addr && d->rom.size) {
> +        /* Re-set BAR reported by OS, otherwise ROM can't be read. */
> +        bar_data = host_pci_get_long(d, PCI_ROM_ADDRESS);
> +        if ((bar_data & PCI_ROM_ADDRESS_MASK) == 0) {
> +            bar_data |= d->rom.base_addr & PCI_ROM_ADDRESS_MASK;
> +            host_pci_set_long(d, PCI_ROM_ADDRESS, bar_data);
> +        }
> +
> +        s->bases[PCI_ROM_SLOT].e_physbase = d->rom.base_addr;
> +        s->bases[PCI_ROM_SLOT].access.maddr = d->rom.base_addr;
> +
> +        memory_region_init_rom_device(&s->rom, NULL, NULL, &s->dev.qdev,
> +                                      "xen-pci-pt-rom", d->rom.size);
> +        pci_register_bar(&s->dev, PCI_ROM_SLOT, PCI_BASE_ADDRESS_MEM_PREFETCH,
> +                         &s->rom);
> +
> +        PT_LOG("Expansion ROM registered (size=0x%08"PRIx64
> +               " base_addr=0x%08"PRIx64")\n",
> +               d->rom.size, d->rom.base_addr);
> +    }
> +
> +    return 0;
> +}
> +
> +static void pt_unregister_regions(XenPCIPassthroughState *s)
> +{
> +    int i, type, rc;
> +    uint32_t e_size;
> +    PCIDevice *d = &s->dev;
> +
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        e_size = s->bases[i].e_size;
> +        if ((e_size == 0) || (s->bases[i].e_physbase == -1)) {
> +            continue;
> +        }
> +
> +        type = d->io_regions[i].type;
> +
> +        if (type == PCI_BASE_ADDRESS_SPACE_MEMORY
> +            || type == PCI_BASE_ADDRESS_MEM_PREFETCH) {
> +            rc = xc_domain_memory_mapping(xen_xc, xen_domid,
> +                    s->bases[i].e_physbase >> XC_PAGE_SHIFT,
> +                    s->bases[i].access.maddr >> XC_PAGE_SHIFT,
> +                    (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
> +                    DPCI_REMOVE_MAPPING);
> +            if (rc != 0) {
> +                PT_LOG("Error: remove old mem mapping failed!\n");
> +                continue;
> +            }
> +
> +        } else if (type == PCI_BASE_ADDRESS_SPACE_IO) {
> +            rc = xc_domain_ioport_mapping(xen_xc, xen_domid,
> +                        s->bases[i].e_physbase,
> +                        s->bases[i].access.pio_base,
> +                        e_size,
> +                        DPCI_REMOVE_MAPPING);
> +            if (rc != 0) {
> +                PT_LOG("Error: remove old io mapping failed!\n");
> +                continue;
> +            }
> +        }
> +    }
> +}
> +
> +static int pt_initfn(PCIDevice *pcidev)
> +{
> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
> +    int dom, bus;
> +    unsigned slot, func;
> +    int rc = 0;
> +    uint32_t machine_irq;
> +    int pirq = -1;
> +
> +    if (pci_parse_devaddr(s->hostaddr, &dom, &bus, &slot, &func) < 0) {
> +        fprintf(stderr, "error parse bdf: %s\n", s->hostaddr);
> +        return -1;
> +    }
> +
> +    /* register real device */
> +    PT_LOG("Assigning real physical device %02x:%02x.%x to devfn %i ...\n",
> +           bus, slot, func, s->dev.devfn);
> +
> +    s->real_device = host_pci_device_get(bus, slot, func);
> +    if (!s->real_device) {
> +        return -1;
> +    }
> +
> +    s->is_virtfn = s->real_device->is_virtfn;
> +    if (s->is_virtfn) {
> +        PT_LOG("%04x:%02x:%02x.%x is a SR-IOV Virtual Function\n",
> +               s->real_device->domain, bus, slot, func);
> +    }
> +
> +    /* Initialize virtualized PCI configuration (Extended 256 Bytes) */
> +    if (host_pci_get_block(s->real_device, 0, pcidev->config,
> +                           PCI_CONFIG_SPACE_SIZE) == -1) {
> +        return -1;
> +    }
> +
> +    /* Handle real device's MMIO/PIO BARs */
> +    pt_register_regions(s);
> +
> +    /* reinitialize each config register to be emulated */
> +    pt_config_init(s);

this function is implemented in the next patch, so you might as well add
this call there


> +    /* Bind interrupt */
> +    if (!s->dev.config[PCI_INTERRUPT_PIN]) {
> +        PT_LOG("no pin interrupt\n");
> +        goto out;
> +    }
> +
> +    machine_irq = host_pci_get_byte(s->real_device, PCI_INTERRUPT_LINE);
> +    rc = xc_physdev_map_pirq(xen_xc, xen_domid, machine_irq, &pirq);
> +
> +    if (rc) {
> +        PT_LOG("Error: Mapping irq failed, rc = %d\n", rc);
> +
> +        /* Disable PCI intx assertion (turn on bit10 of devctl) */
> +        host_pci_set_word(s->real_device,
> +                          PCI_COMMAND,
> +                          pci_get_word(s->dev.config + PCI_COMMAND)
> +                          | PCI_COMMAND_INTX_DISABLE);
> +        machine_irq = 0;
> +        s->machine_irq = 0;
> +    } else {
> +        machine_irq = pirq;
> +        s->machine_irq = pirq;
> +        mapped_machine_irq[machine_irq]++;
> +    }
> +
> +    /* bind machine_irq to device */
> +    if (rc < 0 && machine_irq != 0) {
> +        uint8_t e_device = PCI_SLOT(s->dev.devfn);
> +        uint8_t e_intx = pci_intx(s);
> +
> +        rc = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, machine_irq, 0,
> +                                       e_device, e_intx);
> +        if (rc < 0) {
> +            PT_LOG("Error: Binding of interrupt failed! rc=%d\n", rc);
> +
> +            /* Disable PCI intx assertion (turn on bit10 of devctl) */
> +            host_pci_set_word(s->real_device, PCI_COMMAND,
> +                              *(uint16_t *)(&s->dev.config[PCI_COMMAND])
> +                              | PCI_COMMAND_INTX_DISABLE);
> +            mapped_machine_irq[machine_irq]--;
> +
> +            if (mapped_machine_irq[machine_irq] == 0) {
> +                if (xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq)) {
> +                    PT_LOG("Error: Unmapping of interrupt failed! rc=%d\n",
> +                           rc);
> +                }
> +            }
> +            s->machine_irq = 0;
> +        }
> +    }
> +
> +out:
> +    PT_LOG("Real physical device %02x:%02x.%x registered successfuly!\n"
> +           "IRQ type = %s\n", bus, slot, func, "INTx");
> +
> +    return 0;
> +}
> +
> +static int pt_unregister_device(PCIDevice *pcidev)
> +{
> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
> +    uint8_t e_device, e_intx;
> +    uint32_t machine_irq;
> +    int rc;
> +
> +    /* Unbind interrupt */
> +    e_device = PCI_SLOT(s->dev.devfn);
> +    e_intx = pci_intx(s);
> +    machine_irq = s->machine_irq;
> +
> +    if (machine_irq) {
> +        rc = xc_domain_unbind_pt_irq(xen_xc, xen_domid, machine_irq,
> +                                     PT_IRQ_TYPE_PCI, 0, e_device, e_intx, 0);
> +        if (rc < 0) {
> +            PT_LOG("Error: Unbinding of interrupt failed! rc=%d\n", rc);
> +        }
> +    }
> +
> +    if (machine_irq) {
> +        mapped_machine_irq[machine_irq]--;
> +
> +        if (mapped_machine_irq[machine_irq] == 0) {
> +            rc = xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq);
> +
> +            if (rc < 0) {
> +                PT_LOG("Error: Unmaping of interrupt failed! rc=%d\n", rc);
> +            }
> +        }
> +    }
> +
> +    /* delete all emulated config registers */
> +    pt_config_delete(s);
> +
> +    /* unregister real device's MMIO/PIO BARs */
> +    pt_unregister_regions(s);
> +
> +    host_pci_device_put(s->real_device);
> +
> +    return 0;
> +}
> +
> +static PCIDeviceInfo xen_pci_passthrough = {
> +    .init = pt_initfn,
> +    .exit = pt_unregister_device,
> +    .qdev.name = "xen-pci-passthrough",
> +    .qdev.desc = "Assign an host pci device with Xen",
> +    .qdev.size = sizeof(XenPCIPassthroughState),
> +    .config_read = pt_pci_read_config,
> +    .config_write = pt_pci_write_config,
> +    .is_express = 0,
> +    .qdev.props = (Property[]) {
> +        DEFINE_PROP_STRING("hostaddr", XenPCIPassthroughState, hostaddr),
> +        DEFINE_PROP_BIT("power-mgmt", XenPCIPassthroughState, power_mgmt,
> +                        0, false),
> +        DEFINE_PROP_END_OF_LIST(),
> +    }
> +};
> +
> +static void xen_passthrough_register(void)
> +{
> +    pci_qdev_register(&xen_pci_passthrough);
> +}
> +
> +device_init(xen_passthrough_register);
> diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
> new file mode 100644
> index 0000000..2d1979d
> --- /dev/null
> +++ b/hw/xen_pci_passthrough.h
> @@ -0,0 +1,223 @@
> +#ifndef QEMU_HW_XEN_PCI_PASSTHROUGH_H
> +#  define QEMU_HW_XEN_PCI_PASSTHROUGH_H
> +
> +#include "qemu-common.h"
> +#include "xen_common.h"
> +#include "pci.h"
> +#include "host-pci-device.h"
> +
> +#define PT_LOGGING_ENABLED
> +#define PT_DEBUG_PCI_CONFIG_ACCESS
> +
> +#ifdef PT_LOGGING_ENABLED
> +#  define PT_LOG(_f, _a...)   fprintf(stderr, "%s: " _f, __func__, ##_a)
> +#else
> +#  define PT_LOG(_f, _a...)
> +#endif
> +
> +#ifdef PT_DEBUG_PCI_CONFIG_ACCESS
> +#  define PT_LOG_CONFIG(_f, _a...) PT_LOG(_f, ##_a)
> +#else
> +#  define PT_LOG_CONFIG(_f, _a...)
> +#endif
> +
> +
> +typedef struct XenPTRegInfo XenPTRegInfo;
> +typedef struct XenPTReg XenPTReg;
> +
> +typedef struct XenPCIPassthroughState XenPCIPassthroughState;
> +
> +/* function type for config reg */
> +typedef uint32_t (*conf_reg_init)
> +    (XenPCIPassthroughState *, XenPTRegInfo *, uint32_t real_offset);
> +typedef int (*conf_dword_write)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint32_t *val, uint32_t dev_value, uint32_t valid_mask);
> +typedef int (*conf_word_write)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint16_t *val, uint16_t dev_value, uint16_t valid_mask);
> +typedef int (*conf_byte_write)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint8_t *val, uint8_t dev_value, uint8_t valid_mask);
> +typedef int (*conf_dword_read)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint32_t *val, uint32_t valid_mask);
> +typedef int (*conf_word_read)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint16_t *val, uint16_t valid_mask);
> +typedef int (*conf_byte_read)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint8_t *val, uint8_t valid_mask);
> +typedef int (*conf_dword_restore)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
> +     uint32_t dev_value, uint32_t *val);
> +typedef int (*conf_word_restore)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
> +     uint16_t dev_value, uint16_t *val);
> +typedef int (*conf_byte_restore)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
> +     uint8_t dev_value, uint8_t *val);
> +
> +/* power state transition */
> +#define PT_FLAG_TRANSITING 0x0001
> +
> +
> +typedef enum {
> +    GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
> +    GRP_TYPE_EMU,                               /* emul reg group */
> +} RegisterGroupType;
> +
> +typedef enum {
> +    PT_BAR_FLAG_MEM = 0,                        /* Memory type BAR */
> +    PT_BAR_FLAG_IO,                             /* I/O type BAR */
> +    PT_BAR_FLAG_UPPER,                          /* upper 64bit BAR */
> +    PT_BAR_FLAG_UNUSED,                         /* unused BAR */
> +} PTBarFlag;
> +
> +
> +typedef struct XenPTRegion {
> +    /* Virtual phys base & size */
> +    uint32_t e_physbase;
> +    uint32_t e_size;
> +    /* Index of region in qemu */
> +    uint32_t memory_index;
> +    /* BAR flag */
> +    PTBarFlag bar_flag;
> +    /* Translation of the emulated address */
> +    union {
> +        uint64_t maddr;
> +        uint64_t pio_base;
> +        uint64_t u;
> +    } access;
> +} XenPTRegion;
> +
> +/* XenPTRegInfo declaration
> + * - only for emulated register (either a part or whole bit).
> + * - for passthrough register that need special behavior (like interacting with
> + *   other component), set emu_mask to all 0 and specify r/w func properly.
> + * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
> + */
> +
> +/* emulated register infomation */
> +struct XenPTRegInfo {
> +    uint32_t offset;
> +    uint32_t size;
> +    uint32_t init_val;
> +    /* reg read only field mask (ON:RO/ROS, OFF:other) */
> +    uint32_t ro_mask;
> +    /* reg emulate field mask (ON:emu, OFF:passthrough) */
> +    uint32_t emu_mask;
> +    /* no write back allowed */
> +    uint32_t no_wb;
> +    conf_reg_init init;
> +    /* read/write/restore function pointer
> +     * for double_word/word/byte size */
> +    union {
> +        struct {
> +            conf_dword_write write;
> +            conf_dword_read read;
> +            conf_dword_restore restore;
> +        } dw;
> +        struct {
> +            conf_word_write write;
> +            conf_word_read read;
> +            conf_word_restore restore;
> +        } w;
> +        struct {
> +            conf_byte_write write;
> +            conf_byte_read read;
> +            conf_byte_restore restore;
> +        } b;
> +    } u;
> +};
> +
> +/* emulated register management */
> +struct XenPTReg {
> +    QLIST_ENTRY(XenPTReg) entries;
> +    XenPTRegInfo *reg;
> +    uint32_t data;
> +};
> +
> +typedef struct XenPTRegGroupInfo XenPTRegGroupInfo;
> +
> +/* emul reg group size initialize method */
> +typedef uint8_t (*pt_reg_size_init_fn)
> +    (XenPCIPassthroughState *, const XenPTRegGroupInfo *,
> +     uint32_t base_offset);
> +
> +/* emulated register group infomation */
> +struct XenPTRegGroupInfo {
> +    uint8_t grp_id;
> +    RegisterGroupType grp_type;
> +    uint8_t grp_size;
> +    pt_reg_size_init_fn size_init;
> +    XenPTRegInfo *emu_reg_tbl;
> +};
> +
> +/* emul register group management table */
> +typedef struct XenPTRegGroup {
> +    QLIST_ENTRY(XenPTRegGroup) entries;
> +    const XenPTRegGroupInfo *reg_grp;
> +    uint32_t base_offset;
> +    uint8_t size;
> +    QLIST_HEAD(, XenPTReg) reg_tbl_list;
> +} XenPTRegGroup;
> +
> +
> +typedef struct XenPTPM {
> +    QEMUTimer *pm_timer;  /* QEMUTimer struct */
> +    int no_soft_reset;    /* No Soft Reset flags */
> +    uint16_t flags;       /* power state transition flags */
> +    uint16_t pmc_field;   /* Power Management Capabilities field */
> +    int pm_delay;         /* power state transition delay */
> +    uint16_t cur_state;   /* current power state */
> +    uint16_t req_state;   /* requested power state */
> +    uint32_t pm_base;     /* Power Management Capability reg base offset */
> +    uint32_t aer_base;    /* AER Capability reg base offset */
> +} XenPTPM;
> +
> +struct XenPCIPassthroughState {
> +    PCIDevice dev;
> +
> +    char *hostaddr;
> +    bool is_virtfn;
> +    HostPCIDevice *real_device;
> +    XenPTRegion bases[PCI_NUM_REGIONS]; /* Access regions */
> +    QLIST_HEAD(, XenPTRegGroup) reg_grp_tbl;
> +
> +    uint32_t machine_irq;
> +
> +    uint32_t power_mgmt;
> +    XenPTPM *pm_state;
> +
> +    MemoryRegion bar[PCI_NUM_REGIONS - 1];
> +    MemoryRegion rom;
> +};
> +
> +void pt_config_init(XenPCIPassthroughState *s);
> +void pt_config_delete(XenPCIPassthroughState *s);
> +void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable);
> +void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
> +                        int io_enable, int mem_enable);
> +XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address);
> +XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address);
> +int pt_bar_offset_to_index(uint32_t offset);
> +
> +static inline pcibus_t pt_get_emul_size(PTBarFlag flag, pcibus_t r_size)
> +{
> +    /* align resource size (memory type only) */
> +    if (flag == PT_BAR_FLAG_MEM) {
> +        return (r_size + XC_PAGE_SIZE - 1) & XC_PAGE_MASK;
> +    } else {
> +        return r_size;
> +    }
> +}
> +
> +/* INTx */
> +static inline uint8_t pci_read_intx(XenPCIPassthroughState *s)
> +{
> +    return host_pci_get_byte(s->real_device, PCI_INTERRUPT_PIN);
> +}
> +uint8_t pci_intx(XenPCIPassthroughState *ptdev);
> +
> +#endif /* !QEMU_HW_XEN_PCI_PASSTHROUGH_H */
> diff --git a/hw/xen_pci_passthrough_helpers.c b/hw/xen_pci_passthrough_helpers.c
> new file mode 100644
> index 0000000..192e918
> --- /dev/null
> +++ b/hw/xen_pci_passthrough_helpers.c
> @@ -0,0 +1,46 @@
> +#include "xen_pci_passthrough.h"
> +
> +/* The PCI Local Bus Specification, Rev. 3.0, {
> + * Section 6.2.4 Miscellaneous Registers, pp 223
> + * outlines 5 valid values for the intertupt pin (intx).
> + *  0: For devices (or device functions) that don't use an interrupt in
> + *  1: INTA#
> + *  2: INTB#
> + *  3: INTC#
> + *  4: INTD#
> + *
> + * Xen uses the following 4 values for intx
> + *  0: INTA#
> + *  1: INTB#
> + *  2: INTC#
> + *  3: INTD#
> + *
> + * Observing that these list of values are not the same, pci_read_intx()
> + * uses the following mapping from hw to xen values.
> + * This seems to reflect the current usage within Xen.
> + *
> + * PCI hardware    | Xen | Notes
> + * ----------------+-----+----------------------------------------------------
> + * 0               | 0   | No interrupt
> + * 1               | 0   | INTA#
> + * 2               | 1   | INTB#
> + * 3               | 2   | INTC#
> + * 4               | 3   | INTD#
> + * any other value | 0   | This should never happen, log error message
> +}
> + */
> +uint8_t pci_intx(XenPCIPassthroughState *ptdev)
> +{
> +    uint8_t r_val = pci_read_intx(ptdev);
> +
> +    PT_LOG("intx=%i\n", r_val);
> +    if (r_val < 1 || r_val > 4) {
> +        PT_LOG("Interrupt pin read from hardware is out of range: "
> +               "value=%i, acceptable range is 1 - 4\n", r_val);
> +        r_val = 0;
> +    } else {
> +        r_val -= 1;
> +    }
> +
> +    return r_val;
> +}
 
if xen_pci_passthrough_helpers.c is only going to contain this function
you might as well declared it static inline and move it to
xen_pci_passthrough.h
Anthony PERARD - Nov. 9, 2011, 5:03 p.m.
On Tue, Nov 8, 2011 at 12:56, Stefano Stabellini
<stefano.stabellini@eu.citrix.com> wrote:
> On Fri, 28 Oct 2011, Anthony PERARD wrote:
>> From: Allen Kay <allen.m.kay@intel.com>
>>
>> Signed-off-by: Allen Kay <allen.m.kay@intel.com>
>> Signed-off-by: Guy Zana <guy@neocleus.com>
>> Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
>> ---
>>  Makefile.target                  |    2 +
>>  hw/xen_pci_passthrough.c         |  838 ++++++++++++++++++++++++++++++++++++++
>>  hw/xen_pci_passthrough.h         |  223 ++++++++++
>>  hw/xen_pci_passthrough_helpers.c |   46 ++
>>  4 files changed, 1109 insertions(+), 0 deletions(-)
>>  create mode 100644 hw/xen_pci_passthrough.c
>>  create mode 100644 hw/xen_pci_passthrough.h
>>  create mode 100644 hw/xen_pci_passthrough_helpers.c
>>
>> diff --git a/Makefile.target b/Makefile.target
>> index 243f9f2..36ea47d 100644
>> --- a/Makefile.target
>> +++ b/Makefile.target
>> @@ -217,6 +217,8 @@ obj-i386-$(CONFIG_XEN) += xen_platform.o
>>
>>  # Xen PCI Passthrough
>>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
>> +obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough.o
>> +obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_helpers.o
>>
>>  # Inter-VM PCI shared memory
>>  CONFIG_IVSHMEM =
>> diff --git a/hw/xen_pci_passthrough.c b/hw/xen_pci_passthrough.c
>> new file mode 100644
>> index 0000000..b97c5b6
>> --- /dev/null
>> +++ b/hw/xen_pci_passthrough.c
>> @@ -0,0 +1,838 @@
>> +/*
>> + * Copyright (c) 2007, Neocleus Corporation.
>> + * Copyright (c) 2007, Intel Corporation.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + * Alex Novik <alex@neocleus.com>
>> + * Allen Kay <allen.m.kay@intel.com>
>> + * Guy Zana <guy@neocleus.com>
>> + *
>> + * This file implements direct PCI assignment to a HVM guest
>> + */
>> +
>> +/*
>> + * Interrupt Disable policy:
>> + *
>> + * INTx interrupt:
>> + *   Initialize(register_real_device)
>> + *     Map INTx(xc_physdev_map_pirq):
>> + *       <fail>
>> + *         - Set real Interrupt Disable bit to '1'.
>> + *         - Set machine_irq and assigned_device->machine_irq to '0'.
>> + *         * Don't bind INTx.
>> + *
>> + *     Bind INTx(xc_domain_bind_pt_pci_irq):
>> + *       <fail>
>> + *         - Set real Interrupt Disable bit to '1'.
>> + *         - Unmap INTx.
>> + *         - Decrement mapped_machine_irq[machine_irq]
>> + *         - Set assigned_device->machine_irq to '0'.
>> + *
>> + *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
>> + *     Write '0'
>> + *       <ptdev->msi_trans_en is false>
>> + *         - Set real bit to '0' if assigned_device->machine_irq isn't '0'.
>> + *
>> + *     Write '1'
>> + *       <ptdev->msi_trans_en is false>
>> + *         - Set real bit to '1'.
>> + *
>> + * MSI-INTx translation.
>> + *   Initialize(xc_physdev_map_pirq_msi/pt_msi_setup)
>> + *     Bind MSI-INTx(xc_domain_bind_pt_irq)
>> + *       <fail>
>> + *         - Unmap MSI.
>> + *           <success>
>> + *             - Set dev->msi->pirq to '-1'.
>> + *           <fail>
>> + *             - Do nothing.
>> + *
>> + *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
>> + *     Write '0'
>> + *       <ptdev->msi_trans_en is true>
>> + *         - Set MSI Enable bit to '1'.
>> + *
>> + *     Write '1'
>> + *       <ptdev->msi_trans_en is true>
>> + *         - Set MSI Enable bit to '0'.
>> + *
>> + * MSI interrupt:
>> + *   Initialize MSI register(pt_msi_setup, pt_msi_update)
>> + *     Bind MSI(xc_domain_update_msi_irq)
>> + *       <fail>
>> + *         - Unmap MSI.
>> + *         - Set dev->msi->pirq to '-1'.
>> + *
>> + * MSI-X interrupt:
>> + *   Initialize MSI-X register(pt_msix_update_one)
>> + *     Bind MSI-X(xc_domain_update_msi_irq)
>> + *       <fail>
>> + *         - Unmap MSI-X.
>> + *         - Set entry->pirq to '-1'.
>> + */
>> +
>
> you should move all the MSI related comments to the MSI patch

OK, I will move MSI comments.

>> +#include <sys/ioctl.h>
>> +
>> +#include "pci.h"
>> +#include "xen.h"
>> +#include "xen_backend.h"
>> +#include "xen_pci_passthrough.h"
>> +
>> +#define PCI_BAR_ENTRIES (6)
>> +
>> +#define PT_NR_IRQS          (256)
>> +char mapped_machine_irq[PT_NR_IRQS] = {0};
>> +
>> +/* Config Space */
>> +static int pt_pci_config_access_check(PCIDevice *d, uint32_t address, int len)
>> +{
>> +    /* check offset range */
>> +    if (address >= 0xFF) {
>> +        PT_LOG("Error: Failed to access register with offset exceeding FFh. "
>> +               "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
>> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
>> +               address, len);
>> +        return -1;
>> +    }
>> +
>> +    /* check read size */
>> +    if ((len != 1) && (len != 2) && (len != 4)) {
>> +        PT_LOG("Error: Failed to access register with invalid access length. "
>> +               "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
>> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
>> +               address, len);
>> +        return -1;
>> +    }
>> +
>> +    /* check offset alignment */
>> +    if (address & (len - 1)) {
>> +        PT_LOG("Error: Failed to access register with invalid access size "
>> +            "alignment. [%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
>> +            pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
>> +            address, len);
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +int pt_bar_offset_to_index(uint32_t offset)
>> +{
>> +    int index = 0;
>> +
>> +    /* check Exp ROM BAR */
>> +    if (offset == PCI_ROM_ADDRESS) {
>> +        return PCI_ROM_SLOT;
>> +    }
>> +
>> +    /* calculate BAR index */
>> +    index = (offset - PCI_BASE_ADDRESS_0) >> 2;
>> +    if (index >= PCI_NUM_REGIONS) {
>> +        return -1;
>> +    }
>> +
>> +    return index;
>> +}
>> +
>> +static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t address, int len)
>> +{
>> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
>> +    uint32_t val = 0;
>> +    XenPTRegGroup *reg_grp_entry = NULL;
>> +    XenPTReg *reg_entry = NULL;
>> +    int rc = 0;
>> +    int emul_len = 0;
>> +    uint32_t find_addr = address;
>> +
>> +    if (pt_pci_config_access_check(d, address, len)) {
>> +        goto exit;
>> +    }
>> +
>> +    /* check power state transition flags */
>> +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
>> +        /* can't accept until previous power state transition is completed.
>> +         * so finished previous request here.
>> +         */
>> +        PT_LOG("Warning: guest want to write durring power state transition\n");
>> +        goto exit;
>> +    }
>> +
>> +    /* find register group entry */
>> +    reg_grp_entry = pt_find_reg_grp(s, address);
>> +    if (reg_grp_entry) {
>> +        /* check 0 Hardwired register group */
>> +        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
>> +            /* no need to emulate, just return 0 */
>> +            val = 0;
>> +            goto exit;
>> +        }
>> +    }
>> +
>> +    /* read I/O device register value */
>> +    rc = host_pci_get_block(s->real_device, address, (uint8_t *)&val, len);
>> +    if (!rc) {
>> +        PT_LOG("Error: pci_read_block failed. return value[%d].\n", rc);
>> +        memset(&val, 0xff, len);
>> +    }
>> +
>> +    /* just return the I/O device register value for
>> +     * passthrough type register group */
>> +    if (reg_grp_entry == NULL) {
>> +        goto exit;
>> +    }
>> +
>> +    /* adjust the read value to appropriate CFC-CFF window */
>> +    val <<= (address & 3) << 3;
>> +    emul_len = len;
>> +
>> +    /* loop Guest request size */
>> +    while (emul_len > 0) {
>> +        /* find register entry to be emulated */
>> +        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
>> +        if (reg_entry) {
>> +            XenPTRegInfo *reg = reg_entry->reg;
>> +            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
>> +            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
>> +            uint8_t *ptr_val = NULL;
>> +
>> +            valid_mask <<= (find_addr - real_offset) << 3;
>> +            ptr_val = (uint8_t *)&val + (real_offset & 3);
>> +
>> +            /* do emulation depend on register size */
>> +            switch (reg->size) {
>> +            case 1:
>> +                if (reg->u.b.read) {
>> +                    rc = reg->u.b.read(s, reg_entry, ptr_val, valid_mask);
>> +                }
>> +                break;
>> +            case 2:
>> +                if (reg->u.w.read) {
>> +                    rc = reg->u.w.read(s, reg_entry,
>> +                                       (uint16_t *)ptr_val, valid_mask);
>> +                }
>> +                break;
>> +            case 4:
>> +                if (reg->u.dw.read) {
>> +                    rc = reg->u.dw.read(s, reg_entry,
>> +                                        (uint32_t *)ptr_val, valid_mask);
>> +                }
>> +                break;
>> +            }
>> +
>> +            if (rc < 0) {
>> +                hw_error("Internal error: Invalid read emulation "
>> +                         "return value[%d]. I/O emulator exit.\n", rc);
>> +            }
>> +
>> +            /* calculate next address to find */
>> +            emul_len -= reg->size;
>> +            if (emul_len > 0) {
>> +                find_addr = real_offset + reg->size;
>> +            }
>> +        } else {
>> +            /* nothing to do with passthrough type register,
>> +             * continue to find next byte */
>> +            emul_len--;
>> +            find_addr++;
>> +        }
>> +    }
>> +
>> +    /* need to shift back before returning them to pci bus emulator */
>> +    val >>= ((address & 3) << 3);
>> +
>> +exit:
>> +    PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
>> +                  pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
>> +                  address, val, len);
>> +    return val;
>> +}
>> +
>> +static void pt_pci_write_config(PCIDevice *d, uint32_t address,
>> +                                uint32_t val, int len)
>> +{
>> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
>> +    int index = 0;
>> +    XenPTRegGroup *reg_grp_entry = NULL;
>> +    int rc = 0;
>> +    uint32_t read_val = 0;
>> +    int emul_len = 0;
>> +    XenPTReg *reg_entry = NULL;
>> +    uint32_t find_addr = address;
>> +    XenPTRegInfo *reg = NULL;
>> +
>> +    if (pt_pci_config_access_check(d, address, len)) {
>> +        return;
>> +    }
>> +
>> +    PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
>> +                  pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
>> +                  address, val, len);
>> +
>> +    /* check unused BAR register */
>> +    index = pt_bar_offset_to_index(address);
>> +    if ((index >= 0) && (val > 0 && val < PT_BAR_ALLF) &&
>> +        (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED)) {
>> +        PT_LOG("Warning: Guest attempt to set address to unused Base Address "
>> +               "Register. [%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
>> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
>> +               address, len);
>> +    }
>> +
>> +    /* check power state transition flags */
>> +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
>> +        /* can't accept untill previous power state transition is completed.
>> +         * so finished previous request here.
>> +         */
>> +        PT_LOG("Warning: guest want to write durring power state transition\n");
>> +        return;
>> +    }
>> +
>> +    /* find register group entry */
>> +    reg_grp_entry = pt_find_reg_grp(s, address);
>> +    if (reg_grp_entry) {
>> +        /* check 0 Hardwired register group */
>> +        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
>> +            /* ignore silently */
>> +            PT_LOG("Warning: Access to 0 Hardwired register. "
>> +                   "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
>> +                   pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
>> +                   address, len);
>> +            return;
>> +        }
>> +    }
>> +
>> +    /* read I/O device register value */
>> +    rc = host_pci_get_block(s->real_device, address,
>> +                             (uint8_t *)&read_val, len);
>> +    if (!rc) {
>> +        PT_LOG("Error: pci_read_block failed. return value[%d].\n", rc);
>> +        memset(&read_val, 0xff, len);
>> +    }
>> +
>> +    /* pass directly to libpci for passthrough type register group */
>> +    if (reg_grp_entry == NULL) {
>> +        goto out;
>> +    }
>> +
>> +    /* adjust the read and write value to appropriate CFC-CFF window */
>> +    read_val <<= (address & 3) << 3;
>> +    val <<= (address & 3) << 3;
>> +    emul_len = len;
>> +
>> +    /* loop Guest request size */
>> +    while (emul_len > 0) {
>> +        /* find register entry to be emulated */
>> +        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
>> +        if (reg_entry) {
>> +            reg = reg_entry->reg;
>> +            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
>> +            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
>> +            uint8_t *ptr_val = NULL;
>> +
>> +            valid_mask <<= (find_addr - real_offset) << 3;
>> +            ptr_val = (uint8_t *)&val + (real_offset & 3);
>> +
>> +            /* do emulation depend on register size */
>> +            switch (reg->size) {
>> +            case 1:
>> +                if (reg->u.b.write) {
>> +                    rc = reg->u.b.write(s, reg_entry, ptr_val,
>> +                                        read_val >> ((real_offset & 3) << 3),
>> +                                        valid_mask);
>> +                }
>> +                break;
>> +            case 2:
>> +                if (reg->u.w.write) {
>> +                    rc = reg->u.w.write(s, reg_entry, (uint16_t *)ptr_val,
>> +                                        (read_val >> ((real_offset & 3) << 3)),
>> +                                        valid_mask);
>> +                }
>> +                break;
>> +            case 4:
>> +                if (reg->u.dw.write) {
>> +                    rc = reg->u.dw.write(s, reg_entry, (uint32_t *)ptr_val,
>> +                                         (read_val >> ((real_offset & 3) << 3)),
>> +                                         valid_mask);
>> +                }
>> +                break;
>> +            }
>> +
>> +            if (rc < 0) {
>> +                hw_error("Internal error: Invalid write emulation "
>> +                         "return value[%d]. I/O emulator exit.\n", rc);
>> +            }
>> +
>> +            /* calculate next address to find */
>> +            emul_len -= reg->size;
>> +            if (emul_len > 0) {
>> +                find_addr = real_offset + reg->size;
>> +            }
>> +        } else {
>> +            /* nothing to do with passthrough type register,
>> +             * continue to find next byte */
>> +            emul_len--;
>> +            find_addr++;
>> +        }
>> +    }
>> +
>> +    /* need to shift back before passing them to libpci */
>> +    val >>= (address & 3) << 3;
>> +
>> +out:
>> +    if (!(reg && reg->no_wb)) {
>> +        /* unknown regs are passed through */
>> +        rc = host_pci_set_block(s->real_device, address, (uint8_t *)&val, len);
>> +
>> +        if (!rc) {
>> +            PT_LOG("Error: pci_write_block failed. return value[%d].\n", rc);
>> +        }
>> +    }
>> +
>> +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
>> +        qemu_mod_timer(s->pm_state->pm_timer,
>> +                       qemu_get_clock_ms(rt_clock) + s->pm_state->pm_delay);
>> +    }
>> +}
>
> Where is this timer allocated and initialized?

In the next patch, I will move this lines to the releated patch.

>> +/* ioport/iomem space*/
>> +static void pt_iomem_map(XenPCIPassthroughState *s, int i,
>> +                         pcibus_t e_phys, pcibus_t e_size, int type)
>> +{
>> +    uint32_t old_ebase = s->bases[i].e_physbase;
>> +    bool first_map = s->bases[i].e_size == 0;
>> +    int ret = 0;
>> +
>> +    s->bases[i].e_physbase = e_phys;
>> +    s->bases[i].e_size = e_size;
>> +
>> +    PT_LOG("e_phys=%#"PRIx64" maddr=%#"PRIx64" type=%%d"
>> +           " len=%#"PRIx64" index=%d first_map=%d\n",
>> +           e_phys, s->bases[i].access.maddr, /*type,*/
>> +           e_size, i, first_map);
>> +
>> +    if (e_size == 0) {
>> +        return;
>> +    }
>> +
>> +    if (!first_map && old_ebase != -1) {
>> +        /* Remove old mapping */
>> +        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
>> +                               old_ebase >> XC_PAGE_SHIFT,
>> +                               s->bases[i].access.maddr >> XC_PAGE_SHIFT,
>> +                               (e_size + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
>> +                               DPCI_REMOVE_MAPPING);
>> +        if (ret != 0) {
>> +            PT_LOG("Error: remove old mapping failed!\n");
>> +            return;
>> +        }
>> +    }
>> +
>> +    /* map only valid guest address */
>> +    if (e_phys != -1) {
>> +        /* Create new mapping */
>> +        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
>> +                                   s->bases[i].e_physbase >> XC_PAGE_SHIFT,
>> +                                   s->bases[i].access.maddr >> XC_PAGE_SHIFT,
>> +                                   (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
>> +                                   DPCI_ADD_MAPPING);
>> +
>> +        if (ret != 0) {
>> +            PT_LOG("Error: create new mapping failed!\n");
>> +        }
>> +    }
>> +}
>> +
>> +static void pt_ioport_map(XenPCIPassthroughState *s, int i,
>> +                          pcibus_t e_phys, pcibus_t e_size, int type)
>> +{
>> +    uint32_t old_ebase = s->bases[i].e_physbase;
>> +    bool first_map = s->bases[i].e_size == 0;
>> +    int ret = 0;
>> +
>> +    s->bases[i].e_physbase = e_phys;
>> +    s->bases[i].e_size = e_size;
>> +
>> +    PT_LOG("e_phys=%#04"PRIx64" pio_base=%#04"PRIx64" len=%"PRId64" index=%d"
>> +           " first_map=%d\n",
>> +           e_phys, s->bases[i].access.pio_base, e_size, i, first_map);
>> +
>> +    if (e_size == 0) {
>> +        return;
>> +    }
>> +
>> +    if (!first_map && old_ebase != -1) {
>> +        /* Remove old mapping */
>> +        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, old_ebase,
>> +                                       s->bases[i].access.pio_base, e_size,
>> +                                       DPCI_REMOVE_MAPPING);
>> +        if (ret != 0) {
>> +            PT_LOG("Error: remove old mapping failed!\n");
>> +            return;
>> +        }
>> +    }
>> +
>> +    /* map only valid guest address (include 0) */
>> +    if (e_phys != -1) {
>> +        /* Create new mapping */
>> +        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, e_phys,
>> +                                       s->bases[i].access.pio_base, e_size,
>> +                                       DPCI_ADD_MAPPING);
>> +        if (ret != 0) {
>> +            PT_LOG("Error: create new mapping failed!\n");
>> +        }
>> +    }
>> +
>> +}
>> +
>> +
>> +/* mapping BAR */
>> +
>> +void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
>> +                        int io_enable, int mem_enable)
>> +{
>> +    PCIDevice *dev = &s->dev;
>> +    PCIIORegion *r;
>> +    XenPTRegGroup *reg_grp_entry = NULL;
>> +    XenPTReg *reg_entry = NULL;
>> +    XenPTRegion *base = NULL;
>> +    pcibus_t r_size = 0, r_addr = -1;
>> +    int rc = 0;
>> +
>> +    r = &dev->io_regions[bar];
>> +
>> +    /* check valid region */
>> +    if (!r->size) {
>> +        return;
>> +    }
>> +
>> +    base = &s->bases[bar];
>> +    /* skip unused BAR or upper 64bit BAR */
>> +    if ((base->bar_flag == PT_BAR_FLAG_UNUSED)
>> +        || (base->bar_flag == PT_BAR_FLAG_UPPER)) {
>> +           return;
>> +    }
>> +
>> +    /* copy region address to temporary */
>> +    r_addr = r->addr;
>> +
>> +    /* need unmapping in case I/O Space or Memory Space disable */
>> +    if (((base->bar_flag == PT_BAR_FLAG_IO) && !io_enable) ||
>> +        ((base->bar_flag == PT_BAR_FLAG_MEM) && !mem_enable)) {
>> +        r_addr = -1;
>> +    }
>> +    if ((bar == PCI_ROM_SLOT) && (r_addr != -1)) {
>> +        reg_grp_entry = pt_find_reg_grp(s, PCI_ROM_ADDRESS);
>> +        if (reg_grp_entry) {
>> +            reg_entry = pt_find_reg(reg_grp_entry, PCI_ROM_ADDRESS);
>> +            if (reg_entry && !(reg_entry->data & PCI_ROM_ADDRESS_ENABLE)) {
>> +                r_addr = -1;
>> +            }
>> +        }
>> +    }
>> +
>> +    /* prevent guest software mapping memory resource to 00000000h */
>> +    if ((base->bar_flag == PT_BAR_FLAG_MEM) && (r_addr == 0)) {
>> +        r_addr = -1;
>> +    }
>> +
>> +    r_size = pt_get_emul_size(base->bar_flag, r->size);
>> +
>> +    rc = pci_check_bar_overlap(dev, r_addr, r_size, r->type);
>> +    if (rc > 0) {
>> +        PT_LOG("Warning: s[%02x:%02x.%x][Region:%d][Address:%"FMT_PCIBUS"h]"
>> +               "[Size:%"FMT_PCIBUS"h] is overlapped.\n", pci_bus_num(dev->bus),
>> +               PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), bar,
>> +               r_addr, r_size);
>> +    }
>> +
>> +    /* check whether we need to update the mapping or not */
>> +    if (r_addr != s->bases[bar].e_physbase) {
>> +        /* mapping BAR */
>> +        if (base->bar_flag == PT_BAR_FLAG_IO) {
>> +            pt_ioport_map(s, bar, r_addr, r_size, r->type);
>> +        } else {
>> +            pt_iomem_map(s, bar, r_addr, r_size, r->type);
>> +        }
>> +    }
>> +}
>> +
>> +void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable)
>> +{
>> +    int i;
>> +
>> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
>> +        pt_bar_mapping_one(s, i, io_enable, mem_enable);
>> +    }
>> +}
>> +
>> +/* register regions */
>> +static int pt_register_regions(XenPCIPassthroughState *s)
>> +{
>> +    int i = 0;
>> +    uint32_t bar_data = 0;
>> +    HostPCIDevice *d = s->real_device;
>> +
>> +    /* Register PIO/MMIO BARs */
>> +    for (i = 0; i < PCI_BAR_ENTRIES; i++) {
>> +        HostPCIIORegion *r = &d->io_regions[i];
>> +
>> +        if (r->base_addr) {
>> +            s->bases[i].e_physbase = r->base_addr;
>> +            s->bases[i].access.u = r->base_addr;
>> +
>> +            /* Register current region */
>> +            if (r->flags & IORESOURCE_IO) {
>> +                memory_region_init_io(&s->bar[i], NULL, NULL,
>> +                                      "xen-pci-pt-bar", r->size);
>> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_IO,
>> +                                 &s->bar[i]);
>> +            } else if (r->flags & IORESOURCE_PREFETCH) {
>> +                memory_region_init_io(&s->bar[i], NULL, NULL,
>> +                                      "xen-pci-pt-bar", r->size);
>> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_MEM_PREFETCH,
>> +                                 &s->bar[i]);
>> +            } else {
>> +                memory_region_init_io(&s->bar[i], NULL, NULL,
>> +                                      "xen-pci-pt-bar", r->size);
>> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_MEMORY,
>> +                                 &s->bar[i]);
>> +            }
>> +
>> +            PT_LOG("IO region registered (size=0x%08"PRIx64
>> +                   " base_addr=0x%08"PRIx64")\n",
>> +                   r->size, r->base_addr);
>> +        }
>> +    }
>> +
>> +    /* Register expansion ROM address */
>> +    if (d->rom.base_addr && d->rom.size) {
>> +        /* Re-set BAR reported by OS, otherwise ROM can't be read. */
>> +        bar_data = host_pci_get_long(d, PCI_ROM_ADDRESS);
>> +        if ((bar_data & PCI_ROM_ADDRESS_MASK) == 0) {
>> +            bar_data |= d->rom.base_addr & PCI_ROM_ADDRESS_MASK;
>> +            host_pci_set_long(d, PCI_ROM_ADDRESS, bar_data);
>> +        }
>> +
>> +        s->bases[PCI_ROM_SLOT].e_physbase = d->rom.base_addr;
>> +        s->bases[PCI_ROM_SLOT].access.maddr = d->rom.base_addr;
>> +
>> +        memory_region_init_rom_device(&s->rom, NULL, NULL, &s->dev.qdev,
>> +                                      "xen-pci-pt-rom", d->rom.size);
>> +        pci_register_bar(&s->dev, PCI_ROM_SLOT, PCI_BASE_ADDRESS_MEM_PREFETCH,
>> +                         &s->rom);
>> +
>> +        PT_LOG("Expansion ROM registered (size=0x%08"PRIx64
>> +               " base_addr=0x%08"PRIx64")\n",
>> +               d->rom.size, d->rom.base_addr);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static void pt_unregister_regions(XenPCIPassthroughState *s)
>> +{
>> +    int i, type, rc;
>> +    uint32_t e_size;
>> +    PCIDevice *d = &s->dev;
>> +
>> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
>> +        e_size = s->bases[i].e_size;
>> +        if ((e_size == 0) || (s->bases[i].e_physbase == -1)) {
>> +            continue;
>> +        }
>> +
>> +        type = d->io_regions[i].type;
>> +
>> +        if (type == PCI_BASE_ADDRESS_SPACE_MEMORY
>> +            || type == PCI_BASE_ADDRESS_MEM_PREFETCH) {
>> +            rc = xc_domain_memory_mapping(xen_xc, xen_domid,
>> +                    s->bases[i].e_physbase >> XC_PAGE_SHIFT,
>> +                    s->bases[i].access.maddr >> XC_PAGE_SHIFT,
>> +                    (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
>> +                    DPCI_REMOVE_MAPPING);
>> +            if (rc != 0) {
>> +                PT_LOG("Error: remove old mem mapping failed!\n");
>> +                continue;
>> +            }
>> +
>> +        } else if (type == PCI_BASE_ADDRESS_SPACE_IO) {
>> +            rc = xc_domain_ioport_mapping(xen_xc, xen_domid,
>> +                        s->bases[i].e_physbase,
>> +                        s->bases[i].access.pio_base,
>> +                        e_size,
>> +                        DPCI_REMOVE_MAPPING);
>> +            if (rc != 0) {
>> +                PT_LOG("Error: remove old io mapping failed!\n");
>> +                continue;
>> +            }
>> +        }
>> +    }
>> +}
>> +
>> +static int pt_initfn(PCIDevice *pcidev)
>> +{
>> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
>> +    int dom, bus;
>> +    unsigned slot, func;
>> +    int rc = 0;
>> +    uint32_t machine_irq;
>> +    int pirq = -1;
>> +
>> +    if (pci_parse_devaddr(s->hostaddr, &dom, &bus, &slot, &func) < 0) {
>> +        fprintf(stderr, "error parse bdf: %s\n", s->hostaddr);
>> +        return -1;
>> +    }
>> +
>> +    /* register real device */
>> +    PT_LOG("Assigning real physical device %02x:%02x.%x to devfn %i ...\n",
>> +           bus, slot, func, s->dev.devfn);
>> +
>> +    s->real_device = host_pci_device_get(bus, slot, func);
>> +    if (!s->real_device) {
>> +        return -1;
>> +    }
>> +
>> +    s->is_virtfn = s->real_device->is_virtfn;
>> +    if (s->is_virtfn) {
>> +        PT_LOG("%04x:%02x:%02x.%x is a SR-IOV Virtual Function\n",
>> +               s->real_device->domain, bus, slot, func);
>> +    }
>> +
>> +    /* Initialize virtualized PCI configuration (Extended 256 Bytes) */
>> +    if (host_pci_get_block(s->real_device, 0, pcidev->config,
>> +                           PCI_CONFIG_SPACE_SIZE) == -1) {
>> +        return -1;
>> +    }
>> +
>> +    /* Handle real device's MMIO/PIO BARs */
>> +    pt_register_regions(s);
>> +
>> +    /* reinitialize each config register to be emulated */
>> +    pt_config_init(s);
>
> this function is implemented in the next patch, so you might as well add
> this call there

Ok, I will move this.

>> +    /* Bind interrupt */
>> +    if (!s->dev.config[PCI_INTERRUPT_PIN]) {
>> +        PT_LOG("no pin interrupt\n");
>> +        goto out;
>> +    }
>> +
>> +    machine_irq = host_pci_get_byte(s->real_device, PCI_INTERRUPT_LINE);
>> +    rc = xc_physdev_map_pirq(xen_xc, xen_domid, machine_irq, &pirq);
>> +
>> +    if (rc) {
>> +        PT_LOG("Error: Mapping irq failed, rc = %d\n", rc);
>> +
>> +        /* Disable PCI intx assertion (turn on bit10 of devctl) */
>> +        host_pci_set_word(s->real_device,
>> +                          PCI_COMMAND,
>> +                          pci_get_word(s->dev.config + PCI_COMMAND)
>> +                          | PCI_COMMAND_INTX_DISABLE);
>> +        machine_irq = 0;
>> +        s->machine_irq = 0;
>> +    } else {
>> +        machine_irq = pirq;
>> +        s->machine_irq = pirq;
>> +        mapped_machine_irq[machine_irq]++;
>> +    }
>> +
>> +    /* bind machine_irq to device */
>> +    if (rc < 0 && machine_irq != 0) {
>> +        uint8_t e_device = PCI_SLOT(s->dev.devfn);
>> +        uint8_t e_intx = pci_intx(s);
>> +
>> +        rc = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, machine_irq, 0,
>> +                                       e_device, e_intx);
>> +        if (rc < 0) {
>> +            PT_LOG("Error: Binding of interrupt failed! rc=%d\n", rc);
>> +
>> +            /* Disable PCI intx assertion (turn on bit10 of devctl) */
>> +            host_pci_set_word(s->real_device, PCI_COMMAND,
>> +                              *(uint16_t *)(&s->dev.config[PCI_COMMAND])
>> +                              | PCI_COMMAND_INTX_DISABLE);
>> +            mapped_machine_irq[machine_irq]--;
>> +
>> +            if (mapped_machine_irq[machine_irq] == 0) {
>> +                if (xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq)) {
>> +                    PT_LOG("Error: Unmapping of interrupt failed! rc=%d\n",
>> +                           rc);
>> +                }
>> +            }
>> +            s->machine_irq = 0;
>> +        }
>> +    }
>> +
>> +out:
>> +    PT_LOG("Real physical device %02x:%02x.%x registered successfuly!\n"
>> +           "IRQ type = %s\n", bus, slot, func, "INTx");
>> +
>> +    return 0;
>> +}
>> +
>> +static int pt_unregister_device(PCIDevice *pcidev)
>> +{
>> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
>> +    uint8_t e_device, e_intx;
>> +    uint32_t machine_irq;
>> +    int rc;
>> +
>> +    /* Unbind interrupt */
>> +    e_device = PCI_SLOT(s->dev.devfn);
>> +    e_intx = pci_intx(s);
>> +    machine_irq = s->machine_irq;
>> +
>> +    if (machine_irq) {
>> +        rc = xc_domain_unbind_pt_irq(xen_xc, xen_domid, machine_irq,
>> +                                     PT_IRQ_TYPE_PCI, 0, e_device, e_intx, 0);
>> +        if (rc < 0) {
>> +            PT_LOG("Error: Unbinding of interrupt failed! rc=%d\n", rc);
>> +        }
>> +    }
>> +
>> +    if (machine_irq) {
>> +        mapped_machine_irq[machine_irq]--;
>> +
>> +        if (mapped_machine_irq[machine_irq] == 0) {
>> +            rc = xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq);
>> +
>> +            if (rc < 0) {
>> +                PT_LOG("Error: Unmaping of interrupt failed! rc=%d\n", rc);
>> +            }
>> +        }
>> +    }
>> +
>> +    /* delete all emulated config registers */
>> +    pt_config_delete(s);
>> +
>> +    /* unregister real device's MMIO/PIO BARs */
>> +    pt_unregister_regions(s);
>> +
>> +    host_pci_device_put(s->real_device);
>> +
>> +    return 0;
>> +}
>> +
>> +static PCIDeviceInfo xen_pci_passthrough = {
>> +    .init = pt_initfn,
>> +    .exit = pt_unregister_device,
>> +    .qdev.name = "xen-pci-passthrough",
>> +    .qdev.desc = "Assign an host pci device with Xen",
>> +    .qdev.size = sizeof(XenPCIPassthroughState),
>> +    .config_read = pt_pci_read_config,
>> +    .config_write = pt_pci_write_config,
>> +    .is_express = 0,
>> +    .qdev.props = (Property[]) {
>> +        DEFINE_PROP_STRING("hostaddr", XenPCIPassthroughState, hostaddr),
>> +        DEFINE_PROP_BIT("power-mgmt", XenPCIPassthroughState, power_mgmt,
>> +                        0, false),
>> +        DEFINE_PROP_END_OF_LIST(),
>> +    }
>> +};
>> +
>> +static void xen_passthrough_register(void)
>> +{
>> +    pci_qdev_register(&xen_pci_passthrough);
>> +}
>> +
>> +device_init(xen_passthrough_register);
>> diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
>> new file mode 100644
>> index 0000000..2d1979d
>> --- /dev/null
>> +++ b/hw/xen_pci_passthrough.h
>> @@ -0,0 +1,223 @@
>> +#ifndef QEMU_HW_XEN_PCI_PASSTHROUGH_H
>> +#  define QEMU_HW_XEN_PCI_PASSTHROUGH_H
>> +
>> +#include "qemu-common.h"
>> +#include "xen_common.h"
>> +#include "pci.h"
>> +#include "host-pci-device.h"
>> +
>> +#define PT_LOGGING_ENABLED
>> +#define PT_DEBUG_PCI_CONFIG_ACCESS
>> +
>> +#ifdef PT_LOGGING_ENABLED
>> +#  define PT_LOG(_f, _a...)   fprintf(stderr, "%s: " _f, __func__, ##_a)
>> +#else
>> +#  define PT_LOG(_f, _a...)
>> +#endif
>> +
>> +#ifdef PT_DEBUG_PCI_CONFIG_ACCESS
>> +#  define PT_LOG_CONFIG(_f, _a...) PT_LOG(_f, ##_a)
>> +#else
>> +#  define PT_LOG_CONFIG(_f, _a...)
>> +#endif
>> +
>> +
>> +typedef struct XenPTRegInfo XenPTRegInfo;
>> +typedef struct XenPTReg XenPTReg;
>> +
>> +typedef struct XenPCIPassthroughState XenPCIPassthroughState;
>> +
>> +/* function type for config reg */
>> +typedef uint32_t (*conf_reg_init)
>> +    (XenPCIPassthroughState *, XenPTRegInfo *, uint32_t real_offset);
>> +typedef int (*conf_dword_write)
>> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
>> +     uint32_t *val, uint32_t dev_value, uint32_t valid_mask);
>> +typedef int (*conf_word_write)
>> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
>> +     uint16_t *val, uint16_t dev_value, uint16_t valid_mask);
>> +typedef int (*conf_byte_write)
>> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
>> +     uint8_t *val, uint8_t dev_value, uint8_t valid_mask);
>> +typedef int (*conf_dword_read)
>> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
>> +     uint32_t *val, uint32_t valid_mask);
>> +typedef int (*conf_word_read)
>> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
>> +     uint16_t *val, uint16_t valid_mask);
>> +typedef int (*conf_byte_read)
>> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
>> +     uint8_t *val, uint8_t valid_mask);
>> +typedef int (*conf_dword_restore)
>> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
>> +     uint32_t dev_value, uint32_t *val);
>> +typedef int (*conf_word_restore)
>> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
>> +     uint16_t dev_value, uint16_t *val);
>> +typedef int (*conf_byte_restore)
>> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
>> +     uint8_t dev_value, uint8_t *val);
>> +
>> +/* power state transition */
>> +#define PT_FLAG_TRANSITING 0x0001
>> +
>> +
>> +typedef enum {
>> +    GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
>> +    GRP_TYPE_EMU,                               /* emul reg group */
>> +} RegisterGroupType;
>> +
>> +typedef enum {
>> +    PT_BAR_FLAG_MEM = 0,                        /* Memory type BAR */
>> +    PT_BAR_FLAG_IO,                             /* I/O type BAR */
>> +    PT_BAR_FLAG_UPPER,                          /* upper 64bit BAR */
>> +    PT_BAR_FLAG_UNUSED,                         /* unused BAR */
>> +} PTBarFlag;
>> +
>> +
>> +typedef struct XenPTRegion {
>> +    /* Virtual phys base & size */
>> +    uint32_t e_physbase;
>> +    uint32_t e_size;
>> +    /* Index of region in qemu */
>> +    uint32_t memory_index;
>> +    /* BAR flag */
>> +    PTBarFlag bar_flag;
>> +    /* Translation of the emulated address */
>> +    union {
>> +        uint64_t maddr;
>> +        uint64_t pio_base;
>> +        uint64_t u;
>> +    } access;
>> +} XenPTRegion;
>> +
>> +/* XenPTRegInfo declaration
>> + * - only for emulated register (either a part or whole bit).
>> + * - for passthrough register that need special behavior (like interacting with
>> + *   other component), set emu_mask to all 0 and specify r/w func properly.
>> + * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
>> + */
>> +
>> +/* emulated register infomation */
>> +struct XenPTRegInfo {
>> +    uint32_t offset;
>> +    uint32_t size;
>> +    uint32_t init_val;
>> +    /* reg read only field mask (ON:RO/ROS, OFF:other) */
>> +    uint32_t ro_mask;
>> +    /* reg emulate field mask (ON:emu, OFF:passthrough) */
>> +    uint32_t emu_mask;
>> +    /* no write back allowed */
>> +    uint32_t no_wb;
>> +    conf_reg_init init;
>> +    /* read/write/restore function pointer
>> +     * for double_word/word/byte size */
>> +    union {
>> +        struct {
>> +            conf_dword_write write;
>> +            conf_dword_read read;
>> +            conf_dword_restore restore;
>> +        } dw;
>> +        struct {
>> +            conf_word_write write;
>> +            conf_word_read read;
>> +            conf_word_restore restore;
>> +        } w;
>> +        struct {
>> +            conf_byte_write write;
>> +            conf_byte_read read;
>> +            conf_byte_restore restore;
>> +        } b;
>> +    } u;
>> +};
>> +
>> +/* emulated register management */
>> +struct XenPTReg {
>> +    QLIST_ENTRY(XenPTReg) entries;
>> +    XenPTRegInfo *reg;
>> +    uint32_t data;
>> +};
>> +
>> +typedef struct XenPTRegGroupInfo XenPTRegGroupInfo;
>> +
>> +/* emul reg group size initialize method */
>> +typedef uint8_t (*pt_reg_size_init_fn)
>> +    (XenPCIPassthroughState *, const XenPTRegGroupInfo *,
>> +     uint32_t base_offset);
>> +
>> +/* emulated register group infomation */
>> +struct XenPTRegGroupInfo {
>> +    uint8_t grp_id;
>> +    RegisterGroupType grp_type;
>> +    uint8_t grp_size;
>> +    pt_reg_size_init_fn size_init;
>> +    XenPTRegInfo *emu_reg_tbl;
>> +};
>> +
>> +/* emul register group management table */
>> +typedef struct XenPTRegGroup {
>> +    QLIST_ENTRY(XenPTRegGroup) entries;
>> +    const XenPTRegGroupInfo *reg_grp;
>> +    uint32_t base_offset;
>> +    uint8_t size;
>> +    QLIST_HEAD(, XenPTReg) reg_tbl_list;
>> +} XenPTRegGroup;
>> +
>> +
>> +typedef struct XenPTPM {
>> +    QEMUTimer *pm_timer;  /* QEMUTimer struct */
>> +    int no_soft_reset;    /* No Soft Reset flags */
>> +    uint16_t flags;       /* power state transition flags */
>> +    uint16_t pmc_field;   /* Power Management Capabilities field */
>> +    int pm_delay;         /* power state transition delay */
>> +    uint16_t cur_state;   /* current power state */
>> +    uint16_t req_state;   /* requested power state */
>> +    uint32_t pm_base;     /* Power Management Capability reg base offset */
>> +    uint32_t aer_base;    /* AER Capability reg base offset */
>> +} XenPTPM;
>> +
>> +struct XenPCIPassthroughState {
>> +    PCIDevice dev;
>> +
>> +    char *hostaddr;
>> +    bool is_virtfn;
>> +    HostPCIDevice *real_device;
>> +    XenPTRegion bases[PCI_NUM_REGIONS]; /* Access regions */
>> +    QLIST_HEAD(, XenPTRegGroup) reg_grp_tbl;
>> +
>> +    uint32_t machine_irq;
>> +
>> +    uint32_t power_mgmt;
>> +    XenPTPM *pm_state;
>> +
>> +    MemoryRegion bar[PCI_NUM_REGIONS - 1];
>> +    MemoryRegion rom;
>> +};
>> +
>> +void pt_config_init(XenPCIPassthroughState *s);
>> +void pt_config_delete(XenPCIPassthroughState *s);
>> +void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable);
>> +void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
>> +                        int io_enable, int mem_enable);
>> +XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address);
>> +XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address);
>> +int pt_bar_offset_to_index(uint32_t offset);
>> +
>> +static inline pcibus_t pt_get_emul_size(PTBarFlag flag, pcibus_t r_size)
>> +{
>> +    /* align resource size (memory type only) */
>> +    if (flag == PT_BAR_FLAG_MEM) {
>> +        return (r_size + XC_PAGE_SIZE - 1) & XC_PAGE_MASK;
>> +    } else {
>> +        return r_size;
>> +    }
>> +}
>> +
>> +/* INTx */
>> +static inline uint8_t pci_read_intx(XenPCIPassthroughState *s)
>> +{
>> +    return host_pci_get_byte(s->real_device, PCI_INTERRUPT_PIN);
>> +}
>> +uint8_t pci_intx(XenPCIPassthroughState *ptdev);
>> +
>> +#endif /* !QEMU_HW_XEN_PCI_PASSTHROUGH_H */
>> diff --git a/hw/xen_pci_passthrough_helpers.c b/hw/xen_pci_passthrough_helpers.c
>> new file mode 100644
>> index 0000000..192e918
>> --- /dev/null
>> +++ b/hw/xen_pci_passthrough_helpers.c
>> @@ -0,0 +1,46 @@
>> +#include "xen_pci_passthrough.h"
>> +
>> +/* The PCI Local Bus Specification, Rev. 3.0, {
>> + * Section 6.2.4 Miscellaneous Registers, pp 223
>> + * outlines 5 valid values for the intertupt pin (intx).
>> + *  0: For devices (or device functions) that don't use an interrupt in
>> + *  1: INTA#
>> + *  2: INTB#
>> + *  3: INTC#
>> + *  4: INTD#
>> + *
>> + * Xen uses the following 4 values for intx
>> + *  0: INTA#
>> + *  1: INTB#
>> + *  2: INTC#
>> + *  3: INTD#
>> + *
>> + * Observing that these list of values are not the same, pci_read_intx()
>> + * uses the following mapping from hw to xen values.
>> + * This seems to reflect the current usage within Xen.
>> + *
>> + * PCI hardware    | Xen | Notes
>> + * ----------------+-----+----------------------------------------------------
>> + * 0               | 0   | No interrupt
>> + * 1               | 0   | INTA#
>> + * 2               | 1   | INTB#
>> + * 3               | 2   | INTC#
>> + * 4               | 3   | INTD#
>> + * any other value | 0   | This should never happen, log error message
>> +}
>> + */
>> +uint8_t pci_intx(XenPCIPassthroughState *ptdev)
>> +{
>> +    uint8_t r_val = pci_read_intx(ptdev);
>> +
>> +    PT_LOG("intx=%i\n", r_val);
>> +    if (r_val < 1 || r_val > 4) {
>> +        PT_LOG("Interrupt pin read from hardware is out of range: "
>> +               "value=%i, acceptable range is 1 - 4\n", r_val);
>> +        r_val = 0;
>> +    } else {
>> +        r_val -= 1;
>> +    }
>> +
>> +    return r_val;
>> +}
>
> if xen_pci_passthrough_helpers.c is only going to contain this function
> you might as well declared it static inline and move it to
> xen_pci_passthrough.h

Ok, I will.
Konrad Rzeszutek Wilk - Nov. 10, 2011, 9:28 p.m.
On Fri, Oct 28, 2011 at 04:07:33PM +0100, Anthony PERARD wrote:
> From: Allen Kay <allen.m.kay@intel.com>
> 

This is going to be a bit lame review..

> +static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t address, int len)
> +{
> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
> +    uint32_t val = 0;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    int rc = 0;
> +    int emul_len = 0;
> +    uint32_t find_addr = address;
> +
> +    if (pt_pci_config_access_check(d, address, len)) {
> +        goto exit;
> +    }
> +
> +    /* check power state transition flags */
> +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
> +        /* can't accept until previous power state transition is completed.
> +         * so finished previous request here.
> +         */
> +        PT_LOG("Warning: guest want to write durring power state transition\n");

during
> +        goto exit;
> +    }
> +
> +    /* find register group entry */
> +    reg_grp_entry = pt_find_reg_grp(s, address);
> +    if (reg_grp_entry) {
> +        /* check 0 Hardwired register group */
> +        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
> +            /* no need to emulate, just return 0 */
> +            val = 0;
> +            goto exit;
> +        }
> +    }
> +
> +    /* read I/O device register value */
> +    rc = host_pci_get_block(s->real_device, address, (uint8_t *)&val, len);
> +    if (!rc) {
> +        PT_LOG("Error: pci_read_block failed. return value[%d].\n", rc);
> +        memset(&val, 0xff, len);
> +    }
> +
> +    /* just return the I/O device register value for
> +     * passthrough type register group */
> +    if (reg_grp_entry == NULL) {
> +        goto exit;
> +    }
> +
> +    /* adjust the read value to appropriate CFC-CFF window */
> +    val <<= (address & 3) << 3;
> +    emul_len = len;
> +
> +    /* loop Guest request size */

Perhaps 'loop around the guest request size' ?

> +    while (emul_len > 0) {
> +        /* find register entry to be emulated */
> +        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
> +        if (reg_entry) {
> +            XenPTRegInfo *reg = reg_entry->reg;
> +            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
> +            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
> +            uint8_t *ptr_val = NULL;
> +
> +            valid_mask <<= (find_addr - real_offset) << 3;
> +            ptr_val = (uint8_t *)&val + (real_offset & 3);
> +
> +            /* do emulation depend on register size */

based on register size

> +            switch (reg->size) {
> +            case 1:
> +                if (reg->u.b.read) {
> +                    rc = reg->u.b.read(s, reg_entry, ptr_val, valid_mask);
> +                }
> +                break;
> +            case 2:
> +                if (reg->u.w.read) {
> +                    rc = reg->u.w.read(s, reg_entry,
> +                                       (uint16_t *)ptr_val, valid_mask);
> +                }
> +                break;
> +            case 4:
> +                if (reg->u.dw.read) {
> +                    rc = reg->u.dw.read(s, reg_entry,
> +                                        (uint32_t *)ptr_val, valid_mask);
> +                }
> +                break;
> +            }
> +
> +            if (rc < 0) {
> +                hw_error("Internal error: Invalid read emulation "
> +                         "return value[%d]. I/O emulator exit.\n", rc);
> +            }
> +
> +            /* calculate next address to find */
> +            emul_len -= reg->size;
> +            if (emul_len > 0) {
> +                find_addr = real_offset + reg->size;
> +            }
> +        } else {
> +            /* nothing to do with passthrough type register,
> +             * continue to find next byte */
> +            emul_len--;
> +            find_addr++;
> +        }
> +    }
> +
> +    /* need to shift back before returning them to pci bus emulator */
> +    val >>= ((address & 3) << 3);
> +
> +exit:
> +    PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
> +                  pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +                  address, val, len);
> +    return val;
> +}
> +
> +static void pt_pci_write_config(PCIDevice *d, uint32_t address,
> +                                uint32_t val, int len)
> +{
> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
> +    int index = 0;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    int rc = 0;
> +    uint32_t read_val = 0;
> +    int emul_len = 0;
> +    XenPTReg *reg_entry = NULL;
> +    uint32_t find_addr = address;
> +    XenPTRegInfo *reg = NULL;
> +
> +    if (pt_pci_config_access_check(d, address, len)) {
> +        return;
> +    }
> +
> +    PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
> +                  pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +                  address, val, len);
> +
> +    /* check unused BAR register */
> +    index = pt_bar_offset_to_index(address);
> +    if ((index >= 0) && (val > 0 && val < PT_BAR_ALLF) &&
> +        (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED)) {
> +        PT_LOG("Warning: Guest attempt to set address to unused Base Address "

So.. it is called PT_LOG, but the first thing it says is Warning. So should it be
PT_WARN?

> +               "Register. [%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
> +               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +               address, len);
> +    }
> +
> +    /* check power state transition flags */
> +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
> +        /* can't accept untill previous power state transition is completed.

until
> +         * so finished previous request here.

finish
> +         */
> +        PT_LOG("Warning: guest want to write durring power state transition\n");

during
> +        return;
> +    }
> +
> +    /* find register group entry */
> +    reg_grp_entry = pt_find_reg_grp(s, address);
> +    if (reg_grp_entry) {
> +        /* check 0 Hardwired register group */
> +        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
> +            /* ignore silently */
> +            PT_LOG("Warning: Access to 0 Hardwired register. "
> +                   "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
> +                   pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> +                   address, len);
> +            return;
> +        }
> +    }
> +
> +    /* read I/O device register value */
> +    rc = host_pci_get_block(s->real_device, address,
> +                             (uint8_t *)&read_val, len);
> +    if (!rc) {
> +        PT_LOG("Error: pci_read_block failed. return value[%d].\n", rc);

There isn't a PT_ERR? Hm, looking at the code there is only PT_LOG. Perhaps
declearing PT_ERR and PT_WARN might be a good idea? In case in the future
one wants different levels of this? Or do we really not care much about that?

> +        memset(&read_val, 0xff, len);
> +    }
> +
> +    /* pass directly to libpci for passthrough type register group */

Um, is the libpci requirement a certain thing?

> +    if (reg_grp_entry == NULL) {
> +        goto out;
> +    }
> +
> +    /* adjust the read and write value to appropriate CFC-CFF window */
> +    read_val <<= (address & 3) << 3;
> +    val <<= (address & 3) << 3;
> +    emul_len = len;
> +
> +    /* loop Guest request size */

loop around what the guest requested..

> +    while (emul_len > 0) {
> +        /* find register entry to be emulated */
> +        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
> +        if (reg_entry) {
> +            reg = reg_entry->reg;
> +            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
> +            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
> +            uint8_t *ptr_val = NULL;
> +
> +            valid_mask <<= (find_addr - real_offset) << 3;
> +            ptr_val = (uint8_t *)&val + (real_offset & 3);
> +
> +            /* do emulation depend on register size */

based 
> +            switch (reg->size) {
> +            case 1:
> +                if (reg->u.b.write) {
> +                    rc = reg->u.b.write(s, reg_entry, ptr_val,
> +                                        read_val >> ((real_offset & 3) << 3),
> +                                        valid_mask);
> +                }
> +                break;
> +            case 2:
> +                if (reg->u.w.write) {
> +                    rc = reg->u.w.write(s, reg_entry, (uint16_t *)ptr_val,
> +                                        (read_val >> ((real_offset & 3) << 3)),
> +                                        valid_mask);
> +                }
> +                break;
> +            case 4:
> +                if (reg->u.dw.write) {
> +                    rc = reg->u.dw.write(s, reg_entry, (uint32_t *)ptr_val,
> +                                         (read_val >> ((real_offset & 3) << 3)),
> +                                         valid_mask);
> +                }
> +                break;
> +            }
> +
> +            if (rc < 0) {
> +                hw_error("Internal error: Invalid write emulation "
> +                         "return value[%d]. I/O emulator exit.\n", rc);

Oh. I hadn't realized this, but you are using hw_error. Which is
calling 'abort'! Yikes. Is there no way to recover from this? Say return 0xfffff?

> +            }
> +
> +            /* calculate next address to find */
> +            emul_len -= reg->size;
> +            if (emul_len > 0) {
> +                find_addr = real_offset + reg->size;
> +            }
> +        } else {
> +            /* nothing to do with passthrough type register,
> +             * continue to find next byte */
> +            emul_len--;
> +            find_addr++;
> +        }
> +    }
> +
> +    /* need to shift back before passing them to libpci */
> +    val >>= (address & 3) << 3;
> +
> +out:
> +    if (!(reg && reg->no_wb)) {
> +        /* unknown regs are passed through */
> +        rc = host_pci_set_block(s->real_device, address, (uint8_t *)&val, len);
> +
> +        if (!rc) {
> +            PT_LOG("Error: pci_write_block failed. return value[%d].\n", rc);
> +        }
> +    }
> +
> +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
> +        qemu_mod_timer(s->pm_state->pm_timer,
> +                       qemu_get_clock_ms(rt_clock) + s->pm_state->pm_delay);
> +    }
> +}
> +
> +/* ioport/iomem space*/
> +static void pt_iomem_map(XenPCIPassthroughState *s, int i,
> +                         pcibus_t e_phys, pcibus_t e_size, int type)
> +{
> +    uint32_t old_ebase = s->bases[i].e_physbase;
> +    bool first_map = s->bases[i].e_size == 0;
> +    int ret = 0;
> +
> +    s->bases[i].e_physbase = e_phys;
> +    s->bases[i].e_size = e_size;
> +
> +    PT_LOG("e_phys=%#"PRIx64" maddr=%#"PRIx64" type=%%d"
> +           " len=%#"PRIx64" index=%d first_map=%d\n",
> +           e_phys, s->bases[i].access.maddr, /*type,*/
> +           e_size, i, first_map);
> +
> +    if (e_size == 0) {
> +        return;
> +    }
> +
> +    if (!first_map && old_ebase != -1) {

old_ebase != PCI_BAR_UNMAPPED ?

> +        /* Remove old mapping */
> +        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
> +                               old_ebase >> XC_PAGE_SHIFT,
> +                               s->bases[i].access.maddr >> XC_PAGE_SHIFT,
> +                               (e_size + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
> +                               DPCI_REMOVE_MAPPING);
> +        if (ret != 0) {
> +            PT_LOG("Error: remove old mapping failed!\n");
> +            return;
> +        }
> +    }
> +
> +    /* map only valid guest address */
> +    if (e_phys != -1) {

PCI_BAR_UNMAPPED

> +        /* Create new mapping */
> +        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
> +                                   s->bases[i].e_physbase >> XC_PAGE_SHIFT,
> +                                   s->bases[i].access.maddr >> XC_PAGE_SHIFT,
> +                                   (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
> +                                   DPCI_ADD_MAPPING);
> +
> +        if (ret != 0) {
> +            PT_LOG("Error: create new mapping failed!\n");
> +        }
> +    }
> +}
> +
> +static void pt_ioport_map(XenPCIPassthroughState *s, int i,
> +                          pcibus_t e_phys, pcibus_t e_size, int type)
> +{
> +    uint32_t old_ebase = s->bases[i].e_physbase;
> +    bool first_map = s->bases[i].e_size == 0;
> +    int ret = 0;
> +
> +    s->bases[i].e_physbase = e_phys;
> +    s->bases[i].e_size = e_size;
> +
> +    PT_LOG("e_phys=%#04"PRIx64" pio_base=%#04"PRIx64" len=%"PRId64" index=%d"
> +           " first_map=%d\n",
> +           e_phys, s->bases[i].access.pio_base, e_size, i, first_map);
> +
> +    if (e_size == 0) {
> +        return;
> +    }
> +
> +    if (!first_map && old_ebase != -1) {

PCI_BAR_UNMAPPED
> +        /* Remove old mapping */
> +        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, old_ebase,
> +                                       s->bases[i].access.pio_base, e_size,
> +                                       DPCI_REMOVE_MAPPING);
> +        if (ret != 0) {
> +            PT_LOG("Error: remove old mapping failed!\n");
> +            return;
> +        }
> +    }
> +
> +    /* map only valid guest address (include 0) */
> +    if (e_phys != -1) {
> +        /* Create new mapping */
> +        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, e_phys,
> +                                       s->bases[i].access.pio_base, e_size,
> +                                       DPCI_ADD_MAPPING);
> +        if (ret != 0) {
> +            PT_LOG("Error: create new mapping failed!\n");
> +        }
> +    }
> +
> +}
> +
> +
> +/* mapping BAR */
> +
> +void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
> +                        int io_enable, int mem_enable)
> +{
> +    PCIDevice *dev = &s->dev;
> +    PCIIORegion *r;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegion *base = NULL;
> +    pcibus_t r_size = 0, r_addr = -1;

PCI_BAR_UNMAPPED

> +    int rc = 0;
> +
> +    r = &dev->io_regions[bar];
> +
> +    /* check valid region */
> +    if (!r->size) {
> +        return;
> +    }
> +
> +    base = &s->bases[bar];
> +    /* skip unused BAR or upper 64bit BAR */
> +    if ((base->bar_flag == PT_BAR_FLAG_UNUSED)
> +        || (base->bar_flag == PT_BAR_FLAG_UPPER)) {
> +           return;
> +    }
> +
> +    /* copy region address to temporary */
> +    r_addr = r->addr;
> +
> +    /* need unmapping in case I/O Space or Memory Space disable */
> +    if (((base->bar_flag == PT_BAR_FLAG_IO) && !io_enable) ||
> +        ((base->bar_flag == PT_BAR_FLAG_MEM) && !mem_enable)) {
> +        r_addr = -1;
> +    }
> +    if ((bar == PCI_ROM_SLOT) && (r_addr != -1)) {
> +        reg_grp_entry = pt_find_reg_grp(s, PCI_ROM_ADDRESS);
> +        if (reg_grp_entry) {
> +            reg_entry = pt_find_reg(reg_grp_entry, PCI_ROM_ADDRESS);
> +            if (reg_entry && !(reg_entry->data & PCI_ROM_ADDRESS_ENABLE)) {
> +                r_addr = -1;

PCI_BAR_UNMAPPED

> +            }
> +        }
> +    }
> +
> +    /* prevent guest software mapping memory resource to 00000000h */
> +    if ((base->bar_flag == PT_BAR_FLAG_MEM) && (r_addr == 0)) {
> +        r_addr = -1;
> +    }
> +
> +    r_size = pt_get_emul_size(base->bar_flag, r->size);
> +
> +    rc = pci_check_bar_overlap(dev, r_addr, r_size, r->type);
> +    if (rc > 0) {
> +        PT_LOG("Warning: s[%02x:%02x.%x][Region:%d][Address:%"FMT_PCIBUS"h]"
> +               "[Size:%"FMT_PCIBUS"h] is overlapped.\n", pci_bus_num(dev->bus),
> +               PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), bar,
> +               r_addr, r_size);
> +    }
> +
> +    /* check whether we need to update the mapping or not */
> +    if (r_addr != s->bases[bar].e_physbase) {
> +        /* mapping BAR */
> +        if (base->bar_flag == PT_BAR_FLAG_IO) {
> +            pt_ioport_map(s, bar, r_addr, r_size, r->type);
> +        } else {
> +            pt_iomem_map(s, bar, r_addr, r_size, r->type);
> +        }
> +    }
> +}
> +
> +void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable)
> +{
> +    int i;
> +
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        pt_bar_mapping_one(s, i, io_enable, mem_enable);
> +    }
> +}
> +
> +/* register regions */
> +static int pt_register_regions(XenPCIPassthroughState *s)
> +{
> +    int i = 0;
> +    uint32_t bar_data = 0;
> +    HostPCIDevice *d = s->real_device;
> +
> +    /* Register PIO/MMIO BARs */
> +    for (i = 0; i < PCI_BAR_ENTRIES; i++) {
> +        HostPCIIORegion *r = &d->io_regions[i];
> +
> +        if (r->base_addr) {

So should you check for PCI_BAR_UNMAPPED or is that not really
required here as the pci_register_bar would do it?

> +            s->bases[i].e_physbase = r->base_addr;
> +            s->bases[i].access.u = r->base_addr;
> +
> +            /* Register current region */
> +            if (r->flags & IORESOURCE_IO) {
> +                memory_region_init_io(&s->bar[i], NULL, NULL,
> +                                      "xen-pci-pt-bar", r->size);

You can make the "xen_pci-pt-bar" be a #define somewhere and reuse that.

> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_IO,
> +                                 &s->bar[i]);
> +            } else if (r->flags & IORESOURCE_PREFETCH) {
> +                memory_region_init_io(&s->bar[i], NULL, NULL,
> +                                      "xen-pci-pt-bar", r->size);
> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_MEM_PREFETCH,
> +                                 &s->bar[i]);
> +            } else {
> +                memory_region_init_io(&s->bar[i], NULL, NULL,
> +                                      "xen-pci-pt-bar", r->size);
> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_MEMORY,
> +                                 &s->bar[i]);
> +            }
> +
> +            PT_LOG("IO region registered (size=0x%08"PRIx64
> +                   " base_addr=0x%08"PRIx64")\n",
> +                   r->size, r->base_addr);
> +        }
> +    }
> +
> +    /* Register expansion ROM address */
> +    if (d->rom.base_addr && d->rom.size) {
> +        /* Re-set BAR reported by OS, otherwise ROM can't be read. */
> +        bar_data = host_pci_get_long(d, PCI_ROM_ADDRESS);
> +        if ((bar_data & PCI_ROM_ADDRESS_MASK) == 0) {
> +            bar_data |= d->rom.base_addr & PCI_ROM_ADDRESS_MASK;
> +            host_pci_set_long(d, PCI_ROM_ADDRESS, bar_data);
> +        }
> +
> +        s->bases[PCI_ROM_SLOT].e_physbase = d->rom.base_addr;
> +        s->bases[PCI_ROM_SLOT].access.maddr = d->rom.base_addr;
> +
> +        memory_region_init_rom_device(&s->rom, NULL, NULL, &s->dev.qdev,
> +                                      "xen-pci-pt-rom", d->rom.size);
> +        pci_register_bar(&s->dev, PCI_ROM_SLOT, PCI_BASE_ADDRESS_MEM_PREFETCH,
> +                         &s->rom);
> +
> +        PT_LOG("Expansion ROM registered (size=0x%08"PRIx64
> +               " base_addr=0x%08"PRIx64")\n",
> +               d->rom.size, d->rom.base_addr);
> +    }
> +
> +    return 0;
> +}
> +
> +static void pt_unregister_regions(XenPCIPassthroughState *s)
> +{
> +    int i, type, rc;
> +    uint32_t e_size;
> +    PCIDevice *d = &s->dev;
> +
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        e_size = s->bases[i].e_size;
> +        if ((e_size == 0) || (s->bases[i].e_physbase == -1)) {
> +            continue;
> +        }
> +
> +        type = d->io_regions[i].type;
> +
> +        if (type == PCI_BASE_ADDRESS_SPACE_MEMORY
> +            || type == PCI_BASE_ADDRESS_MEM_PREFETCH) {
> +            rc = xc_domain_memory_mapping(xen_xc, xen_domid,
> +                    s->bases[i].e_physbase >> XC_PAGE_SHIFT,
> +                    s->bases[i].access.maddr >> XC_PAGE_SHIFT,
> +                    (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
> +                    DPCI_REMOVE_MAPPING);
> +            if (rc != 0) {
> +                PT_LOG("Error: remove old mem mapping failed!\n");
> +                continue;
> +            }
> +
> +        } else if (type == PCI_BASE_ADDRESS_SPACE_IO) {
> +            rc = xc_domain_ioport_mapping(xen_xc, xen_domid,
> +                        s->bases[i].e_physbase,
> +                        s->bases[i].access.pio_base,
> +                        e_size,
> +                        DPCI_REMOVE_MAPPING);
> +            if (rc != 0) {
> +                PT_LOG("Error: remove old io mapping failed!\n");
> +                continue;
> +            }
> +        }
> +    }
> +}
> +
> +static int pt_initfn(PCIDevice *pcidev)
> +{
> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
> +    int dom, bus;
> +    unsigned slot, func;
> +    int rc = 0;
> +    uint32_t machine_irq;
> +    int pirq = -1;
> +
> +    if (pci_parse_devaddr(s->hostaddr, &dom, &bus, &slot, &func) < 0) {
> +        fprintf(stderr, "error parse bdf: %s\n", s->hostaddr);
> +        return -1;
> +    }
> +
> +    /* register real device */
> +    PT_LOG("Assigning real physical device %02x:%02x.%x to devfn %i ...\n",
> +           bus, slot, func, s->dev.devfn);
> +
> +    s->real_device = host_pci_device_get(bus, slot, func);
> +    if (!s->real_device) {
> +        return -1;
> +    }
> +
> +    s->is_virtfn = s->real_device->is_virtfn;
> +    if (s->is_virtfn) {
> +        PT_LOG("%04x:%02x:%02x.%x is a SR-IOV Virtual Function\n",
> +               s->real_device->domain, bus, slot, func);
> +    }
> +
> +    /* Initialize virtualized PCI configuration (Extended 256 Bytes) */
> +    if (host_pci_get_block(s->real_device, 0, pcidev->config,
> +                           PCI_CONFIG_SPACE_SIZE) == -1) {
> +        return -1;
> +    }
> +
> +    /* Handle real device's MMIO/PIO BARs */
> +    pt_register_regions(s);
> +
> +    /* reinitialize each config register to be emulated */
> +    pt_config_init(s);
> +
> +    /* Bind interrupt */
> +    if (!s->dev.config[PCI_INTERRUPT_PIN]) {
> +        PT_LOG("no pin interrupt\n");

Perhaps include some details of which device failed?

> +        goto out;
> +    }
> +
> +    machine_irq = host_pci_get_byte(s->real_device, PCI_INTERRUPT_LINE);
> +    rc = xc_physdev_map_pirq(xen_xc, xen_domid, machine_irq, &pirq);
> +
> +    if (rc) {
> +        PT_LOG("Error: Mapping irq failed, rc = %d\n", rc);

Can you also include the IRQ it tried to map (both machine and pirq).

> +
> +        /* Disable PCI intx assertion (turn on bit10 of devctl) */
> +        host_pci_set_word(s->real_device,
> +                          PCI_COMMAND,
> +                          pci_get_word(s->dev.config + PCI_COMMAND)
> +                          | PCI_COMMAND_INTX_DISABLE);
> +        machine_irq = 0;
> +        s->machine_irq = 0;
> +    } else {
> +        machine_irq = pirq;
> +        s->machine_irq = pirq;
> +        mapped_machine_irq[machine_irq]++;
> +    }
> +
> +    /* bind machine_irq to device */
> +    if (rc < 0 && machine_irq != 0) {
> +        uint8_t e_device = PCI_SLOT(s->dev.devfn);
> +        uint8_t e_intx = pci_intx(s);
> +
> +        rc = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, machine_irq, 0,
> +                                       e_device, e_intx);
> +        if (rc < 0) {
> +            PT_LOG("Error: Binding of interrupt failed! rc=%d\n", rc);

A bit details - name of the device, the IRQ,..

> +
> +            /* Disable PCI intx assertion (turn on bit10 of devctl) */
> +            host_pci_set_word(s->real_device, PCI_COMMAND,
> +                              *(uint16_t *)(&s->dev.config[PCI_COMMAND])
> +                              | PCI_COMMAND_INTX_DISABLE);
> +            mapped_machine_irq[machine_irq]--;
> +
> +            if (mapped_machine_irq[machine_irq] == 0) {
> +                if (xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq)) {
> +                    PT_LOG("Error: Unmapping of interrupt failed! rc=%d\n",
> +                           rc);

And here too. It would be beneficial to have on the error paths lots of 
nice details so that in the field it will be easier to find out what
went wrong (and match up PIRQ with the GSI).
Anthony PERARD - Nov. 11, 2011, 4:27 p.m.
On Thu, 10 Nov 2011, Konrad Rzeszutek Wilk wrote:

> On Fri, Oct 28, 2011 at 04:07:33PM +0100, Anthony PERARD wrote:
> > From: Allen Kay <allen.m.kay@intel.com>
> >
>
> This is going to be a bit lame review..
>

[...]

> > +        return;
> > +    }
> > +
> > +    /* find register group entry */
> > +    reg_grp_entry = pt_find_reg_grp(s, address);
> > +    if (reg_grp_entry) {
> > +        /* check 0 Hardwired register group */
> > +        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
> > +            /* ignore silently */
> > +            PT_LOG("Warning: Access to 0 Hardwired register. "
> > +                   "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
> > +                   pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> > +                   address, len);
> > +            return;
> > +        }
> > +    }
> > +
> > +    /* read I/O device register value */
> > +    rc = host_pci_get_block(s->real_device, address,
> > +                             (uint8_t *)&read_val, len);
> > +    if (!rc) {
> > +        PT_LOG("Error: pci_read_block failed. return value[%d].\n", rc);
>
> There isn't a PT_ERR? Hm, looking at the code there is only PT_LOG. Perhaps
> declearing PT_ERR and PT_WARN might be a good idea? In case in the future
> one wants different levels of this? Or do we really not care much about that?

I will add this two macros.

> > +        memset(&read_val, 0xff, len);
> > +    }
> > +
> > +    /* pass directly to libpci for passthrough type register group */
>
> Um, is the libpci requirement a certain thing?

:(, it's just an old comment. libpci is not used anymore and have been
replaced by host-pci-device. I will replace libpci in the comment by
"the real device".


[...]

> > +            switch (reg->size) {
> > +            case 1:
> > +                if (reg->u.b.write) {
> > +                    rc = reg->u.b.write(s, reg_entry, ptr_val,
> > +                                        read_val >> ((real_offset & 3) << 3),
> > +                                        valid_mask);
> > +                }
> > +                break;
> > +            case 2:
> > +                if (reg->u.w.write) {
> > +                    rc = reg->u.w.write(s, reg_entry, (uint16_t *)ptr_val,
> > +                                        (read_val >> ((real_offset & 3) << 3)),
> > +                                        valid_mask);
> > +                }
> > +                break;
> > +            case 4:
> > +                if (reg->u.dw.write) {
> > +                    rc = reg->u.dw.write(s, reg_entry, (uint32_t *)ptr_val,
> > +                                         (read_val >> ((real_offset & 3) << 3)),
> > +                                         valid_mask);
> > +                }
> > +                break;
> > +            }
> > +
> > +            if (rc < 0) {
> > +                hw_error("Internal error: Invalid write emulation "
> > +                         "return value[%d]. I/O emulator exit.\n", rc);
>
> Oh. I hadn't realized this, but you are using hw_error. Which is
> calling 'abort'! Yikes. Is there no way to recover from this? Say return 0xfffff?

In qemu-xen-traditionnal, it was an exit(1). I do not know the
consequence of a bad write, and I can not return anythings. So I suppose
that the guest would know that somethings wrong only on the next read.

Instead of abort();, I can just do nothing and return. Or we could unplug
the device from QEMU.

Any preference?

> > +            }
> > +
> > +            /* calculate next address to find */
> > +            emul_len -= reg->size;
> > +            if (emul_len > 0) {
> > +                find_addr = real_offset + reg->size;
> > +            }
> > +        } else {
> > +            /* nothing to do with passthrough type register,
> > +             * continue to find next byte */
> > +            emul_len--;
> > +            find_addr++;
> > +        }
> > +    }
> > +
> > +    /* need to shift back before passing them to libpci */
> > +    val >>= (address & 3) << 3;
> > +
> > +out:
> > +    if (!(reg && reg->no_wb)) {
> > +        /* unknown regs are passed through */
> > +        rc = host_pci_set_block(s->real_device, address, (uint8_t *)&val, len);
> > +
> > +        if (!rc) {
> > +            PT_LOG("Error: pci_write_block failed. return value[%d].\n", rc);
> > +        }
> > +    }
> > +
> > +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
> > +        qemu_mod_timer(s->pm_state->pm_timer,
> > +                       qemu_get_clock_ms(rt_clock) + s->pm_state->pm_delay);
> > +    }
> > +}
> > +
> > +/* ioport/iomem space*/
> > +static void pt_iomem_map(XenPCIPassthroughState *s, int i,
> > +                         pcibus_t e_phys, pcibus_t e_size, int type)
> > +{
> > +    uint32_t old_ebase = s->bases[i].e_physbase;
> > +    bool first_map = s->bases[i].e_size == 0;
> > +    int ret = 0;
> > +
> > +    s->bases[i].e_physbase = e_phys;
> > +    s->bases[i].e_size = e_size;
> > +
> > +    PT_LOG("e_phys=%#"PRIx64" maddr=%#"PRIx64" type=%%d"
> > +           " len=%#"PRIx64" index=%d first_map=%d\n",
> > +           e_phys, s->bases[i].access.maddr, /*type,*/
> > +           e_size, i, first_map);
> > +
> > +    if (e_size == 0) {
> > +        return;
> > +    }
> > +
> > +    if (!first_map && old_ebase != -1) {
>
> old_ebase != PCI_BAR_UNMAPPED ?

:(, no. Because old_ebase is a uint32_t and PCI_BAR_UNMAPPED is
pcibus_t (uint64_t in Xen case).

I'm not sure that a good idee to change the type of old_ebase as
xc_domain_memory_mapping bellow takes only uint32_t.

But, if I can replace a -1 by PCI_BAR_UNMAPPED, I will.

> > +        /* Remove old mapping */
> > +        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
> > +                               old_ebase >> XC_PAGE_SHIFT,
> > +                               s->bases[i].access.maddr >> XC_PAGE_SHIFT,
> > +                               (e_size + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
> > +                               DPCI_REMOVE_MAPPING);
> > +        if (ret != 0) {
> > +            PT_LOG("Error: remove old mapping failed!\n");
> > +            return;
> > +        }
> > +    }
> > +
> > +    /* map only valid guest address */
> > +    if (e_phys != -1) {
>
> PCI_BAR_UNMAPPED
>
> > +        /* Create new mapping */
> > +        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
> > +                                   s->bases[i].e_physbase >> XC_PAGE_SHIFT,
> > +                                   s->bases[i].access.maddr >> XC_PAGE_SHIFT,
> > +                                   (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
> > +                                   DPCI_ADD_MAPPING);
> > +
> > +        if (ret != 0) {
> > +            PT_LOG("Error: create new mapping failed!\n");
> > +        }
> > +    }
> > +}

[...]

> > +void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable)
> > +{
> > +    int i;
> > +
> > +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> > +        pt_bar_mapping_one(s, i, io_enable, mem_enable);
> > +    }
> > +}
> > +
> > +/* register regions */
> > +static int pt_register_regions(XenPCIPassthroughState *s)
> > +{
> > +    int i = 0;
> > +    uint32_t bar_data = 0;
> > +    HostPCIDevice *d = s->real_device;
> > +
> > +    /* Register PIO/MMIO BARs */
> > +    for (i = 0; i < PCI_BAR_ENTRIES; i++) {
> > +        HostPCIIORegion *r = &d->io_regions[i];
> > +
> > +        if (r->base_addr) {
>
> So should you check for PCI_BAR_UNMAPPED or is that not really
> required here as the pci_register_bar would do it?

Actually, this value come from the real device (the value in
sysfs/resource). So, I think it's just 0 if it's not mapped.

Here, it's probably better to check for the size instead, to know if
there is actually a BAR.

> > +            s->bases[i].e_physbase = r->base_addr;
> > +            s->bases[i].access.u = r->base_addr;
> > +
> > +            /* Register current region */
> > +            if (r->flags & IORESOURCE_IO) {
> > +                memory_region_init_io(&s->bar[i], NULL, NULL,
> > +                                      "xen-pci-pt-bar", r->size);
>
> You can make the "xen_pci-pt-bar" be a #define somewhere and reuse that.
>
> > +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_IO,
> > +                                 &s->bar[i]);
> > +            } else if (r->flags & IORESOURCE_PREFETCH) {
> > +                memory_region_init_io(&s->bar[i], NULL, NULL,
> > +                                      "xen-pci-pt-bar", r->size);
> > +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_MEM_PREFETCH,
> > +                                 &s->bar[i]);
> > +            } else {
> > +                memory_region_init_io(&s->bar[i], NULL, NULL,
> > +                                      "xen-pci-pt-bar", r->size);
> > +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_MEMORY,
> > +                                 &s->bar[i]);
> > +            }
> > +
> > +            PT_LOG("IO region registered (size=0x%08"PRIx64
> > +                   " base_addr=0x%08"PRIx64")\n",
> > +                   r->size, r->base_addr);
> > +        }
> > +    }
> > +
> > +    /* Register expansion ROM address */
> > +    if (d->rom.base_addr && d->rom.size) {
> > +        /* Re-set BAR reported by OS, otherwise ROM can't be read. */
> > +        bar_data = host_pci_get_long(d, PCI_ROM_ADDRESS);
> > +        if ((bar_data & PCI_ROM_ADDRESS_MASK) == 0) {
> > +            bar_data |= d->rom.base_addr & PCI_ROM_ADDRESS_MASK;
> > +            host_pci_set_long(d, PCI_ROM_ADDRESS, bar_data);
> > +        }
> > +
> > +        s->bases[PCI_ROM_SLOT].e_physbase = d->rom.base_addr;
> > +        s->bases[PCI_ROM_SLOT].access.maddr = d->rom.base_addr;
> > +
> > +        memory_region_init_rom_device(&s->rom, NULL, NULL, &s->dev.qdev,
> > +                                      "xen-pci-pt-rom", d->rom.size);
> > +        pci_register_bar(&s->dev, PCI_ROM_SLOT, PCI_BASE_ADDRESS_MEM_PREFETCH,
> > +                         &s->rom);
> > +
> > +        PT_LOG("Expansion ROM registered (size=0x%08"PRIx64
> > +               " base_addr=0x%08"PRIx64")\n",
> > +               d->rom.size, d->rom.base_addr);
> > +    }
> > +
> > +    return 0;
> > +}
> > +

[...]

> > +static int pt_initfn(PCIDevice *pcidev)
> > +{
> > +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
> > +    int dom, bus;
> > +    unsigned slot, func;
> > +    int rc = 0;
> > +    uint32_t machine_irq;
> > +    int pirq = -1;
> > +
> > +    if (pci_parse_devaddr(s->hostaddr, &dom, &bus, &slot, &func) < 0) {
> > +        fprintf(stderr, "error parse bdf: %s\n", s->hostaddr);
> > +        return -1;
> > +    }
> > +
> > +    /* register real device */
> > +    PT_LOG("Assigning real physical device %02x:%02x.%x to devfn %i ...\n",
> > +           bus, slot, func, s->dev.devfn);
> > +
> > +    s->real_device = host_pci_device_get(bus, slot, func);
> > +    if (!s->real_device) {
> > +        return -1;
> > +    }
> > +
> > +    s->is_virtfn = s->real_device->is_virtfn;
> > +    if (s->is_virtfn) {
> > +        PT_LOG("%04x:%02x:%02x.%x is a SR-IOV Virtual Function\n",
> > +               s->real_device->domain, bus, slot, func);
> > +    }
> > +
> > +    /* Initialize virtualized PCI configuration (Extended 256 Bytes) */
> > +    if (host_pci_get_block(s->real_device, 0, pcidev->config,
> > +                           PCI_CONFIG_SPACE_SIZE) == -1) {
> > +        return -1;
> > +    }
> > +
> > +    /* Handle real device's MMIO/PIO BARs */
> > +    pt_register_regions(s);
> > +
> > +    /* reinitialize each config register to be emulated */
> > +    pt_config_init(s);
> > +
> > +    /* Bind interrupt */
> > +    if (!s->dev.config[PCI_INTERRUPT_PIN]) {
> > +        PT_LOG("no pin interrupt\n");
>
> Perhaps include some details of which device failed?

There is already detailed about the device at the beginning of the
function. Is it not enough?

> > +        goto out;
> > +    }
> > +
> > +    machine_irq = host_pci_get_byte(s->real_device, PCI_INTERRUPT_LINE);
> > +    rc = xc_physdev_map_pirq(xen_xc, xen_domid, machine_irq, &pirq);
> > +
> > +    if (rc) {
> > +        PT_LOG("Error: Mapping irq failed, rc = %d\n", rc);
>
> Can you also include the IRQ it tried to map (both machine and pirq).

Yep.

> > +
> > +        /* Disable PCI intx assertion (turn on bit10 of devctl) */
> > +        host_pci_set_word(s->real_device,
> > +                          PCI_COMMAND,
> > +                          pci_get_word(s->dev.config + PCI_COMMAND)
> > +                          | PCI_COMMAND_INTX_DISABLE);
> > +        machine_irq = 0;
> > +        s->machine_irq = 0;
> > +    } else {
> > +        machine_irq = pirq;
> > +        s->machine_irq = pirq;
> > +        mapped_machine_irq[machine_irq]++;
> > +    }
> > +
> > +    /* bind machine_irq to device */
> > +    if (rc < 0 && machine_irq != 0) {
> > +        uint8_t e_device = PCI_SLOT(s->dev.devfn);
> > +        uint8_t e_intx = pci_intx(s);
> > +
> > +        rc = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, machine_irq, 0,
> > +                                       e_device, e_intx);
> > +        if (rc < 0) {
> > +            PT_LOG("Error: Binding of interrupt failed! rc=%d\n", rc);
>
> A bit details - name of the device, the IRQ,..
>
> > +
> > +            /* Disable PCI intx assertion (turn on bit10 of devctl) */
> > +            host_pci_set_word(s->real_device, PCI_COMMAND,
> > +                              *(uint16_t *)(&s->dev.config[PCI_COMMAND])
> > +                              | PCI_COMMAND_INTX_DISABLE);
> > +            mapped_machine_irq[machine_irq]--;
> > +
> > +            if (mapped_machine_irq[machine_irq] == 0) {
> > +                if (xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq)) {
> > +                    PT_LOG("Error: Unmapping of interrupt failed! rc=%d\n",
> > +                           rc);
>
> And here too. It would be beneficial to have on the error paths lots of
> nice details so that in the field it will be easier to find out what
> went wrong (and match up PIRQ with the GSI).

Yes, I will try to improve the messages.

It's also probably good to always print the errors.


Thanks,
Konrad Rzeszutek Wilk - Nov. 11, 2011, 6:05 p.m.
> > > +                hw_error("Internal error: Invalid write emulation "
> > > +                         "return value[%d]. I/O emulator exit.\n", rc);
> >
> > Oh. I hadn't realized this, but you are using hw_error. Which is
> > calling 'abort'! Yikes. Is there no way to recover from this? Say return 0xfffff?
> 
> In qemu-xen-traditionnal, it was an exit(1). I do not know the
> consequence of a bad write, and I can not return anythings. So I suppose
> that the guest would know that somethings wrong only on the next read.
> 
> Instead of abort();, I can just do nothing and return. Or we could unplug
> the device from QEMU.
> 
> Any preference?

I think this calls for an experiment. If Linux still functions if you completly
unplug the device, then I would say unplug it (b/c in most likelyhood the reason
you can't write is b/c the host has unplugged the device).

> 
> > > +            }
> > > +
> > > +            /* calculate next address to find */
> > > +            emul_len -= reg->size;
> > > +            if (emul_len > 0) {
> > > +                find_addr = real_offset + reg->size;
> > > +            }
> > > +        } else {
> > > +            /* nothing to do with passthrough type register,
> > > +             * continue to find next byte */
> > > +            emul_len--;
> > > +            find_addr++;
> > > +        }
> > > +    }
> > > +
> > > +    /* need to shift back before passing them to libpci */
> > > +    val >>= (address & 3) << 3;
> > > +
> > > +out:
> > > +    if (!(reg && reg->no_wb)) {
> > > +        /* unknown regs are passed through */
> > > +        rc = host_pci_set_block(s->real_device, address, (uint8_t *)&val, len);
> > > +
> > > +        if (!rc) {
> > > +            PT_LOG("Error: pci_write_block failed. return value[%d].\n", rc);
> > > +        }
> > > +    }
> > > +
> > > +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
> > > +        qemu_mod_timer(s->pm_state->pm_timer,
> > > +                       qemu_get_clock_ms(rt_clock) + s->pm_state->pm_delay);
> > > +    }
> > > +}
> > > +
> > > +/* ioport/iomem space*/
> > > +static void pt_iomem_map(XenPCIPassthroughState *s, int i,
> > > +                         pcibus_t e_phys, pcibus_t e_size, int type)
> > > +{
> > > +    uint32_t old_ebase = s->bases[i].e_physbase;
> > > +    bool first_map = s->bases[i].e_size == 0;
> > > +    int ret = 0;
> > > +
> > > +    s->bases[i].e_physbase = e_phys;
> > > +    s->bases[i].e_size = e_size;
> > > +
> > > +    PT_LOG("e_phys=%#"PRIx64" maddr=%#"PRIx64" type=%%d"
> > > +           " len=%#"PRIx64" index=%d first_map=%d\n",
> > > +           e_phys, s->bases[i].access.maddr, /*type,*/
> > > +           e_size, i, first_map);
> > > +
> > > +    if (e_size == 0) {
> > > +        return;
> > > +    }
> > > +
> > > +    if (!first_map && old_ebase != -1) {
> >
> > old_ebase != PCI_BAR_UNMAPPED ?
> 
> :(, no. Because old_ebase is a uint32_t and PCI_BAR_UNMAPPED is
> pcibus_t (uint64_t in Xen case).

I somehow thought it was defined as -1.. but 
> 
> I'm not sure that a good idee to change the type of old_ebase as
> xc_domain_memory_mapping bellow takes only uint32_t.
> 
> But, if I can replace a -1 by PCI_BAR_UNMAPPED, I will.

.. or something close to it. _PCI_BAR_UNMAPPED?
.. snip..

> > > +    /* Register PIO/MMIO BARs */
> > > +    for (i = 0; i < PCI_BAR_ENTRIES; i++) {
> > > +        HostPCIIORegion *r = &d->io_regions[i];
> > > +
> > > +        if (r->base_addr) {
> >
> > So should you check for PCI_BAR_UNMAPPED or is that not really
> > required here as the pci_register_bar would do it?
> 
> Actually, this value come from the real device (the value in
> sysfs/resource). So, I think it's just 0 if it's not mapped.

Ah! Right.
> 
> Here, it's probably better to check for the size instead, to know if
> there is actually a BAR.

<nods>
> 
> > > +            s->bases[i].e_physbase = r->base_addr;
> > > +            s->bases[i].access.u = r->base_addr;
> > > +
> > > +            /* Register current region */
> > > +            if (r->flags & IORESOURCE_IO) {
> > > +                memory_region_init_io(&s->bar[i], NULL, NULL,
> > > +                                      "xen-pci-pt-bar", r->size);
> >
> > You can make the "xen_pci-pt-bar" be a #define somewhere and reuse that.

.. snip ..
> > > +    if (!s->dev.config[PCI_INTERRUPT_PIN]) {
> > > +        PT_LOG("no pin interrupt\n");
> >
> > Perhaps include some details of which device failed?
> 
> There is already detailed about the device at the beginning of the
> function. Is it not enough?

I was thinking parallel operations. So it could be there are multiple
PCI requests and you might not know which device's pin is wrong.

> 
> > > +        goto out;
> > > +    }
> > > +
> > > +    machine_irq = host_pci_get_byte(s->real_device, PCI_INTERRUPT_LINE);
> > > +    rc = xc_physdev_map_pirq(xen_xc, xen_domid, machine_irq, &pirq);
> > > +
> > > +    if (rc) {
> > > +        PT_LOG("Error: Mapping irq failed, rc = %d\n", rc);
> >
> > Can you also include the IRQ it tried to map (both machine and pirq).
> 
> Yep.
> 
> > > +
> > > +        /* Disable PCI intx assertion (turn on bit10 of devctl) */
> > > +        host_pci_set_word(s->real_device,
> > > +                          PCI_COMMAND,
> > > +                          pci_get_word(s->dev.config + PCI_COMMAND)
> > > +                          | PCI_COMMAND_INTX_DISABLE);
> > > +        machine_irq = 0;
> > > +        s->machine_irq = 0;
> > > +    } else {
> > > +        machine_irq = pirq;
> > > +        s->machine_irq = pirq;
> > > +        mapped_machine_irq[machine_irq]++;
> > > +    }
> > > +
> > > +    /* bind machine_irq to device */
> > > +    if (rc < 0 && machine_irq != 0) {
> > > +        uint8_t e_device = PCI_SLOT(s->dev.devfn);
> > > +        uint8_t e_intx = pci_intx(s);
> > > +
> > > +        rc = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, machine_irq, 0,
> > > +                                       e_device, e_intx);
> > > +        if (rc < 0) {
> > > +            PT_LOG("Error: Binding of interrupt failed! rc=%d\n", rc);
> >
> > A bit details - name of the device, the IRQ,..
> >
> > > +
> > > +            /* Disable PCI intx assertion (turn on bit10 of devctl) */
> > > +            host_pci_set_word(s->real_device, PCI_COMMAND,
> > > +                              *(uint16_t *)(&s->dev.config[PCI_COMMAND])
> > > +                              | PCI_COMMAND_INTX_DISABLE);
> > > +            mapped_machine_irq[machine_irq]--;
> > > +
> > > +            if (mapped_machine_irq[machine_irq] == 0) {
> > > +                if (xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq)) {
> > > +                    PT_LOG("Error: Unmapping of interrupt failed! rc=%d\n",
> > > +                           rc);
> >
> > And here too. It would be beneficial to have on the error paths lots of
> > nice details so that in the field it will be easier to find out what
> > went wrong (and match up PIRQ with the GSI).
> 
> Yes, I will try to improve the messages.
> 
> It's also probably good to always print the errors.

<nods> Thanks.
Stefano Stabellini - Nov. 14, 2011, 11:09 a.m.
On Fri, 11 Nov 2011, Konrad Rzeszutek Wilk wrote:
> > > > +                hw_error("Internal error: Invalid write emulation "
> > > > +                         "return value[%d]. I/O emulator exit.\n", rc);
> > >
> > > Oh. I hadn't realized this, but you are using hw_error. Which is
> > > calling 'abort'! Yikes. Is there no way to recover from this? Say return 0xfffff?
> > 
> > In qemu-xen-traditionnal, it was an exit(1). I do not know the
> > consequence of a bad write, and I can not return anythings. So I suppose
> > that the guest would know that somethings wrong only on the next read.
> > 
> > Instead of abort();, I can just do nothing and return. Or we could unplug
> > the device from QEMU.
> > 
> > Any preference?
> 
> I think this calls for an experiment. If Linux still functions if you completly
> unplug the device, then I would say unplug it (b/c in most likelyhood the reason
> you can't write is b/c the host has unplugged the device).

It would make sense to try to PCI hot-unplug the device, however
considering that it requires guest support, it cannot be used to safely
handle an error like this one. Also it requires some interactions that
might not be possible anymore at this point.
I would destroy the domain instead, using a graceful shutdown if
possible. Something similar to libxl_domain_shutdown.
Konrad Rzeszutek Wilk - Nov. 14, 2011, 6:11 p.m.
On Mon, Nov 14, 2011 at 11:09:31AM +0000, Stefano Stabellini wrote:
> On Fri, 11 Nov 2011, Konrad Rzeszutek Wilk wrote:
> > > > > +                hw_error("Internal error: Invalid write emulation "
> > > > > +                         "return value[%d]. I/O emulator exit.\n", rc);
> > > >
> > > > Oh. I hadn't realized this, but you are using hw_error. Which is
> > > > calling 'abort'! Yikes. Is there no way to recover from this? Say return 0xfffff?
> > > 
> > > In qemu-xen-traditionnal, it was an exit(1). I do not know the
> > > consequence of a bad write, and I can not return anythings. So I suppose
> > > that the guest would know that somethings wrong only on the next read.
> > > 
> > > Instead of abort();, I can just do nothing and return. Or we could unplug
> > > the device from QEMU.
> > > 
> > > Any preference?
> > 
> > I think this calls for an experiment. If Linux still functions if you completly
> > unplug the device, then I would say unplug it (b/c in most likelyhood the reason
> > you can't write is b/c the host has unplugged the device).
> 
> It would make sense to try to PCI hot-unplug the device, however
> considering that it requires guest support, it cannot be used to safely
> handle an error like this one. Also it requires some interactions that
> might not be possible anymore at this point.
> I would destroy the domain instead, using a graceful shutdown if
> possible. Something similar to libxl_domain_shutdown.

Sounds good, and we should also print something prudent to the log _why_
we just killed the guest.

Patch

diff --git a/Makefile.target b/Makefile.target
index 243f9f2..36ea47d 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -217,6 +217,8 @@  obj-i386-$(CONFIG_XEN) += xen_platform.o
 
 # Xen PCI Passthrough
 obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
+obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough.o
+obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_helpers.o
 
 # Inter-VM PCI shared memory
 CONFIG_IVSHMEM =
diff --git a/hw/xen_pci_passthrough.c b/hw/xen_pci_passthrough.c
new file mode 100644
index 0000000..b97c5b6
--- /dev/null
+++ b/hw/xen_pci_passthrough.c
@@ -0,0 +1,838 @@ 
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Alex Novik <alex@neocleus.com>
+ * Allen Kay <allen.m.kay@intel.com>
+ * Guy Zana <guy@neocleus.com>
+ *
+ * This file implements direct PCI assignment to a HVM guest
+ */
+
+/*
+ * Interrupt Disable policy:
+ *
+ * INTx interrupt:
+ *   Initialize(register_real_device)
+ *     Map INTx(xc_physdev_map_pirq):
+ *       <fail>
+ *         - Set real Interrupt Disable bit to '1'.
+ *         - Set machine_irq and assigned_device->machine_irq to '0'.
+ *         * Don't bind INTx.
+ *
+ *     Bind INTx(xc_domain_bind_pt_pci_irq):
+ *       <fail>
+ *         - Set real Interrupt Disable bit to '1'.
+ *         - Unmap INTx.
+ *         - Decrement mapped_machine_irq[machine_irq]
+ *         - Set assigned_device->machine_irq to '0'.
+ *
+ *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
+ *     Write '0'
+ *       <ptdev->msi_trans_en is false>
+ *         - Set real bit to '0' if assigned_device->machine_irq isn't '0'.
+ *
+ *     Write '1'
+ *       <ptdev->msi_trans_en is false>
+ *         - Set real bit to '1'.
+ *
+ * MSI-INTx translation.
+ *   Initialize(xc_physdev_map_pirq_msi/pt_msi_setup)
+ *     Bind MSI-INTx(xc_domain_bind_pt_irq)
+ *       <fail>
+ *         - Unmap MSI.
+ *           <success>
+ *             - Set dev->msi->pirq to '-1'.
+ *           <fail>
+ *             - Do nothing.
+ *
+ *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
+ *     Write '0'
+ *       <ptdev->msi_trans_en is true>
+ *         - Set MSI Enable bit to '1'.
+ *
+ *     Write '1'
+ *       <ptdev->msi_trans_en is true>
+ *         - Set MSI Enable bit to '0'.
+ *
+ * MSI interrupt:
+ *   Initialize MSI register(pt_msi_setup, pt_msi_update)
+ *     Bind MSI(xc_domain_update_msi_irq)
+ *       <fail>
+ *         - Unmap MSI.
+ *         - Set dev->msi->pirq to '-1'.
+ *
+ * MSI-X interrupt:
+ *   Initialize MSI-X register(pt_msix_update_one)
+ *     Bind MSI-X(xc_domain_update_msi_irq)
+ *       <fail>
+ *         - Unmap MSI-X.
+ *         - Set entry->pirq to '-1'.
+ */
+
+#include <sys/ioctl.h>
+
+#include "pci.h"
+#include "xen.h"
+#include "xen_backend.h"
+#include "xen_pci_passthrough.h"
+
+#define PCI_BAR_ENTRIES (6)
+
+#define PT_NR_IRQS          (256)
+char mapped_machine_irq[PT_NR_IRQS] = {0};
+
+/* Config Space */
+static int pt_pci_config_access_check(PCIDevice *d, uint32_t address, int len)
+{
+    /* check offset range */
+    if (address >= 0xFF) {
+        PT_LOG("Error: Failed to access register with offset exceeding FFh. "
+               "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
+               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+               address, len);
+        return -1;
+    }
+
+    /* check read size */
+    if ((len != 1) && (len != 2) && (len != 4)) {
+        PT_LOG("Error: Failed to access register with invalid access length. "
+               "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
+               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+               address, len);
+        return -1;
+    }
+
+    /* check offset alignment */
+    if (address & (len - 1)) {
+        PT_LOG("Error: Failed to access register with invalid access size "
+            "alignment. [%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
+            pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+            address, len);
+        return -1;
+    }
+
+    return 0;
+}
+
+int pt_bar_offset_to_index(uint32_t offset)
+{
+    int index = 0;
+
+    /* check Exp ROM BAR */
+    if (offset == PCI_ROM_ADDRESS) {
+        return PCI_ROM_SLOT;
+    }
+
+    /* calculate BAR index */
+    index = (offset - PCI_BASE_ADDRESS_0) >> 2;
+    if (index >= PCI_NUM_REGIONS) {
+        return -1;
+    }
+
+    return index;
+}
+
+static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t address, int len)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
+    uint32_t val = 0;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    int rc = 0;
+    int emul_len = 0;
+    uint32_t find_addr = address;
+
+    if (pt_pci_config_access_check(d, address, len)) {
+        goto exit;
+    }
+
+    /* check power state transition flags */
+    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
+        /* can't accept until previous power state transition is completed.
+         * so finished previous request here.
+         */
+        PT_LOG("Warning: guest want to write durring power state transition\n");
+        goto exit;
+    }
+
+    /* find register group entry */
+    reg_grp_entry = pt_find_reg_grp(s, address);
+    if (reg_grp_entry) {
+        /* check 0 Hardwired register group */
+        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
+            /* no need to emulate, just return 0 */
+            val = 0;
+            goto exit;
+        }
+    }
+
+    /* read I/O device register value */
+    rc = host_pci_get_block(s->real_device, address, (uint8_t *)&val, len);
+    if (!rc) {
+        PT_LOG("Error: pci_read_block failed. return value[%d].\n", rc);
+        memset(&val, 0xff, len);
+    }
+
+    /* just return the I/O device register value for
+     * passthrough type register group */
+    if (reg_grp_entry == NULL) {
+        goto exit;
+    }
+
+    /* adjust the read value to appropriate CFC-CFF window */
+    val <<= (address & 3) << 3;
+    emul_len = len;
+
+    /* loop Guest request size */
+    while (emul_len > 0) {
+        /* find register entry to be emulated */
+        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
+        if (reg_entry) {
+            XenPTRegInfo *reg = reg_entry->reg;
+            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
+            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
+            uint8_t *ptr_val = NULL;
+
+            valid_mask <<= (find_addr - real_offset) << 3;
+            ptr_val = (uint8_t *)&val + (real_offset & 3);
+
+            /* do emulation depend on register size */
+            switch (reg->size) {
+            case 1:
+                if (reg->u.b.read) {
+                    rc = reg->u.b.read(s, reg_entry, ptr_val, valid_mask);
+                }
+                break;
+            case 2:
+                if (reg->u.w.read) {
+                    rc = reg->u.w.read(s, reg_entry,
+                                       (uint16_t *)ptr_val, valid_mask);
+                }
+                break;
+            case 4:
+                if (reg->u.dw.read) {
+                    rc = reg->u.dw.read(s, reg_entry,
+                                        (uint32_t *)ptr_val, valid_mask);
+                }
+                break;
+            }
+
+            if (rc < 0) {
+                hw_error("Internal error: Invalid read emulation "
+                         "return value[%d]. I/O emulator exit.\n", rc);
+            }
+
+            /* calculate next address to find */
+            emul_len -= reg->size;
+            if (emul_len > 0) {
+                find_addr = real_offset + reg->size;
+            }
+        } else {
+            /* nothing to do with passthrough type register,
+             * continue to find next byte */
+            emul_len--;
+            find_addr++;
+        }
+    }
+
+    /* need to shift back before returning them to pci bus emulator */
+    val >>= ((address & 3) << 3);
+
+exit:
+    PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
+                  pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+                  address, val, len);
+    return val;
+}
+
+static void pt_pci_write_config(PCIDevice *d, uint32_t address,
+                                uint32_t val, int len)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
+    int index = 0;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    int rc = 0;
+    uint32_t read_val = 0;
+    int emul_len = 0;
+    XenPTReg *reg_entry = NULL;
+    uint32_t find_addr = address;
+    XenPTRegInfo *reg = NULL;
+
+    if (pt_pci_config_access_check(d, address, len)) {
+        return;
+    }
+
+    PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
+                  pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+                  address, val, len);
+
+    /* check unused BAR register */
+    index = pt_bar_offset_to_index(address);
+    if ((index >= 0) && (val > 0 && val < PT_BAR_ALLF) &&
+        (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED)) {
+        PT_LOG("Warning: Guest attempt to set address to unused Base Address "
+               "Register. [%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
+               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+               address, len);
+    }
+
+    /* check power state transition flags */
+    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
+        /* can't accept untill previous power state transition is completed.
+         * so finished previous request here.
+         */
+        PT_LOG("Warning: guest want to write durring power state transition\n");
+        return;
+    }
+
+    /* find register group entry */
+    reg_grp_entry = pt_find_reg_grp(s, address);
+    if (reg_grp_entry) {
+        /* check 0 Hardwired register group */
+        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
+            /* ignore silently */
+            PT_LOG("Warning: Access to 0 Hardwired register. "
+                   "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
+                   pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+                   address, len);
+            return;
+        }
+    }
+
+    /* read I/O device register value */
+    rc = host_pci_get_block(s->real_device, address,
+                             (uint8_t *)&read_val, len);
+    if (!rc) {
+        PT_LOG("Error: pci_read_block failed. return value[%d].\n", rc);
+        memset(&read_val, 0xff, len);
+    }
+
+    /* pass directly to libpci for passthrough type register group */
+    if (reg_grp_entry == NULL) {
+        goto out;
+    }
+
+    /* adjust the read and write value to appropriate CFC-CFF window */
+    read_val <<= (address & 3) << 3;
+    val <<= (address & 3) << 3;
+    emul_len = len;
+
+    /* loop Guest request size */
+    while (emul_len > 0) {
+        /* find register entry to be emulated */
+        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
+        if (reg_entry) {
+            reg = reg_entry->reg;
+            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
+            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
+            uint8_t *ptr_val = NULL;
+
+            valid_mask <<= (find_addr - real_offset) << 3;
+            ptr_val = (uint8_t *)&val + (real_offset & 3);
+
+            /* do emulation depend on register size */
+            switch (reg->size) {
+            case 1:
+                if (reg->u.b.write) {
+                    rc = reg->u.b.write(s, reg_entry, ptr_val,
+                                        read_val >> ((real_offset & 3) << 3),
+                                        valid_mask);
+                }
+                break;
+            case 2:
+                if (reg->u.w.write) {
+                    rc = reg->u.w.write(s, reg_entry, (uint16_t *)ptr_val,
+                                        (read_val >> ((real_offset & 3) << 3)),
+                                        valid_mask);
+                }
+                break;
+            case 4:
+                if (reg->u.dw.write) {
+                    rc = reg->u.dw.write(s, reg_entry, (uint32_t *)ptr_val,
+                                         (read_val >> ((real_offset & 3) << 3)),
+                                         valid_mask);
+                }
+                break;
+            }
+
+            if (rc < 0) {
+                hw_error("Internal error: Invalid write emulation "
+                         "return value[%d]. I/O emulator exit.\n", rc);
+            }
+
+            /* calculate next address to find */
+            emul_len -= reg->size;
+            if (emul_len > 0) {
+                find_addr = real_offset + reg->size;
+            }
+        } else {
+            /* nothing to do with passthrough type register,
+             * continue to find next byte */
+            emul_len--;
+            find_addr++;
+        }
+    }
+
+    /* need to shift back before passing them to libpci */
+    val >>= (address & 3) << 3;
+
+out:
+    if (!(reg && reg->no_wb)) {
+        /* unknown regs are passed through */
+        rc = host_pci_set_block(s->real_device, address, (uint8_t *)&val, len);
+
+        if (!rc) {
+            PT_LOG("Error: pci_write_block failed. return value[%d].\n", rc);
+        }
+    }
+
+    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
+        qemu_mod_timer(s->pm_state->pm_timer,
+                       qemu_get_clock_ms(rt_clock) + s->pm_state->pm_delay);
+    }
+}
+
+/* ioport/iomem space*/
+static void pt_iomem_map(XenPCIPassthroughState *s, int i,
+                         pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    uint32_t old_ebase = s->bases[i].e_physbase;
+    bool first_map = s->bases[i].e_size == 0;
+    int ret = 0;
+
+    s->bases[i].e_physbase = e_phys;
+    s->bases[i].e_size = e_size;
+
+    PT_LOG("e_phys=%#"PRIx64" maddr=%#"PRIx64" type=%%d"
+           " len=%#"PRIx64" index=%d first_map=%d\n",
+           e_phys, s->bases[i].access.maddr, /*type,*/
+           e_size, i, first_map);
+
+    if (e_size == 0) {
+        return;
+    }
+
+    if (!first_map && old_ebase != -1) {
+        /* Remove old mapping */
+        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
+                               old_ebase >> XC_PAGE_SHIFT,
+                               s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                               (e_size + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
+                               DPCI_REMOVE_MAPPING);
+        if (ret != 0) {
+            PT_LOG("Error: remove old mapping failed!\n");
+            return;
+        }
+    }
+
+    /* map only valid guest address */
+    if (e_phys != -1) {
+        /* Create new mapping */
+        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
+                                   s->bases[i].e_physbase >> XC_PAGE_SHIFT,
+                                   s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                                   (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
+                                   DPCI_ADD_MAPPING);
+
+        if (ret != 0) {
+            PT_LOG("Error: create new mapping failed!\n");
+        }
+    }
+}
+
+static void pt_ioport_map(XenPCIPassthroughState *s, int i,
+                          pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    uint32_t old_ebase = s->bases[i].e_physbase;
+    bool first_map = s->bases[i].e_size == 0;
+    int ret = 0;
+
+    s->bases[i].e_physbase = e_phys;
+    s->bases[i].e_size = e_size;
+
+    PT_LOG("e_phys=%#04"PRIx64" pio_base=%#04"PRIx64" len=%"PRId64" index=%d"
+           " first_map=%d\n",
+           e_phys, s->bases[i].access.pio_base, e_size, i, first_map);
+
+    if (e_size == 0) {
+        return;
+    }
+
+    if (!first_map && old_ebase != -1) {
+        /* Remove old mapping */
+        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, old_ebase,
+                                       s->bases[i].access.pio_base, e_size,
+                                       DPCI_REMOVE_MAPPING);
+        if (ret != 0) {
+            PT_LOG("Error: remove old mapping failed!\n");
+            return;
+        }
+    }
+
+    /* map only valid guest address (include 0) */
+    if (e_phys != -1) {
+        /* Create new mapping */
+        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, e_phys,
+                                       s->bases[i].access.pio_base, e_size,
+                                       DPCI_ADD_MAPPING);
+        if (ret != 0) {
+            PT_LOG("Error: create new mapping failed!\n");
+        }
+    }
+
+}
+
+
+/* mapping BAR */
+
+void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
+                        int io_enable, int mem_enable)
+{
+    PCIDevice *dev = &s->dev;
+    PCIIORegion *r;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    XenPTRegion *base = NULL;
+    pcibus_t r_size = 0, r_addr = -1;
+    int rc = 0;
+
+    r = &dev->io_regions[bar];
+
+    /* check valid region */
+    if (!r->size) {
+        return;
+    }
+
+    base = &s->bases[bar];
+    /* skip unused BAR or upper 64bit BAR */
+    if ((base->bar_flag == PT_BAR_FLAG_UNUSED)
+        || (base->bar_flag == PT_BAR_FLAG_UPPER)) {
+           return;
+    }
+
+    /* copy region address to temporary */
+    r_addr = r->addr;
+
+    /* need unmapping in case I/O Space or Memory Space disable */
+    if (((base->bar_flag == PT_BAR_FLAG_IO) && !io_enable) ||
+        ((base->bar_flag == PT_BAR_FLAG_MEM) && !mem_enable)) {
+        r_addr = -1;
+    }
+    if ((bar == PCI_ROM_SLOT) && (r_addr != -1)) {
+        reg_grp_entry = pt_find_reg_grp(s, PCI_ROM_ADDRESS);
+        if (reg_grp_entry) {
+            reg_entry = pt_find_reg(reg_grp_entry, PCI_ROM_ADDRESS);
+            if (reg_entry && !(reg_entry->data & PCI_ROM_ADDRESS_ENABLE)) {
+                r_addr = -1;
+            }
+        }
+    }
+
+    /* prevent guest software mapping memory resource to 00000000h */
+    if ((base->bar_flag == PT_BAR_FLAG_MEM) && (r_addr == 0)) {
+        r_addr = -1;
+    }
+
+    r_size = pt_get_emul_size(base->bar_flag, r->size);
+
+    rc = pci_check_bar_overlap(dev, r_addr, r_size, r->type);
+    if (rc > 0) {
+        PT_LOG("Warning: s[%02x:%02x.%x][Region:%d][Address:%"FMT_PCIBUS"h]"
+               "[Size:%"FMT_PCIBUS"h] is overlapped.\n", pci_bus_num(dev->bus),
+               PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), bar,
+               r_addr, r_size);
+    }
+
+    /* check whether we need to update the mapping or not */
+    if (r_addr != s->bases[bar].e_physbase) {
+        /* mapping BAR */
+        if (base->bar_flag == PT_BAR_FLAG_IO) {
+            pt_ioport_map(s, bar, r_addr, r_size, r->type);
+        } else {
+            pt_iomem_map(s, bar, r_addr, r_size, r->type);
+        }
+    }
+}
+
+void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable)
+{
+    int i;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        pt_bar_mapping_one(s, i, io_enable, mem_enable);
+    }
+}
+
+/* register regions */
+static int pt_register_regions(XenPCIPassthroughState *s)
+{
+    int i = 0;
+    uint32_t bar_data = 0;
+    HostPCIDevice *d = s->real_device;
+
+    /* Register PIO/MMIO BARs */
+    for (i = 0; i < PCI_BAR_ENTRIES; i++) {
+        HostPCIIORegion *r = &d->io_regions[i];
+
+        if (r->base_addr) {
+            s->bases[i].e_physbase = r->base_addr;
+            s->bases[i].access.u = r->base_addr;
+
+            /* Register current region */
+            if (r->flags & IORESOURCE_IO) {
+                memory_region_init_io(&s->bar[i], NULL, NULL,
+                                      "xen-pci-pt-bar", r->size);
+                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_IO,
+                                 &s->bar[i]);
+            } else if (r->flags & IORESOURCE_PREFETCH) {
+                memory_region_init_io(&s->bar[i], NULL, NULL,
+                                      "xen-pci-pt-bar", r->size);
+                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_MEM_PREFETCH,
+                                 &s->bar[i]);
+            } else {
+                memory_region_init_io(&s->bar[i], NULL, NULL,
+                                      "xen-pci-pt-bar", r->size);
+                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_MEMORY,
+                                 &s->bar[i]);
+            }
+
+            PT_LOG("IO region registered (size=0x%08"PRIx64
+                   " base_addr=0x%08"PRIx64")\n",
+                   r->size, r->base_addr);
+        }
+    }
+
+    /* Register expansion ROM address */
+    if (d->rom.base_addr && d->rom.size) {
+        /* Re-set BAR reported by OS, otherwise ROM can't be read. */
+        bar_data = host_pci_get_long(d, PCI_ROM_ADDRESS);
+        if ((bar_data & PCI_ROM_ADDRESS_MASK) == 0) {
+            bar_data |= d->rom.base_addr & PCI_ROM_ADDRESS_MASK;
+            host_pci_set_long(d, PCI_ROM_ADDRESS, bar_data);
+        }
+
+        s->bases[PCI_ROM_SLOT].e_physbase = d->rom.base_addr;
+        s->bases[PCI_ROM_SLOT].access.maddr = d->rom.base_addr;
+
+        memory_region_init_rom_device(&s->rom, NULL, NULL, &s->dev.qdev,
+                                      "xen-pci-pt-rom", d->rom.size);
+        pci_register_bar(&s->dev, PCI_ROM_SLOT, PCI_BASE_ADDRESS_MEM_PREFETCH,
+                         &s->rom);
+
+        PT_LOG("Expansion ROM registered (size=0x%08"PRIx64
+               " base_addr=0x%08"PRIx64")\n",
+               d->rom.size, d->rom.base_addr);
+    }
+
+    return 0;
+}
+
+static void pt_unregister_regions(XenPCIPassthroughState *s)
+{
+    int i, type, rc;
+    uint32_t e_size;
+    PCIDevice *d = &s->dev;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        e_size = s->bases[i].e_size;
+        if ((e_size == 0) || (s->bases[i].e_physbase == -1)) {
+            continue;
+        }
+
+        type = d->io_regions[i].type;
+
+        if (type == PCI_BASE_ADDRESS_SPACE_MEMORY
+            || type == PCI_BASE_ADDRESS_MEM_PREFETCH) {
+            rc = xc_domain_memory_mapping(xen_xc, xen_domid,
+                    s->bases[i].e_physbase >> XC_PAGE_SHIFT,
+                    s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                    (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
+                    DPCI_REMOVE_MAPPING);
+            if (rc != 0) {
+                PT_LOG("Error: remove old mem mapping failed!\n");
+                continue;
+            }
+
+        } else if (type == PCI_BASE_ADDRESS_SPACE_IO) {
+            rc = xc_domain_ioport_mapping(xen_xc, xen_domid,
+                        s->bases[i].e_physbase,
+                        s->bases[i].access.pio_base,
+                        e_size,
+                        DPCI_REMOVE_MAPPING);
+            if (rc != 0) {
+                PT_LOG("Error: remove old io mapping failed!\n");
+                continue;
+            }
+        }
+    }
+}
+
+static int pt_initfn(PCIDevice *pcidev)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
+    int dom, bus;
+    unsigned slot, func;
+    int rc = 0;
+    uint32_t machine_irq;
+    int pirq = -1;
+
+    if (pci_parse_devaddr(s->hostaddr, &dom, &bus, &slot, &func) < 0) {
+        fprintf(stderr, "error parse bdf: %s\n", s->hostaddr);
+        return -1;
+    }
+
+    /* register real device */
+    PT_LOG("Assigning real physical device %02x:%02x.%x to devfn %i ...\n",
+           bus, slot, func, s->dev.devfn);
+
+    s->real_device = host_pci_device_get(bus, slot, func);
+    if (!s->real_device) {
+        return -1;
+    }
+
+    s->is_virtfn = s->real_device->is_virtfn;
+    if (s->is_virtfn) {
+        PT_LOG("%04x:%02x:%02x.%x is a SR-IOV Virtual Function\n",
+               s->real_device->domain, bus, slot, func);
+    }
+
+    /* Initialize virtualized PCI configuration (Extended 256 Bytes) */
+    if (host_pci_get_block(s->real_device, 0, pcidev->config,
+                           PCI_CONFIG_SPACE_SIZE) == -1) {
+        return -1;
+    }
+
+    /* Handle real device's MMIO/PIO BARs */
+    pt_register_regions(s);
+
+    /* reinitialize each config register to be emulated */
+    pt_config_init(s);
+
+    /* Bind interrupt */
+    if (!s->dev.config[PCI_INTERRUPT_PIN]) {
+        PT_LOG("no pin interrupt\n");
+        goto out;
+    }
+
+    machine_irq = host_pci_get_byte(s->real_device, PCI_INTERRUPT_LINE);
+    rc = xc_physdev_map_pirq(xen_xc, xen_domid, machine_irq, &pirq);
+
+    if (rc) {
+        PT_LOG("Error: Mapping irq failed, rc = %d\n", rc);
+
+        /* Disable PCI intx assertion (turn on bit10 of devctl) */
+        host_pci_set_word(s->real_device,
+                          PCI_COMMAND,
+                          pci_get_word(s->dev.config + PCI_COMMAND)
+                          | PCI_COMMAND_INTX_DISABLE);
+        machine_irq = 0;
+        s->machine_irq = 0;
+    } else {
+        machine_irq = pirq;
+        s->machine_irq = pirq;
+        mapped_machine_irq[machine_irq]++;
+    }
+
+    /* bind machine_irq to device */
+    if (rc < 0 && machine_irq != 0) {
+        uint8_t e_device = PCI_SLOT(s->dev.devfn);
+        uint8_t e_intx = pci_intx(s);
+
+        rc = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, machine_irq, 0,
+                                       e_device, e_intx);
+        if (rc < 0) {
+            PT_LOG("Error: Binding of interrupt failed! rc=%d\n", rc);
+
+            /* Disable PCI intx assertion (turn on bit10 of devctl) */
+            host_pci_set_word(s->real_device, PCI_COMMAND,
+                              *(uint16_t *)(&s->dev.config[PCI_COMMAND])
+                              | PCI_COMMAND_INTX_DISABLE);
+            mapped_machine_irq[machine_irq]--;
+
+            if (mapped_machine_irq[machine_irq] == 0) {
+                if (xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq)) {
+                    PT_LOG("Error: Unmapping of interrupt failed! rc=%d\n",
+                           rc);
+                }
+            }
+            s->machine_irq = 0;
+        }
+    }
+
+out:
+    PT_LOG("Real physical device %02x:%02x.%x registered successfuly!\n"
+           "IRQ type = %s\n", bus, slot, func, "INTx");
+
+    return 0;
+}
+
+static int pt_unregister_device(PCIDevice *pcidev)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
+    uint8_t e_device, e_intx;
+    uint32_t machine_irq;
+    int rc;
+
+    /* Unbind interrupt */
+    e_device = PCI_SLOT(s->dev.devfn);
+    e_intx = pci_intx(s);
+    machine_irq = s->machine_irq;
+
+    if (machine_irq) {
+        rc = xc_domain_unbind_pt_irq(xen_xc, xen_domid, machine_irq,
+                                     PT_IRQ_TYPE_PCI, 0, e_device, e_intx, 0);
+        if (rc < 0) {
+            PT_LOG("Error: Unbinding of interrupt failed! rc=%d\n", rc);
+        }
+    }
+
+    if (machine_irq) {
+        mapped_machine_irq[machine_irq]--;
+
+        if (mapped_machine_irq[machine_irq] == 0) {
+            rc = xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq);
+
+            if (rc < 0) {
+                PT_LOG("Error: Unmaping of interrupt failed! rc=%d\n", rc);
+            }
+        }
+    }
+
+    /* delete all emulated config registers */
+    pt_config_delete(s);
+
+    /* unregister real device's MMIO/PIO BARs */
+    pt_unregister_regions(s);
+
+    host_pci_device_put(s->real_device);
+
+    return 0;
+}
+
+static PCIDeviceInfo xen_pci_passthrough = {
+    .init = pt_initfn,
+    .exit = pt_unregister_device,
+    .qdev.name = "xen-pci-passthrough",
+    .qdev.desc = "Assign an host pci device with Xen",
+    .qdev.size = sizeof(XenPCIPassthroughState),
+    .config_read = pt_pci_read_config,
+    .config_write = pt_pci_write_config,
+    .is_express = 0,
+    .qdev.props = (Property[]) {
+        DEFINE_PROP_STRING("hostaddr", XenPCIPassthroughState, hostaddr),
+        DEFINE_PROP_BIT("power-mgmt", XenPCIPassthroughState, power_mgmt,
+                        0, false),
+        DEFINE_PROP_END_OF_LIST(),
+    }
+};
+
+static void xen_passthrough_register(void)
+{
+    pci_qdev_register(&xen_pci_passthrough);
+}
+
+device_init(xen_passthrough_register);
diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
new file mode 100644
index 0000000..2d1979d
--- /dev/null
+++ b/hw/xen_pci_passthrough.h
@@ -0,0 +1,223 @@ 
+#ifndef QEMU_HW_XEN_PCI_PASSTHROUGH_H
+#  define QEMU_HW_XEN_PCI_PASSTHROUGH_H
+
+#include "qemu-common.h"
+#include "xen_common.h"
+#include "pci.h"
+#include "host-pci-device.h"
+
+#define PT_LOGGING_ENABLED
+#define PT_DEBUG_PCI_CONFIG_ACCESS
+
+#ifdef PT_LOGGING_ENABLED
+#  define PT_LOG(_f, _a...)   fprintf(stderr, "%s: " _f, __func__, ##_a)
+#else
+#  define PT_LOG(_f, _a...)
+#endif
+
+#ifdef PT_DEBUG_PCI_CONFIG_ACCESS
+#  define PT_LOG_CONFIG(_f, _a...) PT_LOG(_f, ##_a)
+#else
+#  define PT_LOG_CONFIG(_f, _a...)
+#endif
+
+
+typedef struct XenPTRegInfo XenPTRegInfo;
+typedef struct XenPTReg XenPTReg;
+
+typedef struct XenPCIPassthroughState XenPCIPassthroughState;
+
+/* function type for config reg */
+typedef uint32_t (*conf_reg_init)
+    (XenPCIPassthroughState *, XenPTRegInfo *, uint32_t real_offset);
+typedef int (*conf_dword_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint32_t *val, uint32_t dev_value, uint32_t valid_mask);
+typedef int (*conf_word_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint16_t *val, uint16_t dev_value, uint16_t valid_mask);
+typedef int (*conf_byte_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint8_t *val, uint8_t dev_value, uint8_t valid_mask);
+typedef int (*conf_dword_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint32_t *val, uint32_t valid_mask);
+typedef int (*conf_word_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint16_t *val, uint16_t valid_mask);
+typedef int (*conf_byte_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint8_t *val, uint8_t valid_mask);
+typedef int (*conf_dword_restore)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
+     uint32_t dev_value, uint32_t *val);
+typedef int (*conf_word_restore)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
+     uint16_t dev_value, uint16_t *val);
+typedef int (*conf_byte_restore)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
+     uint8_t dev_value, uint8_t *val);
+
+/* power state transition */
+#define PT_FLAG_TRANSITING 0x0001
+
+
+typedef enum {
+    GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
+    GRP_TYPE_EMU,                               /* emul reg group */
+} RegisterGroupType;
+
+typedef enum {
+    PT_BAR_FLAG_MEM = 0,                        /* Memory type BAR */
+    PT_BAR_FLAG_IO,                             /* I/O type BAR */
+    PT_BAR_FLAG_UPPER,                          /* upper 64bit BAR */
+    PT_BAR_FLAG_UNUSED,                         /* unused BAR */
+} PTBarFlag;
+
+
+typedef struct XenPTRegion {
+    /* Virtual phys base & size */
+    uint32_t e_physbase;
+    uint32_t e_size;
+    /* Index of region in qemu */
+    uint32_t memory_index;
+    /* BAR flag */
+    PTBarFlag bar_flag;
+    /* Translation of the emulated address */
+    union {
+        uint64_t maddr;
+        uint64_t pio_base;
+        uint64_t u;
+    } access;
+} XenPTRegion;
+
+/* XenPTRegInfo declaration
+ * - only for emulated register (either a part or whole bit).
+ * - for passthrough register that need special behavior (like interacting with
+ *   other component), set emu_mask to all 0 and specify r/w func properly.
+ * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
+ */
+
+/* emulated register infomation */
+struct XenPTRegInfo {
+    uint32_t offset;
+    uint32_t size;
+    uint32_t init_val;
+    /* reg read only field mask (ON:RO/ROS, OFF:other) */
+    uint32_t ro_mask;
+    /* reg emulate field mask (ON:emu, OFF:passthrough) */
+    uint32_t emu_mask;
+    /* no write back allowed */
+    uint32_t no_wb;
+    conf_reg_init init;
+    /* read/write/restore function pointer
+     * for double_word/word/byte size */
+    union {
+        struct {
+            conf_dword_write write;
+            conf_dword_read read;
+            conf_dword_restore restore;
+        } dw;
+        struct {
+            conf_word_write write;
+            conf_word_read read;
+            conf_word_restore restore;
+        } w;
+        struct {
+            conf_byte_write write;
+            conf_byte_read read;
+            conf_byte_restore restore;
+        } b;
+    } u;
+};
+
+/* emulated register management */
+struct XenPTReg {
+    QLIST_ENTRY(XenPTReg) entries;
+    XenPTRegInfo *reg;
+    uint32_t data;
+};
+
+typedef struct XenPTRegGroupInfo XenPTRegGroupInfo;
+
+/* emul reg group size initialize method */
+typedef uint8_t (*pt_reg_size_init_fn)
+    (XenPCIPassthroughState *, const XenPTRegGroupInfo *,
+     uint32_t base_offset);
+
+/* emulated register group infomation */
+struct XenPTRegGroupInfo {
+    uint8_t grp_id;
+    RegisterGroupType grp_type;
+    uint8_t grp_size;
+    pt_reg_size_init_fn size_init;
+    XenPTRegInfo *emu_reg_tbl;
+};
+
+/* emul register group management table */
+typedef struct XenPTRegGroup {
+    QLIST_ENTRY(XenPTRegGroup) entries;
+    const XenPTRegGroupInfo *reg_grp;
+    uint32_t base_offset;
+    uint8_t size;
+    QLIST_HEAD(, XenPTReg) reg_tbl_list;
+} XenPTRegGroup;
+
+
+typedef struct XenPTPM {
+    QEMUTimer *pm_timer;  /* QEMUTimer struct */
+    int no_soft_reset;    /* No Soft Reset flags */
+    uint16_t flags;       /* power state transition flags */
+    uint16_t pmc_field;   /* Power Management Capabilities field */
+    int pm_delay;         /* power state transition delay */
+    uint16_t cur_state;   /* current power state */
+    uint16_t req_state;   /* requested power state */
+    uint32_t pm_base;     /* Power Management Capability reg base offset */
+    uint32_t aer_base;    /* AER Capability reg base offset */
+} XenPTPM;
+
+struct XenPCIPassthroughState {
+    PCIDevice dev;
+
+    char *hostaddr;
+    bool is_virtfn;
+    HostPCIDevice *real_device;
+    XenPTRegion bases[PCI_NUM_REGIONS]; /* Access regions */
+    QLIST_HEAD(, XenPTRegGroup) reg_grp_tbl;
+
+    uint32_t machine_irq;
+
+    uint32_t power_mgmt;
+    XenPTPM *pm_state;
+
+    MemoryRegion bar[PCI_NUM_REGIONS - 1];
+    MemoryRegion rom;
+};
+
+void pt_config_init(XenPCIPassthroughState *s);
+void pt_config_delete(XenPCIPassthroughState *s);
+void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable);
+void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
+                        int io_enable, int mem_enable);
+XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address);
+XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address);
+int pt_bar_offset_to_index(uint32_t offset);
+
+static inline pcibus_t pt_get_emul_size(PTBarFlag flag, pcibus_t r_size)
+{
+    /* align resource size (memory type only) */
+    if (flag == PT_BAR_FLAG_MEM) {
+        return (r_size + XC_PAGE_SIZE - 1) & XC_PAGE_MASK;
+    } else {
+        return r_size;
+    }
+}
+
+/* INTx */
+static inline uint8_t pci_read_intx(XenPCIPassthroughState *s)
+{
+    return host_pci_get_byte(s->real_device, PCI_INTERRUPT_PIN);
+}
+uint8_t pci_intx(XenPCIPassthroughState *ptdev);
+
+#endif /* !QEMU_HW_XEN_PCI_PASSTHROUGH_H */
diff --git a/hw/xen_pci_passthrough_helpers.c b/hw/xen_pci_passthrough_helpers.c
new file mode 100644
index 0000000..192e918
--- /dev/null
+++ b/hw/xen_pci_passthrough_helpers.c
@@ -0,0 +1,46 @@ 
+#include "xen_pci_passthrough.h"
+
+/* The PCI Local Bus Specification, Rev. 3.0, {
+ * Section 6.2.4 Miscellaneous Registers, pp 223
+ * outlines 5 valid values for the intertupt pin (intx).
+ *  0: For devices (or device functions) that don't use an interrupt in
+ *  1: INTA#
+ *  2: INTB#
+ *  3: INTC#
+ *  4: INTD#
+ *
+ * Xen uses the following 4 values for intx
+ *  0: INTA#
+ *  1: INTB#
+ *  2: INTC#
+ *  3: INTD#
+ *
+ * Observing that these list of values are not the same, pci_read_intx()
+ * uses the following mapping from hw to xen values.
+ * This seems to reflect the current usage within Xen.
+ *
+ * PCI hardware    | Xen | Notes
+ * ----------------+-----+----------------------------------------------------
+ * 0               | 0   | No interrupt
+ * 1               | 0   | INTA#
+ * 2               | 1   | INTB#
+ * 3               | 2   | INTC#
+ * 4               | 3   | INTD#
+ * any other value | 0   | This should never happen, log error message
+}
+ */
+uint8_t pci_intx(XenPCIPassthroughState *ptdev)
+{
+    uint8_t r_val = pci_read_intx(ptdev);
+
+    PT_LOG("intx=%i\n", r_val);
+    if (r_val < 1 || r_val > 4) {
+        PT_LOG("Interrupt pin read from hardware is out of range: "
+               "value=%i, acceptable range is 1 - 4\n", r_val);
+        r_val = 0;
+    } else {
+        r_val -= 1;
+    }
+
+    return r_val;
+}