diff mbox

[4/4] kvm: i386: Add classic PCI device assignment

Message ID 825e653c9cfe9d8e26185917cbe1f1dd7ae299e2.1346048917.git.jan.kiszka@web.de
State New
Headers show

Commit Message

Jan Kiszka Aug. 27, 2012, 6:28 a.m. UTC
From: Jan Kiszka <jan.kiszka@siemens.com>

This adds PCI device assignment for i386 targets using the classic KVM
interfaces. This version is 100% identical to what is being maintained
in qemu-kvm for several years and is supported by libvirt as well. It is
expected to remain relevant for another couple of years until kernels
without full-features and performance-wise equivalent VFIO support are
obsolete.

A refactoring to-do that should be done in-tree is to model MSI and
MSI-X support via the generic PCI layer, similar to what VFIO is already
doing for MSI-X. This should improve the correctness and clean up the
code from duplicate logic.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 hw/kvm/Makefile.objs |    2 +-
 hw/kvm/pci-assign.c  | 1929 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 1930 insertions(+), 1 deletions(-)
 create mode 100644 hw/kvm/pci-assign.c

Comments

Andreas Färber Aug. 27, 2012, 12:07 p.m. UTC | #1
Hi,

Am 27.08.2012 08:28, schrieb Jan Kiszka:
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> This adds PCI device assignment for i386 targets using the classic KVM
> interfaces. This version is 100% identical to what is being maintained
> in qemu-kvm for several years and is supported by libvirt as well. It is
> expected to remain relevant for another couple of years until kernels
> without full-features and performance-wise equivalent VFIO support are
> obsolete.
> 
> A refactoring to-do that should be done in-tree is to model MSI and
> MSI-X support via the generic PCI layer, similar to what VFIO is already
> doing for MSI-X. This should improve the correctness and clean up the
> code from duplicate logic.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>  hw/kvm/Makefile.objs |    2 +-
>  hw/kvm/pci-assign.c  | 1929 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 1930 insertions(+), 1 deletions(-)
>  create mode 100644 hw/kvm/pci-assign.c
[...]
> diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
> new file mode 100644
> index 0000000..9cce02c
> --- /dev/null
> +++ b/hw/kvm/pci-assign.c
> @@ -0,0 +1,1929 @@
> +/*
> + * Copyright (c) 2007, Neocleus Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.

The downside of accepting this into qemu.git is that it gets us a huge
blob of GPLv2-only code without history of contributors for GPLv2+
relicensing...

> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.

(Expect the usual GNU address reminder here.)

> + *
> + *
> + *  Assign a PCI device from the host to a guest VM.
> + *
> + *  Adapted for KVM by Qumranet.
> + *
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> + */
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <sys/io.h>
> +#include <sys/mman.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include "hw/hw.h"
> +#include "hw/pc.h"
> +#include "qemu-error.h"
> +#include "console.h"
> +#include "hw/loader.h"
> +#include "monitor.h"
> +#include "range.h"
> +#include "sysemu.h"
> +#include "hw/pci.h"
> +#include "hw/msi.h"

> +#include "kvm_i386.h"

Am I correct to understand we compile this only for i386 / x86_64?
(apic.o in kvm/Makefile.objs hints in that direction) You may want to
update the description in the comment above accordingly, also mentioning
that this is some deprecated backwards-compatibility thing.

Regards,
Andreas
Jan Kiszka Aug. 27, 2012, 12:15 p.m. UTC | #2
On 2012-08-27 14:07, Andreas Färber wrote:
> Hi,
> 
> Am 27.08.2012 08:28, schrieb Jan Kiszka:
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> This adds PCI device assignment for i386 targets using the classic KVM
>> interfaces. This version is 100% identical to what is being maintained
>> in qemu-kvm for several years and is supported by libvirt as well. It is
>> expected to remain relevant for another couple of years until kernels
>> without full-features and performance-wise equivalent VFIO support are
>> obsolete.
>>
>> A refactoring to-do that should be done in-tree is to model MSI and
>> MSI-X support via the generic PCI layer, similar to what VFIO is already
>> doing for MSI-X. This should improve the correctness and clean up the
>> code from duplicate logic.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>  hw/kvm/Makefile.objs |    2 +-
>>  hw/kvm/pci-assign.c  | 1929 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 1930 insertions(+), 1 deletions(-)
>>  create mode 100644 hw/kvm/pci-assign.c
> [...]
>> diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
>> new file mode 100644
>> index 0000000..9cce02c
>> --- /dev/null
>> +++ b/hw/kvm/pci-assign.c
>> @@ -0,0 +1,1929 @@
>> +/*
>> + * Copyright (c) 2007, Neocleus Corporation.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms and conditions of the GNU General Public License,
>> + * version 2, as published by the Free Software Foundation.
> 
> The downside of accepting this into qemu.git is that it gets us a huge
> blob of GPLv2-only code without history of contributors for GPLv2+
> relicensing...

The history is documented in qemu-kvm. I personally don't see it will
pay off going through this, but someone else may, and nothing will
prevent trying this at least. I can leave a comment.

BTW, VFIO will be GPLv2 only as well. If I understood Alex correctly, it
is too much derived from this code. IOW: There is probably no PCI
assignment without this restriction in the foreseeable future.

> 
>> + *
>> + * This program is distributed in the hope it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
>> + * more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along with
>> + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
>> + * Place - Suite 330, Boston, MA 02111-1307 USA.
> 
> (Expect the usual GNU address reminder here.)

Will fix.

> 
>> + *
>> + *
>> + *  Assign a PCI device from the host to a guest VM.
>> + *
>> + *  Adapted for KVM by Qumranet.
>> + *
>> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
>> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
>> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
>> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
>> + */
>> +#include <stdio.h>
>> +#include <unistd.h>
>> +#include <sys/io.h>
>> +#include <sys/mman.h>
>> +#include <sys/types.h>
>> +#include <sys/stat.h>
>> +#include "hw/hw.h"
>> +#include "hw/pc.h"
>> +#include "qemu-error.h"
>> +#include "console.h"
>> +#include "hw/loader.h"
>> +#include "monitor.h"
>> +#include "range.h"
>> +#include "sysemu.h"
>> +#include "hw/pci.h"
>> +#include "hw/msi.h"
> 
>> +#include "kvm_i386.h"
> 
> Am I correct to understand we compile this only for i386 / x86_64?

This is correct.

> (apic.o in kvm/Makefile.objs hints in that direction) You may want to
> update the description in the comment above accordingly, also mentioning
> that this is some deprecated backwards-compatibility thing.

You mean in the header of pci-assign.c? Can do.

Jan
Blue Swirl Aug. 27, 2012, 6:56 p.m. UTC | #3
On Mon, Aug 27, 2012 at 6:28 AM, Jan Kiszka <jan.kiszka@web.de> wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
>
> This adds PCI device assignment for i386 targets using the classic KVM
> interfaces. This version is 100% identical to what is being maintained
> in qemu-kvm for several years and is supported by libvirt as well. It is
> expected to remain relevant for another couple of years until kernels
> without full-features and performance-wise equivalent VFIO support are
> obsolete.
>
> A refactoring to-do that should be done in-tree is to model MSI and
> MSI-X support via the generic PCI layer, similar to what VFIO is already
> doing for MSI-X. This should improve the correctness and clean up the
> code from duplicate logic.
>
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>  hw/kvm/Makefile.objs |    2 +-
>  hw/kvm/pci-assign.c  | 1929 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 1930 insertions(+), 1 deletions(-)
>  create mode 100644 hw/kvm/pci-assign.c
>
> diff --git a/hw/kvm/Makefile.objs b/hw/kvm/Makefile.objs
> index 226497a..f620d7f 100644
> --- a/hw/kvm/Makefile.objs
> +++ b/hw/kvm/Makefile.objs
> @@ -1 +1 @@
> -obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o
> +obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o pci-assign.o
> diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
> new file mode 100644
> index 0000000..9cce02c
> --- /dev/null
> +++ b/hw/kvm/pci-assign.c
> @@ -0,0 +1,1929 @@
> +/*
> + * Copyright (c) 2007, Neocleus Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.
> + *
> + *
> + *  Assign a PCI device from the host to a guest VM.
> + *
> + *  Adapted for KVM by Qumranet.
> + *
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> + */
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <sys/io.h>
> +#include <sys/mman.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include "hw/hw.h"
> +#include "hw/pc.h"
> +#include "qemu-error.h"
> +#include "console.h"
> +#include "hw/loader.h"
> +#include "monitor.h"
> +#include "range.h"
> +#include "sysemu.h"
> +#include "hw/pci.h"
> +#include "hw/msi.h"
> +#include "kvm_i386.h"
> +
> +#define MSIX_PAGE_SIZE 0x1000
> +
> +/* From linux/ioport.h */
> +#define IORESOURCE_IO       0x00000100  /* Resource type */
> +#define IORESOURCE_MEM      0x00000200
> +#define IORESOURCE_IRQ      0x00000400
> +#define IORESOURCE_DMA      0x00000800
> +#define IORESOURCE_PREFETCH 0x00002000  /* No side effects */
> +
> +/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
> +
> +#ifdef DEVICE_ASSIGNMENT_DEBUG
> +#define DEBUG(fmt, ...)                                       \
> +    do {                                                      \
> +        fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);  \
> +    } while (0)
> +#else
> +#define DEBUG(fmt, ...)
> +#endif
> +
> +typedef struct {
> +    int type;           /* Memory or port I/O */
> +    int valid;
> +    uint32_t base_addr;
> +    uint32_t size;    /* size of the region */
> +    int resource_fd;
> +} PCIRegion;

Please don't use anonymous structs.

> +
> +typedef struct {
> +    uint8_t bus, dev, func; /* Bus inside domain, device and function */
> +    int irq;                /* IRQ number */
> +    uint16_t region_number; /* number of active regions */
> +
> +    /* Port I/O or MMIO Regions */
> +    PCIRegion regions[PCI_NUM_REGIONS - 1];
> +    int config_fd;
> +} PCIDevRegions;
> +
> +typedef struct {
> +    MemoryRegion container;
> +    MemoryRegion real_iomem;
> +    union {
> +        void *r_virtbase;    /* mmapped access address for memory regions */
> +        uint32_t r_baseport; /* the base guest port for I/O regions */
> +    } u;
> +    pcibus_t e_size;    /* emulated size of region in bytes */
> +    pcibus_t r_size;    /* real size of region in bytes */
> +    PCIRegion *region;
> +} AssignedDevRegion;
> +
> +#define ASSIGNED_DEVICE_PREFER_MSI_BIT  0
> +#define ASSIGNED_DEVICE_SHARE_INTX_BIT  1
> +
> +#define ASSIGNED_DEVICE_PREFER_MSI_MASK (1 << ASSIGNED_DEVICE_PREFER_MSI_BIT)
> +#define ASSIGNED_DEVICE_SHARE_INTX_MASK (1 << ASSIGNED_DEVICE_SHARE_INTX_BIT)
> +
> +typedef struct {
> +    uint32_t addr_lo;
> +    uint32_t addr_hi;
> +    uint32_t data;
> +    uint32_t ctrl;
> +} MSIXTableEntry;
> +
> +typedef enum AssignedIRQType {
> +    ASSIGNED_IRQ_NONE = 0,
> +    ASSIGNED_IRQ_INTX_HOST_INTX,
> +    ASSIGNED_IRQ_INTX_HOST_MSI,
> +    ASSIGNED_IRQ_MSI,
> +    ASSIGNED_IRQ_MSIX
> +} AssignedIRQType;
> +
> +typedef struct AssignedDevice {
> +    PCIDevice dev;
> +    PCIHostDeviceAddress host;
> +    uint32_t dev_id;
> +    uint32_t features;
> +    int intpin;
> +    AssignedDevRegion v_addrs[PCI_NUM_REGIONS - 1];
> +    PCIDevRegions real_device;
> +    PCIINTxRoute intx_route;
> +    AssignedIRQType assigned_irq_type;
> +    struct {
> +#define ASSIGNED_DEVICE_CAP_MSI (1 << 0)
> +#define ASSIGNED_DEVICE_CAP_MSIX (1 << 1)
> +        uint32_t available;
> +#define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0)
> +#define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1)
> +#define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
> +        uint32_t state;
> +    } cap;
> +    uint8_t emulate_config_read[PCI_CONFIG_SPACE_SIZE];
> +    uint8_t emulate_config_write[PCI_CONFIG_SPACE_SIZE];
> +    int msi_virq_nr;
> +    int *msi_virq;
> +    MSIXTableEntry *msix_table;
> +    target_phys_addr_t msix_table_addr;
> +    uint16_t msix_max;
> +    MemoryRegion mmio;
> +    char *configfd_name;

const? Not if this would mean more casts.

> +    int32_t bootindex;
> +} AssignedDevice;
> +
> +static void assigned_dev_update_irq_routing(PCIDevice *dev);
> +
> +static void assigned_dev_load_option_rom(AssignedDevice *dev);
> +
> +static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
> +
> +static uint64_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
> +                                       target_phys_addr_t addr, int size,
> +                                       uint64_t *data)
> +{
> +    uint64_t val = 0;
> +    int fd = dev_region->region->resource_fd;
> +
> +    if (fd >= 0) {
> +        if (data) {
> +            DEBUG("pwrite data=%lx, size=%d, e_phys=%lx, addr=%lx\n",

PRIx64 for data, TARGET_FMT_Plx for e_phys and addr.

> +                  *data, size, addr, addr);
> +            if (pwrite(fd, data, size, addr) != size) {
> +                fprintf(stderr, "%s - pwrite failed %s\n",
> +                        __func__, strerror(errno));
> +            }
> +        } else {
> +            if (pread(fd, &val, size, addr) != size) {
> +                fprintf(stderr, "%s - pread failed %s\n",
> +                        __func__, strerror(errno));
> +                val = (1UL << (size * 8)) - 1;
> +            }
> +            DEBUG("pread val=%lx, size=%d, e_phys=%lx, addr=%lx\n",

Also here. Please fix all DEBUG statements.

> +                  val, size, addr, addr);
> +        }
> +    } else {
> +        uint32_t port = addr + dev_region->u.r_baseport;
> +
> +        if (data) {
> +            DEBUG("out data=%lx, size=%d, e_phys=%lx, host=%x\n",
> +                  *data, size, addr, port);
> +            switch (size) {
> +            case 1:
> +                outb(*data, port);
> +                break;
> +            case 2:
> +                outw(*data, port);
> +                break;
> +            case 4:
> +                outl(*data, port);
> +                break;

Maybe add case 8: and default: with abort(), also below.

> +            }
> +        } else {
> +            switch (size) {
> +            case 1:
> +                val = inb(port);
> +                break;
> +            case 2:
> +                val = inw(port);
> +                break;
> +            case 4:
> +                val = inl(port);
> +                break;
> +            }
> +            DEBUG("in data=%lx, size=%d, e_phys=%lx, host=%x\n",
> +                  val, size, addr, port);
> +        }
> +    }
> +    return val;
> +}
> +
> +static void assigned_dev_ioport_write(void *opaque, target_phys_addr_t addr,
> +                                      uint64_t data, unsigned size)
> +{
> +    assigned_dev_ioport_rw(opaque, addr, size, &data);
> +}
> +
> +static uint64_t assigned_dev_ioport_read(void *opaque,
> +                                         target_phys_addr_t addr, unsigned size)
> +{
> +    return assigned_dev_ioport_rw(opaque, addr, size, NULL);
> +}
> +
> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *in = d->u.r_virtbase + addr;

Don't perform arithmetic with void pointers.

> +    uint32_t r;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *in = d->u.r_virtbase + addr;
> +    uint32_t r;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *in = d->u.r_virtbase + addr;
> +    uint32_t r;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *out = d->u.r_virtbase + addr;
> +
> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *out = d->u.r_virtbase + addr;
> +
> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *out = d->u.r_virtbase + addr;
> +
> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> +    *out = val;
> +}
> +
> +static const MemoryRegionOps slow_bar_ops = {
> +    .old_mmio = {
> +        .read = { slow_bar_readb, slow_bar_readw, slow_bar_readl, },
> +        .write = { slow_bar_writeb, slow_bar_writew, slow_bar_writel, },
> +    },
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +};
> +
> +static void assigned_dev_iomem_setup(PCIDevice *pci_dev, int region_num,
> +                                     pcibus_t e_size)
> +{
> +    AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> +
> +    if (e_size > 0) {
> +        memory_region_init(&region->container, "assigned-dev-container",
> +                           e_size);
> +        memory_region_add_subregion(&region->container, 0, &region->real_iomem);
> +
> +        /* deal with MSI-X MMIO page */
> +        if (real_region->base_addr <= r_dev->msix_table_addr &&
> +                real_region->base_addr + real_region->size >
> +                r_dev->msix_table_addr) {
> +            int offset = r_dev->msix_table_addr - real_region->base_addr;
> +
> +            memory_region_add_subregion_overlap(&region->container,
> +                                                offset,
> +                                                &r_dev->mmio,
> +                                                1);
> +        }
> +    }
> +}
> +
> +static const MemoryRegionOps assigned_dev_ioport_ops = {
> +    .read = assigned_dev_ioport_read,
> +    .write = assigned_dev_ioport_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +};
> +
> +static void assigned_dev_ioport_setup(PCIDevice *pci_dev, int region_num,
> +                                      pcibus_t size)
> +{
> +    AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> +
> +    region->e_size = size;
> +    memory_region_init(&region->container, "assigned-dev-container", size);
> +    memory_region_init_io(&region->real_iomem, &assigned_dev_ioport_ops,
> +                          r_dev->v_addrs + region_num,
> +                          "assigned-dev-iomem", size);
> +    memory_region_add_subregion(&region->container, 0, &region->real_iomem);
> +}
> +
> +static uint32_t assigned_dev_pci_read(PCIDevice *d, int pos, int len)
> +{
> +    AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d);
> +    uint32_t val;
> +    ssize_t ret;
> +    int fd = pci_dev->real_device.config_fd;
> +
> +again:
> +    ret = pread(fd, &val, len, pos);
> +    if (ret != len) {
> +        if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) {
> +            goto again;
> +        }
> +
> +        hw_error("pci read failed, ret = %zd errno = %d\n", ret, errno);
> +    }
> +
> +    return val;
> +}
> +
> +static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
> +{
> +    return (uint8_t)assigned_dev_pci_read(d, pos, 1);
> +}
> +
> +static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
> +{
> +    AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d);
> +    ssize_t ret;
> +    int fd = pci_dev->real_device.config_fd;
> +
> +again:
> +    ret = pwrite(fd, &val, len, pos);
> +    if (ret != len) {
> +        if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) {
> +            goto again;
> +        }
> +
> +        hw_error("pci write failed, ret = %zd errno = %d\n", ret, errno);
> +    }
> +}
> +
> +static void assigned_dev_emulate_config_read(AssignedDevice *dev,
> +                                             uint32_t offset, uint32_t len)
> +{
> +    memset(dev->emulate_config_read + offset, 0xff, len);
> +}
> +
> +static void assigned_dev_direct_config_read(AssignedDevice *dev,
> +                                            uint32_t offset, uint32_t len)
> +{
> +    memset(dev->emulate_config_read + offset, 0, len);
> +}
> +
> +static void assigned_dev_direct_config_write(AssignedDevice *dev,
> +                                             uint32_t offset, uint32_t len)
> +{
> +    memset(dev->emulate_config_write + offset, 0, len);
> +}
> +
> +static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap, uint8_t start)
> +{
> +    int id;
> +    int max_cap = 48;
> +    int pos = start ? start : PCI_CAPABILITY_LIST;
> +    int status;
> +
> +    status = assigned_dev_pci_read_byte(d, PCI_STATUS);
> +    if ((status & PCI_STATUS_CAP_LIST) == 0) {
> +        return 0;
> +    }
> +
> +    while (max_cap--) {
> +        pos = assigned_dev_pci_read_byte(d, pos);
> +        if (pos < 0x40) {
> +            break;
> +        }
> +
> +        pos &= ~3;
> +        id = assigned_dev_pci_read_byte(d, pos + PCI_CAP_LIST_ID);
> +
> +        if (id == 0xff) {
> +            break;
> +        }
> +        if (id == cap) {
> +            return pos;
> +        }
> +
> +        pos += PCI_CAP_LIST_NEXT;
> +    }
> +    return 0;
> +}
> +
> +static int assigned_dev_register_regions(PCIRegion *io_regions,
> +                                         unsigned long regions_num,
> +                                         AssignedDevice *pci_dev)
> +{
> +    uint32_t i;
> +    PCIRegion *cur_region = io_regions;
> +
> +    for (i = 0; i < regions_num; i++, cur_region++) {
> +        if (!cur_region->valid) {
> +            continue;
> +        }
> +
> +        /* handle memory io regions */
> +        if (cur_region->type & IORESOURCE_MEM) {
> +            int t = cur_region->type & IORESOURCE_PREFETCH
> +                ? PCI_BASE_ADDRESS_MEM_PREFETCH
> +                : PCI_BASE_ADDRESS_SPACE_MEMORY;
> +
> +            /* map physical memory */
> +            pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL, cur_region->size,
> +                                                    PROT_WRITE | PROT_READ,
> +                                                    MAP_SHARED,
> +                                                    cur_region->resource_fd,
> +                                                    (off_t)0);
> +
> +            if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
> +                pci_dev->v_addrs[i].u.r_virtbase = NULL;
> +                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
> +                        "\n", __func__,
> +                        (uint32_t) (cur_region->base_addr));

Why truncate with cast?

> +                return -1;
> +            }
> +
> +            pci_dev->v_addrs[i].r_size = cur_region->size;
> +            pci_dev->v_addrs[i].e_size = 0;
> +
> +            /* add offset */
> +            pci_dev->v_addrs[i].u.r_virtbase +=
> +                (cur_region->base_addr & 0xFFF);
> +
> +            if (cur_region->size & 0xFFF) {
> +                fprintf(stderr, "PCI region %d at address 0x%llx "

PRIx64 ...

> +                        "has size 0x%x, which is not a multiple of 4K. "
> +                        "You might experience some performance hit "
> +                        "due to that.\n",
> +                        i, (unsigned long long)cur_region->base_addr,

... and remove cast.

> +                        cur_region->size);
> +                memory_region_init_io(&pci_dev->v_addrs[i].real_iomem,
> +                                      &slow_bar_ops, &pci_dev->v_addrs[i],
> +                                      "assigned-dev-slow-bar",
> +                                      cur_region->size);
> +            } else {
> +                void *virtbase = pci_dev->v_addrs[i].u.r_virtbase;
> +                char name[32];
> +                snprintf(name, sizeof(name), "%s.bar%d",
> +                         object_get_typename(OBJECT(pci_dev)), i);
> +                memory_region_init_ram_ptr(&pci_dev->v_addrs[i].real_iomem,
> +                                           name, cur_region->size,
> +                                           virtbase);
> +                vmstate_register_ram(&pci_dev->v_addrs[i].real_iomem,
> +                                     &pci_dev->dev.qdev);
> +            }
> +
> +            assigned_dev_iomem_setup(&pci_dev->dev, i, cur_region->size);
> +            pci_register_bar((PCIDevice *) pci_dev, i, t,
> +                             &pci_dev->v_addrs[i].container);
> +            continue;
> +        } else {
> +            /* handle port io regions */
> +            uint32_t val;
> +            int ret;
> +
> +            /* Test kernel support for ioport resource read/write.  Old
> +             * kernels return EIO.  New kernels only allow 1/2/4 byte reads
> +             * so should return EINVAL for a 3 byte read */
> +            ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
> +            if (ret >= 0) {
> +                fprintf(stderr, "Unexpected return from I/O port read: %d\n",
> +                        ret);
> +                abort();
> +            } else if (errno != EINVAL) {
> +                fprintf(stderr, "Kernel doesn't support ioport resource "
> +                                "access, hiding this region.\n");
> +                close(pci_dev->v_addrs[i].region->resource_fd);
> +                cur_region->valid = 0;
> +                continue;
> +            }
> +
> +            pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
> +            pci_dev->v_addrs[i].r_size = cur_region->size;
> +            pci_dev->v_addrs[i].e_size = 0;
> +
> +            assigned_dev_ioport_setup(&pci_dev->dev, i, cur_region->size);
> +            pci_register_bar((PCIDevice *) pci_dev, i,
> +                             PCI_BASE_ADDRESS_SPACE_IO,
> +                             &pci_dev->v_addrs[i].container);
> +        }
> +    }
> +
> +    /* success */
> +    return 0;
> +}
> +
> +static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
> +{
> +    FILE *f;
> +    char name[128];
> +    long id;
> +
> +    snprintf(name, sizeof(name), "%s%s", devpath, idname);
> +    f = fopen(name, "r");
> +    if (f == NULL) {
> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +        return -1;
> +    }
> +    if (fscanf(f, "%li\n", &id) == 1) {
> +        *val = id;
> +    } else {
> +        return -1;
> +    }
> +    fclose(f);
> +
> +    return 0;
> +}
> +
> +static int get_real_vendor_id(const char *devpath, uint16_t *val)
> +{
> +    return get_real_id(devpath, "vendor", val);
> +}
> +
> +static int get_real_device_id(const char *devpath, uint16_t *val)
> +{
> +    return get_real_id(devpath, "device", val);
> +}
> +
> +static int get_real_device(AssignedDevice *pci_dev, uint16_t r_seg,
> +                           uint8_t r_bus, uint8_t r_dev, uint8_t r_func)
> +{
> +    char dir[128], name[128];
> +    int fd, r = 0, v;
> +    FILE *f;
> +    unsigned long long start, end, size, flags;

uint64_t

> +    uint16_t id;
> +    PCIRegion *rp;
> +    PCIDevRegions *dev = &pci_dev->real_device;
> +
> +    dev->region_number = 0;
> +
> +    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
> +             r_seg, r_bus, r_dev, r_func);
> +
> +    snprintf(name, sizeof(name), "%sconfig", dir);
> +
> +    if (pci_dev->configfd_name && *pci_dev->configfd_name) {
> +        if (qemu_isdigit(pci_dev->configfd_name[0])) {
> +            dev->config_fd = strtol(pci_dev->configfd_name, NULL, 0);
> +        } else {
> +            dev->config_fd = monitor_get_fd(cur_mon, pci_dev->configfd_name);
> +            if (dev->config_fd < 0) {
> +                fprintf(stderr, "%s: (%s) unkown\n", __func__,
> +                        pci_dev->configfd_name);
> +                return 1;
> +            }
> +        }
> +    } else {
> +        dev->config_fd = open(name, O_RDWR);
> +
> +        if (dev->config_fd == -1) {
> +            fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +            return 1;
> +        }
> +    }
> +again:
> +    r = read(dev->config_fd, pci_dev->dev.config,
> +             pci_config_size(&pci_dev->dev));
> +    if (r < 0) {
> +        if (errno == EINTR || errno == EAGAIN) {
> +            goto again;
> +        }
> +        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
> +    }
> +
> +    /* Restore or clear multifunction, this is always controlled by qemu */
> +    if (pci_dev->dev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> +        pci_dev->dev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
> +    } else {
> +        pci_dev->dev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
> +    }
> +
> +    /* Clear host resource mapping info.  If we choose not to register a
> +     * BAR, such as might be the case with the option ROM, we can get
> +     * confusing, unwritable, residual addresses from the host here. */
> +    memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24);
> +    memset(&pci_dev->dev.config[PCI_ROM_ADDRESS], 0, 4);
> +
> +    snprintf(name, sizeof(name), "%sresource", dir);
> +
> +    f = fopen(name, "r");
> +    if (f == NULL) {
> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +        return 1;
> +    }
> +
> +    for (r = 0; r < PCI_ROM_SLOT; r++) {
> +        if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3) {

SCNi64

> +            break;
> +        }
> +
> +        rp = dev->regions + r;
> +        rp->valid = 0;
> +        rp->resource_fd = -1;
> +        size = end - start + 1;
> +        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
> +        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0) {
> +            continue;
> +        }
> +        if (flags & IORESOURCE_MEM) {
> +            flags &= ~IORESOURCE_IO;
> +        } else {
> +            flags &= ~IORESOURCE_PREFETCH;
> +        }
> +        snprintf(name, sizeof(name), "%sresource%d", dir, r);
> +        fd = open(name, O_RDWR);
> +        if (fd == -1) {
> +            continue;
> +        }
> +        rp->resource_fd = fd;
> +
> +        rp->type = flags;
> +        rp->valid = 1;
> +        rp->base_addr = start;
> +        rp->size = size;
> +        pci_dev->v_addrs[r].region = rp;
> +        DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
> +              r, rp->size, start, rp->type, rp->resource_fd);
> +    }
> +
> +    fclose(f);
> +
> +    /* read and fill vendor ID */
> +    v = get_real_vendor_id(dir, &id);
> +    if (v) {
> +        return 1;
> +    }
> +    pci_dev->dev.config[0] = id & 0xff;
> +    pci_dev->dev.config[1] = (id & 0xff00) >> 8;
> +
> +    /* read and fill device ID */
> +    v = get_real_device_id(dir, &id);
> +    if (v) {
> +        return 1;
> +    }
> +    pci_dev->dev.config[2] = id & 0xff;
> +    pci_dev->dev.config[3] = (id & 0xff00) >> 8;
> +
> +    pci_word_test_and_clear_mask(pci_dev->emulate_config_write + PCI_COMMAND,
> +                                 PCI_COMMAND_MASTER | PCI_COMMAND_INTX_DISABLE);
> +
> +    dev->region_number = r;
> +    return 0;
> +}

Pretty long function, how about refactoring?

> +
> +static void free_msi_virqs(AssignedDevice *dev)
> +{
> +    int i;
> +
> +    for (i = 0; i < dev->msi_virq_nr; i++) {
> +        if (dev->msi_virq[i] >= 0) {
> +            kvm_irqchip_release_virq(kvm_state, dev->msi_virq[i]);
> +            dev->msi_virq[i] = -1;
> +        }
> +    }
> +    g_free(dev->msi_virq);
> +    dev->msi_virq = NULL;
> +    dev->msi_virq_nr = 0;
> +}
> +
> +static void free_assigned_device(AssignedDevice *dev)
> +{
> +    int i;
> +
> +    if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
> +        assigned_dev_unregister_msix_mmio(dev);
> +    }
> +    for (i = 0; i < dev->real_device.region_number; i++) {
> +        PCIRegion *pci_region = &dev->real_device.regions[i];
> +        AssignedDevRegion *region = &dev->v_addrs[i];
> +
> +        if (!pci_region->valid) {
> +            continue;
> +        }
> +        if (pci_region->type & IORESOURCE_IO) {
> +            if (region->u.r_baseport) {
> +                memory_region_del_subregion(&region->container,
> +                                            &region->real_iomem);
> +                memory_region_destroy(&region->real_iomem);
> +                memory_region_destroy(&region->container);
> +            }
> +        } else if (pci_region->type & IORESOURCE_MEM) {
> +            if (region->u.r_virtbase) {
> +                memory_region_del_subregion(&region->container,
> +                                            &region->real_iomem);
> +
> +                /* Remove MSI-X table subregion */
> +                if (pci_region->base_addr <= dev->msix_table_addr &&
> +                    pci_region->base_addr + pci_region->size >
> +                    dev->msix_table_addr) {
> +                    memory_region_del_subregion(&region->container,
> +                                                &dev->mmio);
> +                }
> +
> +                memory_region_destroy(&region->real_iomem);
> +                memory_region_destroy(&region->container);
> +                if (munmap(region->u.r_virtbase,
> +                           (pci_region->size + 0xFFF) & 0xFFFFF000)) {
> +                    fprintf(stderr,
> +                            "Failed to unmap assigned device region: %s\n",
> +                            strerror(errno));
> +                }
> +            }
> +        }
> +        if (pci_region->resource_fd >= 0) {
> +            close(pci_region->resource_fd);
> +        }
> +    }
> +
> +    if (dev->real_device.config_fd >= 0) {
> +        close(dev->real_device.config_fd);
> +    }
> +
> +    free_msi_virqs(dev);
> +}
> +
> +static void assign_failed_examine(AssignedDevice *dev)
> +{
> +    char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
> +    uint16_t vendor_id, device_id;
> +    int r;
> +
> +    sprintf(dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
> +            dev->host.domain, dev->host.bus, dev->host.slot,
> +            dev->host.function);

Please use snprintf(), also below.

> +
> +    sprintf(name, "%sdriver", dir);
> +
> +    r = readlink(name, driver, sizeof(driver));
> +    if ((r <= 0) || r >= sizeof(driver)) {
> +        goto fail;
> +    }
> +
> +    ns = strrchr(driver, '/');
> +    if (!ns) {
> +        goto fail;
> +    }
> +
> +    ns++;
> +
> +    if (get_real_vendor_id(dir, &vendor_id) ||
> +        get_real_device_id(dir, &device_id)) {
> +        goto fail;
> +    }
> +
> +    fprintf(stderr, "*** The driver '%s' is occupying your device "
> +                    "%04x:%02x:%02x.%x.\n",
> +            ns, dev->host.domain, dev->host.bus, dev->host.slot,
> +            dev->host.function);
> +    fprintf(stderr, "***\n");
> +    fprintf(stderr, "*** You can try the following commands to free it:\n");
> +    fprintf(stderr, "***\n");
> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
> +                    "new_id\n", vendor_id, device_id);
> +    fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> +                    "%s/unbind\n",
> +            dev->host.domain, dev->host.bus, dev->host.slot,
> +            dev->host.function, ns);
> +    fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> +                    "pci-stub/bind\n",
> +            dev->host.domain, dev->host.bus, dev->host.slot,
> +            dev->host.function);
> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
> +                    "/remove_id\n", vendor_id, device_id);
> +    fprintf(stderr, "***\n");
> +
> +    return;
> +
> +fail:
> +    fprintf(stderr, "Couldn't find out why.\n");
> +}
> +
> +static int assign_device(AssignedDevice *dev)
> +{
> +    uint32_t flags = KVM_DEV_ASSIGN_ENABLE_IOMMU;
> +    int r;
> +
> +    /* Only pass non-zero PCI segment to capable module */
> +    if (!kvm_check_extension(kvm_state, KVM_CAP_PCI_SEGMENT) &&
> +        dev->host.domain) {
> +        fprintf(stderr, "Can't assign device inside non-zero PCI segment "
> +                "as this KVM module doesn't support it.\n");
> +        return -ENODEV;
> +    }
> +
> +    if (!kvm_check_extension(kvm_state, KVM_CAP_IOMMU)) {
> +        fprintf(stderr, "No IOMMU found.  Unable to assign device \"%s\"\n",
> +                dev->dev.qdev.id);
> +        return -ENODEV;
> +    }
> +
> +    if (dev->features & ASSIGNED_DEVICE_SHARE_INTX_MASK &&
> +        kvm_has_intx_set_mask()) {
> +        flags |= KVM_DEV_ASSIGN_PCI_2_3;
> +    }
> +
> +    r = kvm_device_pci_assign(kvm_state, &dev->host, flags, &dev->dev_id);
> +    if (r < 0) {
> +        fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> +                dev->dev.qdev.id, strerror(-r));
> +
> +        switch (r) {
> +        case -EBUSY:
> +            assign_failed_examine(dev);
> +            break;
> +        default:
> +            break;
> +        }
> +    }
> +    return r;
> +}
> +
> +static bool check_irqchip_in_kernel(void)
> +{
> +    if (kvm_irqchip_in_kernel()) {
> +        return true;
> +    }
> +    error_report("pci-assign: error: requires KVM with in-kernel irqchip "
> +                 "enabled");
> +    return false;
> +}
> +
> +static int assign_intx(AssignedDevice *dev)
> +{
> +    AssignedIRQType new_type;
> +    PCIINTxRoute intx_route;
> +    bool intx_host_msi;
> +    int r;
> +
> +    /* Interrupt PIN 0 means don't use INTx */
> +    if (assigned_dev_pci_read_byte(&dev->dev, PCI_INTERRUPT_PIN) == 0) {
> +        pci_device_set_intx_routing_notifier(&dev->dev, NULL);
> +        return 0;
> +    }
> +
> +    if (!check_irqchip_in_kernel()) {
> +        return -ENOTSUP;
> +    }
> +
> +    pci_device_set_intx_routing_notifier(&dev->dev,
> +                                         assigned_dev_update_irq_routing);
> +
> +    intx_route = pci_device_route_intx_to_irq(&dev->dev, 0);
> +    assert(intx_route.mode != PCI_INTX_INVERTED);
> +
> +    if (dev->intx_route.mode == intx_route.mode &&
> +        dev->intx_route.irq == intx_route.irq) {
> +        return 0;
> +    }
> +
> +    switch (dev->assigned_irq_type) {
> +    case ASSIGNED_IRQ_INTX_HOST_INTX:
> +    case ASSIGNED_IRQ_INTX_HOST_MSI:
> +        intx_host_msi = dev->assigned_irq_type == ASSIGNED_IRQ_INTX_HOST_MSI;
> +        r = kvm_device_intx_deassign(kvm_state, dev->dev_id, intx_host_msi);
> +        break;
> +    case ASSIGNED_IRQ_MSI:
> +        r = kvm_device_msi_deassign(kvm_state, dev->dev_id);
> +        break;
> +    case ASSIGNED_IRQ_MSIX:
> +        r = kvm_device_msix_deassign(kvm_state, dev->dev_id);
> +        break;
> +    default:
> +        r = 0;
> +        break;
> +    }
> +    if (r) {
> +        perror("assign_intx: deassign");

This error message is not very clear.

> +    }
> +    dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
> +
> +    if (intx_route.mode == PCI_INTX_DISABLED) {
> +        dev->intx_route = intx_route;
> +        return 0;
> +    }
> +
> +retry:
> +    if (dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK &&
> +        dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
> +        intx_host_msi = true;
> +        new_type = ASSIGNED_IRQ_INTX_HOST_MSI;
> +    } else {
> +        intx_host_msi = false;
> +        new_type = ASSIGNED_IRQ_INTX_HOST_INTX;
> +    }
> +
> +    r = kvm_device_intx_assign(kvm_state, dev->dev_id, intx_host_msi,
> +                               intx_route.irq);
> +    if (r < 0) {
> +        if (r == -EIO && !(dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK) &&
> +            dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
> +            /* Retry with host-side MSI. There might be an IRQ conflict and
> +             * either the kernel or the device doesn't support sharing. */
> +            fprintf(stderr,
> +                    "Host-side INTx sharing not supported, "
> +                    "using MSI instead.\n"
> +                    "Some devices do not to work properly in this mode.\n");
> +            dev->features |= ASSIGNED_DEVICE_PREFER_MSI_MASK;
> +            goto retry;
> +        }
> +        fprintf(stderr, "Failed to assign irq for \"%s\": %s\n",
> +                dev->dev.qdev.id, strerror(-r));
> +        fprintf(stderr, "Perhaps you are assigning a device "
> +                "that shares an IRQ with another device?\n");
> +        return r;
> +    }
> +
> +    dev->intx_route = intx_route;
> +    dev->assigned_irq_type = new_type;
> +    return r;
> +}
> +
> +static void deassign_device(AssignedDevice *dev)
> +{
> +    int r;
> +
> +    r = kvm_device_pci_deassign(kvm_state, dev->dev_id);
> +    if (r < 0) {
> +        fprintf(stderr, "Failed to deassign device \"%s\" : %s\n",
> +                dev->dev.qdev.id, strerror(-r));
> +    }
> +}
> +
> +/* The pci config space got updated. Check if irq numbers have changed
> + * for our devices
> + */
> +static void assigned_dev_update_irq_routing(PCIDevice *dev)
> +{
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, dev);
> +    Error *err = NULL;
> +    int r;
> +
> +    r = assign_intx(assigned_dev);
> +    if (r < 0) {
> +        qdev_unplug(&dev->qdev, &err);
> +        assert(!err);
> +    }
> +}
> +
> +static void assigned_dev_update_msi(PCIDevice *pci_dev)
> +{
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint8_t ctrl_byte = pci_get_byte(pci_dev->config + pci_dev->msi_cap +
> +                                     PCI_MSI_FLAGS);
> +    int r;
> +
> +    /* Some guests gratuitously disable MSI even if they're not using it,
> +     * try to catch this by only deassigning irqs if the guest is using
> +     * MSI or intends to start. */
> +    if (assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSI ||
> +        (ctrl_byte & PCI_MSI_FLAGS_ENABLE)) {
> +        r = kvm_device_msi_deassign(kvm_state, assigned_dev->dev_id);
> +        /* -ENXIO means no assigned irq */
> +        if (r && r != -ENXIO) {
> +            perror("assigned_dev_update_msi: deassign irq");
> +        }
> +
> +        free_msi_virqs(assigned_dev);
> +
> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
> +        pci_device_set_intx_routing_notifier(pci_dev, NULL);
> +    }
> +
> +    if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) {
> +        uint8_t *pos = pci_dev->config + pci_dev->msi_cap;
> +        MSIMessage msg;
> +        int virq;
> +
> +        msg.address = pci_get_long(pos + PCI_MSI_ADDRESS_LO);
> +        msg.data = pci_get_word(pos + PCI_MSI_DATA_32);
> +        virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        if (virq < 0) {
> +            perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route");
> +            return;
> +        }
> +
> +        assigned_dev->msi_virq = g_malloc(sizeof(*assigned_dev->msi_virq));

Is this ever freed?

> +        assigned_dev->msi_virq_nr = 1;
> +        assigned_dev->msi_virq[0] = virq;
> +        if (kvm_device_msi_assign(kvm_state, assigned_dev->dev_id, virq) < 0) {
> +            perror("assigned_dev_update_msi: kvm_device_msi_assign");
> +        }
> +
> +        assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
> +        assigned_dev->intx_route.irq = -1;
> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSI;
> +    } else {
> +        assign_intx(assigned_dev);
> +    }
> +}
> +
> +static bool msix_masked(MSIXTableEntry *entry)
> +{
> +    return (entry->ctrl & cpu_to_le32(0x1)) != 0;
> +}
> +
> +static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
> +{
> +    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint16_t entries_nr = 0;
> +    int i, r = 0;
> +    MSIXTableEntry *entry = adev->msix_table;
> +    MSIMessage msg;
> +
> +    /* Get the usable entry number for allocating */
> +    for (i = 0; i < adev->msix_max; i++, entry++) {
> +        if (msix_masked(entry)) {
> +            continue;
> +        }
> +        entries_nr++;
> +    }
> +
> +    DEBUG("MSI-X entries: %d\n", entries_nr);
> +
> +    /* It's valid to enable MSI-X with all entries masked */
> +    if (!entries_nr) {
> +        return 0;
> +    }
> +
> +    r = kvm_device_msix_init_vectors(kvm_state, adev->dev_id, entries_nr);
> +    if (r != 0) {
> +        fprintf(stderr, "fail to set MSI-X entry number for MSIX! %s\n",
> +                strerror(-r));
> +        return r;
> +    }
> +
> +    free_msi_virqs(adev);
> +
> +    adev->msi_virq_nr = adev->msix_max;
> +    adev->msi_virq = g_malloc(adev->msix_max * sizeof(*adev->msi_virq));
> +
> +    entry = adev->msix_table;
> +    for (i = 0; i < adev->msix_max; i++, entry++) {
> +        adev->msi_virq[i] = -1;
> +
> +        if (msix_masked(entry)) {
> +            continue;
> +        }
> +
> +        msg.address = entry->addr_lo | ((uint64_t)entry->addr_hi << 32);
> +        msg.data = entry->data;
> +        r = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        if (r < 0) {
> +            return r;
> +        }
> +        adev->msi_virq[i] = r;
> +
> +        DEBUG("MSI-X vector %d, gsi %d, addr %08x_%08x, data %08x\n", i,
> +              r, entry->addr_hi, entry->addr_lo, entry->data);
> +
> +        r = kvm_device_msix_set_vector(kvm_state, adev->dev_id, i,
> +                                       adev->msi_virq[i]);
> +        if (r) {
> +            fprintf(stderr, "fail to set MSI-X entry! %s\n", strerror(-r));
> +            break;
> +        }
> +    }
> +
> +    return r;
> +}
> +
> +static void assigned_dev_update_msix(PCIDevice *pci_dev)
> +{
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint16_t ctrl_word = pci_get_word(pci_dev->config + pci_dev->msix_cap +
> +                                      PCI_MSIX_FLAGS);
> +    int r;
> +
> +    /* Some guests gratuitously disable MSIX even if they're not using it,
> +     * try to catch this by only deassigning irqs if the guest is using
> +     * MSIX or intends to start. */
> +    if ((assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSIX) ||
> +        (ctrl_word & PCI_MSIX_FLAGS_ENABLE)) {
> +        r = kvm_device_msix_deassign(kvm_state, assigned_dev->dev_id);
> +        /* -ENXIO means no assigned irq */
> +        if (r && r != -ENXIO) {
> +            perror("assigned_dev_update_msix: deassign irq");
> +        }
> +
> +        free_msi_virqs(assigned_dev);
> +
> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
> +        pci_device_set_intx_routing_notifier(pci_dev, NULL);
> +    }
> +
> +    if (ctrl_word & PCI_MSIX_FLAGS_ENABLE) {
> +        if (assigned_dev_update_msix_mmio(pci_dev) < 0) {
> +            perror("assigned_dev_update_msix_mmio");
> +            return;
> +        }
> +
> +        if (assigned_dev->msi_virq_nr > 0) {
> +            if (kvm_device_msix_assign(kvm_state, assigned_dev->dev_id) < 0) {
> +                perror("assigned_dev_enable_msix: assign irq");
> +                return;
> +            }
> +        }
> +        assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
> +        assigned_dev->intx_route.irq = -1;
> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSIX;
> +    } else {
> +        assign_intx(assigned_dev);
> +    }
> +}
> +
> +static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev,
> +                                             uint32_t address, int len)
> +{
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint32_t virt_val = pci_default_read_config(pci_dev, address, len);
> +    uint32_t real_val, emulate_mask, full_emulation_mask;
> +
> +    emulate_mask = 0;
> +    memcpy(&emulate_mask, assigned_dev->emulate_config_read + address, len);
> +    emulate_mask = le32_to_cpu(emulate_mask);
> +
> +    full_emulation_mask = 0xffffffff >> (32 - len * 8);
> +
> +    if (emulate_mask != full_emulation_mask) {
> +        real_val = assigned_dev_pci_read(pci_dev, address, len);
> +        return (virt_val & emulate_mask) | (real_val & ~emulate_mask);
> +    } else {
> +        return virt_val;
> +    }
> +}
> +
> +static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address,
> +                                          uint32_t val, int len)
> +{
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint16_t old_cmd = pci_get_word(pci_dev->config + PCI_COMMAND);
> +    uint32_t emulate_mask, full_emulation_mask;
> +    int ret;
> +
> +    pci_default_write_config(pci_dev, address, val, len);
> +
> +    if (kvm_has_intx_set_mask() &&
> +        range_covers_byte(address, len, PCI_COMMAND + 1)) {
> +        bool intx_masked = (pci_get_word(pci_dev->config + PCI_COMMAND) &
> +                            PCI_COMMAND_INTX_DISABLE);
> +
> +        if (intx_masked != !!(old_cmd & PCI_COMMAND_INTX_DISABLE)) {
> +            ret = kvm_device_intx_set_mask(kvm_state, assigned_dev->dev_id,
> +                                           intx_masked);
> +            if (ret) {
> +                perror("assigned_dev_pci_write_config: set intx mask");
> +            }
> +        }
> +    }
> +    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
> +        if (range_covers_byte(address, len,
> +                              pci_dev->msi_cap + PCI_MSI_FLAGS)) {
> +            assigned_dev_update_msi(pci_dev);
> +        }
> +    }
> +    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
> +        if (range_covers_byte(address, len,
> +                              pci_dev->msix_cap + PCI_MSIX_FLAGS + 1)) {
> +            assigned_dev_update_msix(pci_dev);
> +        }
> +    }
> +
> +    emulate_mask = 0;
> +    memcpy(&emulate_mask, assigned_dev->emulate_config_write + address, len);
> +    emulate_mask = le32_to_cpu(emulate_mask);
> +
> +    full_emulation_mask = 0xffffffff >> (32 - len * 8);
> +
> +    if (emulate_mask != full_emulation_mask) {
> +        if (emulate_mask) {
> +            val &= ~emulate_mask;
> +            val |= assigned_dev_pci_read(pci_dev, address, len) & emulate_mask;
> +        }
> +        assigned_dev_pci_write(pci_dev, address, val, len);
> +    }
> +}
> +
> +static void assigned_dev_setup_cap_read(AssignedDevice *dev, uint32_t offset,
> +                                        uint32_t len)
> +{
> +    assigned_dev_direct_config_read(dev, offset, len);
> +    assigned_dev_emulate_config_read(dev, offset + PCI_CAP_LIST_NEXT, 1);
> +}
> +
> +static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
> +{
> +    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    PCIRegion *pci_region = dev->real_device.regions;
> +    int ret, pos;
> +
> +    /* Clear initial capabilities pointer and status copied from hw */
> +    pci_set_byte(pci_dev->config + PCI_CAPABILITY_LIST, 0);
> +    pci_set_word(pci_dev->config + PCI_STATUS,
> +                 pci_get_word(pci_dev->config + PCI_STATUS) &
> +                 ~PCI_STATUS_CAP_LIST);
> +
> +    /* Expose MSI capability
> +     * MSI capability is the 1st capability in capability config */
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSI, 0);
> +    if (pos != 0 && kvm_check_extension(kvm_state, KVM_CAP_ASSIGN_DEV_IRQ)) {
> +        if (!check_irqchip_in_kernel()) {
> +            return -ENOTSUP;
> +        }
> +        dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI;
> +        /* Only 32-bit/no-mask currently supported */
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSI, pos, 10);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        pci_dev->msi_cap = pos;
> +
> +        pci_set_word(pci_dev->config + pos + PCI_MSI_FLAGS,
> +                     pci_get_word(pci_dev->config + pos + PCI_MSI_FLAGS) &
> +                     PCI_MSI_FLAGS_QMASK);
> +        pci_set_long(pci_dev->config + pos + PCI_MSI_ADDRESS_LO, 0);
> +        pci_set_word(pci_dev->config + pos + PCI_MSI_DATA_32, 0);
> +
> +        /* Set writable fields */
> +        pci_set_word(pci_dev->wmask + pos + PCI_MSI_FLAGS,
> +                     PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
> +        pci_set_long(pci_dev->wmask + pos + PCI_MSI_ADDRESS_LO, 0xfffffffc);
> +        pci_set_word(pci_dev->wmask + pos + PCI_MSI_DATA_32, 0xffff);
> +    }
> +    /* Expose MSI-X capability */
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX, 0);
> +    if (pos != 0 && kvm_device_msix_supported(kvm_state)) {
> +        int bar_nr;
> +        uint32_t msix_table_entry;
> +
> +        if (!check_irqchip_in_kernel()) {
> +            return -ENOTSUP;
> +        }
> +        dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX;
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSIX, pos, 12);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        pci_dev->msix_cap = pos;
> +
> +        pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS,
> +                     pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) &
> +                     PCI_MSIX_FLAGS_QSIZE);
> +
> +        /* Only enable and function mask bits are writable */
> +        pci_set_word(pci_dev->wmask + pos + PCI_MSIX_FLAGS,
> +                     PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
> +
> +        msix_table_entry = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE);
> +        bar_nr = msix_table_entry & PCI_MSIX_FLAGS_BIRMASK;
> +        msix_table_entry &= ~PCI_MSIX_FLAGS_BIRMASK;
> +        dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
> +        dev->msix_max = pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS);
> +        dev->msix_max &= PCI_MSIX_FLAGS_QSIZE;
> +        dev->msix_max += 1;
> +    }
> +
> +    /* Minimal PM support, nothing writable, device appears to NAK changes */
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM, 0);
> +    if (pos) {
> +        uint16_t pmc;
> +
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, pos, PCI_PM_SIZEOF);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        assigned_dev_setup_cap_read(dev, pos, PCI_PM_SIZEOF);
> +
> +        pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
> +        pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
> +        pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
> +
> +        /* assign_device will bring the device up to D0, so we don't need
> +         * to worry about doing that ourselves here. */
> +        pci_set_word(pci_dev->config + pos + PCI_PM_CTRL,
> +                     PCI_PM_CTRL_NO_SOFT_RESET);
> +
> +        pci_set_byte(pci_dev->config + pos + PCI_PM_PPB_EXTENSIONS, 0);
> +        pci_set_byte(pci_dev->config + pos + PCI_PM_DATA_REGISTER, 0);
> +    }
> +
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP, 0);
> +    if (pos) {
> +        uint8_t version, size = 0;
> +        uint16_t type, devctl, lnksta;
> +        uint32_t devcap, lnkcap;
> +
> +        version = pci_get_byte(pci_dev->config + pos + PCI_EXP_FLAGS);
> +        version &= PCI_EXP_FLAGS_VERS;
> +        if (version == 1) {
> +            size = 0x14;
> +        } else if (version == 2) {
> +            /*
> +             * Check for non-std size, accept reduced size to 0x34,
> +             * which is what bcm5761 implemented, violating the
> +             * PCIe v3.0 spec that regs should exist and be read as 0,
> +             * not optionally provided and shorten the struct size.
> +             */
> +            size = MIN(0x3c, PCI_CONFIG_SPACE_SIZE - pos);
> +            if (size < 0x34) {
> +                fprintf(stderr,
> +                        "%s: Invalid size PCIe cap-id 0x%x\n",
> +                        __func__, PCI_CAP_ID_EXP);
> +                return -EINVAL;
> +            } else if (size != 0x3c) {
> +                fprintf(stderr,
> +                        "WARNING, %s: PCIe cap-id 0x%x has "
> +                        "non-standard size 0x%x; std size should be 0x3c\n",
> +                         __func__, PCI_CAP_ID_EXP, size);
> +            }
> +        } else if (version == 0) {
> +            uint16_t vid, did;
> +            vid = pci_get_word(pci_dev->config + PCI_VENDOR_ID);
> +            did = pci_get_word(pci_dev->config + PCI_DEVICE_ID);
> +            if (vid == PCI_VENDOR_ID_INTEL && did == 0x10ed) {
> +                /*
> +                 * quirk for Intel 82599 VF with invalid PCIe capability
> +                 * version, should really be version 2 (same as PF)
> +                 */
> +                size = 0x3c;
> +            }
> +        }
> +
> +        if (size == 0) {
> +            fprintf(stderr,
> +                    "%s: Unsupported PCI express capability version %d\n",
> +                    __func__, version);
> +            return -EINVAL;
> +        }
> +
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_EXP, pos, size);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        assigned_dev_setup_cap_read(dev, pos, size);
> +
> +        type = pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS);
> +        type = (type & PCI_EXP_FLAGS_TYPE) >> 4;
> +        if (type != PCI_EXP_TYPE_ENDPOINT &&
> +            type != PCI_EXP_TYPE_LEG_END && type != PCI_EXP_TYPE_RC_END) {
> +            fprintf(stderr,
> +                    "Device assignment only supports endpoint assignment, "
> +                    "device type %d\n", type);
> +            return -EINVAL;
> +        }
> +
> +        /* capabilities, pass existing read-only copy
> +         * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */
> +
> +        /* device capabilities: hide FLR */
> +        devcap = pci_get_long(pci_dev->config + pos + PCI_EXP_DEVCAP);
> +        devcap &= ~PCI_EXP_DEVCAP_FLR;
> +        pci_set_long(pci_dev->config + pos + PCI_EXP_DEVCAP, devcap);
> +
> +        /* device control: clear all error reporting enable bits, leaving
> +         *                 only a few host values.  Note, these are
> +         *                 all writable, but not passed to hw.
> +         */
> +        devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
> +        devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
> +                  PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
> +        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
> +        devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
> +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
> +
> +        /* Clear device status */
> +        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVSTA, 0);
> +
> +        /* Link capabilities, expose links and latencues, clear reporting */
> +        lnkcap = pci_get_long(pci_dev->config + pos + PCI_EXP_LNKCAP);
> +        lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
> +                   PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
> +                   PCI_EXP_LNKCAP_L1EL);
> +        pci_set_long(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
> +
> +        /* Link control, pass existing read-only copy.  Should be writable? */
> +
> +        /* Link status, only expose current speed and width */
> +        lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
> +        lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
> +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
> +
> +        if (version >= 2) {
> +            /* Slot capabilities, control, status - not needed for endpoints */
> +            pci_set_long(pci_dev->config + pos + PCI_EXP_SLTCAP, 0);
> +            pci_set_word(pci_dev->config + pos + PCI_EXP_SLTCTL, 0);
> +            pci_set_word(pci_dev->config + pos + PCI_EXP_SLTSTA, 0);
> +
> +            /* Root control, capabilities, status - not needed for endpoints */
> +            pci_set_word(pci_dev->config + pos + PCI_EXP_RTCTL, 0);
> +            pci_set_word(pci_dev->config + pos + PCI_EXP_RTCAP, 0);
> +            pci_set_long(pci_dev->config + pos + PCI_EXP_RTSTA, 0);
> +
> +            /* Device capabilities/control 2, pass existing read-only copy */
> +            /* Link control 2, pass existing read-only copy */
> +        }
> +    }
> +
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX, 0);
> +    if (pos) {
> +        uint16_t cmd;
> +        uint32_t status;
> +
> +        /* Only expose the minimum, 8 byte capability */
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_PCIX, pos, 8);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        assigned_dev_setup_cap_read(dev, pos, 8);
> +
> +        /* Command register, clear upper bits, including extended modes */
> +        cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
> +        cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
> +                PCI_X_CMD_MAX_SPLIT);
> +        pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
> +
> +        /* Status register, update with emulated PCI bus location, clear
> +         * error bits, leave the rest. */
> +        status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
> +        status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
> +        status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
> +        status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
> +                    PCI_X_STATUS_SPL_ERR);
> +        pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
> +    }
> +
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD, 0);
> +    if (pos) {
> +        /* Direct R/W passthrough */
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_VPD, pos, 8);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        assigned_dev_setup_cap_read(dev, pos, 8);
> +
> +        /* direct write for cap content */
> +        assigned_dev_direct_config_write(dev, pos + 2, 6);
> +    }
> +
> +    /* Devices can have multiple vendor capabilities, get them all */
> +    for (pos = 0; (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
> +        pos += PCI_CAP_LIST_NEXT) {
> +        uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
> +        /* Direct R/W passthrough */
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_VNDR, pos, len);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        assigned_dev_setup_cap_read(dev, pos, len);
> +
> +        /* direct write for cap content */
> +        assigned_dev_direct_config_write(dev, pos + 2, len - 2);
> +    }
> +
> +    /* If real and virtual capability list status bits differ, virtualize the
> +     * access. */
> +    if ((pci_get_word(pci_dev->config + PCI_STATUS) & PCI_STATUS_CAP_LIST) !=
> +        (assigned_dev_pci_read_byte(pci_dev, PCI_STATUS) &
> +         PCI_STATUS_CAP_LIST)) {
> +        dev->emulate_config_read[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
> +    }
> +
> +    return 0;
> +}

Also this function is way too long.

> +
> +static uint64_t msix_mmio_read(void *opaque, target_phys_addr_t addr,
> +                               unsigned size)
> +{
> +    AssignedDevice *adev = opaque;
> +    uint64_t val;
> +
> +    memcpy(&val, (void *)((uint8_t *)adev->msix_table + addr), size);
> +
> +    return val;
> +}
> +
> +static void msix_mmio_write(void *opaque, target_phys_addr_t addr,
> +                            uint64_t val, unsigned size)
> +{
> +    AssignedDevice *adev = opaque;
> +    PCIDevice *pdev = &adev->dev;
> +    uint16_t ctrl;
> +    MSIXTableEntry orig;
> +    int i = addr >> 4;
> +
> +    if (i >= adev->msix_max) {
> +        return; /* Drop write */
> +    }
> +
> +    ctrl = pci_get_word(pdev->config + pdev->msix_cap + PCI_MSIX_FLAGS);
> +
> +    DEBUG("write to MSI-X table offset 0x%lx, val 0x%lx\n", addr, val);
> +
> +    if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
> +        orig = adev->msix_table[i];
> +    }
> +
> +    memcpy((void *)((uint8_t *)adev->msix_table + addr), &val, size);

Cast to void * shouldn't be necessary.

> +
> +    if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
> +        MSIXTableEntry *entry = &adev->msix_table[i];
> +
> +        if (!msix_masked(&orig) && msix_masked(entry)) {
> +            /*
> +             * Vector masked, disable it
> +             *
> +             * XXX It's not clear if we can or should actually attempt
> +             * to mask or disable the interrupt.  KVM doesn't have
> +             * support for pending bits and kvm_assign_set_msix_entry
> +             * doesn't modify the device hardware mask.  Interrupts
> +             * while masked are simply not injected to the guest, so
> +             * are lost.  Can we get away with always injecting an
> +             * interrupt on unmask?
> +             */
> +        } else if (msix_masked(&orig) && !msix_masked(entry)) {
> +            /* Vector unmasked */
> +            if (i >= adev->msi_virq_nr || adev->msi_virq[i] < 0) {
> +                /* Previously unassigned vector, start from scratch */
> +                assigned_dev_update_msix(pdev);
> +                return;
> +            } else {
> +                /* Update an existing, previously masked vector */
> +                MSIMessage msg;
> +                int ret;
> +
> +                msg.address = entry->addr_lo |
> +                    ((uint64_t)entry->addr_hi << 32);
> +                msg.data = entry->data;
> +
> +                ret = kvm_irqchip_update_msi_route(kvm_state,
> +                                                   adev->msi_virq[i], msg);
> +                if (ret) {
> +                    fprintf(stderr,
> +                            "Error updating irq routing entry (%d)\n", ret);
> +                }
> +            }
> +        }
> +    }
> +}
> +
> +static const MemoryRegionOps msix_mmio_ops = {
> +    .read = msix_mmio_read,
> +    .write = msix_mmio_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    },
> +    .impl = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    },
> +};
> +
> +static void msix_reset(AssignedDevice *dev)
> +{
> +    MSIXTableEntry *entry;
> +    int i;
> +
> +    if (!dev->msix_table) {
> +        return;
> +    }
> +
> +    memset(dev->msix_table, 0, MSIX_PAGE_SIZE);
> +
> +    for (i = 0, entry = dev->msix_table; i < dev->msix_max; i++, entry++) {
> +        entry->ctrl = cpu_to_le32(0x1); /* Masked */
> +    }
> +}
> +
> +static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
> +{
> +    dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE,
> +                           MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
> +    if (dev->msix_table == MAP_FAILED) {
> +        fprintf(stderr, "fail allocate msix_table! %s\n", strerror(errno));
> +        return -EFAULT;
> +    }
> +
> +    msix_reset(dev);
> +
> +    memory_region_init_io(&dev->mmio, &msix_mmio_ops, dev,
> +                          "assigned-dev-msix", MSIX_PAGE_SIZE);
> +    return 0;
> +}
> +
> +static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
> +{
> +    if (!dev->msix_table) {
> +        return;
> +    }
> +
> +    memory_region_destroy(&dev->mmio);
> +
> +    if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) {
> +        fprintf(stderr, "error unmapping msix_table! %s\n",
> +                strerror(errno));
> +    }
> +    dev->msix_table = NULL;
> +}
> +
> +static const VMStateDescription vmstate_assigned_device = {
> +    .name = "pci-assign",
> +    .unmigratable = 1,
> +};
> +
> +static void reset_assigned_device(DeviceState *dev)
> +{
> +    PCIDevice *pci_dev = DO_UPCAST(PCIDevice, qdev, dev);
> +    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    char reset_file[64];
> +    const char reset[] = "1";
> +    int fd, ret;
> +
> +    /*
> +     * If a guest is reset without being shutdown, MSI/MSI-X can still
> +     * be running.  We want to return the device to a known state on
> +     * reset, so disable those here.  We especially do not want MSI-X
> +     * enabled since it lives in MMIO space, which is about to get
> +     * disabled.
> +     */
> +    if (adev->assigned_irq_type == ASSIGNED_IRQ_MSIX) {
> +        uint16_t ctrl = pci_get_word(pci_dev->config +
> +                                     pci_dev->msix_cap + PCI_MSIX_FLAGS);
> +
> +        pci_set_word(pci_dev->config + pci_dev->msix_cap + PCI_MSIX_FLAGS,
> +                     ctrl & ~PCI_MSIX_FLAGS_ENABLE);
> +        assigned_dev_update_msix(pci_dev);
> +    } else if (adev->assigned_irq_type == ASSIGNED_IRQ_MSI) {
> +        uint8_t ctrl = pci_get_byte(pci_dev->config +
> +                                    pci_dev->msi_cap + PCI_MSI_FLAGS);
> +
> +        pci_set_byte(pci_dev->config + pci_dev->msi_cap + PCI_MSI_FLAGS,
> +                     ctrl & ~PCI_MSI_FLAGS_ENABLE);
> +        assigned_dev_update_msi(pci_dev);
> +    }
> +
> +    snprintf(reset_file, sizeof(reset_file),
> +             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset",
> +             adev->host.domain, adev->host.bus, adev->host.slot,
> +             adev->host.function);
> +
> +    /*
> +     * Issue a device reset via pci-sysfs.  Note that we use write(2) here
> +     * and ignore the return value because some kernels have a bug that
> +     * returns 0 rather than bytes written on success, sending us into an
> +     * infinite retry loop using other write mechanisms.
> +     */
> +    fd = open(reset_file, O_WRONLY);
> +    if (fd != -1) {
> +        ret = write(fd, reset, strlen(reset));
> +        (void)ret;
> +        close(fd);
> +    }
> +
> +    /*
> +     * When a 0 is written to the bus master register, the device is logically
> +     * disconnected from the PCI bus. This avoids further DMA transfers.
> +     */
> +    assigned_dev_pci_write_config(pci_dev, PCI_COMMAND, 0, 1);
> +}
> +
> +static int assigned_initfn(struct PCIDevice *pci_dev)
> +{
> +    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint8_t e_intx;
> +    int r;
> +
> +    if (!kvm_enabled()) {
> +        error_report("pci-assign: error: requires KVM support");
> +        return -1;
> +    }
> +
> +    if (!dev->host.domain && !dev->host.bus && !dev->host.slot &&
> +        !dev->host.function) {
> +        error_report("pci-assign: error: no host device specified");
> +        return -1;
> +    }
> +
> +    /*
> +     * Set up basic config space access control. Will be further refined during
> +     * device initialization.
> +     */
> +    assigned_dev_emulate_config_read(dev, 0, PCI_CONFIG_SPACE_SIZE);
> +    assigned_dev_direct_config_read(dev, PCI_STATUS, 2);
> +    assigned_dev_direct_config_read(dev, PCI_REVISION_ID, 1);
> +    assigned_dev_direct_config_read(dev, PCI_CLASS_PROG, 3);
> +    assigned_dev_direct_config_read(dev, PCI_CACHE_LINE_SIZE, 1);
> +    assigned_dev_direct_config_read(dev, PCI_LATENCY_TIMER, 1);
> +    assigned_dev_direct_config_read(dev, PCI_BIST, 1);
> +    assigned_dev_direct_config_read(dev, PCI_CARDBUS_CIS, 4);
> +    assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_VENDOR_ID, 2);
> +    assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_ID, 2);
> +    assigned_dev_direct_config_read(dev, PCI_CAPABILITY_LIST + 1, 7);
> +    assigned_dev_direct_config_read(dev, PCI_MIN_GNT, 1);
> +    assigned_dev_direct_config_read(dev, PCI_MAX_LAT, 1);
> +    memcpy(dev->emulate_config_write, dev->emulate_config_read,
> +           sizeof(dev->emulate_config_read));
> +
> +    if (get_real_device(dev, dev->host.domain, dev->host.bus,
> +                        dev->host.slot, dev->host.function)) {
> +        error_report("pci-assign: Error: Couldn't get real device (%s)!",
> +                     dev->dev.qdev.id);
> +        goto out;
> +    }
> +
> +    if (assigned_device_pci_cap_init(pci_dev) < 0) {
> +        goto out;
> +    }
> +
> +    /* intercept MSI-X entry page in the MMIO */
> +    if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
> +        if (assigned_dev_register_msix_mmio(dev)) {
> +            goto out;
> +        }
> +    }
> +
> +    /* handle real device's MMIO/PIO BARs */
> +    if (assigned_dev_register_regions(dev->real_device.regions,
> +                                      dev->real_device.region_number,
> +                                      dev)) {
> +        goto out;
> +    }
> +
> +    /* handle interrupt routing */
> +    e_intx = dev->dev.config[0x3d] - 1;
> +    dev->intpin = e_intx;
> +    dev->intx_route.mode = PCI_INTX_DISABLED;
> +    dev->intx_route.irq = -1;
> +
> +    /* assign device to guest */
> +    r = assign_device(dev);
> +    if (r < 0) {
> +        goto out;
> +    }
> +
> +    /* assign legacy INTx to the device */
> +    r = assign_intx(dev);
> +    if (r < 0) {
> +        goto assigned_out;
> +    }
> +
> +    assigned_dev_load_option_rom(dev);
> +
> +    add_boot_device_path(dev->bootindex, &pci_dev->qdev, NULL);
> +
> +    return 0;
> +
> +assigned_out:
> +    deassign_device(dev);
> +out:
> +    free_assigned_device(dev);
> +    return -1;
> +}
> +
> +static void assigned_exitfn(struct PCIDevice *pci_dev)
> +{
> +    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +
> +    deassign_device(dev);
> +    free_assigned_device(dev);
> +}
> +
> +static Property da_properties[] = {

const?

> +    DEFINE_PROP_PCI_HOST_DEVADDR("host", AssignedDevice, host),
> +    DEFINE_PROP_BIT("prefer_msi", AssignedDevice, features,
> +                    ASSIGNED_DEVICE_PREFER_MSI_BIT, false),
> +    DEFINE_PROP_BIT("share_intx", AssignedDevice, features,
> +                    ASSIGNED_DEVICE_SHARE_INTX_BIT, true),
> +    DEFINE_PROP_INT32("bootindex", AssignedDevice, bootindex, -1),
> +    DEFINE_PROP_STRING("configfd", AssignedDevice, configfd_name),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void assign_class_init(ObjectClass *klass, void *data)
> +{
> +    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    k->init         = assigned_initfn;
> +    k->exit         = assigned_exitfn;
> +    k->config_read  = assigned_dev_pci_read_config;
> +    k->config_write = assigned_dev_pci_write_config;
> +    dc->props       = da_properties;
> +    dc->vmsd        = &vmstate_assigned_device;
> +    dc->reset       = reset_assigned_device;
> +}
> +
> +static TypeInfo assign_info = {

const?

> +    .name               = "pci-assign",
> +    .parent             = TYPE_PCI_DEVICE,
> +    .instance_size      = sizeof(AssignedDevice),
> +    .class_init         = assign_class_init,
> +};
> +
> +static void assign_register_types(void)
> +{
> +    type_register_static(&assign_info);
> +}
> +
> +type_init(assign_register_types)
> +
> +/*
> + * Scan the assigned devices for the devices that have an option ROM, and then
> + * load the corresponding ROM data to RAM. If an error occurs while loading an
> + * option ROM, we just ignore that option ROM and continue with the next one.
> + */
> +static void assigned_dev_load_option_rom(AssignedDevice *dev)
> +{
> +    char name[32], rom_file[64];
> +    FILE *fp;
> +    uint8_t val;
> +    struct stat st;
> +    void *ptr;
> +
> +    /* If loading ROM from file, pci handles it */
> +    if (dev->dev.romfile || !dev->dev.rom_bar) {
> +        return;
> +    }
> +
> +    snprintf(rom_file, sizeof(rom_file),
> +             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/rom",
> +             dev->host.domain, dev->host.bus, dev->host.slot,
> +             dev->host.function);
> +
> +    if (stat(rom_file, &st)) {
> +        return;
> +    }
> +
> +    if (access(rom_file, F_OK)) {
> +        fprintf(stderr, "pci-assign: Insufficient privileges for %s\n",
> +                rom_file);
> +        return;
> +    }
> +
> +    /* Write "1" to the ROM file to enable it */
> +    fp = fopen(rom_file, "r+");
> +    if (fp == NULL) {
> +        return;
> +    }
> +    val = 1;
> +    if (fwrite(&val, 1, 1, fp) != 1) {
> +        goto close_rom;
> +    }
> +    fseek(fp, 0, SEEK_SET);
> +
> +    snprintf(name, sizeof(name), "%s.rom",
> +            object_get_typename(OBJECT(dev)));
> +    memory_region_init_ram(&dev->dev.rom, name, st.st_size);
> +    vmstate_register_ram(&dev->dev.rom, &dev->dev.qdev);
> +    ptr = memory_region_get_ram_ptr(&dev->dev.rom);
> +    memset(ptr, 0xff, st.st_size);
> +
> +    if (!fread(ptr, 1, st.st_size, fp)) {
> +        fprintf(stderr, "pci-assign: Cannot read from host %s\n"
> +                "\tDevice option ROM contents are probably invalid "
> +                "(check dmesg).\n\tSkip option ROM probe with rombar=0, "
> +                "or load from file with romfile=\n", rom_file);
> +        memory_region_destroy(&dev->dev.rom);
> +        goto close_rom;
> +    }
> +
> +    pci_register_bar(&dev->dev, PCI_ROM_SLOT, 0, &dev->dev.rom);
> +    dev->dev.has_rom = true;
> +close_rom:
> +    /* Write "0" to disable ROM */
> +    fseek(fp, 0, SEEK_SET);
> +    val = 0;
> +    if (!fwrite(&val, 1, 1, fp)) {
> +        DEBUG("%s\n", "Failed to disable pci-sysfs rom file");
> +    }
> +    fclose(fp);
> +}
> --
> 1.7.3.4
>
>
Michael S. Tsirkin Aug. 27, 2012, 7:01 p.m. UTC | #4
On Mon, Aug 27, 2012 at 06:56:38PM +0000, Blue Swirl wrote:
> > +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> > +{
> > +    AssignedDevRegion *d = opaque;
> > +    uint8_t *in = d->u.r_virtbase + addr;
> 
> Don't perform arithmetic with void pointers.

Why not?
We require gcc and it's a documented extension there.
Blue Swirl Aug. 27, 2012, 7:06 p.m. UTC | #5
On Mon, Aug 27, 2012 at 7:01 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Aug 27, 2012 at 06:56:38PM +0000, Blue Swirl wrote:
>> > +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>> > +{
>> > +    AssignedDevRegion *d = opaque;
>> > +    uint8_t *in = d->u.r_virtbase + addr;
>>
>> Don't perform arithmetic with void pointers.
>
> Why not?
> We require gcc and it's a documented extension there.

We don't require GCC, Clang can be used for some targets already.
Though it supports this non-standard extension too.

It's a bad idea to introduce dependencies where it's not necessary.

In this case it's not much effort to add the identifier for the struct
and in fact the only benefit ever is that the lazy coder saves a few
key presses.
Jan Kiszka Aug. 28, 2012, 12:30 a.m. UTC | #6
Hi Blue,

thanks for the review. I addressed most of them, the others a commented
below.

On 2012-08-27 20:56, Blue Swirl wrote:
>> +typedef struct AssignedDevice {
>> +    PCIDevice dev;
>> +    PCIHostDeviceAddress host;
>> +    uint32_t dev_id;
>> +    uint32_t features;
>> +    int intpin;
>> +    AssignedDevRegion v_addrs[PCI_NUM_REGIONS - 1];
>> +    PCIDevRegions real_device;
>> +    PCIINTxRoute intx_route;
>> +    AssignedIRQType assigned_irq_type;
>> +    struct {
>> +#define ASSIGNED_DEVICE_CAP_MSI (1 << 0)
>> +#define ASSIGNED_DEVICE_CAP_MSIX (1 << 1)
>> +        uint32_t available;
>> +#define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0)
>> +#define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1)
>> +#define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
>> +        uint32_t state;
>> +    } cap;
>> +    uint8_t emulate_config_read[PCI_CONFIG_SPACE_SIZE];
>> +    uint8_t emulate_config_write[PCI_CONFIG_SPACE_SIZE];
>> +    int msi_virq_nr;
>> +    int *msi_virq;
>> +    MSIXTableEntry *msix_table;
>> +    target_phys_addr_t msix_table_addr;
>> +    uint16_t msix_max;
>> +    MemoryRegion mmio;
>> +    char *configfd_name;
> 
> const? Not if this would mean more casts.

DEFINE_PROP_STRING, where this is used, doesn't allow this.

...
>> +    } else {
>> +        uint32_t port = addr + dev_region->u.r_baseport;
>> +
>> +        if (data) {
>> +            DEBUG("out data=%lx, size=%d, e_phys=%lx, host=%x\n",
>> +                  *data, size, addr, port);
>> +            switch (size) {
>> +            case 1:
>> +                outb(*data, port);
>> +                break;
>> +            case 2:
>> +                outw(*data, port);
>> +                break;
>> +            case 4:
>> +                outl(*data, port);
>> +                break;
> 
> Maybe add case 8: and default: with abort(), also below.

PIO is never 8 bytes long, the generic layer protects us.

...
>> +
>> +    fclose(f);
>> +
>> +    /* read and fill vendor ID */
>> +    v = get_real_vendor_id(dir, &id);
>> +    if (v) {
>> +        return 1;
>> +    }
>> +    pci_dev->dev.config[0] = id & 0xff;
>> +    pci_dev->dev.config[1] = (id & 0xff00) >> 8;
>> +
>> +    /* read and fill device ID */
>> +    v = get_real_device_id(dir, &id);
>> +    if (v) {
>> +        return 1;
>> +    }
>> +    pci_dev->dev.config[2] = id & 0xff;
>> +    pci_dev->dev.config[3] = (id & 0xff00) >> 8;
>> +
>> +    pci_word_test_and_clear_mask(pci_dev->emulate_config_write + PCI_COMMAND,
>> +                                 PCI_COMMAND_MASTER | PCI_COMMAND_INTX_DISABLE);
>> +
>> +    dev->region_number = r;
>> +    return 0;
>> +}
> 
> Pretty long function, how about refactoring?

Possibly, but I'd prefer to do such changes in-tree, after the more
important refactoring on MSI[-X] is done.

...
>> +    if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) {
>> +        uint8_t *pos = pci_dev->config + pci_dev->msi_cap;
>> +        MSIMessage msg;
>> +        int virq;
>> +
>> +        msg.address = pci_get_long(pos + PCI_MSI_ADDRESS_LO);
>> +        msg.data = pci_get_word(pos + PCI_MSI_DATA_32);
>> +        virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>> +        if (virq < 0) {
>> +            perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route");
>> +            return;
>> +        }
>> +
>> +        assigned_dev->msi_virq = g_malloc(sizeof(*assigned_dev->msi_virq));
> 
> Is this ever freed?

Yep, in free_msi_virqs. If you think you spotted a path where this is
not the case, let me know.

...
>> +
>> +static Property da_properties[] = {
> 
> const?

Nope, properties must remain writable.

> 
>> +    DEFINE_PROP_PCI_HOST_DEVADDR("host", AssignedDevice, host),
>> +    DEFINE_PROP_BIT("prefer_msi", AssignedDevice, features,
>> +                    ASSIGNED_DEVICE_PREFER_MSI_BIT, false),
>> +    DEFINE_PROP_BIT("share_intx", AssignedDevice, features,
>> +                    ASSIGNED_DEVICE_SHARE_INTX_BIT, true),
>> +    DEFINE_PROP_INT32("bootindex", AssignedDevice, bootindex, -1),
>> +    DEFINE_PROP_STRING("configfd", AssignedDevice, configfd_name),
>> +    DEFINE_PROP_END_OF_LIST(),
>> +};
>> +

Jan
Michael Tokarev Aug. 28, 2012, 7:35 a.m. UTC | #7
On 27.08.2012 22:56, Blue Swirl wrote:
[]
>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint8_t *in = d->u.r_virtbase + addr;
> 
> Don't perform arithmetic with void pointers.

There are a few places in common qemu code which does this for a very
long time.  So I guess it is safe now.

/mjt
Anthony Liguori Aug. 28, 2012, 12:57 p.m. UTC | #8
Andreas Färber <afaerber@suse.de> writes:

> Hi,
>
> Am 27.08.2012 08:28, schrieb Jan Kiszka:
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>> 
>> This adds PCI device assignment for i386 targets using the classic KVM
>> interfaces. This version is 100% identical to what is being maintained
>> in qemu-kvm for several years and is supported by libvirt as well. It is
>> expected to remain relevant for another couple of years until kernels
>> without full-features and performance-wise equivalent VFIO support are
>> obsolete.
>> 
>> A refactoring to-do that should be done in-tree is to model MSI and
>> MSI-X support via the generic PCI layer, similar to what VFIO is already
>> doing for MSI-X. This should improve the correctness and clean up the
>> code from duplicate logic.
>> 
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>  hw/kvm/Makefile.objs |    2 +-
>>  hw/kvm/pci-assign.c  | 1929 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 1930 insertions(+), 1 deletions(-)
>>  create mode 100644 hw/kvm/pci-assign.c
> [...]
>> diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
>> new file mode 100644
>> index 0000000..9cce02c
>> --- /dev/null
>> +++ b/hw/kvm/pci-assign.c
>> @@ -0,0 +1,1929 @@
>> +/*
>> + * Copyright (c) 2007, Neocleus Corporation.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms and conditions of the GNU General Public License,
>> + * version 2, as published by the Free Software Foundation.
>
> The downside of accepting this into qemu.git is that it gets us a huge
> blob of GPLv2-only code without history of contributors for GPLv2+
> relicensing...

That is 100% okay.

Regards,

Anthony Liguori

>
>> + *
>> + * This program is distributed in the hope it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
>> + * more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along with
>> + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
>> + * Place - Suite 330, Boston, MA 02111-1307 USA.
>
> (Expect the usual GNU address reminder here.)
>
>> + *
>> + *
>> + *  Assign a PCI device from the host to a guest VM.
>> + *
>> + *  Adapted for KVM by Qumranet.
>> + *
>> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
>> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
>> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
>> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
>> + */
>> +#include <stdio.h>
>> +#include <unistd.h>
>> +#include <sys/io.h>
>> +#include <sys/mman.h>
>> +#include <sys/types.h>
>> +#include <sys/stat.h>
>> +#include "hw/hw.h"
>> +#include "hw/pc.h"
>> +#include "qemu-error.h"
>> +#include "console.h"
>> +#include "hw/loader.h"
>> +#include "monitor.h"
>> +#include "range.h"
>> +#include "sysemu.h"
>> +#include "hw/pci.h"
>> +#include "hw/msi.h"
>
>> +#include "kvm_i386.h"
>
> Am I correct to understand we compile this only for i386 / x86_64?
> (apic.o in kvm/Makefile.objs hints in that direction) You may want to
> update the description in the comment above accordingly, also mentioning
> that this is some deprecated backwards-compatibility thing.
>
> Regards,
> Andreas
>
> -- 
> SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
> GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Aug. 28, 2012, 3:04 p.m. UTC | #9
On Mon, Aug 27, 2012 at 08:28:41AM +0200, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> This adds PCI device assignment for i386 targets using the classic KVM
> interfaces. This version is 100% identical to what is being maintained
> in qemu-kvm for several years and is supported by libvirt as well. It is
> expected to remain relevant for another couple of years until kernels
> without full-features and performance-wise equivalent VFIO support are
> obsolete.
> 
> A refactoring to-do that should be done in-tree is to model MSI and
> MSI-X support via the generic PCI layer, similar to what VFIO is already
> doing for MSI-X. This should improve the correctness and clean up the
> code from duplicate logic.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>

Don't see anything major to object to here,
in particular because it's well-contained.
It would be nice to add, in commit log,
the commit ID from qemu-kvm which has code
closest to this.

All comments below can be addressed either
downstream before merging or upstream after
this code is merged.

> ---
>  hw/kvm/Makefile.objs |    2 +-
>  hw/kvm/pci-assign.c  | 1929 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 1930 insertions(+), 1 deletions(-)
>  create mode 100644 hw/kvm/pci-assign.c
> 
> diff --git a/hw/kvm/Makefile.objs b/hw/kvm/Makefile.objs
> index 226497a..f620d7f 100644
> --- a/hw/kvm/Makefile.objs
> +++ b/hw/kvm/Makefile.objs
> @@ -1 +1 @@
> -obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o
> +obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o pci-assign.o
> diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
> new file mode 100644
> index 0000000..9cce02c
> --- /dev/null
> +++ b/hw/kvm/pci-assign.c
> @@ -0,0 +1,1929 @@
> +/*
> + * Copyright (c) 2007, Neocleus Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.
> + *
> + *
> + *  Assign a PCI device from the host to a guest VM.
> + *
> + *  Adapted for KVM by Qumranet.
> + *
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> + */
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <sys/io.h>
> +#include <sys/mman.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include "hw/hw.h"
> +#include "hw/pc.h"
> +#include "qemu-error.h"
> +#include "console.h"
> +#include "hw/loader.h"
> +#include "monitor.h"
> +#include "range.h"
> +#include "sysemu.h"
> +#include "hw/pci.h"
> +#include "hw/msi.h"
> +#include "kvm_i386.h"
> +
> +#define MSIX_PAGE_SIZE 0x1000
> +
> +/* From linux/ioport.h */
> +#define IORESOURCE_IO       0x00000100  /* Resource type */
> +#define IORESOURCE_MEM      0x00000200
> +#define IORESOURCE_IRQ      0x00000400
> +#define IORESOURCE_DMA      0x00000800
> +#define IORESOURCE_PREFETCH 0x00002000  /* No side effects */
> +
> +/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
> +
> +#ifdef DEVICE_ASSIGNMENT_DEBUG
> +#define DEBUG(fmt, ...)                                       \
> +    do {                                                      \
> +        fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);  \
> +    } while (0)
> +#else
> +#define DEBUG(fmt, ...)
> +#endif
> +
> +typedef struct {
> +    int type;           /* Memory or port I/O */
> +    int valid;
> +    uint32_t base_addr;
> +    uint32_t size;    /* size of the region */
> +    int resource_fd;
> +} PCIRegion;
> +
> +typedef struct {
> +    uint8_t bus, dev, func; /* Bus inside domain, device and function */
> +    int irq;                /* IRQ number */
> +    uint16_t region_number; /* number of active regions */
> +
> +    /* Port I/O or MMIO Regions */
> +    PCIRegion regions[PCI_NUM_REGIONS - 1];
> +    int config_fd;
> +} PCIDevRegions;
> +
> +typedef struct {
> +    MemoryRegion container;
> +    MemoryRegion real_iomem;
> +    union {
> +        void *r_virtbase;    /* mmapped access address for memory regions */
> +        uint32_t r_baseport; /* the base guest port for I/O regions */
> +    } u;
> +    pcibus_t e_size;    /* emulated size of region in bytes */
> +    pcibus_t r_size;    /* real size of region in bytes */
> +    PCIRegion *region;
> +} AssignedDevRegion;
> +
> +#define ASSIGNED_DEVICE_PREFER_MSI_BIT  0
> +#define ASSIGNED_DEVICE_SHARE_INTX_BIT  1
> +
> +#define ASSIGNED_DEVICE_PREFER_MSI_MASK (1 << ASSIGNED_DEVICE_PREFER_MSI_BIT)
> +#define ASSIGNED_DEVICE_SHARE_INTX_MASK (1 << ASSIGNED_DEVICE_SHARE_INTX_BIT)
> +
> +typedef struct {
> +    uint32_t addr_lo;
> +    uint32_t addr_hi;
> +    uint32_t data;
> +    uint32_t ctrl;
> +} MSIXTableEntry;
> +
> +typedef enum AssignedIRQType {
> +    ASSIGNED_IRQ_NONE = 0,
> +    ASSIGNED_IRQ_INTX_HOST_INTX,
> +    ASSIGNED_IRQ_INTX_HOST_MSI,
> +    ASSIGNED_IRQ_MSI,
> +    ASSIGNED_IRQ_MSIX
> +} AssignedIRQType;
> +
> +typedef struct AssignedDevice {
> +    PCIDevice dev;
> +    PCIHostDeviceAddress host;
> +    uint32_t dev_id;
> +    uint32_t features;
> +    int intpin;

This field seems unused.

> +    AssignedDevRegion v_addrs[PCI_NUM_REGIONS - 1];
> +    PCIDevRegions real_device;
> +    PCIINTxRoute intx_route;
> +    AssignedIRQType assigned_irq_type;
> +    struct {
> +#define ASSIGNED_DEVICE_CAP_MSI (1 << 0)
> +#define ASSIGNED_DEVICE_CAP_MSIX (1 << 1)
> +        uint32_t available;
> +#define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0)
> +#define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1)
> +#define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
> +        uint32_t state;
> +    } cap;
> +    uint8_t emulate_config_read[PCI_CONFIG_SPACE_SIZE];
> +    uint8_t emulate_config_write[PCI_CONFIG_SPACE_SIZE];
> +    int msi_virq_nr;
> +    int *msi_virq;
> +    MSIXTableEntry *msix_table;
> +    target_phys_addr_t msix_table_addr;
> +    uint16_t msix_max;
> +    MemoryRegion mmio;
> +    char *configfd_name;
> +    int32_t bootindex;
> +} AssignedDevice;
> +
> +static void assigned_dev_update_irq_routing(PCIDevice *dev);
> +
> +static void assigned_dev_load_option_rom(AssignedDevice *dev);
> +
> +static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
> +
> +static uint64_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
> +                                       target_phys_addr_t addr, int size,
> +                                       uint64_t *data)
> +{
> +    uint64_t val = 0;
> +    int fd = dev_region->region->resource_fd;
> +
> +    if (fd >= 0) {
> +        if (data) {
> +            DEBUG("pwrite data=%lx, size=%d, e_phys=%lx, addr=%lx\n",
> +                  *data, size, addr, addr);
> +            if (pwrite(fd, data, size, addr) != size) {
> +                fprintf(stderr, "%s - pwrite failed %s\n",
> +                        __func__, strerror(errno));
> +            }
> +        } else {
> +            if (pread(fd, &val, size, addr) != size) {
> +                fprintf(stderr, "%s - pread failed %s\n",
> +                        __func__, strerror(errno));
> +                val = (1UL << (size * 8)) - 1;
> +            }
> +            DEBUG("pread val=%lx, size=%d, e_phys=%lx, addr=%lx\n",
> +                  val, size, addr, addr);
> +        }
> +    } else {
> +        uint32_t port = addr + dev_region->u.r_baseport;
> +
> +        if (data) {
> +            DEBUG("out data=%lx, size=%d, e_phys=%lx, host=%x\n",
> +                  *data, size, addr, port);
> +            switch (size) {
> +            case 1:
> +                outb(*data, port);
> +                break;
> +            case 2:
> +                outw(*data, port);
> +                break;
> +            case 4:
> +                outl(*data, port);
> +                break;
> +            }
> +        } else {
> +            switch (size) {
> +            case 1:
> +                val = inb(port);
> +                break;
> +            case 2:
> +                val = inw(port);
> +                break;
> +            case 4:
> +                val = inl(port);
> +                break;
> +            }
> +            DEBUG("in data=%lx, size=%d, e_phys=%lx, host=%x\n",
> +                  val, size, addr, port);
> +        }
> +    }
> +    return val;
> +}
> +
> +static void assigned_dev_ioport_write(void *opaque, target_phys_addr_t addr,
> +                                      uint64_t data, unsigned size)
> +{
> +    assigned_dev_ioport_rw(opaque, addr, size, &data);
> +}
> +
> +static uint64_t assigned_dev_ioport_read(void *opaque,
> +                                         target_phys_addr_t addr, unsigned size)
> +{
> +    return assigned_dev_ioport_rw(opaque, addr, size, NULL);
> +}
> +
> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *in = d->u.r_virtbase + addr;
> +    uint32_t r;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *in = d->u.r_virtbase + addr;
> +    uint32_t r;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *in = d->u.r_virtbase + addr;
> +    uint32_t r;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *out = d->u.r_virtbase + addr;
> +
> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *out = d->u.r_virtbase + addr;
> +
> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *out = d->u.r_virtbase + addr;
> +
> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> +    *out = val;
> +}
> +
> +static const MemoryRegionOps slow_bar_ops = {
> +    .old_mmio = {
> +        .read = { slow_bar_readb, slow_bar_readw, slow_bar_readl, },
> +        .write = { slow_bar_writeb, slow_bar_writew, slow_bar_writel, },
> +    },
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +};
> +
> +static void assigned_dev_iomem_setup(PCIDevice *pci_dev, int region_num,
> +                                     pcibus_t e_size)
> +{
> +    AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> +
> +    if (e_size > 0) {
> +        memory_region_init(&region->container, "assigned-dev-container",
> +                           e_size);
> +        memory_region_add_subregion(&region->container, 0, &region->real_iomem);
> +
> +        /* deal with MSI-X MMIO page */
> +        if (real_region->base_addr <= r_dev->msix_table_addr &&
> +                real_region->base_addr + real_region->size >
> +                r_dev->msix_table_addr) {
> +            int offset = r_dev->msix_table_addr - real_region->base_addr;
> +
> +            memory_region_add_subregion_overlap(&region->container,
> +                                                offset,
> +                                                &r_dev->mmio,
> +                                                1);
> +        }
> +    }
> +}
> +
> +static const MemoryRegionOps assigned_dev_ioport_ops = {
> +    .read = assigned_dev_ioport_read,
> +    .write = assigned_dev_ioport_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +};
> +
> +static void assigned_dev_ioport_setup(PCIDevice *pci_dev, int region_num,
> +                                      pcibus_t size)
> +{
> +    AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> +
> +    region->e_size = size;
> +    memory_region_init(&region->container, "assigned-dev-container", size);
> +    memory_region_init_io(&region->real_iomem, &assigned_dev_ioport_ops,
> +                          r_dev->v_addrs + region_num,
> +                          "assigned-dev-iomem", size);
> +    memory_region_add_subregion(&region->container, 0, &region->real_iomem);
> +}
> +
> +static uint32_t assigned_dev_pci_read(PCIDevice *d, int pos, int len)
> +{
> +    AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d);
> +    uint32_t val;
> +    ssize_t ret;
> +    int fd = pci_dev->real_device.config_fd;
> +
> +again:
> +    ret = pread(fd, &val, len, pos);
> +    if (ret != len) {
> +        if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) {
> +            goto again;
> +        }
> +
> +        hw_error("pci read failed, ret = %zd errno = %d\n", ret, errno);
> +    }
> +
> +    return val;
> +}
> +
> +static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
> +{
> +    return (uint8_t)assigned_dev_pci_read(d, pos, 1);
> +}
> +
> +static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
> +{
> +    AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d);
> +    ssize_t ret;
> +    int fd = pci_dev->real_device.config_fd;
> +
> +again:
> +    ret = pwrite(fd, &val, len, pos);
> +    if (ret != len) {
> +        if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) {
> +            goto again;
> +        }
> +
> +        hw_error("pci write failed, ret = %zd errno = %d\n", ret, errno);
> +    }
> +}
> +
> +static void assigned_dev_emulate_config_read(AssignedDevice *dev,
> +                                             uint32_t offset, uint32_t len)
> +{
> +    memset(dev->emulate_config_read + offset, 0xff, len);
> +}
> +
> +static void assigned_dev_direct_config_read(AssignedDevice *dev,
> +                                            uint32_t offset, uint32_t len)
> +{
> +    memset(dev->emulate_config_read + offset, 0, len);
> +}
> +
> +static void assigned_dev_direct_config_write(AssignedDevice *dev,
> +                                             uint32_t offset, uint32_t len)
> +{
> +    memset(dev->emulate_config_write + offset, 0, len);
> +}
> +
> +static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap, uint8_t start)
> +{
> +    int id;
> +    int max_cap = 48;
> +    int pos = start ? start : PCI_CAPABILITY_LIST;
> +    int status;
> +
> +    status = assigned_dev_pci_read_byte(d, PCI_STATUS);
> +    if ((status & PCI_STATUS_CAP_LIST) == 0) {
> +        return 0;
> +    }
> +
> +    while (max_cap--) {
> +        pos = assigned_dev_pci_read_byte(d, pos);
> +        if (pos < 0x40) {
> +            break;
> +        }
> +
> +        pos &= ~3;
> +        id = assigned_dev_pci_read_byte(d, pos + PCI_CAP_LIST_ID);
> +
> +        if (id == 0xff) {
> +            break;
> +        }
> +        if (id == cap) {
> +            return pos;
> +        }
> +
> +        pos += PCI_CAP_LIST_NEXT;
> +    }
> +    return 0;
> +}
> +
> +static int assigned_dev_register_regions(PCIRegion *io_regions,
> +                                         unsigned long regions_num,
> +                                         AssignedDevice *pci_dev)
> +{
> +    uint32_t i;
> +    PCIRegion *cur_region = io_regions;
> +
> +    for (i = 0; i < regions_num; i++, cur_region++) {
> +        if (!cur_region->valid) {
> +            continue;
> +        }
> +
> +        /* handle memory io regions */
> +        if (cur_region->type & IORESOURCE_MEM) {
> +            int t = cur_region->type & IORESOURCE_PREFETCH
> +                ? PCI_BASE_ADDRESS_MEM_PREFETCH
> +                : PCI_BASE_ADDRESS_SPACE_MEMORY;
> +
> +            /* map physical memory */
> +            pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL, cur_region->size,
> +                                                    PROT_WRITE | PROT_READ,
> +                                                    MAP_SHARED,
> +                                                    cur_region->resource_fd,
> +                                                    (off_t)0);
> +
> +            if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
> +                pci_dev->v_addrs[i].u.r_virtbase = NULL;
> +                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
> +                        "\n", __func__,
> +                        (uint32_t) (cur_region->base_addr));
> +                return -1;
> +            }
> +
> +            pci_dev->v_addrs[i].r_size = cur_region->size;
> +            pci_dev->v_addrs[i].e_size = 0;
> +
> +            /* add offset */
> +            pci_dev->v_addrs[i].u.r_virtbase +=
> +                (cur_region->base_addr & 0xFFF);
> +
> +            if (cur_region->size & 0xFFF) {
> +                fprintf(stderr, "PCI region %d at address 0x%llx "
> +                        "has size 0x%x, which is not a multiple of 4K. "
> +                        "You might experience some performance hit "
> +                        "due to that.\n",
> +                        i, (unsigned long long)cur_region->base_addr,
> +                        cur_region->size);
> +                memory_region_init_io(&pci_dev->v_addrs[i].real_iomem,
> +                                      &slow_bar_ops, &pci_dev->v_addrs[i],
> +                                      "assigned-dev-slow-bar",
> +                                      cur_region->size);
> +            } else {
> +                void *virtbase = pci_dev->v_addrs[i].u.r_virtbase;
> +                char name[32];
> +                snprintf(name, sizeof(name), "%s.bar%d",
> +                         object_get_typename(OBJECT(pci_dev)), i);
> +                memory_region_init_ram_ptr(&pci_dev->v_addrs[i].real_iomem,
> +                                           name, cur_region->size,
> +                                           virtbase);
> +                vmstate_register_ram(&pci_dev->v_addrs[i].real_iomem,
> +                                     &pci_dev->dev.qdev);
> +            }
> +
> +            assigned_dev_iomem_setup(&pci_dev->dev, i, cur_region->size);
> +            pci_register_bar((PCIDevice *) pci_dev, i, t,
> +                             &pci_dev->v_addrs[i].container);
> +            continue;
> +        } else {
> +            /* handle port io regions */
> +            uint32_t val;
> +            int ret;
> +
> +            /* Test kernel support for ioport resource read/write.  Old
> +             * kernels return EIO.  New kernels only allow 1/2/4 byte reads
> +             * so should return EINVAL for a 3 byte read */
> +            ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
> +            if (ret >= 0) {
> +                fprintf(stderr, "Unexpected return from I/O port read: %d\n",
> +                        ret);
> +                abort();
> +            } else if (errno != EINVAL) {
> +                fprintf(stderr, "Kernel doesn't support ioport resource "
> +                                "access, hiding this region.\n");
> +                close(pci_dev->v_addrs[i].region->resource_fd);
> +                cur_region->valid = 0;
> +                continue;
> +            }
> +
> +            pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
> +            pci_dev->v_addrs[i].r_size = cur_region->size;
> +            pci_dev->v_addrs[i].e_size = 0;
> +
> +            assigned_dev_ioport_setup(&pci_dev->dev, i, cur_region->size);
> +            pci_register_bar((PCIDevice *) pci_dev, i,
> +                             PCI_BASE_ADDRESS_SPACE_IO,
> +                             &pci_dev->v_addrs[i].container);
> +        }
> +    }
> +
> +    /* success */
> +    return 0;
> +}
> +
> +static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
> +{
> +    FILE *f;
> +    char name[128];
> +    long id;
> +
> +    snprintf(name, sizeof(name), "%s%s", devpath, idname);
> +    f = fopen(name, "r");
> +    if (f == NULL) {
> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +        return -1;
> +    }
> +    if (fscanf(f, "%li\n", &id) == 1) {
> +        *val = id;
> +    } else {
> +        return -1;
> +    }
> +    fclose(f);
> +
> +    return 0;
> +}
> +
> +static int get_real_vendor_id(const char *devpath, uint16_t *val)
> +{
> +    return get_real_id(devpath, "vendor", val);
> +}
> +
> +static int get_real_device_id(const char *devpath, uint16_t *val)
> +{
> +    return get_real_id(devpath, "device", val);
> +}
> +
> +static int get_real_device(AssignedDevice *pci_dev, uint16_t r_seg,
> +                           uint8_t r_bus, uint8_t r_dev, uint8_t r_func)
> +{
> +    char dir[128], name[128];
> +    int fd, r = 0, v;
> +    FILE *f;
> +    unsigned long long start, end, size, flags;
> +    uint16_t id;
> +    PCIRegion *rp;
> +    PCIDevRegions *dev = &pci_dev->real_device;
> +
> +    dev->region_number = 0;
> +
> +    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
> +             r_seg, r_bus, r_dev, r_func);
> +
> +    snprintf(name, sizeof(name), "%sconfig", dir);
> +
> +    if (pci_dev->configfd_name && *pci_dev->configfd_name) {
> +        if (qemu_isdigit(pci_dev->configfd_name[0])) {
> +            dev->config_fd = strtol(pci_dev->configfd_name, NULL, 0);
> +        } else {
> +            dev->config_fd = monitor_get_fd(cur_mon, pci_dev->configfd_name);
> +            if (dev->config_fd < 0) {
> +                fprintf(stderr, "%s: (%s) unkown\n", __func__,
> +                        pci_dev->configfd_name);
> +                return 1;
> +            }
> +        }
> +    } else {
> +        dev->config_fd = open(name, O_RDWR);
> +
> +        if (dev->config_fd == -1) {
> +            fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +            return 1;
> +        }
> +    }
> +again:
> +    r = read(dev->config_fd, pci_dev->dev.config,
> +             pci_config_size(&pci_dev->dev));
> +    if (r < 0) {
> +        if (errno == EINTR || errno == EAGAIN) {
> +            goto again;
> +        }
> +        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
> +    }
> +
> +    /* Restore or clear multifunction, this is always controlled by qemu */
> +    if (pci_dev->dev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> +        pci_dev->dev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
> +    } else {
> +        pci_dev->dev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
> +    }
> +
> +    /* Clear host resource mapping info.  If we choose not to register a
> +     * BAR, such as might be the case with the option ROM, we can get
> +     * confusing, unwritable, residual addresses from the host here. */
> +    memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24);
> +    memset(&pci_dev->dev.config[PCI_ROM_ADDRESS], 0, 4);
> +
> +    snprintf(name, sizeof(name), "%sresource", dir);
> +
> +    f = fopen(name, "r");
> +    if (f == NULL) {
> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +        return 1;
> +    }
> +
> +    for (r = 0; r < PCI_ROM_SLOT; r++) {
> +        if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3) {
> +            break;
> +        }
> +
> +        rp = dev->regions + r;
> +        rp->valid = 0;
> +        rp->resource_fd = -1;
> +        size = end - start + 1;
> +        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
> +        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0) {
> +            continue;
> +        }
> +        if (flags & IORESOURCE_MEM) {
> +            flags &= ~IORESOURCE_IO;
> +        } else {
> +            flags &= ~IORESOURCE_PREFETCH;
> +        }
> +        snprintf(name, sizeof(name), "%sresource%d", dir, r);
> +        fd = open(name, O_RDWR);
> +        if (fd == -1) {
> +            continue;
> +        }
> +        rp->resource_fd = fd;
> +
> +        rp->type = flags;
> +        rp->valid = 1;
> +        rp->base_addr = start;
> +        rp->size = size;
> +        pci_dev->v_addrs[r].region = rp;
> +        DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
> +              r, rp->size, start, rp->type, rp->resource_fd);
> +    }
> +
> +    fclose(f);
> +
> +    /* read and fill vendor ID */
> +    v = get_real_vendor_id(dir, &id);
> +    if (v) {
> +        return 1;
> +    }
> +    pci_dev->dev.config[0] = id & 0xff;
> +    pci_dev->dev.config[1] = (id & 0xff00) >> 8;
> +
> +    /* read and fill device ID */
> +    v = get_real_device_id(dir, &id);
> +    if (v) {
> +        return 1;
> +    }
> +    pci_dev->dev.config[2] = id & 0xff;
> +    pci_dev->dev.config[3] = (id & 0xff00) >> 8;
> +
> +    pci_word_test_and_clear_mask(pci_dev->emulate_config_write + PCI_COMMAND,
> +                                 PCI_COMMAND_MASTER | PCI_COMMAND_INTX_DISABLE);
> +
> +    dev->region_number = r;
> +    return 0;
> +}
> +
> +static void free_msi_virqs(AssignedDevice *dev)
> +{
> +    int i;
> +
> +    for (i = 0; i < dev->msi_virq_nr; i++) {
> +        if (dev->msi_virq[i] >= 0) {
> +            kvm_irqchip_release_virq(kvm_state, dev->msi_virq[i]);
> +            dev->msi_virq[i] = -1;
> +        }
> +    }
> +    g_free(dev->msi_virq);
> +    dev->msi_virq = NULL;
> +    dev->msi_virq_nr = 0;
> +}
> +
> +static void free_assigned_device(AssignedDevice *dev)
> +{
> +    int i;
> +
> +    if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
> +        assigned_dev_unregister_msix_mmio(dev);
> +    }
> +    for (i = 0; i < dev->real_device.region_number; i++) {
> +        PCIRegion *pci_region = &dev->real_device.regions[i];
> +        AssignedDevRegion *region = &dev->v_addrs[i];
> +
> +        if (!pci_region->valid) {
> +            continue;
> +        }
> +        if (pci_region->type & IORESOURCE_IO) {
> +            if (region->u.r_baseport) {
> +                memory_region_del_subregion(&region->container,
> +                                            &region->real_iomem);
> +                memory_region_destroy(&region->real_iomem);
> +                memory_region_destroy(&region->container);
> +            }
> +        } else if (pci_region->type & IORESOURCE_MEM) {
> +            if (region->u.r_virtbase) {
> +                memory_region_del_subregion(&region->container,
> +                                            &region->real_iomem);
> +
> +                /* Remove MSI-X table subregion */
> +                if (pci_region->base_addr <= dev->msix_table_addr &&
> +                    pci_region->base_addr + pci_region->size >
> +                    dev->msix_table_addr) {
> +                    memory_region_del_subregion(&region->container,
> +                                                &dev->mmio);
> +                }
> +
> +                memory_region_destroy(&region->real_iomem);
> +                memory_region_destroy(&region->container);
> +                if (munmap(region->u.r_virtbase,
> +                           (pci_region->size + 0xFFF) & 0xFFFFF000)) {
> +                    fprintf(stderr,
> +                            "Failed to unmap assigned device region: %s\n",
> +                            strerror(errno));
> +                }
> +            }
> +        }
> +        if (pci_region->resource_fd >= 0) {
> +            close(pci_region->resource_fd);
> +        }
> +    }
> +
> +    if (dev->real_device.config_fd >= 0) {
> +        close(dev->real_device.config_fd);
> +    }
> +
> +    free_msi_virqs(dev);
> +}
> +
> +static void assign_failed_examine(AssignedDevice *dev)
> +{
> +    char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
> +    uint16_t vendor_id, device_id;
> +    int r;
> +
> +    sprintf(dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
> +            dev->host.domain, dev->host.bus, dev->host.slot,
> +            dev->host.function);
> +
> +    sprintf(name, "%sdriver", dir);
> +
> +    r = readlink(name, driver, sizeof(driver));
> +    if ((r <= 0) || r >= sizeof(driver)) {
> +        goto fail;
> +    }
> +
> +    ns = strrchr(driver, '/');
> +    if (!ns) {
> +        goto fail;
> +    }
> +
> +    ns++;
> +
> +    if (get_real_vendor_id(dir, &vendor_id) ||
> +        get_real_device_id(dir, &device_id)) {
> +        goto fail;
> +    }
> +
> +    fprintf(stderr, "*** The driver '%s' is occupying your device "
> +                    "%04x:%02x:%02x.%x.\n",
> +            ns, dev->host.domain, dev->host.bus, dev->host.slot,
> +            dev->host.function);
> +    fprintf(stderr, "***\n");
> +    fprintf(stderr, "*** You can try the following commands to free it:\n");
> +    fprintf(stderr, "***\n");
> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
> +                    "new_id\n", vendor_id, device_id);
> +    fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> +                    "%s/unbind\n",
> +            dev->host.domain, dev->host.bus, dev->host.slot,
> +            dev->host.function, ns);
> +    fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> +                    "pci-stub/bind\n",
> +            dev->host.domain, dev->host.bus, dev->host.slot,
> +            dev->host.function);
> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
> +                    "/remove_id\n", vendor_id, device_id);
> +    fprintf(stderr, "***\n");
> +
> +    return;
> +
> +fail:
> +    fprintf(stderr, "Couldn't find out why.\n");
> +}
> +
> +static int assign_device(AssignedDevice *dev)
> +{
> +    uint32_t flags = KVM_DEV_ASSIGN_ENABLE_IOMMU;
> +    int r;
> +
> +    /* Only pass non-zero PCI segment to capable module */
> +    if (!kvm_check_extension(kvm_state, KVM_CAP_PCI_SEGMENT) &&
> +        dev->host.domain) {
> +        fprintf(stderr, "Can't assign device inside non-zero PCI segment "
> +                "as this KVM module doesn't support it.\n");
> +        return -ENODEV;
> +    }
> +
> +    if (!kvm_check_extension(kvm_state, KVM_CAP_IOMMU)) {
> +        fprintf(stderr, "No IOMMU found.  Unable to assign device \"%s\"\n",
> +                dev->dev.qdev.id);
> +        return -ENODEV;
> +    }
> +
> +    if (dev->features & ASSIGNED_DEVICE_SHARE_INTX_MASK &&
> +        kvm_has_intx_set_mask()) {
> +        flags |= KVM_DEV_ASSIGN_PCI_2_3;
> +    }
> +
> +    r = kvm_device_pci_assign(kvm_state, &dev->host, flags, &dev->dev_id);
> +    if (r < 0) {
> +        fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> +                dev->dev.qdev.id, strerror(-r));
> +
> +        switch (r) {
> +        case -EBUSY:
> +            assign_failed_examine(dev);
> +            break;
> +        default:
> +            break;
> +        }
> +    }
> +    return r;
> +}
> +
> +static bool check_irqchip_in_kernel(void)
> +{
> +    if (kvm_irqchip_in_kernel()) {
> +        return true;
> +    }
> +    error_report("pci-assign: error: requires KVM with in-kernel irqchip "
> +                 "enabled");
> +    return false;
> +}
> +
> +static int assign_intx(AssignedDevice *dev)
> +{
> +    AssignedIRQType new_type;
> +    PCIINTxRoute intx_route;
> +    bool intx_host_msi;
> +    int r;
> +
> +    /* Interrupt PIN 0 means don't use INTx */
> +    if (assigned_dev_pci_read_byte(&dev->dev, PCI_INTERRUPT_PIN) == 0) {
> +        pci_device_set_intx_routing_notifier(&dev->dev, NULL);
> +        return 0;
> +    }
> +
> +    if (!check_irqchip_in_kernel()) {
> +        return -ENOTSUP;
> +    }
> +
> +    pci_device_set_intx_routing_notifier(&dev->dev,
> +                                         assigned_dev_update_irq_routing);
> +
> +    intx_route = pci_device_route_intx_to_irq(&dev->dev, 0);
> +    assert(intx_route.mode != PCI_INTX_INVERTED);
> +
> +    if (dev->intx_route.mode == intx_route.mode &&
> +        dev->intx_route.irq == intx_route.irq) {
> +        return 0;
> +    }
> +
> +    switch (dev->assigned_irq_type) {
> +    case ASSIGNED_IRQ_INTX_HOST_INTX:
> +    case ASSIGNED_IRQ_INTX_HOST_MSI:
> +        intx_host_msi = dev->assigned_irq_type == ASSIGNED_IRQ_INTX_HOST_MSI;
> +        r = kvm_device_intx_deassign(kvm_state, dev->dev_id, intx_host_msi);
> +        break;
> +    case ASSIGNED_IRQ_MSI:
> +        r = kvm_device_msi_deassign(kvm_state, dev->dev_id);
> +        break;
> +    case ASSIGNED_IRQ_MSIX:
> +        r = kvm_device_msix_deassign(kvm_state, dev->dev_id);
> +        break;
> +    default:
> +        r = 0;
> +        break;
> +    }
> +    if (r) {
> +        perror("assign_intx: deassign");
> +    }
> +    dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
> +
> +    if (intx_route.mode == PCI_INTX_DISABLED) {
> +        dev->intx_route = intx_route;
> +        return 0;
> +    }
> +
> +retry:

Better refactor this, factor code out and avoid
implementing loops using goto.

> +    if (dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK &&
> +        dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
> +        intx_host_msi = true;
> +        new_type = ASSIGNED_IRQ_INTX_HOST_MSI;
> +    } else {
> +        intx_host_msi = false;
> +        new_type = ASSIGNED_IRQ_INTX_HOST_INTX;
> +    }
> +
> +    r = kvm_device_intx_assign(kvm_state, dev->dev_id, intx_host_msi,
> +                               intx_route.irq);
> +    if (r < 0) {
> +        if (r == -EIO && !(dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK) &&
> +            dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
> +            /* Retry with host-side MSI. There might be an IRQ conflict and
> +             * either the kernel or the device doesn't support sharing. */
> +            fprintf(stderr,
> +                    "Host-side INTx sharing not supported, "
> +                    "using MSI instead.\n"
> +                    "Some devices do not to work properly in this mode.\n");
> +            dev->features |= ASSIGNED_DEVICE_PREFER_MSI_MASK;
> +            goto retry;
> +        }
> +        fprintf(stderr, "Failed to assign irq for \"%s\": %s\n",
> +                dev->dev.qdev.id, strerror(-r));
> +        fprintf(stderr, "Perhaps you are assigning a device "
> +                "that shares an IRQ with another device?\n");

This is now supported no? So arguably error message is wrong.
Also report errors with stderr here and elsewhere.

> +        return r;
> +    }
> +
> +    dev->intx_route = intx_route;
> +    dev->assigned_irq_type = new_type;
> +    return r;
> +}
> +
> +static void deassign_device(AssignedDevice *dev)
> +{
> +    int r;
> +
> +    r = kvm_device_pci_deassign(kvm_state, dev->dev_id);
> +    if (r < 0) {
> +        fprintf(stderr, "Failed to deassign device \"%s\" : %s\n",
> +                dev->dev.qdev.id, strerror(-r));

assert?

> +    }
> +}
> +
> +/* The pci config space got updated. Check if irq numbers have changed
> + * for our devices
> + */
> +static void assigned_dev_update_irq_routing(PCIDevice *dev)
> +{
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, dev);
> +    Error *err = NULL;
> +    int r;
> +
> +    r = assign_intx(assigned_dev);
> +    if (r < 0) {
> +        qdev_unplug(&dev->qdev, &err);
> +        assert(!err);
> +    }
> +}
> +

Lots of stuff below duplicates logic in hw/msix.c and
hw/msi.c.

> +static void assigned_dev_update_msi(PCIDevice *pci_dev)
> +{
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint8_t ctrl_byte = pci_get_byte(pci_dev->config + pci_dev->msi_cap +
> +                                     PCI_MSI_FLAGS);
> +    int r;
> +
> +    /* Some guests gratuitously disable MSI even if they're not using it,
> +     * try to catch this by only deassigning irqs if the guest is using
> +     * MSI or intends to start. */
> +    if (assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSI ||
> +        (ctrl_byte & PCI_MSI_FLAGS_ENABLE)) {
> +        r = kvm_device_msi_deassign(kvm_state, assigned_dev->dev_id);
> +        /* -ENXIO means no assigned irq */
> +        if (r && r != -ENXIO) {
> +            perror("assigned_dev_update_msi: deassign irq");
> +        }
> +
> +        free_msi_virqs(assigned_dev);
> +
> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
> +        pci_device_set_intx_routing_notifier(pci_dev, NULL);
> +    }
> +
> +    if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) {
> +        uint8_t *pos = pci_dev->config + pci_dev->msi_cap;
> +        MSIMessage msg;
> +        int virq;
> +
> +        msg.address = pci_get_long(pos + PCI_MSI_ADDRESS_LO);
> +        msg.data = pci_get_word(pos + PCI_MSI_DATA_32);
> +        virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        if (virq < 0) {
> +            perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route");
> +            return;
> +        }
> +
> +        assigned_dev->msi_virq = g_malloc(sizeof(*assigned_dev->msi_virq));
> +        assigned_dev->msi_virq_nr = 1;
> +        assigned_dev->msi_virq[0] = virq;
> +        if (kvm_device_msi_assign(kvm_state, assigned_dev->dev_id, virq) < 0) {
> +            perror("assigned_dev_update_msi: kvm_device_msi_assign");
> +        }
> +
> +        assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
> +        assigned_dev->intx_route.irq = -1;
> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSI;
> +    } else {
> +        assign_intx(assigned_dev);
> +    }
> +}
> +
> +static bool msix_masked(MSIXTableEntry *entry)
> +{
> +    return (entry->ctrl & cpu_to_le32(0x1)) != 0;
> +}
> +
> +static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
> +{
> +    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint16_t entries_nr = 0;
> +    int i, r = 0;
> +    MSIXTableEntry *entry = adev->msix_table;
> +    MSIMessage msg;
> +
> +    /* Get the usable entry number for allocating */
> +    for (i = 0; i < adev->msix_max; i++, entry++) {
> +        if (msix_masked(entry)) {
> +            continue;
> +        }
> +        entries_nr++;
> +    }
> +
> +    DEBUG("MSI-X entries: %d\n", entries_nr);
> +
> +    /* It's valid to enable MSI-X with all entries masked */
> +    if (!entries_nr) {
> +        return 0;
> +    }
> +
> +    r = kvm_device_msix_init_vectors(kvm_state, adev->dev_id, entries_nr);
> +    if (r != 0) {
> +        fprintf(stderr, "fail to set MSI-X entry number for MSIX! %s\n",
> +                strerror(-r));
> +        return r;
> +    }
> +
> +    free_msi_virqs(adev);
> +
> +    adev->msi_virq_nr = adev->msix_max;
> +    adev->msi_virq = g_malloc(adev->msix_max * sizeof(*adev->msi_virq));
> +
> +    entry = adev->msix_table;
> +    for (i = 0; i < adev->msix_max; i++, entry++) {
> +        adev->msi_virq[i] = -1;
> +
> +        if (msix_masked(entry)) {
> +            continue;
> +        }
> +
> +        msg.address = entry->addr_lo | ((uint64_t)entry->addr_hi << 32);
> +        msg.data = entry->data;
> +        r = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        if (r < 0) {
> +            return r;
> +        }
> +        adev->msi_virq[i] = r;
> +
> +        DEBUG("MSI-X vector %d, gsi %d, addr %08x_%08x, data %08x\n", i,
> +              r, entry->addr_hi, entry->addr_lo, entry->data);
> +
> +        r = kvm_device_msix_set_vector(kvm_state, adev->dev_id, i,
> +                                       adev->msi_virq[i]);
> +        if (r) {
> +            fprintf(stderr, "fail to set MSI-X entry! %s\n", strerror(-r));
> +            break;
> +        }
> +    }
> +
> +    return r;
> +}
> +
> +static void assigned_dev_update_msix(PCIDevice *pci_dev)
> +{
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint16_t ctrl_word = pci_get_word(pci_dev->config + pci_dev->msix_cap +
> +                                      PCI_MSIX_FLAGS);
> +    int r;
> +
> +    /* Some guests gratuitously disable MSIX even if they're not using it,
> +     * try to catch this by only deassigning irqs if the guest is using
> +     * MSIX or intends to start. */

Above comment seems very unclear.

> +    if ((assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSIX) ||
> +        (ctrl_word & PCI_MSIX_FLAGS_ENABLE)) {
> +        r = kvm_device_msix_deassign(kvm_state, assigned_dev->dev_id);
> +        /* -ENXIO means no assigned irq */
> +        if (r && r != -ENXIO) {
> +            perror("assigned_dev_update_msix: deassign irq");
> +        }
> +
> +        free_msi_virqs(assigned_dev);
> +
> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
> +        pci_device_set_intx_routing_notifier(pci_dev, NULL);
> +    }
> +
> +    if (ctrl_word & PCI_MSIX_FLAGS_ENABLE) {
> +        if (assigned_dev_update_msix_mmio(pci_dev) < 0) {
> +            perror("assigned_dev_update_msix_mmio");
> +            return;
> +        }
> +
> +        if (assigned_dev->msi_virq_nr > 0) {
> +            if (kvm_device_msix_assign(kvm_state, assigned_dev->dev_id) < 0) {
> +                perror("assigned_dev_enable_msix: assign irq");
> +                return;
> +            }
> +        }
> +        assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
> +        assigned_dev->intx_route.irq = -1;
> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSIX;
> +    } else {
> +        assign_intx(assigned_dev);
> +    }
> +}
> +
> +static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev,
> +                                             uint32_t address, int len)
> +{
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint32_t virt_val = pci_default_read_config(pci_dev, address, len);
> +    uint32_t real_val, emulate_mask, full_emulation_mask;
> +
> +    emulate_mask = 0;
> +    memcpy(&emulate_mask, assigned_dev->emulate_config_read + address, len);
> +    emulate_mask = le32_to_cpu(emulate_mask);
> +
> +    full_emulation_mask = 0xffffffff >> (32 - len * 8);
> +
> +    if (emulate_mask != full_emulation_mask) {
> +        real_val = assigned_dev_pci_read(pci_dev, address, len);
> +        return (virt_val & emulate_mask) | (real_val & ~emulate_mask);
> +    } else {
> +        return virt_val;
> +    }
> +}
> +
> +static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address,
> +                                          uint32_t val, int len)
> +{
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint16_t old_cmd = pci_get_word(pci_dev->config + PCI_COMMAND);
> +    uint32_t emulate_mask, full_emulation_mask;
> +    int ret;
> +
> +    pci_default_write_config(pci_dev, address, val, len);
> +
> +    if (kvm_has_intx_set_mask() &&
> +        range_covers_byte(address, len, PCI_COMMAND + 1)) {
> +        bool intx_masked = (pci_get_word(pci_dev->config + PCI_COMMAND) &
> +                            PCI_COMMAND_INTX_DISABLE);
> +
> +        if (intx_masked != !!(old_cmd & PCI_COMMAND_INTX_DISABLE)) {
> +            ret = kvm_device_intx_set_mask(kvm_state, assigned_dev->dev_id,
> +                                           intx_masked);
> +            if (ret) {
> +                perror("assigned_dev_pci_write_config: set intx mask");

error_report? hardware_error?

> +            }
> +        }
> +    }
> +    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
> +        if (range_covers_byte(address, len,
> +                              pci_dev->msi_cap + PCI_MSI_FLAGS)) {
> +            assigned_dev_update_msi(pci_dev);
> +        }
> +    }
> +    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
> +        if (range_covers_byte(address, len,
> +                              pci_dev->msix_cap + PCI_MSIX_FLAGS + 1)) {
> +            assigned_dev_update_msix(pci_dev);
> +        }
> +    }
> +
> +    emulate_mask = 0;
> +    memcpy(&emulate_mask, assigned_dev->emulate_config_write + address, len);
> +    emulate_mask = le32_to_cpu(emulate_mask);
> +
> +    full_emulation_mask = 0xffffffff >> (32 - len * 8);
> +
> +    if (emulate_mask != full_emulation_mask) {
> +        if (emulate_mask) {
> +            val &= ~emulate_mask;
> +            val |= assigned_dev_pci_read(pci_dev, address, len) & emulate_mask;
> +        }
> +        assigned_dev_pci_write(pci_dev, address, val, len);
> +    }
> +}
> +
> +static void assigned_dev_setup_cap_read(AssignedDevice *dev, uint32_t offset,
> +                                        uint32_t len)
> +{
> +    assigned_dev_direct_config_read(dev, offset, len);
> +    assigned_dev_emulate_config_read(dev, offset + PCI_CAP_LIST_NEXT, 1);
> +}
> +
> +static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
> +{
> +    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    PCIRegion *pci_region = dev->real_device.regions;
> +    int ret, pos;
> +
> +    /* Clear initial capabilities pointer and status copied from hw */
> +    pci_set_byte(pci_dev->config + PCI_CAPABILITY_LIST, 0);
> +    pci_set_word(pci_dev->config + PCI_STATUS,
> +                 pci_get_word(pci_dev->config + PCI_STATUS) &
> +                 ~PCI_STATUS_CAP_LIST);
> +
> +    /* Expose MSI capability
> +     * MSI capability is the 1st capability in capability config */
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSI, 0);
> +    if (pos != 0 && kvm_check_extension(kvm_state, KVM_CAP_ASSIGN_DEV_IRQ)) {
> +        if (!check_irqchip_in_kernel()) {
> +            return -ENOTSUP;
> +        }
> +        dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI;

again code below has lots of logic duplicated from msix.c
pcie etc. 

> +        /* Only 32-bit/no-mask currently supported */
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSI, pos, 10);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        pci_dev->msi_cap = pos;
> +
> +        pci_set_word(pci_dev->config + pos + PCI_MSI_FLAGS,
> +                     pci_get_word(pci_dev->config + pos + PCI_MSI_FLAGS) &
> +                     PCI_MSI_FLAGS_QMASK);
> +        pci_set_long(pci_dev->config + pos + PCI_MSI_ADDRESS_LO, 0);
> +        pci_set_word(pci_dev->config + pos + PCI_MSI_DATA_32, 0);
> +
> +        /* Set writable fields */
> +        pci_set_word(pci_dev->wmask + pos + PCI_MSI_FLAGS,
> +                     PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
> +        pci_set_long(pci_dev->wmask + pos + PCI_MSI_ADDRESS_LO, 0xfffffffc);
> +        pci_set_word(pci_dev->wmask + pos + PCI_MSI_DATA_32, 0xffff);
> +    }
> +    /* Expose MSI-X capability */
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX, 0);
> +    if (pos != 0 && kvm_device_msix_supported(kvm_state)) {
> +        int bar_nr;
> +        uint32_t msix_table_entry;
> +
> +        if (!check_irqchip_in_kernel()) {
> +            return -ENOTSUP;
> +        }
> +        dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX;
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSIX, pos, 12);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        pci_dev->msix_cap = pos;
> +
> +        pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS,
> +                     pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) &
> +                     PCI_MSIX_FLAGS_QSIZE);
> +
> +        /* Only enable and function mask bits are writable */
> +        pci_set_word(pci_dev->wmask + pos + PCI_MSIX_FLAGS,
> +                     PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
> +
> +        msix_table_entry = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE);
> +        bar_nr = msix_table_entry & PCI_MSIX_FLAGS_BIRMASK;
> +        msix_table_entry &= ~PCI_MSIX_FLAGS_BIRMASK;
> +        dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
> +        dev->msix_max = pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS);
> +        dev->msix_max &= PCI_MSIX_FLAGS_QSIZE;
> +        dev->msix_max += 1;
> +    }
> +
> +    /* Minimal PM support, nothing writable, device appears to NAK changes */
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM, 0);
> +    if (pos) {
> +        uint16_t pmc;
> +
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, pos, PCI_PM_SIZEOF);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        assigned_dev_setup_cap_read(dev, pos, PCI_PM_SIZEOF);
> +
> +        pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
> +        pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
> +        pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
> +
> +        /* assign_device will bring the device up to D0, so we don't need
> +         * to worry about doing that ourselves here. */
> +        pci_set_word(pci_dev->config + pos + PCI_PM_CTRL,
> +                     PCI_PM_CTRL_NO_SOFT_RESET);
> +
> +        pci_set_byte(pci_dev->config + pos + PCI_PM_PPB_EXTENSIONS, 0);
> +        pci_set_byte(pci_dev->config + pos + PCI_PM_DATA_REGISTER, 0);
> +    }

Would be nice to move logic to hw/pcipm.c.

> +
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP, 0);
> +    if (pos) {
> +        uint8_t version, size = 0;
> +        uint16_t type, devctl, lnksta;
> +        uint32_t devcap, lnkcap;
> +
> +        version = pci_get_byte(pci_dev->config + pos + PCI_EXP_FLAGS);
> +        version &= PCI_EXP_FLAGS_VERS;
> +        if (version == 1) {
> +            size = 0x14;
> +        } else if (version == 2) {
> +            /*
> +             * Check for non-std size, accept reduced size to 0x34,
> +             * which is what bcm5761 implemented, violating the
> +             * PCIe v3.0 spec that regs should exist and be read as 0,
> +             * not optionally provided and shorten the struct size.
> +             */
> +            size = MIN(0x3c, PCI_CONFIG_SPACE_SIZE - pos);
> +            if (size < 0x34) {
> +                fprintf(stderr,
> +                        "%s: Invalid size PCIe cap-id 0x%x\n",
> +                        __func__, PCI_CAP_ID_EXP);
> +                return -EINVAL;
> +            } else if (size != 0x3c) {
> +                fprintf(stderr,
> +                        "WARNING, %s: PCIe cap-id 0x%x has "
> +                        "non-standard size 0x%x; std size should be 0x3c\n",
> +                         __func__, PCI_CAP_ID_EXP, size);
> +            }
> +        } else if (version == 0) {
> +            uint16_t vid, did;
> +            vid = pci_get_word(pci_dev->config + PCI_VENDOR_ID);
> +            did = pci_get_word(pci_dev->config + PCI_DEVICE_ID);
> +            if (vid == PCI_VENDOR_ID_INTEL && did == 0x10ed) {
> +                /*
> +                 * quirk for Intel 82599 VF with invalid PCIe capability
> +                 * version, should really be version 2 (same as PF)
> +                 */
> +                size = 0x3c;
> +            }
> +        }
> +
> +        if (size == 0) {
> +            fprintf(stderr,
> +                    "%s: Unsupported PCI express capability version %d\n",
> +                    __func__, version);
> +            return -EINVAL;
> +        }
> +
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_EXP, pos, size);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        assigned_dev_setup_cap_read(dev, pos, size);
> +
> +        type = pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS);
> +        type = (type & PCI_EXP_FLAGS_TYPE) >> 4;
> +        if (type != PCI_EXP_TYPE_ENDPOINT &&
> +            type != PCI_EXP_TYPE_LEG_END && type != PCI_EXP_TYPE_RC_END) {
> +            fprintf(stderr,
> +                    "Device assignment only supports endpoint assignment, "
> +                    "device type %d\n", type);
> +            return -EINVAL;
> +        }
> +
> +        /* capabilities, pass existing read-only copy
> +         * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */
> +
> +        /* device capabilities: hide FLR */
> +        devcap = pci_get_long(pci_dev->config + pos + PCI_EXP_DEVCAP);
> +        devcap &= ~PCI_EXP_DEVCAP_FLR;
> +        pci_set_long(pci_dev->config + pos + PCI_EXP_DEVCAP, devcap);
> +
> +        /* device control: clear all error reporting enable bits, leaving
> +         *                 only a few host values.  Note, these are
> +         *                 all writable, but not passed to hw.
> +         */
> +        devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
> +        devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
> +                  PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
> +        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
> +        devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
> +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
> +
> +        /* Clear device status */
> +        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVSTA, 0);
> +
> +        /* Link capabilities, expose links and latencues, clear reporting */
> +        lnkcap = pci_get_long(pci_dev->config + pos + PCI_EXP_LNKCAP);
> +        lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
> +                   PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
> +                   PCI_EXP_LNKCAP_L1EL);
> +        pci_set_long(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
> +
> +        /* Link control, pass existing read-only copy.  Should be writable? */
> +
> +        /* Link status, only expose current speed and width */
> +        lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
> +        lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
> +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
> +
> +        if (version >= 2) {
> +            /* Slot capabilities, control, status - not needed for endpoints */
> +            pci_set_long(pci_dev->config + pos + PCI_EXP_SLTCAP, 0);
> +            pci_set_word(pci_dev->config + pos + PCI_EXP_SLTCTL, 0);
> +            pci_set_word(pci_dev->config + pos + PCI_EXP_SLTSTA, 0);
> +
> +            /* Root control, capabilities, status - not needed for endpoints */
> +            pci_set_word(pci_dev->config + pos + PCI_EXP_RTCTL, 0);
> +            pci_set_word(pci_dev->config + pos + PCI_EXP_RTCAP, 0);
> +            pci_set_long(pci_dev->config + pos + PCI_EXP_RTSTA, 0);
> +
> +            /* Device capabilities/control 2, pass existing read-only copy */
> +            /* Link control 2, pass existing read-only copy */
> +        }
> +    }
> +
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX, 0);
> +    if (pos) {
> +        uint16_t cmd;
> +        uint32_t status;
> +
> +        /* Only expose the minimum, 8 byte capability */
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_PCIX, pos, 8);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        assigned_dev_setup_cap_read(dev, pos, 8);
> +
> +        /* Command register, clear upper bits, including extended modes */
> +        cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
> +        cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
> +                PCI_X_CMD_MAX_SPLIT);
> +        pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
> +
> +        /* Status register, update with emulated PCI bus location, clear
> +         * error bits, leave the rest. */
> +        status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
> +        status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
> +        status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
> +        status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
> +                    PCI_X_STATUS_SPL_ERR);
> +        pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
> +    }
> +
> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD, 0);
> +    if (pos) {
> +        /* Direct R/W passthrough */
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_VPD, pos, 8);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        assigned_dev_setup_cap_read(dev, pos, 8);
> +
> +        /* direct write for cap content */
> +        assigned_dev_direct_config_write(dev, pos + 2, 6);
> +    }
> +
> +    /* Devices can have multiple vendor capabilities, get them all */
> +    for (pos = 0; (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
> +        pos += PCI_CAP_LIST_NEXT) {
> +        uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
> +        /* Direct R/W passthrough */
> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_VNDR, pos, len);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        assigned_dev_setup_cap_read(dev, pos, len);
> +
> +        /* direct write for cap content */
> +        assigned_dev_direct_config_write(dev, pos + 2, len - 2);
> +    }
> +
> +    /* If real and virtual capability list status bits differ, virtualize the
> +     * access. */
> +    if ((pci_get_word(pci_dev->config + PCI_STATUS) & PCI_STATUS_CAP_LIST) !=
> +        (assigned_dev_pci_read_byte(pci_dev, PCI_STATUS) &
> +         PCI_STATUS_CAP_LIST)) {
> +        dev->emulate_config_read[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
> +    }
> +
> +    return 0;
> +}
> +
> +static uint64_t msix_mmio_read(void *opaque, target_phys_addr_t addr,
> +                               unsigned size)
> +{
> +    AssignedDevice *adev = opaque;
> +    uint64_t val;
> +
> +    memcpy(&val, (void *)((uint8_t *)adev->msix_table + addr), size);
> +
> +    return val;
> +}
> +
> +static void msix_mmio_write(void *opaque, target_phys_addr_t addr,
> +                            uint64_t val, unsigned size)
> +{
> +    AssignedDevice *adev = opaque;
> +    PCIDevice *pdev = &adev->dev;
> +    uint16_t ctrl;
> +    MSIXTableEntry orig;
> +    int i = addr >> 4;
> +
> +    if (i >= adev->msix_max) {
> +        return; /* Drop write */
> +    }
> +
> +    ctrl = pci_get_word(pdev->config + pdev->msix_cap + PCI_MSIX_FLAGS);
> +
> +    DEBUG("write to MSI-X table offset 0x%lx, val 0x%lx\n", addr, val);
> +
> +    if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
> +        orig = adev->msix_table[i];
> +    }
> +
> +    memcpy((void *)((uint8_t *)adev->msix_table + addr), &val, size);
> +
> +    if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
> +        MSIXTableEntry *entry = &adev->msix_table[i];
> +
> +        if (!msix_masked(&orig) && msix_masked(entry)) {
> +            /*
> +             * Vector masked, disable it
> +             *
> +             * XXX It's not clear if we can or should actually attempt
> +             * to mask or disable the interrupt.  KVM doesn't have
> +             * support for pending bits and kvm_assign_set_msix_entry
> +             * doesn't modify the device hardware mask.  Interrupts
> +             * while masked are simply not injected to the guest, so
> +             * are lost.  Can we get away with always injecting an
> +             * interrupt on unmask?
> +             */
> +        } else if (msix_masked(&orig) && !msix_masked(entry)) {
> +            /* Vector unmasked */
> +            if (i >= adev->msi_virq_nr || adev->msi_virq[i] < 0) {
> +                /* Previously unassigned vector, start from scratch */
> +                assigned_dev_update_msix(pdev);
> +                return;
> +            } else {
> +                /* Update an existing, previously masked vector */
> +                MSIMessage msg;
> +                int ret;
> +
> +                msg.address = entry->addr_lo |
> +                    ((uint64_t)entry->addr_hi << 32);
> +                msg.data = entry->data;
> +
> +                ret = kvm_irqchip_update_msi_route(kvm_state,
> +                                                   adev->msi_virq[i], msg);
> +                if (ret) {
> +                    fprintf(stderr,
> +                            "Error updating irq routing entry (%d)\n", ret);
> +                }
> +            }
> +        }
> +    }
> +}
> +
> +static const MemoryRegionOps msix_mmio_ops = {
> +    .read = msix_mmio_read,
> +    .write = msix_mmio_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    },
> +    .impl = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    },
> +};
> +
> +static void msix_reset(AssignedDevice *dev)
> +{
> +    MSIXTableEntry *entry;
> +    int i;
> +
> +    if (!dev->msix_table) {
> +        return;
> +    }
> +
> +    memset(dev->msix_table, 0, MSIX_PAGE_SIZE);
> +
> +    for (i = 0, entry = dev->msix_table; i < dev->msix_max; i++, entry++) {
> +        entry->ctrl = cpu_to_le32(0x1); /* Masked */
> +    }
> +}
> +

Pls rename to assigned_dev_msix_reset.
I would prefer if this one comment is addressed before
merging.

> +static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
> +{
> +    dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE,
> +                           MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
> +    if (dev->msix_table == MAP_FAILED) {
> +        fprintf(stderr, "fail allocate msix_table! %s\n", strerror(errno));
> +        return -EFAULT;
> +    }
> +
> +    msix_reset(dev);
> +
> +    memory_region_init_io(&dev->mmio, &msix_mmio_ops, dev,
> +                          "assigned-dev-msix", MSIX_PAGE_SIZE);
> +    return 0;
> +}
> +
> +static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
> +{
> +    if (!dev->msix_table) {
> +        return;
> +    }
> +
> +    memory_region_destroy(&dev->mmio);
> +
> +    if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) {
> +        fprintf(stderr, "error unmapping msix_table! %s\n",
> +                strerror(errno));
> +    }
> +    dev->msix_table = NULL;
> +}
> +
> +static const VMStateDescription vmstate_assigned_device = {
> +    .name = "pci-assign",
> +    .unmigratable = 1,
> +};
> +
> +static void reset_assigned_device(DeviceState *dev)
> +{
> +    PCIDevice *pci_dev = DO_UPCAST(PCIDevice, qdev, dev);
> +    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    char reset_file[64];
> +    const char reset[] = "1";
> +    int fd, ret;
> +
> +    /*
> +     * If a guest is reset without being shutdown, MSI/MSI-X can still
> +     * be running.  We want to return the device to a known state on
> +     * reset, so disable those here.  We especially do not want MSI-X
> +     * enabled since it lives in MMIO space, which is about to get
> +     * disabled.
> +     */
> +    if (adev->assigned_irq_type == ASSIGNED_IRQ_MSIX) {
> +        uint16_t ctrl = pci_get_word(pci_dev->config +
> +                                     pci_dev->msix_cap + PCI_MSIX_FLAGS);
> +
> +        pci_set_word(pci_dev->config + pci_dev->msix_cap + PCI_MSIX_FLAGS,
> +                     ctrl & ~PCI_MSIX_FLAGS_ENABLE);
> +        assigned_dev_update_msix(pci_dev);
> +    } else if (adev->assigned_irq_type == ASSIGNED_IRQ_MSI) {
> +        uint8_t ctrl = pci_get_byte(pci_dev->config +
> +                                    pci_dev->msi_cap + PCI_MSI_FLAGS);
> +
> +        pci_set_byte(pci_dev->config + pci_dev->msi_cap + PCI_MSI_FLAGS,
> +                     ctrl & ~PCI_MSI_FLAGS_ENABLE);
> +        assigned_dev_update_msi(pci_dev);
> +    }
> +
> +    snprintf(reset_file, sizeof(reset_file),
> +             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset",
> +             adev->host.domain, adev->host.bus, adev->host.slot,
> +             adev->host.function);
> +
> +    /*
> +     * Issue a device reset via pci-sysfs.  Note that we use write(2) here
> +     * and ignore the return value because some kernels have a bug that
> +     * returns 0 rather than bytes written on success, sending us into an
> +     * infinite retry loop using other write mechanisms.
> +     */
> +    fd = open(reset_file, O_WRONLY);
> +    if (fd != -1) {
> +        ret = write(fd, reset, strlen(reset));
> +        (void)ret;
> +        close(fd);
> +    }
> +
> +    /*
> +     * When a 0 is written to the bus master register, the device is logically
> +     * disconnected from the PCI bus. This avoids further DMA transfers.
> +     */
> +    assigned_dev_pci_write_config(pci_dev, PCI_COMMAND, 0, 1);
> +}
> +
> +static int assigned_initfn(struct PCIDevice *pci_dev)
> +{
> +    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint8_t e_intx;
> +    int r;
> +
> +    if (!kvm_enabled()) {
> +        error_report("pci-assign: error: requires KVM support");
> +        return -1;
> +    }
> +
> +    if (!dev->host.domain && !dev->host.bus && !dev->host.slot &&
> +        !dev->host.function) {
> +        error_report("pci-assign: error: no host device specified");
> +        return -1;
> +    }
> +
> +    /*
> +     * Set up basic config space access control. Will be further refined during
> +     * device initialization.
> +     */
> +    assigned_dev_emulate_config_read(dev, 0, PCI_CONFIG_SPACE_SIZE);
> +    assigned_dev_direct_config_read(dev, PCI_STATUS, 2);
> +    assigned_dev_direct_config_read(dev, PCI_REVISION_ID, 1);
> +    assigned_dev_direct_config_read(dev, PCI_CLASS_PROG, 3);
> +    assigned_dev_direct_config_read(dev, PCI_CACHE_LINE_SIZE, 1);
> +    assigned_dev_direct_config_read(dev, PCI_LATENCY_TIMER, 1);
> +    assigned_dev_direct_config_read(dev, PCI_BIST, 1);
> +    assigned_dev_direct_config_read(dev, PCI_CARDBUS_CIS, 4);
> +    assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_VENDOR_ID, 2);
> +    assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_ID, 2);
> +    assigned_dev_direct_config_read(dev, PCI_CAPABILITY_LIST + 1, 7);
> +    assigned_dev_direct_config_read(dev, PCI_MIN_GNT, 1);
> +    assigned_dev_direct_config_read(dev, PCI_MAX_LAT, 1);
> +    memcpy(dev->emulate_config_write, dev->emulate_config_read,
> +           sizeof(dev->emulate_config_read));
> +
> +    if (get_real_device(dev, dev->host.domain, dev->host.bus,
> +                        dev->host.slot, dev->host.function)) {
> +        error_report("pci-assign: Error: Couldn't get real device (%s)!",
> +                     dev->dev.qdev.id);
> +        goto out;
> +    }
> +
> +    if (assigned_device_pci_cap_init(pci_dev) < 0) {
> +        goto out;
> +    }
> +
> +    /* intercept MSI-X entry page in the MMIO */
> +    if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
> +        if (assigned_dev_register_msix_mmio(dev)) {
> +            goto out;
> +        }
> +    }
> +
> +    /* handle real device's MMIO/PIO BARs */
> +    if (assigned_dev_register_regions(dev->real_device.regions,
> +                                      dev->real_device.region_number,
> +                                      dev)) {
> +        goto out;
> +    }
> +
> +    /* handle interrupt routing */

hardcoded offsets below.

> +    e_intx = dev->dev.config[0x3d] - 1;
> +    dev->intpin = e_intx;
> +    dev->intx_route.mode = PCI_INTX_DISABLED;
> +    dev->intx_route.irq = -1;
> +
> +    /* assign device to guest */
> +    r = assign_device(dev);
> +    if (r < 0) {
> +        goto out;
> +    }
> +
> +    /* assign legacy INTx to the device */
> +    r = assign_intx(dev);
> +    if (r < 0) {
> +        goto assigned_out;
> +    }
> +
> +    assigned_dev_load_option_rom(dev);
> +
> +    add_boot_device_path(dev->bootindex, &pci_dev->qdev, NULL);
> +
> +    return 0;
> +
> +assigned_out:
> +    deassign_device(dev);
> +out:
> +    free_assigned_device(dev);
> +    return -1;
> +}
> +
> +static void assigned_exitfn(struct PCIDevice *pci_dev)
> +{
> +    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +
> +    deassign_device(dev);
> +    free_assigned_device(dev);
> +}
> +
> +static Property da_properties[] = {

assigned_dev_properties.

> +    DEFINE_PROP_PCI_HOST_DEVADDR("host", AssignedDevice, host),
> +    DEFINE_PROP_BIT("prefer_msi", AssignedDevice, features,
> +                    ASSIGNED_DEVICE_PREFER_MSI_BIT, false),
> +    DEFINE_PROP_BIT("share_intx", AssignedDevice, features,
> +                    ASSIGNED_DEVICE_SHARE_INTX_BIT, true),
> +    DEFINE_PROP_INT32("bootindex", AssignedDevice, bootindex, -1),
> +    DEFINE_PROP_STRING("configfd", AssignedDevice, configfd_name),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void assign_class_init(ObjectClass *klass, void *data)
> +{
> +    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    k->init         = assigned_initfn;
> +    k->exit         = assigned_exitfn;
> +    k->config_read  = assigned_dev_pci_read_config;
> +    k->config_write = assigned_dev_pci_write_config;
> +    dc->props       = da_properties;
> +    dc->vmsd        = &vmstate_assigned_device;
> +    dc->reset       = reset_assigned_device;
> +}
> +
> +static TypeInfo assign_info = {
> +    .name               = "pci-assign",
> +    .parent             = TYPE_PCI_DEVICE,
> +    .instance_size      = sizeof(AssignedDevice),
> +    .class_init         = assign_class_init,
> +};
> +
> +static void assign_register_types(void)
> +{
> +    type_register_static(&assign_info);
> +}
> +
> +type_init(assign_register_types)
> +
> +/*
> + * Scan the assigned devices for the devices that have an option ROM, and then
> + * load the corresponding ROM data to RAM. If an error occurs while loading an
> + * option ROM, we just ignore that option ROM and continue with the next one.
> + */
> +static void assigned_dev_load_option_rom(AssignedDevice *dev)
> +{
> +    char name[32], rom_file[64];
> +    FILE *fp;
> +    uint8_t val;
> +    struct stat st;
> +    void *ptr;
> +
> +    /* If loading ROM from file, pci handles it */
> +    if (dev->dev.romfile || !dev->dev.rom_bar) {
> +        return;
> +    }
> +
> +    snprintf(rom_file, sizeof(rom_file),
> +             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/rom",
> +             dev->host.domain, dev->host.bus, dev->host.slot,
> +             dev->host.function);
> +
> +    if (stat(rom_file, &st)) {
> +        return;
> +    }
> +
> +    if (access(rom_file, F_OK)) {
> +        fprintf(stderr, "pci-assign: Insufficient privileges for %s\n",
> +                rom_file);
> +        return;
> +    }
> +
> +    /* Write "1" to the ROM file to enable it */
> +    fp = fopen(rom_file, "r+");
> +    if (fp == NULL) {
> +        return;
> +    }
> +    val = 1;
> +    if (fwrite(&val, 1, 1, fp) != 1) {
> +        goto close_rom;
> +    }
> +    fseek(fp, 0, SEEK_SET);
> +
> +    snprintf(name, sizeof(name), "%s.rom",
> +            object_get_typename(OBJECT(dev)));
> +    memory_region_init_ram(&dev->dev.rom, name, st.st_size);
> +    vmstate_register_ram(&dev->dev.rom, &dev->dev.qdev);
> +    ptr = memory_region_get_ram_ptr(&dev->dev.rom);
> +    memset(ptr, 0xff, st.st_size);
> +
> +    if (!fread(ptr, 1, st.st_size, fp)) {
> +        fprintf(stderr, "pci-assign: Cannot read from host %s\n"
> +                "\tDevice option ROM contents are probably invalid "
> +                "(check dmesg).\n\tSkip option ROM probe with rombar=0, "
> +                "or load from file with romfile=\n", rom_file);
> +        memory_region_destroy(&dev->dev.rom);
> +        goto close_rom;
> +    }
> +
> +    pci_register_bar(&dev->dev, PCI_ROM_SLOT, 0, &dev->dev.rom);
> +    dev->dev.has_rom = true;
> +close_rom:
> +    /* Write "0" to disable ROM */
> +    fseek(fp, 0, SEEK_SET);
> +    val = 0;
> +    if (!fwrite(&val, 1, 1, fp)) {
> +        DEBUG("%s\n", "Failed to disable pci-sysfs rom file");
> +    }
> +    fclose(fp);
> +}
> -- 
> 1.7.3.4
Blue Swirl Aug. 28, 2012, 5:01 p.m. UTC | #10
On Tue, Aug 28, 2012 at 7:35 AM, Michael Tokarev <mjt@tls.msk.ru> wrote:
> On 27.08.2012 22:56, Blue Swirl wrote:
> []
>>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>>> +{
>>> +    AssignedDevRegion *d = opaque;
>>> +    uint8_t *in = d->u.r_virtbase + addr;
>>
>> Don't perform arithmetic with void pointers.
>
> There are a few places in common qemu code which does this for a very
> long time.  So I guess it is safe now.

It's a non-standard GCC extension.

>
> /mjt
Michael S. Tsirkin Aug. 28, 2012, 5:28 p.m. UTC | #11
On Tue, Aug 28, 2012 at 05:01:55PM +0000, Blue Swirl wrote:
> On Tue, Aug 28, 2012 at 7:35 AM, Michael Tokarev <mjt@tls.msk.ru> wrote:
> > On 27.08.2012 22:56, Blue Swirl wrote:
> > []
> >>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> >>> +{
> >>> +    AssignedDevRegion *d = opaque;
> >>> +    uint8_t *in = d->u.r_virtbase + addr;
> >>
> >> Don't perform arithmetic with void pointers.
> >
> > There are a few places in common qemu code which does this for a very
> > long time.  So I guess it is safe now.
> 
> It's a non-standard GCC extension.

So?  We use many other GCC extensions. grep for typeof.

Is there a work in progress to build GCC with visual studio?
If yes what are the chances KVM device assignment
will work on windows?

Look QEMU codebase is what it is. Unless you rework all existing
code to confirm to your taste, I do not see why you NACK valid new code
unless it confirms to same.

> >
> > /mjt
Blue Swirl Aug. 28, 2012, 5:38 p.m. UTC | #12
On Tue, Aug 28, 2012 at 5:28 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Tue, Aug 28, 2012 at 05:01:55PM +0000, Blue Swirl wrote:
>> On Tue, Aug 28, 2012 at 7:35 AM, Michael Tokarev <mjt@tls.msk.ru> wrote:
>> > On 27.08.2012 22:56, Blue Swirl wrote:
>> > []
>> >>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>> >>> +{
>> >>> +    AssignedDevRegion *d = opaque;
>> >>> +    uint8_t *in = d->u.r_virtbase + addr;
>> >>
>> >> Don't perform arithmetic with void pointers.
>> >
>> > There are a few places in common qemu code which does this for a very
>> > long time.  So I guess it is safe now.
>>
>> It's a non-standard GCC extension.
>
> So?  We use many other GCC extensions. grep for typeof.

Dependencies should not be introduced trivially. In this case, it's
pretty easy to avoid void pointer arithmetic as Jan's next version shows.

>
> Is there a work in progress to build GCC with visual studio?
> If yes what are the chances KVM device assignment
> will work on windows?

IIRC there was really a project to use KVM on Windows and another
project to build QEMU with MSVC.

>
> Look QEMU codebase is what it is. Unless you rework all existing
> code to confirm to your taste, I do not see why you NACK valid new code
> unless it confirms to same.

Yes, I'd be happy to fix the style with huge patches at once. But our
fearless leader does not agree, so we are stuck with the codebase
being what it is until it is fixed one step at a time.

>
>> >
>> > /mjt
Anthony Liguori Aug. 28, 2012, 7:31 p.m. UTC | #13
Blue Swirl <blauwirbel@gmail.com> writes:

> On Tue, Aug 28, 2012 at 5:28 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> On Tue, Aug 28, 2012 at 05:01:55PM +0000, Blue Swirl wrote:
>>> On Tue, Aug 28, 2012 at 7:35 AM, Michael Tokarev <mjt@tls.msk.ru> wrote:
>>> > On 27.08.2012 22:56, Blue Swirl wrote:
>>> > []
>>> >>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>>> >>> +{
>>> >>> +    AssignedDevRegion *d = opaque;
>>> >>> +    uint8_t *in = d->u.r_virtbase + addr;
>>> >>
>>> >> Don't perform arithmetic with void pointers.
>>> >
>>> > There are a few places in common qemu code which does this for a very
>>> > long time.  So I guess it is safe now.
>>>
>>> It's a non-standard GCC extension.
>>
>> So?  We use many other GCC extensions. grep for typeof.
>
> Dependencies should not be introduced trivially. In this case, it's
> pretty easy to avoid void pointer arithmetic as Jan's next version
> shows.

The standard is vague with respect void arithmetic.  Most compilers
allow it.  A very good analysis of the standard can be found below.

http://stackoverflow.com/questions/3523145/pointer-arithmetic-for-void-pointer-in-c

BTW: can we please stop arguing about C standards.  If we currently are
using something in QEMU that's supported by clang and GCC, it's fine and
we ought to continue using it.

The reserved names actually did bite us when porting to a new platform.
But the only requirement for C extensions ought to be reasonable support
in GCC and clang.

I don't care at all about supporting proprietary compilers.

Regards,

Anthony Liguori

>
>>
>> Is there a work in progress to build GCC with visual studio?
>> If yes what are the chances KVM device assignment
>> will work on windows?
>
> IIRC there was really a project to use KVM on Windows and another
> project to build QEMU with MSVC.
>
>>
>> Look QEMU codebase is what it is. Unless you rework all existing
>> code to confirm to your taste, I do not see why you NACK valid new code
>> unless it confirms to same.
>
> Yes, I'd be happy to fix the style with huge patches at once. But our
> fearless leader does not agree, so we are stuck with the codebase
> being what it is until it is fixed one step at a time.
>
>>
>>> >
>>> > /mjt
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
malc Aug. 28, 2012, 7:49 p.m. UTC | #14
On Tue, 28 Aug 2012, Anthony Liguori wrote:

> Blue Swirl <blauwirbel@gmail.com> writes:
> 
> > On Tue, Aug 28, 2012 at 5:28 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> On Tue, Aug 28, 2012 at 05:01:55PM +0000, Blue Swirl wrote:
> >>> On Tue, Aug 28, 2012 at 7:35 AM, Michael Tokarev <mjt@tls.msk.ru> wrote:
> >>> > On 27.08.2012 22:56, Blue Swirl wrote:
> >>> > []
> >>> >>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> >>> >>> +{
> >>> >>> +    AssignedDevRegion *d = opaque;
> >>> >>> +    uint8_t *in = d->u.r_virtbase + addr;
> >>> >>
> >>> >> Don't perform arithmetic with void pointers.
> >>> >
> >>> > There are a few places in common qemu code which does this for a very
> >>> > long time.  So I guess it is safe now.
> >>>
> >>> It's a non-standard GCC extension.
> >>
> >> So?  We use many other GCC extensions. grep for typeof.
> >
> > Dependencies should not be introduced trivially. In this case, it's
> > pretty easy to avoid void pointer arithmetic as Jan's next version
> > shows.
> 
> The standard is vague with respect void arithmetic.  Most compilers
> allow it.  A very good analysis of the standard can be found below.
> 
> http://stackoverflow.com/questions/3523145/pointer-arithmetic-for-void-pointer-in-c
> 
> BTW: can we please stop arguing about C standards.  If we currently are
> using something in QEMU that's supported by clang and GCC, it's fine and
> we ought to continue using it.

No we can not stop arguing. Besides you are wrong.

[..snip..]
Blue Swirl Aug. 28, 2012, 8:06 p.m. UTC | #15
On Tue, Aug 28, 2012 at 7:31 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Blue Swirl <blauwirbel@gmail.com> writes:
>
>> On Tue, Aug 28, 2012 at 5:28 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>> On Tue, Aug 28, 2012 at 05:01:55PM +0000, Blue Swirl wrote:
>>>> On Tue, Aug 28, 2012 at 7:35 AM, Michael Tokarev <mjt@tls.msk.ru> wrote:
>>>> > On 27.08.2012 22:56, Blue Swirl wrote:
>>>> > []
>>>> >>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>>>> >>> +{
>>>> >>> +    AssignedDevRegion *d = opaque;
>>>> >>> +    uint8_t *in = d->u.r_virtbase + addr;
>>>> >>
>>>> >> Don't perform arithmetic with void pointers.
>>>> >
>>>> > There are a few places in common qemu code which does this for a very
>>>> > long time.  So I guess it is safe now.
>>>>
>>>> It's a non-standard GCC extension.
>>>
>>> So?  We use many other GCC extensions. grep for typeof.
>>
>> Dependencies should not be introduced trivially. In this case, it's
>> pretty easy to avoid void pointer arithmetic as Jan's next version
>> shows.
>
> The standard is vague with respect void arithmetic.  Most compilers
> allow it.  A very good analysis of the standard can be found below.
>
> http://stackoverflow.com/questions/3523145/pointer-arithmetic-for-void-pointer-in-c

The analysis would seem to show that arithmetic may be acceptable, but
it doesn't say that void pointers must be treated like char pointers.
In my view, this would make sense:

char *cptr;
void *vptr;

Since
cptr++;
is equivalent to
cptr = (char *)((uintptr_t)cptr + sizeof(*cptr));

therefore

vptr++;
should be equivalent to
vptr = (void *)((uintptr_t)vptr + sizeof(*vptr));

That is, vptr++ should be equivalent to vptr += 0 because sizeof(void)
should be 0 if allowed.

>
> BTW: can we please stop arguing about C standards.  If we currently are
> using something in QEMU that's supported by clang and GCC, it's fine and
> we ought to continue using it.
>
> The reserved names actually did bite us when porting to a new platform.
> But the only requirement for C extensions ought to be reasonable support
> in GCC and clang.
>
> I don't care at all about supporting proprietary compilers.

We also don't have crowds banging doors with their money bags with a
need for such support.

>
> Regards,
>
> Anthony Liguori
>
>>
>>>
>>> Is there a work in progress to build GCC with visual studio?
>>> If yes what are the chances KVM device assignment
>>> will work on windows?
>>
>> IIRC there was really a project to use KVM on Windows and another
>> project to build QEMU with MSVC.
>>
>>>
>>> Look QEMU codebase is what it is. Unless you rework all existing
>>> code to confirm to your taste, I do not see why you NACK valid new code
>>> unless it confirms to same.
>>
>> Yes, I'd be happy to fix the style with huge patches at once. But our
>> fearless leader does not agree, so we are stuck with the codebase
>> being what it is until it is fixed one step at a time.
>>
>>>
>>>> >
>>>> > /mjt
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Maydell Aug. 28, 2012, 9:26 p.m. UTC | #16
On 27 August 2012 13:15, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2012-08-27 14:07, Andreas Färber wrote:
>> Am I correct to understand we compile this only for i386 / x86_64?
>
> This is correct.

Did we ever make a decision about whether architecture specific
KVM devices should all live in hw/kvm/ or in some directory
with the arch name in it? I guess they're all in hw/kvm/ at
the moment so they should stay there unless there's a pressing
reason to move them around...

Since this is arch-specific we should probably give the
resulting device a more specific name than "pci-assign",
which implies that it is (a) ok for any PCI system and
(b) not something that will be obsolete in the foreseen
future.

-- PMM
Anthony Liguori Aug. 28, 2012, 9:51 p.m. UTC | #17
Blue Swirl <blauwirbel@gmail.com> writes:

> On Tue, Aug 28, 2012 at 7:31 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>> Blue Swirl <blauwirbel@gmail.com> writes:
>>
>>> On Tue, Aug 28, 2012 at 5:28 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>> On Tue, Aug 28, 2012 at 05:01:55PM +0000, Blue Swirl wrote:
>>>>> On Tue, Aug 28, 2012 at 7:35 AM, Michael Tokarev <mjt@tls.msk.ru> wrote:
>>>>> > On 27.08.2012 22:56, Blue Swirl wrote:
>>>>> > []
>>>>> >>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>>>>> >>> +{
>>>>> >>> +    AssignedDevRegion *d = opaque;
>>>>> >>> +    uint8_t *in = d->u.r_virtbase + addr;
>>>>> >>
>>>>> >> Don't perform arithmetic with void pointers.
>>>>> >
>>>>> > There are a few places in common qemu code which does this for a very
>>>>> > long time.  So I guess it is safe now.
>>>>>
>>>>> It's a non-standard GCC extension.
>>>>
>>>> So?  We use many other GCC extensions. grep for typeof.
>>>
>>> Dependencies should not be introduced trivially. In this case, it's
>>> pretty easy to avoid void pointer arithmetic as Jan's next version
>>> shows.
>>
>> The standard is vague with respect void arithmetic.  Most compilers
>> allow it.  A very good analysis of the standard can be found below.
>>
>> http://stackoverflow.com/questions/3523145/pointer-arithmetic-for-void-pointer-in-c
>
> The analysis would seem to show that arithmetic may be acceptable, but
> it doesn't say that void pointers must be treated like char pointers.
> In my view, this would make sense:
>
> char *cptr;
> void *vptr;
>
> Since
> cptr++;
> is equivalent to
> cptr = (char *)((uintptr_t)cptr + sizeof(*cptr));
>
> therefore
>
> vptr++;
> should be equivalent to
> vptr = (void *)((uintptr_t)vptr + sizeof(*vptr));
> That is, vptr++ should be equivalent to vptr += 0 because sizeof(void)
> should be 0 if allowed.

sizeof(void) == 1

With GCC at least.

Regards,

Anthony Liguori

>>
>> Regards,
>>
>> Anthony Liguori
>>
>>>
>>>>
>>>> Is there a work in progress to build GCC with visual studio?
>>>> If yes what are the chances KVM device assignment
>>>> will work on windows?
>>>
>>> IIRC there was really a project to use KVM on Windows and another
>>> project to build QEMU with MSVC.
>>>
>>>>
>>>> Look QEMU codebase is what it is. Unless you rework all existing
>>>> code to confirm to your taste, I do not see why you NACK valid new code
>>>> unless it confirms to same.
>>>
>>> Yes, I'd be happy to fix the style with huge patches at once. But our
>>> fearless leader does not agree, so we are stuck with the codebase
>>> being what it is until it is fixed one step at a time.
>>>
>>>>
>>>>> >
>>>>> > /mjt
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Markus Armbruster Aug. 29, 2012, 8:27 a.m. UTC | #18
Anthony Liguori <anthony@codemonkey.ws> writes:

> Blue Swirl <blauwirbel@gmail.com> writes:
>
>> On Tue, Aug 28, 2012 at 5:28 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>> On Tue, Aug 28, 2012 at 05:01:55PM +0000, Blue Swirl wrote:
>>>> On Tue, Aug 28, 2012 at 7:35 AM, Michael Tokarev <mjt@tls.msk.ru> wrote:
>>>> > On 27.08.2012 22:56, Blue Swirl wrote:
>>>> > []
>>>> >>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>>>> >>> +{
>>>> >>> +    AssignedDevRegion *d = opaque;
>>>> >>> +    uint8_t *in = d->u.r_virtbase + addr;
>>>> >>
>>>> >> Don't perform arithmetic with void pointers.
>>>> >
>>>> > There are a few places in common qemu code which does this for a very
>>>> > long time.  So I guess it is safe now.
>>>>
>>>> It's a non-standard GCC extension.
>>>
>>> So?  We use many other GCC extensions. grep for typeof.
>>
>> Dependencies should not be introduced trivially. In this case, it's
>> pretty easy to avoid void pointer arithmetic as Jan's next version
>> shows.
>
> The standard is vague with respect void arithmetic.  Most compilers
> allow it.  A very good analysis of the standard can be found below.
>
> http://stackoverflow.com/questions/3523145/pointer-arithmetic-for-void-pointer-in-c
>
> BTW: can we please stop arguing about C standards.  If we currently are
> using something in QEMU that's supported by clang and GCC, it's fine and
> we ought to continue using it.

Seconded.

I don't see a point in making contributors avoid non-problems that might
conceivably become trivial problems some day.  Especially when there's
no automated help with the avoiding.

[...]
Jan Kiszka Aug. 29, 2012, 8:47 a.m. UTC | #19
On 2012-08-28 23:26, Peter Maydell wrote:
> On 27 August 2012 13:15, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2012-08-27 14:07, Andreas Färber wrote:
>>> Am I correct to understand we compile this only for i386 / x86_64?
>>
>> This is correct.
> 
> Did we ever make a decision about whether architecture specific
> KVM devices should all live in hw/kvm/ or in some directory
> with the arch name in it? I guess they're all in hw/kvm/ at
> the moment so they should stay there unless there's a pressing
> reason to move them around...

Yep, that would be the plan. As long as there are x86 platform devices
under hw/kvm, I see no point putting this one elsewhere. It's trivially
moved when time has come.

> 
> Since this is arch-specific we should probably give the
> resulting device a more specific name than "pci-assign",
> which implies that it is (a) ok for any PCI system and
> (b) not something that will be obsolete in the foreseen
> future.

The name has to be like this for libvirt compatibility. I can provide it
as alias, though, calling the device "kvm-pci-assign" by default. Better?

Jan
Peter Maydell Aug. 29, 2012, 8:49 a.m. UTC | #20
On 29 August 2012 09:47, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2012-08-28 23:26, Peter Maydell wrote:
>> Since this is arch-specific we should probably give the
>> resulting device a more specific name than "pci-assign",
>> which implies that it is (a) ok for any PCI system and
>> (b) not something that will be obsolete in the foreseen
>> future.
>
> The name has to be like this for libvirt compatibility. I can provide it
> as alias, though, calling the device "kvm-pci-assign" by default. Better?

...is it likely to ever work for non-x86 PCI devices?

-- PMM
Jan Kiszka Aug. 29, 2012, 8:50 a.m. UTC | #21
On 2012-08-29 10:49, Peter Maydell wrote:
> On 29 August 2012 09:47, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2012-08-28 23:26, Peter Maydell wrote:
>>> Since this is arch-specific we should probably give the
>>> resulting device a more specific name than "pci-assign",
>>> which implies that it is (a) ok for any PCI system and
>>> (b) not something that will be obsolete in the foreseen
>>> future.
>>
>> The name has to be like this for libvirt compatibility. I can provide it
>> as alias, though, calling the device "kvm-pci-assign" by default. Better?
> 
> ...is it likely to ever work for non-x86 PCI devices?

Nope, but it also won't be exposed on them.

Jan
Jan Kiszka Aug. 29, 2012, 10:50 a.m. UTC | #22
On 2012-08-28 17:04, Michael S. Tsirkin wrote:
> On Mon, Aug 27, 2012 at 08:28:41AM +0200, Jan Kiszka wrote:
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> This adds PCI device assignment for i386 targets using the classic KVM
>> interfaces. This version is 100% identical to what is being maintained
>> in qemu-kvm for several years and is supported by libvirt as well. It is
>> expected to remain relevant for another couple of years until kernels
>> without full-features and performance-wise equivalent VFIO support are
>> obsolete.
>>
>> A refactoring to-do that should be done in-tree is to model MSI and
>> MSI-X support via the generic PCI layer, similar to what VFIO is already
>> doing for MSI-X. This should improve the correctness and clean up the
>> code from duplicate logic.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Don't see anything major to object to here,
> in particular because it's well-contained.
> It would be nice to add, in commit log,
> the commit ID from qemu-kvm which has code
> closest to this.

Done.

> 
> All comments below can be addressed either
> downstream before merging or upstream after
> this code is merged.
> 
>> ---
>>  hw/kvm/Makefile.objs |    2 +-
>>  hw/kvm/pci-assign.c  | 1929 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 1930 insertions(+), 1 deletions(-)
>>  create mode 100644 hw/kvm/pci-assign.c
>>
>> diff --git a/hw/kvm/Makefile.objs b/hw/kvm/Makefile.objs
>> index 226497a..f620d7f 100644
>> --- a/hw/kvm/Makefile.objs
>> +++ b/hw/kvm/Makefile.objs
>> @@ -1 +1 @@
>> -obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o
>> +obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o pci-assign.o
>> diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
>> new file mode 100644
>> index 0000000..9cce02c
>> --- /dev/null
>> +++ b/hw/kvm/pci-assign.c
>> @@ -0,0 +1,1929 @@
>> +/*
>> + * Copyright (c) 2007, Neocleus Corporation.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms and conditions of the GNU General Public License,
>> + * version 2, as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
>> + * more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along with
>> + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
>> + * Place - Suite 330, Boston, MA 02111-1307 USA.
>> + *
>> + *
>> + *  Assign a PCI device from the host to a guest VM.
>> + *
>> + *  Adapted for KVM by Qumranet.
>> + *
>> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
>> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
>> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
>> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
>> + */
>> +#include <stdio.h>
>> +#include <unistd.h>
>> +#include <sys/io.h>
>> +#include <sys/mman.h>
>> +#include <sys/types.h>
>> +#include <sys/stat.h>
>> +#include "hw/hw.h"
>> +#include "hw/pc.h"
>> +#include "qemu-error.h"
>> +#include "console.h"
>> +#include "hw/loader.h"
>> +#include "monitor.h"
>> +#include "range.h"
>> +#include "sysemu.h"
>> +#include "hw/pci.h"
>> +#include "hw/msi.h"
>> +#include "kvm_i386.h"
>> +
>> +#define MSIX_PAGE_SIZE 0x1000
>> +
>> +/* From linux/ioport.h */
>> +#define IORESOURCE_IO       0x00000100  /* Resource type */
>> +#define IORESOURCE_MEM      0x00000200
>> +#define IORESOURCE_IRQ      0x00000400
>> +#define IORESOURCE_DMA      0x00000800
>> +#define IORESOURCE_PREFETCH 0x00002000  /* No side effects */
>> +
>> +/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
>> +
>> +#ifdef DEVICE_ASSIGNMENT_DEBUG
>> +#define DEBUG(fmt, ...)                                       \
>> +    do {                                                      \
>> +        fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);  \
>> +    } while (0)
>> +#else
>> +#define DEBUG(fmt, ...)
>> +#endif
>> +
>> +typedef struct {
>> +    int type;           /* Memory or port I/O */
>> +    int valid;
>> +    uint32_t base_addr;
>> +    uint32_t size;    /* size of the region */
>> +    int resource_fd;
>> +} PCIRegion;
>> +
>> +typedef struct {
>> +    uint8_t bus, dev, func; /* Bus inside domain, device and function */
>> +    int irq;                /* IRQ number */
>> +    uint16_t region_number; /* number of active regions */
>> +
>> +    /* Port I/O or MMIO Regions */
>> +    PCIRegion regions[PCI_NUM_REGIONS - 1];
>> +    int config_fd;
>> +} PCIDevRegions;
>> +
>> +typedef struct {
>> +    MemoryRegion container;
>> +    MemoryRegion real_iomem;
>> +    union {
>> +        void *r_virtbase;    /* mmapped access address for memory regions */
>> +        uint32_t r_baseport; /* the base guest port for I/O regions */
>> +    } u;
>> +    pcibus_t e_size;    /* emulated size of region in bytes */
>> +    pcibus_t r_size;    /* real size of region in bytes */
>> +    PCIRegion *region;
>> +} AssignedDevRegion;
>> +
>> +#define ASSIGNED_DEVICE_PREFER_MSI_BIT  0
>> +#define ASSIGNED_DEVICE_SHARE_INTX_BIT  1
>> +
>> +#define ASSIGNED_DEVICE_PREFER_MSI_MASK (1 << ASSIGNED_DEVICE_PREFER_MSI_BIT)
>> +#define ASSIGNED_DEVICE_SHARE_INTX_MASK (1 << ASSIGNED_DEVICE_SHARE_INTX_BIT)
>> +
>> +typedef struct {
>> +    uint32_t addr_lo;
>> +    uint32_t addr_hi;
>> +    uint32_t data;
>> +    uint32_t ctrl;
>> +} MSIXTableEntry;
>> +
>> +typedef enum AssignedIRQType {
>> +    ASSIGNED_IRQ_NONE = 0,
>> +    ASSIGNED_IRQ_INTX_HOST_INTX,
>> +    ASSIGNED_IRQ_INTX_HOST_MSI,
>> +    ASSIGNED_IRQ_MSI,
>> +    ASSIGNED_IRQ_MSIX
>> +} AssignedIRQType;
>> +
>> +typedef struct AssignedDevice {
>> +    PCIDevice dev;
>> +    PCIHostDeviceAddress host;
>> +    uint32_t dev_id;
>> +    uint32_t features;
>> +    int intpin;
> 
> This field seems unused.

Oops, that's a bug of recent refactorings. Also relevant for
qemu-kvm-1.2. Will fix and post a patch for the latter.

> 
>> +    AssignedDevRegion v_addrs[PCI_NUM_REGIONS - 1];
>> +    PCIDevRegions real_device;
>> +    PCIINTxRoute intx_route;
>> +    AssignedIRQType assigned_irq_type;
>> +    struct {
>> +#define ASSIGNED_DEVICE_CAP_MSI (1 << 0)
>> +#define ASSIGNED_DEVICE_CAP_MSIX (1 << 1)
>> +        uint32_t available;
>> +#define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0)
>> +#define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1)
>> +#define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
>> +        uint32_t state;
>> +    } cap;
>> +    uint8_t emulate_config_read[PCI_CONFIG_SPACE_SIZE];
>> +    uint8_t emulate_config_write[PCI_CONFIG_SPACE_SIZE];
>> +    int msi_virq_nr;
>> +    int *msi_virq;
>> +    MSIXTableEntry *msix_table;
>> +    target_phys_addr_t msix_table_addr;
>> +    uint16_t msix_max;
>> +    MemoryRegion mmio;
>> +    char *configfd_name;
>> +    int32_t bootindex;
>> +} AssignedDevice;
>> +
>> +static void assigned_dev_update_irq_routing(PCIDevice *dev);
>> +
>> +static void assigned_dev_load_option_rom(AssignedDevice *dev);
>> +
>> +static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
>> +
>> +static uint64_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
>> +                                       target_phys_addr_t addr, int size,
>> +                                       uint64_t *data)
>> +{
>> +    uint64_t val = 0;
>> +    int fd = dev_region->region->resource_fd;
>> +
>> +    if (fd >= 0) {
>> +        if (data) {
>> +            DEBUG("pwrite data=%lx, size=%d, e_phys=%lx, addr=%lx\n",
>> +                  *data, size, addr, addr);
>> +            if (pwrite(fd, data, size, addr) != size) {
>> +                fprintf(stderr, "%s - pwrite failed %s\n",
>> +                        __func__, strerror(errno));
>> +            }
>> +        } else {
>> +            if (pread(fd, &val, size, addr) != size) {
>> +                fprintf(stderr, "%s - pread failed %s\n",
>> +                        __func__, strerror(errno));
>> +                val = (1UL << (size * 8)) - 1;
>> +            }
>> +            DEBUG("pread val=%lx, size=%d, e_phys=%lx, addr=%lx\n",
>> +                  val, size, addr, addr);
>> +        }
>> +    } else {
>> +        uint32_t port = addr + dev_region->u.r_baseport;
>> +
>> +        if (data) {
>> +            DEBUG("out data=%lx, size=%d, e_phys=%lx, host=%x\n",
>> +                  *data, size, addr, port);
>> +            switch (size) {
>> +            case 1:
>> +                outb(*data, port);
>> +                break;
>> +            case 2:
>> +                outw(*data, port);
>> +                break;
>> +            case 4:
>> +                outl(*data, port);
>> +                break;
>> +            }
>> +        } else {
>> +            switch (size) {
>> +            case 1:
>> +                val = inb(port);
>> +                break;
>> +            case 2:
>> +                val = inw(port);
>> +                break;
>> +            case 4:
>> +                val = inl(port);
>> +                break;
>> +            }
>> +            DEBUG("in data=%lx, size=%d, e_phys=%lx, host=%x\n",
>> +                  val, size, addr, port);
>> +        }
>> +    }
>> +    return val;
>> +}
>> +
>> +static void assigned_dev_ioport_write(void *opaque, target_phys_addr_t addr,
>> +                                      uint64_t data, unsigned size)
>> +{
>> +    assigned_dev_ioport_rw(opaque, addr, size, &data);
>> +}
>> +
>> +static uint64_t assigned_dev_ioport_read(void *opaque,
>> +                                         target_phys_addr_t addr, unsigned size)
>> +{
>> +    return assigned_dev_ioport_rw(opaque, addr, size, NULL);
>> +}
>> +
>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint8_t *in = d->u.r_virtbase + addr;
>> +    uint32_t r;
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> +    return r;
>> +}
>> +
>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint16_t *in = d->u.r_virtbase + addr;
>> +    uint32_t r;
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> +    return r;
>> +}
>> +
>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint32_t *in = d->u.r_virtbase + addr;
>> +    uint32_t r;
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> +    return r;
>> +}
>> +
>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint8_t *out = d->u.r_virtbase + addr;
>> +
>> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint16_t *out = d->u.r_virtbase + addr;
>> +
>> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint32_t *out = d->u.r_virtbase + addr;
>> +
>> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static const MemoryRegionOps slow_bar_ops = {
>> +    .old_mmio = {
>> +        .read = { slow_bar_readb, slow_bar_readw, slow_bar_readl, },
>> +        .write = { slow_bar_writeb, slow_bar_writew, slow_bar_writel, },
>> +    },
>> +    .endianness = DEVICE_NATIVE_ENDIAN,
>> +};
>> +
>> +static void assigned_dev_iomem_setup(PCIDevice *pci_dev, int region_num,
>> +                                     pcibus_t e_size)
>> +{
>> +    AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
>> +
>> +    if (e_size > 0) {
>> +        memory_region_init(&region->container, "assigned-dev-container",
>> +                           e_size);
>> +        memory_region_add_subregion(&region->container, 0, &region->real_iomem);
>> +
>> +        /* deal with MSI-X MMIO page */
>> +        if (real_region->base_addr <= r_dev->msix_table_addr &&
>> +                real_region->base_addr + real_region->size >
>> +                r_dev->msix_table_addr) {
>> +            int offset = r_dev->msix_table_addr - real_region->base_addr;
>> +
>> +            memory_region_add_subregion_overlap(&region->container,
>> +                                                offset,
>> +                                                &r_dev->mmio,
>> +                                                1);
>> +        }
>> +    }
>> +}
>> +
>> +static const MemoryRegionOps assigned_dev_ioport_ops = {
>> +    .read = assigned_dev_ioport_read,
>> +    .write = assigned_dev_ioport_write,
>> +    .endianness = DEVICE_NATIVE_ENDIAN,
>> +};
>> +
>> +static void assigned_dev_ioport_setup(PCIDevice *pci_dev, int region_num,
>> +                                      pcibus_t size)
>> +{
>> +    AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> +
>> +    region->e_size = size;
>> +    memory_region_init(&region->container, "assigned-dev-container", size);
>> +    memory_region_init_io(&region->real_iomem, &assigned_dev_ioport_ops,
>> +                          r_dev->v_addrs + region_num,
>> +                          "assigned-dev-iomem", size);
>> +    memory_region_add_subregion(&region->container, 0, &region->real_iomem);
>> +}
>> +
>> +static uint32_t assigned_dev_pci_read(PCIDevice *d, int pos, int len)
>> +{
>> +    AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d);
>> +    uint32_t val;
>> +    ssize_t ret;
>> +    int fd = pci_dev->real_device.config_fd;
>> +
>> +again:
>> +    ret = pread(fd, &val, len, pos);
>> +    if (ret != len) {
>> +        if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) {
>> +            goto again;
>> +        }
>> +
>> +        hw_error("pci read failed, ret = %zd errno = %d\n", ret, errno);
>> +    }
>> +
>> +    return val;
>> +}
>> +
>> +static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
>> +{
>> +    return (uint8_t)assigned_dev_pci_read(d, pos, 1);
>> +}
>> +
>> +static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
>> +{
>> +    AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d);
>> +    ssize_t ret;
>> +    int fd = pci_dev->real_device.config_fd;
>> +
>> +again:
>> +    ret = pwrite(fd, &val, len, pos);
>> +    if (ret != len) {
>> +        if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) {
>> +            goto again;
>> +        }
>> +
>> +        hw_error("pci write failed, ret = %zd errno = %d\n", ret, errno);
>> +    }
>> +}
>> +
>> +static void assigned_dev_emulate_config_read(AssignedDevice *dev,
>> +                                             uint32_t offset, uint32_t len)
>> +{
>> +    memset(dev->emulate_config_read + offset, 0xff, len);
>> +}
>> +
>> +static void assigned_dev_direct_config_read(AssignedDevice *dev,
>> +                                            uint32_t offset, uint32_t len)
>> +{
>> +    memset(dev->emulate_config_read + offset, 0, len);
>> +}
>> +
>> +static void assigned_dev_direct_config_write(AssignedDevice *dev,
>> +                                             uint32_t offset, uint32_t len)
>> +{
>> +    memset(dev->emulate_config_write + offset, 0, len);
>> +}
>> +
>> +static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap, uint8_t start)
>> +{
>> +    int id;
>> +    int max_cap = 48;
>> +    int pos = start ? start : PCI_CAPABILITY_LIST;
>> +    int status;
>> +
>> +    status = assigned_dev_pci_read_byte(d, PCI_STATUS);
>> +    if ((status & PCI_STATUS_CAP_LIST) == 0) {
>> +        return 0;
>> +    }
>> +
>> +    while (max_cap--) {
>> +        pos = assigned_dev_pci_read_byte(d, pos);
>> +        if (pos < 0x40) {
>> +            break;
>> +        }
>> +
>> +        pos &= ~3;
>> +        id = assigned_dev_pci_read_byte(d, pos + PCI_CAP_LIST_ID);
>> +
>> +        if (id == 0xff) {
>> +            break;
>> +        }
>> +        if (id == cap) {
>> +            return pos;
>> +        }
>> +
>> +        pos += PCI_CAP_LIST_NEXT;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static int assigned_dev_register_regions(PCIRegion *io_regions,
>> +                                         unsigned long regions_num,
>> +                                         AssignedDevice *pci_dev)
>> +{
>> +    uint32_t i;
>> +    PCIRegion *cur_region = io_regions;
>> +
>> +    for (i = 0; i < regions_num; i++, cur_region++) {
>> +        if (!cur_region->valid) {
>> +            continue;
>> +        }
>> +
>> +        /* handle memory io regions */
>> +        if (cur_region->type & IORESOURCE_MEM) {
>> +            int t = cur_region->type & IORESOURCE_PREFETCH
>> +                ? PCI_BASE_ADDRESS_MEM_PREFETCH
>> +                : PCI_BASE_ADDRESS_SPACE_MEMORY;
>> +
>> +            /* map physical memory */
>> +            pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL, cur_region->size,
>> +                                                    PROT_WRITE | PROT_READ,
>> +                                                    MAP_SHARED,
>> +                                                    cur_region->resource_fd,
>> +                                                    (off_t)0);
>> +
>> +            if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
>> +                pci_dev->v_addrs[i].u.r_virtbase = NULL;
>> +                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
>> +                        "\n", __func__,
>> +                        (uint32_t) (cur_region->base_addr));
>> +                return -1;
>> +            }
>> +
>> +            pci_dev->v_addrs[i].r_size = cur_region->size;
>> +            pci_dev->v_addrs[i].e_size = 0;
>> +
>> +            /* add offset */
>> +            pci_dev->v_addrs[i].u.r_virtbase +=
>> +                (cur_region->base_addr & 0xFFF);
>> +
>> +            if (cur_region->size & 0xFFF) {
>> +                fprintf(stderr, "PCI region %d at address 0x%llx "
>> +                        "has size 0x%x, which is not a multiple of 4K. "
>> +                        "You might experience some performance hit "
>> +                        "due to that.\n",
>> +                        i, (unsigned long long)cur_region->base_addr,
>> +                        cur_region->size);
>> +                memory_region_init_io(&pci_dev->v_addrs[i].real_iomem,
>> +                                      &slow_bar_ops, &pci_dev->v_addrs[i],
>> +                                      "assigned-dev-slow-bar",
>> +                                      cur_region->size);
>> +            } else {
>> +                void *virtbase = pci_dev->v_addrs[i].u.r_virtbase;
>> +                char name[32];
>> +                snprintf(name, sizeof(name), "%s.bar%d",
>> +                         object_get_typename(OBJECT(pci_dev)), i);
>> +                memory_region_init_ram_ptr(&pci_dev->v_addrs[i].real_iomem,
>> +                                           name, cur_region->size,
>> +                                           virtbase);
>> +                vmstate_register_ram(&pci_dev->v_addrs[i].real_iomem,
>> +                                     &pci_dev->dev.qdev);
>> +            }
>> +
>> +            assigned_dev_iomem_setup(&pci_dev->dev, i, cur_region->size);
>> +            pci_register_bar((PCIDevice *) pci_dev, i, t,
>> +                             &pci_dev->v_addrs[i].container);
>> +            continue;
>> +        } else {
>> +            /* handle port io regions */
>> +            uint32_t val;
>> +            int ret;
>> +
>> +            /* Test kernel support for ioport resource read/write.  Old
>> +             * kernels return EIO.  New kernels only allow 1/2/4 byte reads
>> +             * so should return EINVAL for a 3 byte read */
>> +            ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
>> +            if (ret >= 0) {
>> +                fprintf(stderr, "Unexpected return from I/O port read: %d\n",
>> +                        ret);
>> +                abort();
>> +            } else if (errno != EINVAL) {
>> +                fprintf(stderr, "Kernel doesn't support ioport resource "
>> +                                "access, hiding this region.\n");
>> +                close(pci_dev->v_addrs[i].region->resource_fd);
>> +                cur_region->valid = 0;
>> +                continue;
>> +            }
>> +
>> +            pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
>> +            pci_dev->v_addrs[i].r_size = cur_region->size;
>> +            pci_dev->v_addrs[i].e_size = 0;
>> +
>> +            assigned_dev_ioport_setup(&pci_dev->dev, i, cur_region->size);
>> +            pci_register_bar((PCIDevice *) pci_dev, i,
>> +                             PCI_BASE_ADDRESS_SPACE_IO,
>> +                             &pci_dev->v_addrs[i].container);
>> +        }
>> +    }
>> +
>> +    /* success */
>> +    return 0;
>> +}
>> +
>> +static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
>> +{
>> +    FILE *f;
>> +    char name[128];
>> +    long id;
>> +
>> +    snprintf(name, sizeof(name), "%s%s", devpath, idname);
>> +    f = fopen(name, "r");
>> +    if (f == NULL) {
>> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
>> +        return -1;
>> +    }
>> +    if (fscanf(f, "%li\n", &id) == 1) {
>> +        *val = id;
>> +    } else {
>> +        return -1;
>> +    }
>> +    fclose(f);
>> +
>> +    return 0;
>> +}
>> +
>> +static int get_real_vendor_id(const char *devpath, uint16_t *val)
>> +{
>> +    return get_real_id(devpath, "vendor", val);
>> +}
>> +
>> +static int get_real_device_id(const char *devpath, uint16_t *val)
>> +{
>> +    return get_real_id(devpath, "device", val);
>> +}
>> +
>> +static int get_real_device(AssignedDevice *pci_dev, uint16_t r_seg,
>> +                           uint8_t r_bus, uint8_t r_dev, uint8_t r_func)
>> +{
>> +    char dir[128], name[128];
>> +    int fd, r = 0, v;
>> +    FILE *f;
>> +    unsigned long long start, end, size, flags;
>> +    uint16_t id;
>> +    PCIRegion *rp;
>> +    PCIDevRegions *dev = &pci_dev->real_device;
>> +
>> +    dev->region_number = 0;
>> +
>> +    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
>> +             r_seg, r_bus, r_dev, r_func);
>> +
>> +    snprintf(name, sizeof(name), "%sconfig", dir);
>> +
>> +    if (pci_dev->configfd_name && *pci_dev->configfd_name) {
>> +        if (qemu_isdigit(pci_dev->configfd_name[0])) {
>> +            dev->config_fd = strtol(pci_dev->configfd_name, NULL, 0);
>> +        } else {
>> +            dev->config_fd = monitor_get_fd(cur_mon, pci_dev->configfd_name);
>> +            if (dev->config_fd < 0) {
>> +                fprintf(stderr, "%s: (%s) unkown\n", __func__,
>> +                        pci_dev->configfd_name);
>> +                return 1;
>> +            }
>> +        }
>> +    } else {
>> +        dev->config_fd = open(name, O_RDWR);
>> +
>> +        if (dev->config_fd == -1) {
>> +            fprintf(stderr, "%s: %s: %m\n", __func__, name);
>> +            return 1;
>> +        }
>> +    }
>> +again:
>> +    r = read(dev->config_fd, pci_dev->dev.config,
>> +             pci_config_size(&pci_dev->dev));
>> +    if (r < 0) {
>> +        if (errno == EINTR || errno == EAGAIN) {
>> +            goto again;
>> +        }
>> +        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
>> +    }
>> +
>> +    /* Restore or clear multifunction, this is always controlled by qemu */
>> +    if (pci_dev->dev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
>> +        pci_dev->dev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
>> +    } else {
>> +        pci_dev->dev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
>> +    }
>> +
>> +    /* Clear host resource mapping info.  If we choose not to register a
>> +     * BAR, such as might be the case with the option ROM, we can get
>> +     * confusing, unwritable, residual addresses from the host here. */
>> +    memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24);
>> +    memset(&pci_dev->dev.config[PCI_ROM_ADDRESS], 0, 4);
>> +
>> +    snprintf(name, sizeof(name), "%sresource", dir);
>> +
>> +    f = fopen(name, "r");
>> +    if (f == NULL) {
>> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
>> +        return 1;
>> +    }
>> +
>> +    for (r = 0; r < PCI_ROM_SLOT; r++) {
>> +        if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3) {
>> +            break;
>> +        }
>> +
>> +        rp = dev->regions + r;
>> +        rp->valid = 0;
>> +        rp->resource_fd = -1;
>> +        size = end - start + 1;
>> +        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
>> +        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0) {
>> +            continue;
>> +        }
>> +        if (flags & IORESOURCE_MEM) {
>> +            flags &= ~IORESOURCE_IO;
>> +        } else {
>> +            flags &= ~IORESOURCE_PREFETCH;
>> +        }
>> +        snprintf(name, sizeof(name), "%sresource%d", dir, r);
>> +        fd = open(name, O_RDWR);
>> +        if (fd == -1) {
>> +            continue;
>> +        }
>> +        rp->resource_fd = fd;
>> +
>> +        rp->type = flags;
>> +        rp->valid = 1;
>> +        rp->base_addr = start;
>> +        rp->size = size;
>> +        pci_dev->v_addrs[r].region = rp;
>> +        DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
>> +              r, rp->size, start, rp->type, rp->resource_fd);
>> +    }
>> +
>> +    fclose(f);
>> +
>> +    /* read and fill vendor ID */
>> +    v = get_real_vendor_id(dir, &id);
>> +    if (v) {
>> +        return 1;
>> +    }
>> +    pci_dev->dev.config[0] = id & 0xff;
>> +    pci_dev->dev.config[1] = (id & 0xff00) >> 8;
>> +
>> +    /* read and fill device ID */
>> +    v = get_real_device_id(dir, &id);
>> +    if (v) {
>> +        return 1;
>> +    }
>> +    pci_dev->dev.config[2] = id & 0xff;
>> +    pci_dev->dev.config[3] = (id & 0xff00) >> 8;
>> +
>> +    pci_word_test_and_clear_mask(pci_dev->emulate_config_write + PCI_COMMAND,
>> +                                 PCI_COMMAND_MASTER | PCI_COMMAND_INTX_DISABLE);
>> +
>> +    dev->region_number = r;
>> +    return 0;
>> +}
>> +
>> +static void free_msi_virqs(AssignedDevice *dev)
>> +{
>> +    int i;
>> +
>> +    for (i = 0; i < dev->msi_virq_nr; i++) {
>> +        if (dev->msi_virq[i] >= 0) {
>> +            kvm_irqchip_release_virq(kvm_state, dev->msi_virq[i]);
>> +            dev->msi_virq[i] = -1;
>> +        }
>> +    }
>> +    g_free(dev->msi_virq);
>> +    dev->msi_virq = NULL;
>> +    dev->msi_virq_nr = 0;
>> +}
>> +
>> +static void free_assigned_device(AssignedDevice *dev)
>> +{
>> +    int i;
>> +
>> +    if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
>> +        assigned_dev_unregister_msix_mmio(dev);
>> +    }
>> +    for (i = 0; i < dev->real_device.region_number; i++) {
>> +        PCIRegion *pci_region = &dev->real_device.regions[i];
>> +        AssignedDevRegion *region = &dev->v_addrs[i];
>> +
>> +        if (!pci_region->valid) {
>> +            continue;
>> +        }
>> +        if (pci_region->type & IORESOURCE_IO) {
>> +            if (region->u.r_baseport) {
>> +                memory_region_del_subregion(&region->container,
>> +                                            &region->real_iomem);
>> +                memory_region_destroy(&region->real_iomem);
>> +                memory_region_destroy(&region->container);
>> +            }
>> +        } else if (pci_region->type & IORESOURCE_MEM) {
>> +            if (region->u.r_virtbase) {
>> +                memory_region_del_subregion(&region->container,
>> +                                            &region->real_iomem);
>> +
>> +                /* Remove MSI-X table subregion */
>> +                if (pci_region->base_addr <= dev->msix_table_addr &&
>> +                    pci_region->base_addr + pci_region->size >
>> +                    dev->msix_table_addr) {
>> +                    memory_region_del_subregion(&region->container,
>> +                                                &dev->mmio);
>> +                }
>> +
>> +                memory_region_destroy(&region->real_iomem);
>> +                memory_region_destroy(&region->container);
>> +                if (munmap(region->u.r_virtbase,
>> +                           (pci_region->size + 0xFFF) & 0xFFFFF000)) {
>> +                    fprintf(stderr,
>> +                            "Failed to unmap assigned device region: %s\n",
>> +                            strerror(errno));
>> +                }
>> +            }
>> +        }
>> +        if (pci_region->resource_fd >= 0) {
>> +            close(pci_region->resource_fd);
>> +        }
>> +    }
>> +
>> +    if (dev->real_device.config_fd >= 0) {
>> +        close(dev->real_device.config_fd);
>> +    }
>> +
>> +    free_msi_virqs(dev);
>> +}
>> +
>> +static void assign_failed_examine(AssignedDevice *dev)
>> +{
>> +    char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
>> +    uint16_t vendor_id, device_id;
>> +    int r;
>> +
>> +    sprintf(dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
>> +            dev->host.domain, dev->host.bus, dev->host.slot,
>> +            dev->host.function);
>> +
>> +    sprintf(name, "%sdriver", dir);
>> +
>> +    r = readlink(name, driver, sizeof(driver));
>> +    if ((r <= 0) || r >= sizeof(driver)) {
>> +        goto fail;
>> +    }
>> +
>> +    ns = strrchr(driver, '/');
>> +    if (!ns) {
>> +        goto fail;
>> +    }
>> +
>> +    ns++;
>> +
>> +    if (get_real_vendor_id(dir, &vendor_id) ||
>> +        get_real_device_id(dir, &device_id)) {
>> +        goto fail;
>> +    }
>> +
>> +    fprintf(stderr, "*** The driver '%s' is occupying your device "
>> +                    "%04x:%02x:%02x.%x.\n",
>> +            ns, dev->host.domain, dev->host.bus, dev->host.slot,
>> +            dev->host.function);
>> +    fprintf(stderr, "***\n");
>> +    fprintf(stderr, "*** You can try the following commands to free it:\n");
>> +    fprintf(stderr, "***\n");
>> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
>> +                    "new_id\n", vendor_id, device_id);
>> +    fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
>> +                    "%s/unbind\n",
>> +            dev->host.domain, dev->host.bus, dev->host.slot,
>> +            dev->host.function, ns);
>> +    fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
>> +                    "pci-stub/bind\n",
>> +            dev->host.domain, dev->host.bus, dev->host.slot,
>> +            dev->host.function);
>> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
>> +                    "/remove_id\n", vendor_id, device_id);
>> +    fprintf(stderr, "***\n");
>> +
>> +    return;
>> +
>> +fail:
>> +    fprintf(stderr, "Couldn't find out why.\n");
>> +}
>> +
>> +static int assign_device(AssignedDevice *dev)
>> +{
>> +    uint32_t flags = KVM_DEV_ASSIGN_ENABLE_IOMMU;
>> +    int r;
>> +
>> +    /* Only pass non-zero PCI segment to capable module */
>> +    if (!kvm_check_extension(kvm_state, KVM_CAP_PCI_SEGMENT) &&
>> +        dev->host.domain) {
>> +        fprintf(stderr, "Can't assign device inside non-zero PCI segment "
>> +                "as this KVM module doesn't support it.\n");
>> +        return -ENODEV;
>> +    }
>> +
>> +    if (!kvm_check_extension(kvm_state, KVM_CAP_IOMMU)) {
>> +        fprintf(stderr, "No IOMMU found.  Unable to assign device \"%s\"\n",
>> +                dev->dev.qdev.id);
>> +        return -ENODEV;
>> +    }
>> +
>> +    if (dev->features & ASSIGNED_DEVICE_SHARE_INTX_MASK &&
>> +        kvm_has_intx_set_mask()) {
>> +        flags |= KVM_DEV_ASSIGN_PCI_2_3;
>> +    }
>> +
>> +    r = kvm_device_pci_assign(kvm_state, &dev->host, flags, &dev->dev_id);
>> +    if (r < 0) {
>> +        fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
>> +                dev->dev.qdev.id, strerror(-r));
>> +
>> +        switch (r) {
>> +        case -EBUSY:
>> +            assign_failed_examine(dev);
>> +            break;
>> +        default:
>> +            break;
>> +        }
>> +    }
>> +    return r;
>> +}
>> +
>> +static bool check_irqchip_in_kernel(void)
>> +{
>> +    if (kvm_irqchip_in_kernel()) {
>> +        return true;
>> +    }
>> +    error_report("pci-assign: error: requires KVM with in-kernel irqchip "
>> +                 "enabled");
>> +    return false;
>> +}
>> +
>> +static int assign_intx(AssignedDevice *dev)
>> +{
>> +    AssignedIRQType new_type;
>> +    PCIINTxRoute intx_route;
>> +    bool intx_host_msi;
>> +    int r;
>> +
>> +    /* Interrupt PIN 0 means don't use INTx */
>> +    if (assigned_dev_pci_read_byte(&dev->dev, PCI_INTERRUPT_PIN) == 0) {
>> +        pci_device_set_intx_routing_notifier(&dev->dev, NULL);
>> +        return 0;
>> +    }
>> +
>> +    if (!check_irqchip_in_kernel()) {
>> +        return -ENOTSUP;
>> +    }
>> +
>> +    pci_device_set_intx_routing_notifier(&dev->dev,
>> +                                         assigned_dev_update_irq_routing);
>> +
>> +    intx_route = pci_device_route_intx_to_irq(&dev->dev, 0);
>> +    assert(intx_route.mode != PCI_INTX_INVERTED);
>> +
>> +    if (dev->intx_route.mode == intx_route.mode &&
>> +        dev->intx_route.irq == intx_route.irq) {
>> +        return 0;
>> +    }
>> +
>> +    switch (dev->assigned_irq_type) {
>> +    case ASSIGNED_IRQ_INTX_HOST_INTX:
>> +    case ASSIGNED_IRQ_INTX_HOST_MSI:
>> +        intx_host_msi = dev->assigned_irq_type == ASSIGNED_IRQ_INTX_HOST_MSI;
>> +        r = kvm_device_intx_deassign(kvm_state, dev->dev_id, intx_host_msi);
>> +        break;
>> +    case ASSIGNED_IRQ_MSI:
>> +        r = kvm_device_msi_deassign(kvm_state, dev->dev_id);
>> +        break;
>> +    case ASSIGNED_IRQ_MSIX:
>> +        r = kvm_device_msix_deassign(kvm_state, dev->dev_id);
>> +        break;
>> +    default:
>> +        r = 0;
>> +        break;
>> +    }
>> +    if (r) {
>> +        perror("assign_intx: deassign");
>> +    }
>> +    dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
>> +
>> +    if (intx_route.mode == PCI_INTX_DISABLED) {
>> +        dev->intx_route = intx_route;
>> +        return 0;
>> +    }
>> +
>> +retry:
> 
> Better refactor this, factor code out and avoid
> implementing loops using goto.

Will do in upstream. As I told Blue, I'd like to do all the bigger
refactorings over QEMU so that a) users can finally switch to a
full-featured upstream QEMU (I already received feedback regarding this)
and b) we can stop testing two repositories.

> 
>> +    if (dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK &&
>> +        dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
>> +        intx_host_msi = true;
>> +        new_type = ASSIGNED_IRQ_INTX_HOST_MSI;
>> +    } else {
>> +        intx_host_msi = false;
>> +        new_type = ASSIGNED_IRQ_INTX_HOST_INTX;
>> +    }
>> +
>> +    r = kvm_device_intx_assign(kvm_state, dev->dev_id, intx_host_msi,
>> +                               intx_route.irq);
>> +    if (r < 0) {
>> +        if (r == -EIO && !(dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK) &&
>> +            dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
>> +            /* Retry with host-side MSI. There might be an IRQ conflict and
>> +             * either the kernel or the device doesn't support sharing. */
>> +            fprintf(stderr,
>> +                    "Host-side INTx sharing not supported, "
>> +                    "using MSI instead.\n"
>> +                    "Some devices do not to work properly in this mode.\n");
>> +            dev->features |= ASSIGNED_DEVICE_PREFER_MSI_MASK;
>> +            goto retry;
>> +        }
>> +        fprintf(stderr, "Failed to assign irq for \"%s\": %s\n",
>> +                dev->dev.qdev.id, strerror(-r));
>> +        fprintf(stderr, "Perhaps you are assigning a device "
>> +                "that shares an IRQ with another device?\n");
> 
> This is now supported no? So arguably error message is wrong.
> Also report errors with stderr here and elsewhere.

The error will remain valid as a) not all kernels support sharing and b)
neither all devices do.

Will check what to do with all the error reporting. It requires more
than blind search&replace.

> 
>> +        return r;
>> +    }
>> +
>> +    dev->intx_route = intx_route;
>> +    dev->assigned_irq_type = new_type;
>> +    return r;
>> +}
>> +
>> +static void deassign_device(AssignedDevice *dev)
>> +{
>> +    int r;
>> +
>> +    r = kvm_device_pci_deassign(kvm_state, dev->dev_id);
>> +    if (r < 0) {
>> +        fprintf(stderr, "Failed to deassign device \"%s\" : %s\n",
>> +                dev->dev.qdev.id, strerror(-r));
> 
> assert?

Yes, makes more sense. Will change.

> 
>> +    }
>> +}
>> +
>> +/* The pci config space got updated. Check if irq numbers have changed
>> + * for our devices
>> + */
>> +static void assigned_dev_update_irq_routing(PCIDevice *dev)
>> +{
>> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, dev);
>> +    Error *err = NULL;
>> +    int r;
>> +
>> +    r = assign_intx(assigned_dev);
>> +    if (r < 0) {
>> +        qdev_unplug(&dev->qdev, &err);
>> +        assert(!err);
>> +    }
>> +}
>> +
> 
> Lots of stuff below duplicates logic in hw/msix.c and
> hw/msi.c.

See commit message for the roadmap.

> 
>> +static void assigned_dev_update_msi(PCIDevice *pci_dev)
>> +{
>> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
>> +    uint8_t ctrl_byte = pci_get_byte(pci_dev->config + pci_dev->msi_cap +
>> +                                     PCI_MSI_FLAGS);
>> +    int r;
>> +
>> +    /* Some guests gratuitously disable MSI even if they're not using it,
>> +     * try to catch this by only deassigning irqs if the guest is using
>> +     * MSI or intends to start. */
>> +    if (assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSI ||
>> +        (ctrl_byte & PCI_MSI_FLAGS_ENABLE)) {
>> +        r = kvm_device_msi_deassign(kvm_state, assigned_dev->dev_id);
>> +        /* -ENXIO means no assigned irq */
>> +        if (r && r != -ENXIO) {
>> +            perror("assigned_dev_update_msi: deassign irq");
>> +        }
>> +
>> +        free_msi_virqs(assigned_dev);
>> +
>> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
>> +        pci_device_set_intx_routing_notifier(pci_dev, NULL);
>> +    }
>> +
>> +    if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) {
>> +        uint8_t *pos = pci_dev->config + pci_dev->msi_cap;
>> +        MSIMessage msg;
>> +        int virq;
>> +
>> +        msg.address = pci_get_long(pos + PCI_MSI_ADDRESS_LO);
>> +        msg.data = pci_get_word(pos + PCI_MSI_DATA_32);
>> +        virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>> +        if (virq < 0) {
>> +            perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route");
>> +            return;
>> +        }
>> +
>> +        assigned_dev->msi_virq = g_malloc(sizeof(*assigned_dev->msi_virq));
>> +        assigned_dev->msi_virq_nr = 1;
>> +        assigned_dev->msi_virq[0] = virq;
>> +        if (kvm_device_msi_assign(kvm_state, assigned_dev->dev_id, virq) < 0) {
>> +            perror("assigned_dev_update_msi: kvm_device_msi_assign");
>> +        }
>> +
>> +        assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
>> +        assigned_dev->intx_route.irq = -1;
>> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSI;
>> +    } else {
>> +        assign_intx(assigned_dev);
>> +    }
>> +}
>> +
>> +static bool msix_masked(MSIXTableEntry *entry)
>> +{
>> +    return (entry->ctrl & cpu_to_le32(0x1)) != 0;
>> +}
>> +
>> +static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
>> +{
>> +    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
>> +    uint16_t entries_nr = 0;
>> +    int i, r = 0;
>> +    MSIXTableEntry *entry = adev->msix_table;
>> +    MSIMessage msg;
>> +
>> +    /* Get the usable entry number for allocating */
>> +    for (i = 0; i < adev->msix_max; i++, entry++) {
>> +        if (msix_masked(entry)) {
>> +            continue;
>> +        }
>> +        entries_nr++;
>> +    }
>> +
>> +    DEBUG("MSI-X entries: %d\n", entries_nr);
>> +
>> +    /* It's valid to enable MSI-X with all entries masked */
>> +    if (!entries_nr) {
>> +        return 0;
>> +    }
>> +
>> +    r = kvm_device_msix_init_vectors(kvm_state, adev->dev_id, entries_nr);
>> +    if (r != 0) {
>> +        fprintf(stderr, "fail to set MSI-X entry number for MSIX! %s\n",
>> +                strerror(-r));
>> +        return r;
>> +    }
>> +
>> +    free_msi_virqs(adev);
>> +
>> +    adev->msi_virq_nr = adev->msix_max;
>> +    adev->msi_virq = g_malloc(adev->msix_max * sizeof(*adev->msi_virq));
>> +
>> +    entry = adev->msix_table;
>> +    for (i = 0; i < adev->msix_max; i++, entry++) {
>> +        adev->msi_virq[i] = -1;
>> +
>> +        if (msix_masked(entry)) {
>> +            continue;
>> +        }
>> +
>> +        msg.address = entry->addr_lo | ((uint64_t)entry->addr_hi << 32);
>> +        msg.data = entry->data;
>> +        r = kvm_irqchip_add_msi_route(kvm_state, msg);
>> +        if (r < 0) {
>> +            return r;
>> +        }
>> +        adev->msi_virq[i] = r;
>> +
>> +        DEBUG("MSI-X vector %d, gsi %d, addr %08x_%08x, data %08x\n", i,
>> +              r, entry->addr_hi, entry->addr_lo, entry->data);
>> +
>> +        r = kvm_device_msix_set_vector(kvm_state, adev->dev_id, i,
>> +                                       adev->msi_virq[i]);
>> +        if (r) {
>> +            fprintf(stderr, "fail to set MSI-X entry! %s\n", strerror(-r));
>> +            break;
>> +        }
>> +    }
>> +
>> +    return r;
>> +}
>> +
>> +static void assigned_dev_update_msix(PCIDevice *pci_dev)
>> +{
>> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
>> +    uint16_t ctrl_word = pci_get_word(pci_dev->config + pci_dev->msix_cap +
>> +                                      PCI_MSIX_FLAGS);
>> +    int r;
>> +
>> +    /* Some guests gratuitously disable MSIX even if they're not using it,
>> +     * try to catch this by only deassigning irqs if the guest is using
>> +     * MSIX or intends to start. */
> 
> Above comment seems very unclear.

Yes, also to me. :)

> 
>> +    if ((assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSIX) ||
>> +        (ctrl_word & PCI_MSIX_FLAGS_ENABLE)) {
>> +        r = kvm_device_msix_deassign(kvm_state, assigned_dev->dev_id);
>> +        /* -ENXIO means no assigned irq */
>> +        if (r && r != -ENXIO) {
>> +            perror("assigned_dev_update_msix: deassign irq");
>> +        }
>> +
>> +        free_msi_virqs(assigned_dev);
>> +
>> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
>> +        pci_device_set_intx_routing_notifier(pci_dev, NULL);
>> +    }
>> +
>> +    if (ctrl_word & PCI_MSIX_FLAGS_ENABLE) {
>> +        if (assigned_dev_update_msix_mmio(pci_dev) < 0) {
>> +            perror("assigned_dev_update_msix_mmio");
>> +            return;
>> +        }
>> +
>> +        if (assigned_dev->msi_virq_nr > 0) {
>> +            if (kvm_device_msix_assign(kvm_state, assigned_dev->dev_id) < 0) {
>> +                perror("assigned_dev_enable_msix: assign irq");
>> +                return;
>> +            }
>> +        }
>> +        assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
>> +        assigned_dev->intx_route.irq = -1;
>> +        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSIX;
>> +    } else {
>> +        assign_intx(assigned_dev);
>> +    }
>> +}
>> +
>> +static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev,
>> +                                             uint32_t address, int len)
>> +{
>> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
>> +    uint32_t virt_val = pci_default_read_config(pci_dev, address, len);
>> +    uint32_t real_val, emulate_mask, full_emulation_mask;
>> +
>> +    emulate_mask = 0;
>> +    memcpy(&emulate_mask, assigned_dev->emulate_config_read + address, len);
>> +    emulate_mask = le32_to_cpu(emulate_mask);
>> +
>> +    full_emulation_mask = 0xffffffff >> (32 - len * 8);
>> +
>> +    if (emulate_mask != full_emulation_mask) {
>> +        real_val = assigned_dev_pci_read(pci_dev, address, len);
>> +        return (virt_val & emulate_mask) | (real_val & ~emulate_mask);
>> +    } else {
>> +        return virt_val;
>> +    }
>> +}
>> +
>> +static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address,
>> +                                          uint32_t val, int len)
>> +{
>> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
>> +    uint16_t old_cmd = pci_get_word(pci_dev->config + PCI_COMMAND);
>> +    uint32_t emulate_mask, full_emulation_mask;
>> +    int ret;
>> +
>> +    pci_default_write_config(pci_dev, address, val, len);
>> +
>> +    if (kvm_has_intx_set_mask() &&
>> +        range_covers_byte(address, len, PCI_COMMAND + 1)) {
>> +        bool intx_masked = (pci_get_word(pci_dev->config + PCI_COMMAND) &
>> +                            PCI_COMMAND_INTX_DISABLE);
>> +
>> +        if (intx_masked != !!(old_cmd & PCI_COMMAND_INTX_DISABLE)) {
>> +            ret = kvm_device_intx_set_mask(kvm_state, assigned_dev->dev_id,
>> +                                           intx_masked);
>> +            if (ret) {
>> +                perror("assigned_dev_pci_write_config: set intx mask");
> 
> error_report? hardware_error?
> 
>> +            }
>> +        }
>> +    }
>> +    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
>> +        if (range_covers_byte(address, len,
>> +                              pci_dev->msi_cap + PCI_MSI_FLAGS)) {
>> +            assigned_dev_update_msi(pci_dev);
>> +        }
>> +    }
>> +    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
>> +        if (range_covers_byte(address, len,
>> +                              pci_dev->msix_cap + PCI_MSIX_FLAGS + 1)) {
>> +            assigned_dev_update_msix(pci_dev);
>> +        }
>> +    }
>> +
>> +    emulate_mask = 0;
>> +    memcpy(&emulate_mask, assigned_dev->emulate_config_write + address, len);
>> +    emulate_mask = le32_to_cpu(emulate_mask);
>> +
>> +    full_emulation_mask = 0xffffffff >> (32 - len * 8);
>> +
>> +    if (emulate_mask != full_emulation_mask) {
>> +        if (emulate_mask) {
>> +            val &= ~emulate_mask;
>> +            val |= assigned_dev_pci_read(pci_dev, address, len) & emulate_mask;
>> +        }
>> +        assigned_dev_pci_write(pci_dev, address, val, len);
>> +    }
>> +}
>> +
>> +static void assigned_dev_setup_cap_read(AssignedDevice *dev, uint32_t offset,
>> +                                        uint32_t len)
>> +{
>> +    assigned_dev_direct_config_read(dev, offset, len);
>> +    assigned_dev_emulate_config_read(dev, offset + PCI_CAP_LIST_NEXT, 1);
>> +}
>> +
>> +static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
>> +{
>> +    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
>> +    PCIRegion *pci_region = dev->real_device.regions;
>> +    int ret, pos;
>> +
>> +    /* Clear initial capabilities pointer and status copied from hw */
>> +    pci_set_byte(pci_dev->config + PCI_CAPABILITY_LIST, 0);
>> +    pci_set_word(pci_dev->config + PCI_STATUS,
>> +                 pci_get_word(pci_dev->config + PCI_STATUS) &
>> +                 ~PCI_STATUS_CAP_LIST);
>> +
>> +    /* Expose MSI capability
>> +     * MSI capability is the 1st capability in capability config */
>> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSI, 0);
>> +    if (pos != 0 && kvm_check_extension(kvm_state, KVM_CAP_ASSIGN_DEV_IRQ)) {
>> +        if (!check_irqchip_in_kernel()) {
>> +            return -ENOTSUP;
>> +        }
>> +        dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI;
> 
> again code below has lots of logic duplicated from msix.c
> pcie etc. 
> 
>> +        /* Only 32-bit/no-mask currently supported */
>> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSI, pos, 10);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +        pci_dev->msi_cap = pos;
>> +
>> +        pci_set_word(pci_dev->config + pos + PCI_MSI_FLAGS,
>> +                     pci_get_word(pci_dev->config + pos + PCI_MSI_FLAGS) &
>> +                     PCI_MSI_FLAGS_QMASK);
>> +        pci_set_long(pci_dev->config + pos + PCI_MSI_ADDRESS_LO, 0);
>> +        pci_set_word(pci_dev->config + pos + PCI_MSI_DATA_32, 0);
>> +
>> +        /* Set writable fields */
>> +        pci_set_word(pci_dev->wmask + pos + PCI_MSI_FLAGS,
>> +                     PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
>> +        pci_set_long(pci_dev->wmask + pos + PCI_MSI_ADDRESS_LO, 0xfffffffc);
>> +        pci_set_word(pci_dev->wmask + pos + PCI_MSI_DATA_32, 0xffff);
>> +    }
>> +    /* Expose MSI-X capability */
>> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX, 0);
>> +    if (pos != 0 && kvm_device_msix_supported(kvm_state)) {
>> +        int bar_nr;
>> +        uint32_t msix_table_entry;
>> +
>> +        if (!check_irqchip_in_kernel()) {
>> +            return -ENOTSUP;
>> +        }
>> +        dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX;
>> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSIX, pos, 12);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +        pci_dev->msix_cap = pos;
>> +
>> +        pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS,
>> +                     pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) &
>> +                     PCI_MSIX_FLAGS_QSIZE);
>> +
>> +        /* Only enable and function mask bits are writable */
>> +        pci_set_word(pci_dev->wmask + pos + PCI_MSIX_FLAGS,
>> +                     PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
>> +
>> +        msix_table_entry = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE);
>> +        bar_nr = msix_table_entry & PCI_MSIX_FLAGS_BIRMASK;
>> +        msix_table_entry &= ~PCI_MSIX_FLAGS_BIRMASK;
>> +        dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
>> +        dev->msix_max = pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS);
>> +        dev->msix_max &= PCI_MSIX_FLAGS_QSIZE;
>> +        dev->msix_max += 1;
>> +    }
>> +
>> +    /* Minimal PM support, nothing writable, device appears to NAK changes */
>> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM, 0);
>> +    if (pos) {
>> +        uint16_t pmc;
>> +
>> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, pos, PCI_PM_SIZEOF);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +
>> +        assigned_dev_setup_cap_read(dev, pos, PCI_PM_SIZEOF);
>> +
>> +        pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
>> +        pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
>> +        pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
>> +
>> +        /* assign_device will bring the device up to D0, so we don't need
>> +         * to worry about doing that ourselves here. */
>> +        pci_set_word(pci_dev->config + pos + PCI_PM_CTRL,
>> +                     PCI_PM_CTRL_NO_SOFT_RESET);
>> +
>> +        pci_set_byte(pci_dev->config + pos + PCI_PM_PPB_EXTENSIONS, 0);
>> +        pci_set_byte(pci_dev->config + pos + PCI_PM_DATA_REGISTER, 0);
>> +    }
> 
> Would be nice to move logic to hw/pcipm.c.
> 
>> +
>> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP, 0);
>> +    if (pos) {
>> +        uint8_t version, size = 0;
>> +        uint16_t type, devctl, lnksta;
>> +        uint32_t devcap, lnkcap;
>> +
>> +        version = pci_get_byte(pci_dev->config + pos + PCI_EXP_FLAGS);
>> +        version &= PCI_EXP_FLAGS_VERS;
>> +        if (version == 1) {
>> +            size = 0x14;
>> +        } else if (version == 2) {
>> +            /*
>> +             * Check for non-std size, accept reduced size to 0x34,
>> +             * which is what bcm5761 implemented, violating the
>> +             * PCIe v3.0 spec that regs should exist and be read as 0,
>> +             * not optionally provided and shorten the struct size.
>> +             */
>> +            size = MIN(0x3c, PCI_CONFIG_SPACE_SIZE - pos);
>> +            if (size < 0x34) {
>> +                fprintf(stderr,
>> +                        "%s: Invalid size PCIe cap-id 0x%x\n",
>> +                        __func__, PCI_CAP_ID_EXP);
>> +                return -EINVAL;
>> +            } else if (size != 0x3c) {
>> +                fprintf(stderr,
>> +                        "WARNING, %s: PCIe cap-id 0x%x has "
>> +                        "non-standard size 0x%x; std size should be 0x3c\n",
>> +                         __func__, PCI_CAP_ID_EXP, size);
>> +            }
>> +        } else if (version == 0) {
>> +            uint16_t vid, did;
>> +            vid = pci_get_word(pci_dev->config + PCI_VENDOR_ID);
>> +            did = pci_get_word(pci_dev->config + PCI_DEVICE_ID);
>> +            if (vid == PCI_VENDOR_ID_INTEL && did == 0x10ed) {
>> +                /*
>> +                 * quirk for Intel 82599 VF with invalid PCIe capability
>> +                 * version, should really be version 2 (same as PF)
>> +                 */
>> +                size = 0x3c;
>> +            }
>> +        }
>> +
>> +        if (size == 0) {
>> +            fprintf(stderr,
>> +                    "%s: Unsupported PCI express capability version %d\n",
>> +                    __func__, version);
>> +            return -EINVAL;
>> +        }
>> +
>> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_EXP, pos, size);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +
>> +        assigned_dev_setup_cap_read(dev, pos, size);
>> +
>> +        type = pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS);
>> +        type = (type & PCI_EXP_FLAGS_TYPE) >> 4;
>> +        if (type != PCI_EXP_TYPE_ENDPOINT &&
>> +            type != PCI_EXP_TYPE_LEG_END && type != PCI_EXP_TYPE_RC_END) {
>> +            fprintf(stderr,
>> +                    "Device assignment only supports endpoint assignment, "
>> +                    "device type %d\n", type);
>> +            return -EINVAL;
>> +        }
>> +
>> +        /* capabilities, pass existing read-only copy
>> +         * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */
>> +
>> +        /* device capabilities: hide FLR */
>> +        devcap = pci_get_long(pci_dev->config + pos + PCI_EXP_DEVCAP);
>> +        devcap &= ~PCI_EXP_DEVCAP_FLR;
>> +        pci_set_long(pci_dev->config + pos + PCI_EXP_DEVCAP, devcap);
>> +
>> +        /* device control: clear all error reporting enable bits, leaving
>> +         *                 only a few host values.  Note, these are
>> +         *                 all writable, but not passed to hw.
>> +         */
>> +        devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
>> +        devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
>> +                  PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
>> +        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
>> +        devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
>> +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
>> +
>> +        /* Clear device status */
>> +        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVSTA, 0);
>> +
>> +        /* Link capabilities, expose links and latencues, clear reporting */
>> +        lnkcap = pci_get_long(pci_dev->config + pos + PCI_EXP_LNKCAP);
>> +        lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
>> +                   PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
>> +                   PCI_EXP_LNKCAP_L1EL);
>> +        pci_set_long(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
>> +
>> +        /* Link control, pass existing read-only copy.  Should be writable? */
>> +
>> +        /* Link status, only expose current speed and width */
>> +        lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
>> +        lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
>> +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
>> +
>> +        if (version >= 2) {
>> +            /* Slot capabilities, control, status - not needed for endpoints */
>> +            pci_set_long(pci_dev->config + pos + PCI_EXP_SLTCAP, 0);
>> +            pci_set_word(pci_dev->config + pos + PCI_EXP_SLTCTL, 0);
>> +            pci_set_word(pci_dev->config + pos + PCI_EXP_SLTSTA, 0);
>> +
>> +            /* Root control, capabilities, status - not needed for endpoints */
>> +            pci_set_word(pci_dev->config + pos + PCI_EXP_RTCTL, 0);
>> +            pci_set_word(pci_dev->config + pos + PCI_EXP_RTCAP, 0);
>> +            pci_set_long(pci_dev->config + pos + PCI_EXP_RTSTA, 0);
>> +
>> +            /* Device capabilities/control 2, pass existing read-only copy */
>> +            /* Link control 2, pass existing read-only copy */
>> +        }
>> +    }
>> +
>> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX, 0);
>> +    if (pos) {
>> +        uint16_t cmd;
>> +        uint32_t status;
>> +
>> +        /* Only expose the minimum, 8 byte capability */
>> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_PCIX, pos, 8);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +
>> +        assigned_dev_setup_cap_read(dev, pos, 8);
>> +
>> +        /* Command register, clear upper bits, including extended modes */
>> +        cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
>> +        cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
>> +                PCI_X_CMD_MAX_SPLIT);
>> +        pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
>> +
>> +        /* Status register, update with emulated PCI bus location, clear
>> +         * error bits, leave the rest. */
>> +        status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
>> +        status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
>> +        status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
>> +        status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
>> +                    PCI_X_STATUS_SPL_ERR);
>> +        pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
>> +    }
>> +
>> +    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD, 0);
>> +    if (pos) {
>> +        /* Direct R/W passthrough */
>> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_VPD, pos, 8);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +
>> +        assigned_dev_setup_cap_read(dev, pos, 8);
>> +
>> +        /* direct write for cap content */
>> +        assigned_dev_direct_config_write(dev, pos + 2, 6);
>> +    }
>> +
>> +    /* Devices can have multiple vendor capabilities, get them all */
>> +    for (pos = 0; (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
>> +        pos += PCI_CAP_LIST_NEXT) {
>> +        uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
>> +        /* Direct R/W passthrough */
>> +        ret = pci_add_capability(pci_dev, PCI_CAP_ID_VNDR, pos, len);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +
>> +        assigned_dev_setup_cap_read(dev, pos, len);
>> +
>> +        /* direct write for cap content */
>> +        assigned_dev_direct_config_write(dev, pos + 2, len - 2);
>> +    }
>> +
>> +    /* If real and virtual capability list status bits differ, virtualize the
>> +     * access. */
>> +    if ((pci_get_word(pci_dev->config + PCI_STATUS) & PCI_STATUS_CAP_LIST) !=
>> +        (assigned_dev_pci_read_byte(pci_dev, PCI_STATUS) &
>> +         PCI_STATUS_CAP_LIST)) {
>> +        dev->emulate_config_read[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static uint64_t msix_mmio_read(void *opaque, target_phys_addr_t addr,
>> +                               unsigned size)
>> +{
>> +    AssignedDevice *adev = opaque;
>> +    uint64_t val;
>> +
>> +    memcpy(&val, (void *)((uint8_t *)adev->msix_table + addr), size);
>> +
>> +    return val;
>> +}
>> +
>> +static void msix_mmio_write(void *opaque, target_phys_addr_t addr,
>> +                            uint64_t val, unsigned size)
>> +{
>> +    AssignedDevice *adev = opaque;
>> +    PCIDevice *pdev = &adev->dev;
>> +    uint16_t ctrl;
>> +    MSIXTableEntry orig;
>> +    int i = addr >> 4;
>> +
>> +    if (i >= adev->msix_max) {
>> +        return; /* Drop write */
>> +    }
>> +
>> +    ctrl = pci_get_word(pdev->config + pdev->msix_cap + PCI_MSIX_FLAGS);
>> +
>> +    DEBUG("write to MSI-X table offset 0x%lx, val 0x%lx\n", addr, val);
>> +
>> +    if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
>> +        orig = adev->msix_table[i];
>> +    }
>> +
>> +    memcpy((void *)((uint8_t *)adev->msix_table + addr), &val, size);
>> +
>> +    if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
>> +        MSIXTableEntry *entry = &adev->msix_table[i];
>> +
>> +        if (!msix_masked(&orig) && msix_masked(entry)) {
>> +            /*
>> +             * Vector masked, disable it
>> +             *
>> +             * XXX It's not clear if we can or should actually attempt
>> +             * to mask or disable the interrupt.  KVM doesn't have
>> +             * support for pending bits and kvm_assign_set_msix_entry
>> +             * doesn't modify the device hardware mask.  Interrupts
>> +             * while masked are simply not injected to the guest, so
>> +             * are lost.  Can we get away with always injecting an
>> +             * interrupt on unmask?
>> +             */
>> +        } else if (msix_masked(&orig) && !msix_masked(entry)) {
>> +            /* Vector unmasked */
>> +            if (i >= adev->msi_virq_nr || adev->msi_virq[i] < 0) {
>> +                /* Previously unassigned vector, start from scratch */
>> +                assigned_dev_update_msix(pdev);
>> +                return;
>> +            } else {
>> +                /* Update an existing, previously masked vector */
>> +                MSIMessage msg;
>> +                int ret;
>> +
>> +                msg.address = entry->addr_lo |
>> +                    ((uint64_t)entry->addr_hi << 32);
>> +                msg.data = entry->data;
>> +
>> +                ret = kvm_irqchip_update_msi_route(kvm_state,
>> +                                                   adev->msi_virq[i], msg);
>> +                if (ret) {
>> +                    fprintf(stderr,
>> +                            "Error updating irq routing entry (%d)\n", ret);
>> +                }
>> +            }
>> +        }
>> +    }
>> +}
>> +
>> +static const MemoryRegionOps msix_mmio_ops = {
>> +    .read = msix_mmio_read,
>> +    .write = msix_mmio_write,
>> +    .endianness = DEVICE_NATIVE_ENDIAN,
>> +    .valid = {
>> +        .min_access_size = 4,
>> +        .max_access_size = 8,
>> +    },
>> +    .impl = {
>> +        .min_access_size = 4,
>> +        .max_access_size = 8,
>> +    },
>> +};
>> +
>> +static void msix_reset(AssignedDevice *dev)
>> +{
>> +    MSIXTableEntry *entry;
>> +    int i;
>> +
>> +    if (!dev->msix_table) {
>> +        return;
>> +    }
>> +
>> +    memset(dev->msix_table, 0, MSIX_PAGE_SIZE);
>> +
>> +    for (i = 0, entry = dev->msix_table; i < dev->msix_max; i++, entry++) {
>> +        entry->ctrl = cpu_to_le32(0x1); /* Masked */
>> +    }
>> +}
>> +
> 
> Pls rename to assigned_dev_msix_reset.
> I would prefer if this one comment is addressed before
> merging.

Done here already, also for other msix_* functions and structs.

> 
>> +static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
>> +{
>> +    dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE,
>> +                           MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
>> +    if (dev->msix_table == MAP_FAILED) {
>> +        fprintf(stderr, "fail allocate msix_table! %s\n", strerror(errno));
>> +        return -EFAULT;
>> +    }
>> +
>> +    msix_reset(dev);
>> +
>> +    memory_region_init_io(&dev->mmio, &msix_mmio_ops, dev,
>> +                          "assigned-dev-msix", MSIX_PAGE_SIZE);
>> +    return 0;
>> +}
>> +
>> +static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
>> +{
>> +    if (!dev->msix_table) {
>> +        return;
>> +    }
>> +
>> +    memory_region_destroy(&dev->mmio);
>> +
>> +    if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) {
>> +        fprintf(stderr, "error unmapping msix_table! %s\n",
>> +                strerror(errno));
>> +    }
>> +    dev->msix_table = NULL;
>> +}
>> +
>> +static const VMStateDescription vmstate_assigned_device = {
>> +    .name = "pci-assign",
>> +    .unmigratable = 1,
>> +};
>> +
>> +static void reset_assigned_device(DeviceState *dev)
>> +{
>> +    PCIDevice *pci_dev = DO_UPCAST(PCIDevice, qdev, dev);
>> +    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
>> +    char reset_file[64];
>> +    const char reset[] = "1";
>> +    int fd, ret;
>> +
>> +    /*
>> +     * If a guest is reset without being shutdown, MSI/MSI-X can still
>> +     * be running.  We want to return the device to a known state on
>> +     * reset, so disable those here.  We especially do not want MSI-X
>> +     * enabled since it lives in MMIO space, which is about to get
>> +     * disabled.
>> +     */
>> +    if (adev->assigned_irq_type == ASSIGNED_IRQ_MSIX) {
>> +        uint16_t ctrl = pci_get_word(pci_dev->config +
>> +                                     pci_dev->msix_cap + PCI_MSIX_FLAGS);
>> +
>> +        pci_set_word(pci_dev->config + pci_dev->msix_cap + PCI_MSIX_FLAGS,
>> +                     ctrl & ~PCI_MSIX_FLAGS_ENABLE);
>> +        assigned_dev_update_msix(pci_dev);
>> +    } else if (adev->assigned_irq_type == ASSIGNED_IRQ_MSI) {
>> +        uint8_t ctrl = pci_get_byte(pci_dev->config +
>> +                                    pci_dev->msi_cap + PCI_MSI_FLAGS);
>> +
>> +        pci_set_byte(pci_dev->config + pci_dev->msi_cap + PCI_MSI_FLAGS,
>> +                     ctrl & ~PCI_MSI_FLAGS_ENABLE);
>> +        assigned_dev_update_msi(pci_dev);
>> +    }
>> +
>> +    snprintf(reset_file, sizeof(reset_file),
>> +             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset",
>> +             adev->host.domain, adev->host.bus, adev->host.slot,
>> +             adev->host.function);
>> +
>> +    /*
>> +     * Issue a device reset via pci-sysfs.  Note that we use write(2) here
>> +     * and ignore the return value because some kernels have a bug that
>> +     * returns 0 rather than bytes written on success, sending us into an
>> +     * infinite retry loop using other write mechanisms.
>> +     */
>> +    fd = open(reset_file, O_WRONLY);
>> +    if (fd != -1) {
>> +        ret = write(fd, reset, strlen(reset));
>> +        (void)ret;
>> +        close(fd);
>> +    }
>> +
>> +    /*
>> +     * When a 0 is written to the bus master register, the device is logically
>> +     * disconnected from the PCI bus. This avoids further DMA transfers.
>> +     */
>> +    assigned_dev_pci_write_config(pci_dev, PCI_COMMAND, 0, 1);
>> +}
>> +
>> +static int assigned_initfn(struct PCIDevice *pci_dev)
>> +{
>> +    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
>> +    uint8_t e_intx;
>> +    int r;
>> +
>> +    if (!kvm_enabled()) {
>> +        error_report("pci-assign: error: requires KVM support");
>> +        return -1;
>> +    }
>> +
>> +    if (!dev->host.domain && !dev->host.bus && !dev->host.slot &&
>> +        !dev->host.function) {
>> +        error_report("pci-assign: error: no host device specified");
>> +        return -1;
>> +    }
>> +
>> +    /*
>> +     * Set up basic config space access control. Will be further refined during
>> +     * device initialization.
>> +     */
>> +    assigned_dev_emulate_config_read(dev, 0, PCI_CONFIG_SPACE_SIZE);
>> +    assigned_dev_direct_config_read(dev, PCI_STATUS, 2);
>> +    assigned_dev_direct_config_read(dev, PCI_REVISION_ID, 1);
>> +    assigned_dev_direct_config_read(dev, PCI_CLASS_PROG, 3);
>> +    assigned_dev_direct_config_read(dev, PCI_CACHE_LINE_SIZE, 1);
>> +    assigned_dev_direct_config_read(dev, PCI_LATENCY_TIMER, 1);
>> +    assigned_dev_direct_config_read(dev, PCI_BIST, 1);
>> +    assigned_dev_direct_config_read(dev, PCI_CARDBUS_CIS, 4);
>> +    assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_VENDOR_ID, 2);
>> +    assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_ID, 2);
>> +    assigned_dev_direct_config_read(dev, PCI_CAPABILITY_LIST + 1, 7);
>> +    assigned_dev_direct_config_read(dev, PCI_MIN_GNT, 1);
>> +    assigned_dev_direct_config_read(dev, PCI_MAX_LAT, 1);
>> +    memcpy(dev->emulate_config_write, dev->emulate_config_read,
>> +           sizeof(dev->emulate_config_read));
>> +
>> +    if (get_real_device(dev, dev->host.domain, dev->host.bus,
>> +                        dev->host.slot, dev->host.function)) {
>> +        error_report("pci-assign: Error: Couldn't get real device (%s)!",
>> +                     dev->dev.qdev.id);
>> +        goto out;
>> +    }
>> +
>> +    if (assigned_device_pci_cap_init(pci_dev) < 0) {
>> +        goto out;
>> +    }
>> +
>> +    /* intercept MSI-X entry page in the MMIO */
>> +    if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
>> +        if (assigned_dev_register_msix_mmio(dev)) {
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    /* handle real device's MMIO/PIO BARs */
>> +    if (assigned_dev_register_regions(dev->real_device.regions,
>> +                                      dev->real_device.region_number,
>> +                                      dev)) {
>> +        goto out;
>> +    }
>> +
>> +    /* handle interrupt routing */
> 
> hardcoded offsets below.

Will fix.

> 
>> +    e_intx = dev->dev.config[0x3d] - 1;
>> +    dev->intpin = e_intx;
>> +    dev->intx_route.mode = PCI_INTX_DISABLED;
>> +    dev->intx_route.irq = -1;
>> +
>> +    /* assign device to guest */
>> +    r = assign_device(dev);
>> +    if (r < 0) {
>> +        goto out;
>> +    }
>> +
>> +    /* assign legacy INTx to the device */
>> +    r = assign_intx(dev);
>> +    if (r < 0) {
>> +        goto assigned_out;
>> +    }
>> +
>> +    assigned_dev_load_option_rom(dev);
>> +
>> +    add_boot_device_path(dev->bootindex, &pci_dev->qdev, NULL);
>> +
>> +    return 0;
>> +
>> +assigned_out:
>> +    deassign_device(dev);
>> +out:
>> +    free_assigned_device(dev);
>> +    return -1;
>> +}
>> +
>> +static void assigned_exitfn(struct PCIDevice *pci_dev)
>> +{
>> +    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
>> +
>> +    deassign_device(dev);
>> +    free_assigned_device(dev);
>> +}
>> +
>> +static Property da_properties[] = {
> 
> assigned_dev_properties.

Ok.

> 
>> +    DEFINE_PROP_PCI_HOST_DEVADDR("host", AssignedDevice, host),
>> +    DEFINE_PROP_BIT("prefer_msi", AssignedDevice, features,
>> +                    ASSIGNED_DEVICE_PREFER_MSI_BIT, false),
>> +    DEFINE_PROP_BIT("share_intx", AssignedDevice, features,
>> +                    ASSIGNED_DEVICE_SHARE_INTX_BIT, true),
>> +    DEFINE_PROP_INT32("bootindex", AssignedDevice, bootindex, -1),
>> +    DEFINE_PROP_STRING("configfd", AssignedDevice, configfd_name),
>> +    DEFINE_PROP_END_OF_LIST(),
>> +};
>> +
>> +static void assign_class_init(ObjectClass *klass, void *data)
>> +{
>> +    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +
>> +    k->init         = assigned_initfn;
>> +    k->exit         = assigned_exitfn;
>> +    k->config_read  = assigned_dev_pci_read_config;
>> +    k->config_write = assigned_dev_pci_write_config;
>> +    dc->props       = da_properties;
>> +    dc->vmsd        = &vmstate_assigned_device;
>> +    dc->reset       = reset_assigned_device;
>> +}
>> +
>> +static TypeInfo assign_info = {
>> +    .name               = "pci-assign",
>> +    .parent             = TYPE_PCI_DEVICE,
>> +    .instance_size      = sizeof(AssignedDevice),
>> +    .class_init         = assign_class_init,
>> +};
>> +
>> +static void assign_register_types(void)
>> +{
>> +    type_register_static(&assign_info);
>> +}
>> +
>> +type_init(assign_register_types)
>> +
>> +/*
>> + * Scan the assigned devices for the devices that have an option ROM, and then
>> + * load the corresponding ROM data to RAM. If an error occurs while loading an
>> + * option ROM, we just ignore that option ROM and continue with the next one.
>> + */
>> +static void assigned_dev_load_option_rom(AssignedDevice *dev)
>> +{
>> +    char name[32], rom_file[64];
>> +    FILE *fp;
>> +    uint8_t val;
>> +    struct stat st;
>> +    void *ptr;
>> +
>> +    /* If loading ROM from file, pci handles it */
>> +    if (dev->dev.romfile || !dev->dev.rom_bar) {
>> +        return;
>> +    }
>> +
>> +    snprintf(rom_file, sizeof(rom_file),
>> +             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/rom",
>> +             dev->host.domain, dev->host.bus, dev->host.slot,
>> +             dev->host.function);
>> +
>> +    if (stat(rom_file, &st)) {
>> +        return;
>> +    }
>> +
>> +    if (access(rom_file, F_OK)) {
>> +        fprintf(stderr, "pci-assign: Insufficient privileges for %s\n",
>> +                rom_file);
>> +        return;
>> +    }
>> +
>> +    /* Write "1" to the ROM file to enable it */
>> +    fp = fopen(rom_file, "r+");
>> +    if (fp == NULL) {
>> +        return;
>> +    }
>> +    val = 1;
>> +    if (fwrite(&val, 1, 1, fp) != 1) {
>> +        goto close_rom;
>> +    }
>> +    fseek(fp, 0, SEEK_SET);
>> +
>> +    snprintf(name, sizeof(name), "%s.rom",
>> +            object_get_typename(OBJECT(dev)));
>> +    memory_region_init_ram(&dev->dev.rom, name, st.st_size);
>> +    vmstate_register_ram(&dev->dev.rom, &dev->dev.qdev);
>> +    ptr = memory_region_get_ram_ptr(&dev->dev.rom);
>> +    memset(ptr, 0xff, st.st_size);
>> +
>> +    if (!fread(ptr, 1, st.st_size, fp)) {
>> +        fprintf(stderr, "pci-assign: Cannot read from host %s\n"
>> +                "\tDevice option ROM contents are probably invalid "
>> +                "(check dmesg).\n\tSkip option ROM probe with rombar=0, "
>> +                "or load from file with romfile=\n", rom_file);
>> +        memory_region_destroy(&dev->dev.rom);
>> +        goto close_rom;
>> +    }
>> +
>> +    pci_register_bar(&dev->dev, PCI_ROM_SLOT, 0, &dev->dev.rom);
>> +    dev->dev.has_rom = true;
>> +close_rom:
>> +    /* Write "0" to disable ROM */
>> +    fseek(fp, 0, SEEK_SET);
>> +    val = 0;
>> +    if (!fwrite(&val, 1, 1, fp)) {
>> +        DEBUG("%s\n", "Failed to disable pci-sysfs rom file");
>> +    }
>> +    fclose(fp);
>> +}
>> -- 
>> 1.7.3.4

Thanks for the review,
Jan
Andreas Färber Aug. 29, 2012, 2:08 p.m. UTC | #23
Am 28.08.2012 14:57, schrieb Anthony Liguori:
> Andreas Färber <afaerber@suse.de> writes:
> 
>> Hi,
>>
>> Am 27.08.2012 08:28, schrieb Jan Kiszka:
>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>
>>> This adds PCI device assignment for i386 targets using the classic KVM
>>> interfaces. This version is 100% identical to what is being maintained
>>> in qemu-kvm for several years and is supported by libvirt as well. It is
>>> expected to remain relevant for another couple of years until kernels
>>> without full-features and performance-wise equivalent VFIO support are
>>> obsolete.
>>>
>>> A refactoring to-do that should be done in-tree is to model MSI and
>>> MSI-X support via the generic PCI layer, similar to what VFIO is already
>>> doing for MSI-X. This should improve the correctness and clean up the
>>> code from duplicate logic.
>>>
>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>> ---
>>>  hw/kvm/Makefile.objs |    2 +-
>>>  hw/kvm/pci-assign.c  | 1929 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 1930 insertions(+), 1 deletions(-)
>>>  create mode 100644 hw/kvm/pci-assign.c
>> [...]
>>> diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
>>> new file mode 100644
>>> index 0000000..9cce02c
>>> --- /dev/null
>>> +++ b/hw/kvm/pci-assign.c
>>> @@ -0,0 +1,1929 @@
>>> +/*
>>> + * Copyright (c) 2007, Neocleus Corporation.
>>> + *
>>> + * This program is free software; you can redistribute it and/or modify it
>>> + * under the terms and conditions of the GNU General Public License,
>>> + * version 2, as published by the Free Software Foundation.
>>
>> The downside of accepting this into qemu.git is that it gets us a huge
>> blob of GPLv2-only code without history of contributors for GPLv2+
>> relicensing...
> 
> That is 100% okay.

Why? The way this is being submitted I don't see why we should treat
Jan's patch any different from a patch by IBM or Samsung where we've
asked folks to fix the license to comply with what I thought was our new
policy (it does not even contain a from-x-on-GPLv2+ notice).

If we want to get rid of qemu-kvm by pulling this in nontheless (for
which I do have sympathies from a packaging perspective) why not instead
do a git-merge from qemu-kvm so that we have the full file history at
hands inside qemu.git?

Regards,
Andreas
Anthony Liguori Aug. 29, 2012, 6:32 p.m. UTC | #24
Andreas Färber <afaerber@suse.de> writes:

> Am 28.08.2012 14:57, schrieb Anthony Liguori:
>> Andreas Färber <afaerber@suse.de> writes:
>> 
>>> Hi,
>>>
>>> Am 27.08.2012 08:28, schrieb Jan Kiszka:
>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>
>>>> This adds PCI device assignment for i386 targets using the classic KVM
>>>> interfaces. This version is 100% identical to what is being maintained
>>>> in qemu-kvm for several years and is supported by libvirt as well. It is
>>>> expected to remain relevant for another couple of years until kernels
>>>> without full-features and performance-wise equivalent VFIO support are
>>>> obsolete.
>>>>
>>>> A refactoring to-do that should be done in-tree is to model MSI and
>>>> MSI-X support via the generic PCI layer, similar to what VFIO is already
>>>> doing for MSI-X. This should improve the correctness and clean up the
>>>> code from duplicate logic.
>>>>
>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>> ---
>>>>  hw/kvm/Makefile.objs |    2 +-
>>>>  hw/kvm/pci-assign.c  | 1929 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  2 files changed, 1930 insertions(+), 1 deletions(-)
>>>>  create mode 100644 hw/kvm/pci-assign.c
>>> [...]
>>>> diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
>>>> new file mode 100644
>>>> index 0000000..9cce02c
>>>> --- /dev/null
>>>> +++ b/hw/kvm/pci-assign.c
>>>> @@ -0,0 +1,1929 @@
>>>> +/*
>>>> + * Copyright (c) 2007, Neocleus Corporation.
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify it
>>>> + * under the terms and conditions of the GNU General Public License,
>>>> + * version 2, as published by the Free Software Foundation.
>>>
>>> The downside of accepting this into qemu.git is that it gets us a huge
>>> blob of GPLv2-only code without history of contributors for GPLv2+
>>> relicensing...
>> 
>> That is 100% okay.
>
> Why? The way this is being submitted I don't see why we should treat
> Jan's patch any different from a patch by IBM or Samsung where we've
> asked folks to fix the license to comply with what I thought was our new
> policy (it does not even contain a from-x-on-GPLv2+ notice).

We are not rejecting patches just because aren't GPLv3 compatible.

I'm okay with people trying to make QEMU GPLv2+ but we're not going to
reject code that cannot be practically made GPLv2+.  Making as much code
as possible GPLv2+ means more projects can share code from QEMU.

I have no desire to make QEMU GPLv3.  I said this many times in the
past.

GPLv2 only is fine for contributions.

Regards,

Anthony Liguori

>
> If we want to get rid of qemu-kvm by pulling this in nontheless (for
> which I do have sympathies from a packaging perspective) why not instead
> do a git-merge from qemu-kvm so that we have the full file history at
> hands inside qemu.git?
>
> Regards,
> Andreas
>
> -- 
> SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
> GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
Blue Swirl Sept. 1, 2012, 9:20 a.m. UTC | #25
On Tue, Aug 28, 2012 at 9:51 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Blue Swirl <blauwirbel@gmail.com> writes:
>
>> On Tue, Aug 28, 2012 at 7:31 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>>> Blue Swirl <blauwirbel@gmail.com> writes:
>>>
>>>> On Tue, Aug 28, 2012 at 5:28 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>>> On Tue, Aug 28, 2012 at 05:01:55PM +0000, Blue Swirl wrote:
>>>>>> On Tue, Aug 28, 2012 at 7:35 AM, Michael Tokarev <mjt@tls.msk.ru> wrote:
>>>>>> > On 27.08.2012 22:56, Blue Swirl wrote:
>>>>>> > []
>>>>>> >>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>>>>>> >>> +{
>>>>>> >>> +    AssignedDevRegion *d = opaque;
>>>>>> >>> +    uint8_t *in = d->u.r_virtbase + addr;
>>>>>> >>
>>>>>> >> Don't perform arithmetic with void pointers.
>>>>>> >
>>>>>> > There are a few places in common qemu code which does this for a very
>>>>>> > long time.  So I guess it is safe now.
>>>>>>
>>>>>> It's a non-standard GCC extension.
>>>>>
>>>>> So?  We use many other GCC extensions. grep for typeof.
>>>>
>>>> Dependencies should not be introduced trivially. In this case, it's
>>>> pretty easy to avoid void pointer arithmetic as Jan's next version
>>>> shows.
>>>
>>> The standard is vague with respect void arithmetic.  Most compilers
>>> allow it.  A very good analysis of the standard can be found below.
>>>
>>> http://stackoverflow.com/questions/3523145/pointer-arithmetic-for-void-pointer-in-c
>>
>> The analysis would seem to show that arithmetic may be acceptable, but
>> it doesn't say that void pointers must be treated like char pointers.
>> In my view, this would make sense:
>>
>> char *cptr;
>> void *vptr;
>>
>> Since
>> cptr++;
>> is equivalent to
>> cptr = (char *)((uintptr_t)cptr + sizeof(*cptr));
>>
>> therefore
>>
>> vptr++;
>> should be equivalent to
>> vptr = (void *)((uintptr_t)vptr + sizeof(*vptr));
>> That is, vptr++ should be equivalent to vptr += 0 because sizeof(void)
>> should be 0 if allowed.
>
> sizeof(void) == 1
>
> With GCC at least.

It's not valid C (0 is just how I think it should be if allowed). Also
GCC can reject it even with std=gnu89 (default, C89 with GNU
extensions):
$ cat void.c
unsigned long x = sizeof(void);
$ gcc -pedantic void.c -c
void.c:1: warning: invalid application of 'sizeof' to a void type

> Regards,
>
> Anthony Liguori
>
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>>>
>>>>>
>>>>> Is there a work in progress to build GCC with visual studio?
>>>>> If yes what are the chances KVM device assignment
>>>>> will work on windows?
>>>>
>>>> IIRC there was really a project to use KVM on Windows and another
>>>> project to build QEMU with MSVC.
>>>>
>>>>>
>>>>> Look QEMU codebase is what it is. Unless you rework all existing
>>>>> code to confirm to your taste, I do not see why you NACK valid new code
>>>>> unless it confirms to same.
>>>>
>>>> Yes, I'd be happy to fix the style with huge patches at once. But our
>>>> fearless leader does not agree, so we are stuck with the codebase
>>>> being what it is until it is fixed one step at a time.
>>>>
>>>>>
>>>>>> >
>>>>>> > /mjt
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Sept. 3, 2012, 3:59 p.m. UTC | #26
On 08/29/2012 11:49 AM, Peter Maydell wrote:
> On 29 August 2012 09:47, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2012-08-28 23:26, Peter Maydell wrote:
>>> Since this is arch-specific we should probably give the
>>> resulting device a more specific name than "pci-assign",
>>> which implies that it is (a) ok for any PCI system and
>>> (b) not something that will be obsolete in the foreseen
>>> future.
>>
>> The name has to be like this for libvirt compatibility. I can provide it
>> as alias, though, calling the device "kvm-pci-assign" by default. Better?
> 
> ...is it likely to ever work for non-x86 PCI devices?

It used to work for ia64 before it died.  So it's not architecture
specific by design.
Avi Kivity Sept. 3, 2012, 4:06 p.m. UTC | #27
On 08/28/2012 03:30 AM, Jan Kiszka wrote:
>> 
>> Maybe add case 8: and default: with abort(), also below.
> 
> PIO is never 8 bytes long, the generic layer protects us.

Note: eventually the pio space will be mapped directly to mmio (instead
of being bounced via cpu_inb() in the bridge's mmio handler), and so we
will have to add this restriction to the memory core.
Avi Kivity Sept. 3, 2012, 4:14 p.m. UTC | #28
On 08/29/2012 11:27 AM, Markus Armbruster wrote:
> 
> I don't see a point in making contributors avoid non-problems that might
> conceivably become trivial problems some day.  Especially when there's
> no automated help with the avoiding.

-Wpointer-arith
Blue Swirl Sept. 3, 2012, 7:32 p.m. UTC | #29
On Mon, Sep 3, 2012 at 4:14 PM, Avi Kivity <avi@redhat.com> wrote:
> On 08/29/2012 11:27 AM, Markus Armbruster wrote:
>>
>> I don't see a point in making contributors avoid non-problems that might
>> conceivably become trivial problems some day.  Especially when there's
>> no automated help with the avoiding.
>
> -Wpointer-arith

+1

>
>
>
> --
> error compiling committee.c: too many arguments to function
Alex Williamson Sept. 4, 2012, 3:31 a.m. UTC | #30
On Mon, 2012-09-03 at 18:59 +0300, Avi Kivity wrote:
> On 08/29/2012 11:49 AM, Peter Maydell wrote:
> > On 29 August 2012 09:47, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> >> On 2012-08-28 23:26, Peter Maydell wrote:
> >>> Since this is arch-specific we should probably give the
> >>> resulting device a more specific name than "pci-assign",
> >>> which implies that it is (a) ok for any PCI system and
> >>> (b) not something that will be obsolete in the foreseen
> >>> future.
> >>
> >> The name has to be like this for libvirt compatibility. I can provide it
> >> as alias, though, calling the device "kvm-pci-assign" by default. Better?
> > 
> > ...is it likely to ever work for non-x86 PCI devices?
> 
> It used to work for ia64 before it died.  So it's not architecture
> specific by design.

Caveat; ia64 is a different architecture, but it was on Intel VT-d
chipsets, which therefore supported the IOMMU API and ia64 maps guests
the same way.  This is the same problem I ran into trying to name the
vfio iommu driver for x86.  It's not really x86-specific, but it's got
to look and smell pretty similar.  Thanks,

Alex
Avi Kivity Sept. 4, 2012, 8:32 a.m. UTC | #31
On 09/03/2012 10:32 PM, Blue Swirl wrote:
> On Mon, Sep 3, 2012 at 4:14 PM, Avi Kivity <avi@redhat.com> wrote:
>> On 08/29/2012 11:27 AM, Markus Armbruster wrote:
>>>
>>> I don't see a point in making contributors avoid non-problems that might
>>> conceivably become trivial problems some day.  Especially when there's
>>> no automated help with the avoiding.
>>
>> -Wpointer-arith
> 
> +1

FWIW, I'm not in favour of enabling it, just pointing out that it
exists.  In general I prefer avoiding unnecessary use of extensions, but
in this case the extension is trivial and improves readability.
Blue Swirl Sept. 4, 2012, 7:27 p.m. UTC | #32
On Tue, Sep 4, 2012 at 8:32 AM, Avi Kivity <avi@redhat.com> wrote:
> On 09/03/2012 10:32 PM, Blue Swirl wrote:
>> On Mon, Sep 3, 2012 at 4:14 PM, Avi Kivity <avi@redhat.com> wrote:
>>> On 08/29/2012 11:27 AM, Markus Armbruster wrote:
>>>>
>>>> I don't see a point in making contributors avoid non-problems that might
>>>> conceivably become trivial problems some day.  Especially when there's
>>>> no automated help with the avoiding.
>>>
>>> -Wpointer-arith
>>
>> +1
>
> FWIW, I'm not in favour of enabling it, just pointing out that it
> exists.  In general I prefer avoiding unnecessary use of extensions, but
> in this case the extension is trivial and improves readability.

Void pointers are not so type safe as uint8_t pointers. There's also
little difference in readability between those in my opinion.

>
>
> --
> error compiling committee.c: too many arguments to function
Anthony Liguori Sept. 4, 2012, 9 p.m. UTC | #33
Andreas Färber <afaerber@suse.de> writes:

> Am 28.08.2012 14:57, schrieb Anthony Liguori:
>> Andreas Färber <afaerber@suse.de> writes:
>> 
>>> Hi,
>>>
>>> Am 27.08.2012 08:28, schrieb Jan Kiszka:
>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>
>>>> This adds PCI device assignment for i386 targets using the classic KVM
>>>> interfaces. This version is 100% identical to what is being maintained
>>>> in qemu-kvm for several years and is supported by libvirt as well. It is
>>>> expected to remain relevant for another couple of years until kernels
>>>> without full-features and performance-wise equivalent VFIO support are
>>>> obsolete.
>>>>
>>>> A refactoring to-do that should be done in-tree is to model MSI and
>>>> MSI-X support via the generic PCI layer, similar to what VFIO is already
>>>> doing for MSI-X. This should improve the correctness and clean up the
>>>> code from duplicate logic.
>>>>
>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>> ---
>>>>  hw/kvm/Makefile.objs |    2 +-
>>>>  hw/kvm/pci-assign.c  | 1929 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  2 files changed, 1930 insertions(+), 1 deletions(-)
>>>>  create mode 100644 hw/kvm/pci-assign.c
>>> [...]
>>>> diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
>>>> new file mode 100644
>>>> index 0000000..9cce02c
>>>> --- /dev/null
>>>> +++ b/hw/kvm/pci-assign.c
>>>> @@ -0,0 +1,1929 @@
>>>> +/*
>>>> + * Copyright (c) 2007, Neocleus Corporation.
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify it
>>>> + * under the terms and conditions of the GNU General Public License,
>>>> + * version 2, as published by the Free Software Foundation.
>>>
>>> The downside of accepting this into qemu.git is that it gets us a huge
>>> blob of GPLv2-only code without history of contributors for GPLv2+
>>> relicensing...
>> 
>> That is 100% okay.
>
> Why? The way this is being submitted I don't see why we should treat
> Jan's patch any different from a patch by IBM or Samsung where we've
> asked folks to fix the license to comply with what I thought was our new
> policy (it does not even contain a from-x-on-GPLv2+ notice).

Asking is one thing.  Requiring is another.

I would prefer that people submitted GPLv2+, but I don't think it should
be a hard requirement.  It means, among other things, that we cannot
accept most code that originates from the Linux kernel.

Regards,

Anthony Liguori
Michael S. Tsirkin Sept. 4, 2012, 9:28 p.m. UTC | #34
On Tue, Sep 04, 2012 at 07:27:32PM +0000, Blue Swirl wrote:
> On Tue, Sep 4, 2012 at 8:32 AM, Avi Kivity <avi@redhat.com> wrote:
> > On 09/03/2012 10:32 PM, Blue Swirl wrote:
> >> On Mon, Sep 3, 2012 at 4:14 PM, Avi Kivity <avi@redhat.com> wrote:
> >>> On 08/29/2012 11:27 AM, Markus Armbruster wrote:
> >>>>
> >>>> I don't see a point in making contributors avoid non-problems that might
> >>>> conceivably become trivial problems some day.  Especially when there's
> >>>> no automated help with the avoiding.
> >>>
> >>> -Wpointer-arith
> >>
> >> +1
> >
> > FWIW, I'm not in favour of enabling it, just pointing out that it
> > exists.  In general I prefer avoiding unnecessary use of extensions, but
> > in this case the extension is trivial and improves readability.
> 
> Void pointers are not so type safe as uint8_t pointers.

casts are even worse.

> There's also
> little difference in readability between those in my opinion.

here too, casts are worse for readability.

> >
> >
> > --
> > error compiling committee.c: too many arguments to function
Avi Kivity Sept. 5, 2012, 3:26 p.m. UTC | #35
On 09/05/2012 12:00 AM, Anthony Liguori wrote:
>>
>> Why? The way this is being submitted I don't see why we should treat
>> Jan's patch any different from a patch by IBM or Samsung where we've
>> asked folks to fix the license to comply with what I thought was our new
>> policy (it does not even contain a from-x-on-GPLv2+ notice).
> 
> Asking is one thing.  Requiring is another.
> 
> I would prefer that people submitted GPLv2+, but I don't think it should
> be a hard requirement.  It means, among other things, that we cannot
> accept most code that originates from the Linux kernel.

We could extend this to "require unless there is a reason to grant an
exception" if we wanted to (not saying I know whether we want to or not).
Michael S. Tsirkin Sept. 5, 2012, 3:29 p.m. UTC | #36
On Wed, Sep 05, 2012 at 06:26:54PM +0300, Avi Kivity wrote:
> On 09/05/2012 12:00 AM, Anthony Liguori wrote:
> >>
> >> Why? The way this is being submitted I don't see why we should treat
> >> Jan's patch any different from a patch by IBM or Samsung where we've
> >> asked folks to fix the license to comply with what I thought was our new
> >> policy (it does not even contain a from-x-on-GPLv2+ notice).
> > 
> > Asking is one thing.  Requiring is another.
> > 
> > I would prefer that people submitted GPLv2+, but I don't think it should
> > be a hard requirement.  It means, among other things, that we cannot
> > accept most code that originates from the Linux kernel.
> 
> We could extend this to "require unless there is a reason to grant an
> exception" if we wanted to (not saying I know whether we want to or not).

Would be nice to add a clarification in the header: people
tend to copy boilerplate around.

> 
> -- 
> error compiling committee.c: too many arguments to function
Anthony Liguori Sept. 5, 2012, 3:41 p.m. UTC | #37
Avi Kivity <avi@redhat.com> writes:

> On 09/05/2012 12:00 AM, Anthony Liguori wrote:
>>>
>>> Why? The way this is being submitted I don't see why we should treat
>>> Jan's patch any different from a patch by IBM or Samsung where we've
>>> asked folks to fix the license to comply with what I thought was our new
>>> policy (it does not even contain a from-x-on-GPLv2+ notice).
>> 
>> Asking is one thing.  Requiring is another.
>> 
>> I would prefer that people submitted GPLv2+, but I don't think it should
>> be a hard requirement.  It means, among other things, that we cannot
>> accept most code that originates from the Linux kernel.
>
> We could extend this to "require unless there is a reason to grant an
> exception" if we wanted to (not saying I know whether we want to or
> not).

I don't want QEMU to be GPLv3.  I don't like the terms of the GPLv3.

I don't mind GPLv2+, if people want to share code from QEMU in GPLv3
projects, GPLv2+ enables that.

But if new code is coming in and happens to be under GPLv2, that just
means that the contribution cannot be used outside of QEMU in a GPLv3
project.  That's fine and that's a decision for the submitter to make.

Regards,

Anthony Liguori

>
>
> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity Sept. 5, 2012, 3:52 p.m. UTC | #38
On 09/05/2012 06:41 PM, Anthony Liguori wrote:
> Avi Kivity <avi@redhat.com> writes:
> 
>> On 09/05/2012 12:00 AM, Anthony Liguori wrote:
>>>>
>>>> Why? The way this is being submitted I don't see why we should treat
>>>> Jan's patch any different from a patch by IBM or Samsung where we've
>>>> asked folks to fix the license to comply with what I thought was our new
>>>> policy (it does not even contain a from-x-on-GPLv2+ notice).
>>> 
>>> Asking is one thing.  Requiring is another.
>>> 
>>> I would prefer that people submitted GPLv2+, but I don't think it should
>>> be a hard requirement.  It means, among other things, that we cannot
>>> accept most code that originates from the Linux kernel.
>>
>> We could extend this to "require unless there is a reason to grant an
>> exception" if we wanted to (not saying I know whether we want to or
>> not).
> 
> I don't want QEMU to be GPLv3.  I don't like the terms of the GPLv3.
> 
> I don't mind GPLv2+, if people want to share code from QEMU in GPLv3
> projects, GPLv2+ enables that.
> 
> But if new code is coming in and happens to be under GPLv2, that just
> means that the contribution cannot be used outside of QEMU in a GPLv3
> project.  That's fine and that's a decision for the submitter to make.

Makes sense.
Blue Swirl Sept. 5, 2012, 7:04 p.m. UTC | #39
On Wed, Sep 5, 2012 at 3:41 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Avi Kivity <avi@redhat.com> writes:
>
>> On 09/05/2012 12:00 AM, Anthony Liguori wrote:
>>>>
>>>> Why? The way this is being submitted I don't see why we should treat
>>>> Jan's patch any different from a patch by IBM or Samsung where we've
>>>> asked folks to fix the license to comply with what I thought was our new
>>>> policy (it does not even contain a from-x-on-GPLv2+ notice).
>>>
>>> Asking is one thing.  Requiring is another.
>>>
>>> I would prefer that people submitted GPLv2+, but I don't think it should
>>> be a hard requirement.  It means, among other things, that we cannot
>>> accept most code that originates from the Linux kernel.
>>
>> We could extend this to "require unless there is a reason to grant an
>> exception" if we wanted to (not saying I know whether we want to or
>> not).
>
> I don't want QEMU to be GPLv3.  I don't like the terms of the GPLv3.
>
> I don't mind GPLv2+, if people want to share code from QEMU in GPLv3
> projects, GPLv2+ enables that.

The advantage of 100% GPLv2+ (or other GPLv3 compatible) would be that
QEMU could share code from GPLv3 projects, specifically latest
binutils. Reinventing a disassembler for ever growing x86 assembly is
no fun.

>
> But if new code is coming in and happens to be under GPLv2, that just
> means that the contribution cannot be used outside of QEMU in a GPLv3
> project.  That's fine and that's a decision for the submitter to make.

This policy means that we are locked in with GPLv2.

>
> Regards,
>
> Anthony Liguori
>
>>
>>
>> --
>> error compiling committee.c: too many arguments to function
Blue Swirl Sept. 5, 2012, 7:09 p.m. UTC | #40
On Tue, Sep 4, 2012 at 9:28 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Tue, Sep 04, 2012 at 07:27:32PM +0000, Blue Swirl wrote:
>> On Tue, Sep 4, 2012 at 8:32 AM, Avi Kivity <avi@redhat.com> wrote:
>> > On 09/03/2012 10:32 PM, Blue Swirl wrote:
>> >> On Mon, Sep 3, 2012 at 4:14 PM, Avi Kivity <avi@redhat.com> wrote:
>> >>> On 08/29/2012 11:27 AM, Markus Armbruster wrote:
>> >>>>
>> >>>> I don't see a point in making contributors avoid non-problems that might
>> >>>> conceivably become trivial problems some day.  Especially when there's
>> >>>> no automated help with the avoiding.
>> >>>
>> >>> -Wpointer-arith
>> >>
>> >> +1
>> >
>> > FWIW, I'm not in favour of enabling it, just pointing out that it
>> > exists.  In general I prefer avoiding unnecessary use of extensions, but
>> > in this case the extension is trivial and improves readability.
>>
>> Void pointers are not so type safe as uint8_t pointers.
>
> casts are even worse.
>
>> There's also
>> little difference in readability between those in my opinion.
>
> here too, casts are worse for readability.

I agree they are bad in both accounts, but in most cases it is
possible to use different types consistently (like uint8_t * or char *
instead of void *) without adding casts.

>
>> >
>> >
>> > --
>> > error compiling committee.c: too many arguments to function
Anthony Liguori Sept. 5, 2012, 7:22 p.m. UTC | #41
Blue Swirl <blauwirbel@gmail.com> writes:

> On Wed, Sep 5, 2012 at 3:41 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>> Avi Kivity <avi@redhat.com> writes:
>>
>>> On 09/05/2012 12:00 AM, Anthony Liguori wrote:
>>>>>
>>>>> Why? The way this is being submitted I don't see why we should treat
>>>>> Jan's patch any different from a patch by IBM or Samsung where we've
>>>>> asked folks to fix the license to comply with what I thought was our new
>>>>> policy (it does not even contain a from-x-on-GPLv2+ notice).
>>>>
>>>> Asking is one thing.  Requiring is another.
>>>>
>>>> I would prefer that people submitted GPLv2+, but I don't think it should
>>>> be a hard requirement.  It means, among other things, that we cannot
>>>> accept most code that originates from the Linux kernel.
>>>
>>> We could extend this to "require unless there is a reason to grant an
>>> exception" if we wanted to (not saying I know whether we want to or
>>> not).
>>
>> I don't want QEMU to be GPLv3.  I don't like the terms of the GPLv3.
>>
>> I don't mind GPLv2+, if people want to share code from QEMU in GPLv3
>> projects, GPLv2+ enables that.
>
> The advantage of 100% GPLv2+ (or other GPLv3 compatible) would be that
> QEMU could share code from GPLv3 projects, specifically latest
> binutils. Reinventing a disassembler for ever growing x86 assembly is
> no fun.

But we can't share code with Linux (like for virtio).

Yes, the GPLv3 sucks and FSF screwed up massively not making it v2
compatible.

Regards,

Anthony Liguori

>
>>
>> But if new code is coming in and happens to be under GPLv2, that just
>> means that the contribution cannot be used outside of QEMU in a GPLv3
>> project.  That's fine and that's a decision for the submitter to make.
>
> This policy means that we are locked in with GPLv2.
>
>>
>> Regards,
>>
>> Anthony Liguori
>>
>>>
>>>
>>> --
>>> error compiling committee.c: too many arguments to function
Eric Blake Sept. 5, 2012, 7:24 p.m. UTC | #42
On 09/05/2012 01:04 PM, Blue Swirl wrote:
>> I don't mind GPLv2+, if people want to share code from QEMU in GPLv3
>> projects, GPLv2+ enables that.
> 
> The advantage of 100% GPLv2+ (or other GPLv3 compatible) would be that
> QEMU could share code from GPLv3 projects, specifically latest
> binutils. Reinventing a disassembler for ever growing x86 assembly is
> no fun.

Not quite right.

If qemu is 100% GPLv2+ and binutils is GPLv3+, then binutils can borrow
code from qemu and the result is that binutils is still GPLv3+; but in
the converse direction, if qemu borrows code from binutils then qemu is
no longer 100% GPLv2+ but becomes GPLv3+ by tainting.

That is, requesting GPLv2+ allows qemu code to be reused elsewhere, but
does not help qemu import external code that is not already GPLv2+.

> 
>>
>> But if new code is coming in and happens to be under GPLv2, that just
>> means that the contribution cannot be used outside of QEMU in a GPLv3
>> project.  That's fine and that's a decision for the submitter to make.
> 
> This policy means that we are locked in with GPLv2.

I'm afraid we're already locked at GPLv2 (and not GPLv2+), for good or
for bad.
Blue Swirl Sept. 5, 2012, 7:38 p.m. UTC | #43
On Wed, Sep 5, 2012 at 7:22 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Blue Swirl <blauwirbel@gmail.com> writes:
>
>> On Wed, Sep 5, 2012 at 3:41 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>>> Avi Kivity <avi@redhat.com> writes:
>>>
>>>> On 09/05/2012 12:00 AM, Anthony Liguori wrote:
>>>>>>
>>>>>> Why? The way this is being submitted I don't see why we should treat
>>>>>> Jan's patch any different from a patch by IBM or Samsung where we've
>>>>>> asked folks to fix the license to comply with what I thought was our new
>>>>>> policy (it does not even contain a from-x-on-GPLv2+ notice).
>>>>>
>>>>> Asking is one thing.  Requiring is another.
>>>>>
>>>>> I would prefer that people submitted GPLv2+, but I don't think it should
>>>>> be a hard requirement.  It means, among other things, that we cannot
>>>>> accept most code that originates from the Linux kernel.
>>>>
>>>> We could extend this to "require unless there is a reason to grant an
>>>> exception" if we wanted to (not saying I know whether we want to or
>>>> not).
>>>
>>> I don't want QEMU to be GPLv3.  I don't like the terms of the GPLv3.
>>>
>>> I don't mind GPLv2+, if people want to share code from QEMU in GPLv3
>>> projects, GPLv2+ enables that.
>>
>> The advantage of 100% GPLv2+ (or other GPLv3 compatible) would be that
>> QEMU could share code from GPLv3 projects, specifically latest
>> binutils. Reinventing a disassembler for ever growing x86 assembly is
>> no fun.
>
> But we can't share code with Linux (like for virtio).

It's a tradeoff between reimplementing disassembler without using
binutils vs. reimplementing virtio without using Linux. Both have
their problems and both are growing areas. Disassembler is a bit
smaller and the basic function does not ever change.

>
> Yes, the GPLv3 sucks and FSF screwed up massively not making it v2
> compatible.

I sort of agree. They had their reasons, of course. Too bad binutils
licensing is fully controlled by FSF, for us it would be enough if
they had some sort of dual licensing scheme (GPLv3 + BSD for example)
in place.

>
> Regards,
>
> Anthony Liguori
>
>>
>>>
>>> But if new code is coming in and happens to be under GPLv2, that just
>>> means that the contribution cannot be used outside of QEMU in a GPLv3
>>> project.  That's fine and that's a decision for the submitter to make.
>>
>> This policy means that we are locked in with GPLv2.
>>
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>>>
>>>>
>>>> --
>>>> error compiling committee.c: too many arguments to function
Blue Swirl Sept. 5, 2012, 7:43 p.m. UTC | #44
On Wed, Sep 5, 2012 at 7:24 PM, Eric Blake <eblake@redhat.com> wrote:
> On 09/05/2012 01:04 PM, Blue Swirl wrote:
>>> I don't mind GPLv2+, if people want to share code from QEMU in GPLv3
>>> projects, GPLv2+ enables that.
>>
>> The advantage of 100% GPLv2+ (or other GPLv3 compatible) would be that
>> QEMU could share code from GPLv3 projects, specifically latest
>> binutils. Reinventing a disassembler for ever growing x86 assembly is
>> no fun.
>
> Not quite right.
>
> If qemu is 100% GPLv2+ and binutils is GPLv3+, then binutils can borrow
> code from qemu and the result is that binutils is still GPLv3+; but in
> the converse direction, if qemu borrows code from binutils then qemu is
> no longer 100% GPLv2+ but becomes GPLv3+ by tainting.

I don't see how this disagrees with what I wrote. GPLv2+ QEMU sharing
code from GPLv3 would of course become GPLv3.

>
> That is, requesting GPLv2+ allows qemu code to be reused elsewhere, but
> does not help qemu import external code that is not already GPLv2+.

Unless we demanded relicensing to GPLv2+ for all GPLv2 QEMU code and
forbid new GPLv2 entries.

>
>>
>>>
>>> But if new code is coming in and happens to be under GPLv2, that just
>>> means that the contribution cannot be used outside of QEMU in a GPLv3
>>> project.  That's fine and that's a decision for the submitter to make.
>>
>> This policy means that we are locked in with GPLv2.
>
> I'm afraid we're already locked at GPLv2 (and not GPLv2+), for good or
> for bad.
>
> --
> Eric Blake   eblake@redhat.com    +1-919-301-3266
> Libvirt virtualization library http://libvirt.org
>
Anthony Liguori Sept. 5, 2012, 8:46 p.m. UTC | #45
Blue Swirl <blauwirbel@gmail.com> writes:

> On Wed, Sep 5, 2012 at 7:22 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>> Blue Swirl <blauwirbel@gmail.com> writes:
>>
>>> On Wed, Sep 5, 2012 at 3:41 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>>>> Avi Kivity <avi@redhat.com> writes:
>>>>
>>>>> On 09/05/2012 12:00 AM, Anthony Liguori wrote:
>>>>>>>
>>>>>>> Why? The way this is being submitted I don't see why we should treat
>>>>>>> Jan's patch any different from a patch by IBM or Samsung where we've
>>>>>>> asked folks to fix the license to comply with what I thought was our new
>>>>>>> policy (it does not even contain a from-x-on-GPLv2+ notice).
>>>>>>
>>>>>> Asking is one thing.  Requiring is another.
>>>>>>
>>>>>> I would prefer that people submitted GPLv2+, but I don't think it should
>>>>>> be a hard requirement.  It means, among other things, that we cannot
>>>>>> accept most code that originates from the Linux kernel.
>>>>>
>>>>> We could extend this to "require unless there is a reason to grant an
>>>>> exception" if we wanted to (not saying I know whether we want to or
>>>>> not).
>>>>
>>>> I don't want QEMU to be GPLv3.  I don't like the terms of the GPLv3.
>>>>
>>>> I don't mind GPLv2+, if people want to share code from QEMU in GPLv3
>>>> projects, GPLv2+ enables that.
>>>
>>> The advantage of 100% GPLv2+ (or other GPLv3 compatible) would be that
>>> QEMU could share code from GPLv3 projects, specifically latest
>>> binutils. Reinventing a disassembler for ever growing x86 assembly is
>>> no fun.
>>
>> But we can't share code with Linux (like for virtio).
>
> It's a tradeoff between reimplementing disassembler without using
> binutils vs. reimplementing virtio without using Linux. Both have
> their problems and both are growing areas. Disassembler is a bit
> smaller and the basic function does not ever change.

Since binutils is the project with a copyright controlled by a single
entity that decided to change their license such that it was no longer
compatible with QEMU, as far as I'm concerned, it's a no brainer.  We
should care more about working with projects that are willing to
cooperate with us.

What do we do when the FSF comes out with the GPLv4 and relicenses again
in an incompatible fashion?  Do we do this exercise every couple of
years?

>> Yes, the GPLv3 sucks and FSF screwed up massively not making it v2
>> compatible.
>
> I sort of agree. They had their reasons, of course. Too bad binutils
> licensing is fully controlled by FSF, for us it would be enough if
> they had some sort of dual licensing scheme (GPLv3 + BSD for example)
> in place.

I think it's clear that FSF controlled projects are not good places to
try to share code with...

Regards,

Anthony Liguori

>
>>
>> Regards,
>>
>> Anthony Liguori
>>
>>>
>>>>
>>>> But if new code is coming in and happens to be under GPLv2, that just
>>>> means that the contribution cannot be used outside of QEMU in a GPLv3
>>>> project.  That's fine and that's a decision for the submitter to make.
>>>
>>> This policy means that we are locked in with GPLv2.
>>>
>>>>
>>>> Regards,
>>>>
>>>> Anthony Liguori
>>>>
>>>>>
>>>>>
>>>>> --
>>>>> error compiling committee.c: too many arguments to function
Alexander Graf Sept. 6, 2012, 3:42 a.m. UTC | #46
On 05.09.2012, at 15:38, Blue Swirl wrote:

> On Wed, Sep 5, 2012 at 7:22 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>> Blue Swirl <blauwirbel@gmail.com> writes:
>> 
>>> On Wed, Sep 5, 2012 at 3:41 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>>>> Avi Kivity <avi@redhat.com> writes:
>>>> 
>>>>> On 09/05/2012 12:00 AM, Anthony Liguori wrote:
>>>>>>> 
>>>>>>> Why? The way this is being submitted I don't see why we should treat
>>>>>>> Jan's patch any different from a patch by IBM or Samsung where we've
>>>>>>> asked folks to fix the license to comply with what I thought was our new
>>>>>>> policy (it does not even contain a from-x-on-GPLv2+ notice).
>>>>>> 
>>>>>> Asking is one thing.  Requiring is another.
>>>>>> 
>>>>>> I would prefer that people submitted GPLv2+, but I don't think it should
>>>>>> be a hard requirement.  It means, among other things, that we cannot
>>>>>> accept most code that originates from the Linux kernel.
>>>>> 
>>>>> We could extend this to "require unless there is a reason to grant an
>>>>> exception" if we wanted to (not saying I know whether we want to or
>>>>> not).
>>>> 
>>>> I don't want QEMU to be GPLv3.  I don't like the terms of the GPLv3.
>>>> 
>>>> I don't mind GPLv2+, if people want to share code from QEMU in GPLv3
>>>> projects, GPLv2+ enables that.
>>> 
>>> The advantage of 100% GPLv2+ (or other GPLv3 compatible) would be that
>>> QEMU could share code from GPLv3 projects, specifically latest
>>> binutils. Reinventing a disassembler for ever growing x86 assembly is
>>> no fun.
>> 
>> But we can't share code with Linux (like for virtio).
> 
> It's a tradeoff between reimplementing disassembler without using
> binutils vs. reimplementing virtio without using Linux. Both have
> their problems and both are growing areas. Disassembler is a bit
> smaller and the basic function does not ever change.
> 
>> 
>> Yes, the GPLv3 sucks and FSF screwed up massively not making it v2
>> compatible.
> 
> I sort of agree. They had their reasons, of course. Too bad binutils
> licensing is fully controlled by FSF, for us it would be enough if
> they had some sort of dual licensing scheme (GPLv3 + BSD for example)
> in place.

What do the BSD guys do here? They want to have a disassembler too that works across all different sorts of architectures, no?


Alex
Avi Kivity Sept. 6, 2012, 8:44 a.m. UTC | #47
On 09/05/2012 10:04 PM, Blue Swirl wrote:
>
> Reinventing a disassembler for ever growing x86 assembly is
> no fun.

We can try linking to a disassembler library.  I use udis86 to
disassemble instructions in kvm tracepoints
(http://udis86.git.sourceforge.net/git/gitweb.cgi?p=udis86/udis86;a=shortlog),
it's maintained but not heavily so.

Of course for non-x86 we'd need to continue using binutils; this is
about copying code vs. libraries, not about licensing.
Blue Swirl Sept. 8, 2012, 7:54 a.m. UTC | #48
On Thu, Sep 6, 2012 at 3:42 AM, Alexander Graf <agraf@suse.de> wrote:
>
> On 05.09.2012, at 15:38, Blue Swirl wrote:
>
>> On Wed, Sep 5, 2012 at 7:22 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>>> Blue Swirl <blauwirbel@gmail.com> writes:
>>>
>>>> On Wed, Sep 5, 2012 at 3:41 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>>>>> Avi Kivity <avi@redhat.com> writes:
>>>>>
>>>>>> On 09/05/2012 12:00 AM, Anthony Liguori wrote:
>>>>>>>>
>>>>>>>> Why? The way this is being submitted I don't see why we should treat
>>>>>>>> Jan's patch any different from a patch by IBM or Samsung where we've
>>>>>>>> asked folks to fix the license to comply with what I thought was our new
>>>>>>>> policy (it does not even contain a from-x-on-GPLv2+ notice).
>>>>>>>
>>>>>>> Asking is one thing.  Requiring is another.
>>>>>>>
>>>>>>> I would prefer that people submitted GPLv2+, but I don't think it should
>>>>>>> be a hard requirement.  It means, among other things, that we cannot
>>>>>>> accept most code that originates from the Linux kernel.
>>>>>>
>>>>>> We could extend this to "require unless there is a reason to grant an
>>>>>> exception" if we wanted to (not saying I know whether we want to or
>>>>>> not).
>>>>>
>>>>> I don't want QEMU to be GPLv3.  I don't like the terms of the GPLv3.
>>>>>
>>>>> I don't mind GPLv2+, if people want to share code from QEMU in GPLv3
>>>>> projects, GPLv2+ enables that.
>>>>
>>>> The advantage of 100% GPLv2+ (or other GPLv3 compatible) would be that
>>>> QEMU could share code from GPLv3 projects, specifically latest
>>>> binutils. Reinventing a disassembler for ever growing x86 assembly is
>>>> no fun.
>>>
>>> But we can't share code with Linux (like for virtio).
>>
>> It's a tradeoff between reimplementing disassembler without using
>> binutils vs. reimplementing virtio without using Linux. Both have
>> their problems and both are growing areas. Disassembler is a bit
>> smaller and the basic function does not ever change.
>>
>>>
>>> Yes, the GPLv3 sucks and FSF screwed up massively not making it v2
>>> compatible.
>>
>> I sort of agree. They had their reasons, of course. Too bad binutils
>> licensing is fully controlled by FSF, for us it would be enough if
>> they had some sort of dual licensing scheme (GPLv3 + BSD for example)
>> in place.
>
> What do the BSD guys do here? They want to have a disassembler too that works across all different sorts of architectures, no?

There's at least GDB and DDD. The DDB kernel debugger contains a
disassembler for several architectures:
http://fxr.watson.org/fxr/ident?v=NETBSD&i=db_disasm

At least cris, lm32, microblaze, unicore32 and s390x are still missing
and I don't know if sh3 equals sh4. For some of those, maybe current
code from old binutils will be good enough forever.

It looks like the most recent change for x86 is from 2009 and there's
no support for even MMX so it does not look very potential way to
handle the x86 instruction set growth.

>
>
> Alex
>
Blue Swirl Sept. 8, 2012, 8:06 a.m. UTC | #49
On Thu, Sep 6, 2012 at 8:44 AM, Avi Kivity <avi@redhat.com> wrote:
> On 09/05/2012 10:04 PM, Blue Swirl wrote:
>>
>> Reinventing a disassembler for ever growing x86 assembly is
>> no fun.
>
> We can try linking to a disassembler library.  I use udis86 to
> disassemble instructions in kvm tracepoints
> (http://udis86.git.sourceforge.net/git/gitweb.cgi?p=udis86/udis86;a=shortlog),
> it's maintained but not heavily so.

I think commonality with KVM would be preferred. The library looks
neat and based on changelog, more actively developed than BSD DDB.

>
> Of course for non-x86 we'd need to continue using binutils; this is
> about copying code vs. libraries, not about licensing.

For most architectures, pre-GPLv3 binutils is good enough since the
instruction set does not change anymore. Maybe only PPC and Sparc64
still change besides x86. New CPUs types more recent than 2007 will
have problems.

>
>
> --
> error compiling committee.c: too many arguments to function
Alexander Graf Sept. 8, 2012, 9:28 a.m. UTC | #50
On 08.09.2012, at 10:06, Blue Swirl <blauwirbel@gmail.com> wrote:

> On Thu, Sep 6, 2012 at 8:44 AM, Avi Kivity <avi@redhat.com> wrote:
>> On 09/05/2012 10:04 PM, Blue Swirl wrote:
>>> 
>>> Reinventing a disassembler for ever growing x86 assembly is
>>> no fun.
>> 
>> We can try linking to a disassembler library.  I use udis86 to
>> disassemble instructions in kvm tracepoints
>> (http://udis86.git.sourceforge.net/git/gitweb.cgi?p=udis86/udis86;a=shortlog),
>> it's maintained but not heavily so.
> 
> I think commonality with KVM would be preferred. The library looks
> neat and based on changelog, more actively developed than BSD DDB.
> 
>> 
>> Of course for non-x86 we'd need to continue using binutils; this is
>> about copying code vs. libraries, not about licensing.
> 
> For most architectures, pre-GPLv3 binutils is good enough since the
> instruction set does not change anymore. Maybe only PPC and Sparc64
> still change besides x86. New CPUs types more recent than 2007 will
> have problems.

Alternatively we could try to run the disassembler in a different process, right?

Alex

> 
>> 
>> 
>> --
>> error compiling committee.c: too many arguments to function
>
Blue Swirl Sept. 8, 2012, 10:16 a.m. UTC | #51
On Sat, Sep 8, 2012 at 9:28 AM, Alexander Graf <agraf@suse.de> wrote:
>
>
> On 08.09.2012, at 10:06, Blue Swirl <blauwirbel@gmail.com> wrote:
>
>> On Thu, Sep 6, 2012 at 8:44 AM, Avi Kivity <avi@redhat.com> wrote:
>>> On 09/05/2012 10:04 PM, Blue Swirl wrote:
>>>>
>>>> Reinventing a disassembler for ever growing x86 assembly is
>>>> no fun.
>>>
>>> We can try linking to a disassembler library.  I use udis86 to
>>> disassemble instructions in kvm tracepoints
>>> (http://udis86.git.sourceforge.net/git/gitweb.cgi?p=udis86/udis86;a=shortlog),
>>> it's maintained but not heavily so.
>>
>> I think commonality with KVM would be preferred. The library looks
>> neat and based on changelog, more actively developed than BSD DDB.
>>
>>>
>>> Of course for non-x86 we'd need to continue using binutils; this is
>>> about copying code vs. libraries, not about licensing.
>>
>> For most architectures, pre-GPLv3 binutils is good enough since the
>> instruction set does not change anymore. Maybe only PPC and Sparc64
>> still change besides x86. New CPUs types more recent than 2007 will
>> have problems.
>
> Alternatively we could try to run the disassembler in a different process, right?

For qemu.log this would be doable and even improve performance since
only binary data would be transferred.

But for monitor disassembly command x/i it may be too clumsy. There's
some overlap with GDB support, so maybe we could deprecate monitor
disassembly.

>
> Alex
>
>>
>>>
>>>
>>> --
>>> error compiling committee.c: too many arguments to function
>>
Alexander Graf Sept. 8, 2012, 12:13 p.m. UTC | #52
On 08.09.2012, at 12:16, Blue Swirl <blauwirbel@gmail.com> wrote:

> On Sat, Sep 8, 2012 at 9:28 AM, Alexander Graf <agraf@suse.de> wrote:
>> 
>> 
>> On 08.09.2012, at 10:06, Blue Swirl <blauwirbel@gmail.com> wrote:
>> 
>>> On Thu, Sep 6, 2012 at 8:44 AM, Avi Kivity <avi@redhat.com> wrote:
>>>> On 09/05/2012 10:04 PM, Blue Swirl wrote:
>>>>> 
>>>>> Reinventing a disassembler for ever growing x86 assembly is
>>>>> no fun.
>>>> 
>>>> We can try linking to a disassembler library.  I use udis86 to
>>>> disassemble instructions in kvm tracepoints
>>>> (http://udis86.git.sourceforge.net/git/gitweb.cgi?p=udis86/udis86;a=shortlog),
>>>> it's maintained but not heavily so.
>>> 
>>> I think commonality with KVM would be preferred. The library looks
>>> neat and based on changelog, more actively developed than BSD DDB.
>>> 
>>>> 
>>>> Of course for non-x86 we'd need to continue using binutils; this is
>>>> about copying code vs. libraries, not about licensing.
>>> 
>>> For most architectures, pre-GPLv3 binutils is good enough since the
>>> instruction set does not change anymore. Maybe only PPC and Sparc64
>>> still change besides x86. New CPUs types more recent than 2007 will
>>> have problems.
>> 
>> Alternatively we could try to run the disassembler in a different process, right?
> 
> For qemu.log this would be doable and even improve performance since
> only binary data would be transferred.
> 
> But for monitor disassembly command x/i it may be too clumsy.

Why would it be clumsy? We'd have to make sure we are communicating synchronously with the daemon, but apart from that it shouldn't be too different from the log, no?

> There's
> some overlap with GDB support, so maybe we could deprecate monitor
> disassembly.

I really like the way the monitor goes through special v->p lookup, as it's a lot easier to debug...

Alex

> 
>> 
>> Alex
>> 
>>> 
>>>> 
>>>> 
>>>> --
>>>> error compiling committee.c: too many arguments to function
>>>
Blue Swirl Sept. 8, 2012, 12:30 p.m. UTC | #53
On Sat, Sep 8, 2012 at 12:13 PM, Alexander Graf <agraf@suse.de> wrote:
>
>
> On 08.09.2012, at 12:16, Blue Swirl <blauwirbel@gmail.com> wrote:
>
>> On Sat, Sep 8, 2012 at 9:28 AM, Alexander Graf <agraf@suse.de> wrote:
>>>
>>>
>>> On 08.09.2012, at 10:06, Blue Swirl <blauwirbel@gmail.com> wrote:
>>>
>>>> On Thu, Sep 6, 2012 at 8:44 AM, Avi Kivity <avi@redhat.com> wrote:
>>>>> On 09/05/2012 10:04 PM, Blue Swirl wrote:
>>>>>>
>>>>>> Reinventing a disassembler for ever growing x86 assembly is
>>>>>> no fun.
>>>>>
>>>>> We can try linking to a disassembler library.  I use udis86 to
>>>>> disassemble instructions in kvm tracepoints
>>>>> (http://udis86.git.sourceforge.net/git/gitweb.cgi?p=udis86/udis86;a=shortlog),
>>>>> it's maintained but not heavily so.
>>>>
>>>> I think commonality with KVM would be preferred. The library looks
>>>> neat and based on changelog, more actively developed than BSD DDB.
>>>>
>>>>>
>>>>> Of course for non-x86 we'd need to continue using binutils; this is
>>>>> about copying code vs. libraries, not about licensing.
>>>>
>>>> For most architectures, pre-GPLv3 binutils is good enough since the
>>>> instruction set does not change anymore. Maybe only PPC and Sparc64
>>>> still change besides x86. New CPUs types more recent than 2007 will
>>>> have problems.
>>>
>>> Alternatively we could try to run the disassembler in a different process, right?
>>
>> For qemu.log this would be doable and even improve performance since
>> only binary data would be transferred.
>>
>> But for monitor disassembly command x/i it may be too clumsy.
>
> Why would it be clumsy? We'd have to make sure we are communicating synchronously with the daemon, but apart from that it shouldn't be too different from the log, no?

The log file should be written as binary which the disassembly tool
could read. The daemon would probably slow down execution back to
original speed since it would be writing ASCII, though this mode could
be supported too.

>
>> There's
>> some overlap with GDB support, so maybe we could deprecate monitor
>> disassembly.
>
> I really like the way the monitor goes through special v->p lookup, as it's a lot easier to debug...

GDB could be taught new tricks, but that would not help users with old dogs.

>
> Alex
>
>>
>>>
>>> Alex
>>>
>>>>
>>>>>
>>>>>
>>>>> --
>>>>> error compiling committee.c: too many arguments to function
>>>>
Alexander Graf Sept. 8, 2012, 2:59 p.m. UTC | #54
On 08.09.2012, at 14:30, Blue Swirl <blauwirbel@gmail.com> wrote:

> On Sat, Sep 8, 2012 at 12:13 PM, Alexander Graf <agraf@suse.de> wrote:
>> 
>> 
>> On 08.09.2012, at 12:16, Blue Swirl <blauwirbel@gmail.com> wrote:
>> 
>>> On Sat, Sep 8, 2012 at 9:28 AM, Alexander Graf <agraf@suse.de> wrote:
>>>> 
>>>> 
>>>> On 08.09.2012, at 10:06, Blue Swirl <blauwirbel@gmail.com> wrote:
>>>> 
>>>>> On Thu, Sep 6, 2012 at 8:44 AM, Avi Kivity <avi@redhat.com> wrote:
>>>>>> On 09/05/2012 10:04 PM, Blue Swirl wrote:
>>>>>>> 
>>>>>>> Reinventing a disassembler for ever growing x86 assembly is
>>>>>>> no fun.
>>>>>> 
>>>>>> We can try linking to a disassembler library.  I use udis86 to
>>>>>> disassemble instructions in kvm tracepoints
>>>>>> (http://udis86.git.sourceforge.net/git/gitweb.cgi?p=udis86/udis86;a=shortlog),
>>>>>> it's maintained but not heavily so.
>>>>> 
>>>>> I think commonality with KVM would be preferred. The library looks
>>>>> neat and based on changelog, more actively developed than BSD DDB.
>>>>> 
>>>>>> 
>>>>>> Of course for non-x86 we'd need to continue using binutils; this is
>>>>>> about copying code vs. libraries, not about licensing.
>>>>> 
>>>>> For most architectures, pre-GPLv3 binutils is good enough since the
>>>>> instruction set does not change anymore. Maybe only PPC and Sparc64
>>>>> still change besides x86. New CPUs types more recent than 2007 will
>>>>> have problems.
>>>> 
>>>> Alternatively we could try to run the disassembler in a different process, right?
>>> 
>>> For qemu.log this would be doable and even improve performance since
>>> only binary data would be transferred.
>>> 
>>> But for monitor disassembly command x/i it may be too clumsy.
>> 
>> Why would it be clumsy? We'd have to make sure we are communicating synchronously with the daemon, but apart from that it shouldn't be too different from the log, no?
> 
> The log file should be written as binary which the disassembly tool
> could read.

The log file contains a lot more information than just the diassembly. You get cpu state dumps, tcg op dumps, and above all there are a very big amount of log writing bits throughout the code for debug purposes that write plain ascii.

Do you think it's worth creating a 2-step process out of this? I was more thinking along the lines of a second process that qemu would spawn when log file is active / on monitor command which then would get binary opcodes voa a pipe and returns ascii disassembly that qemu cam use again.

That second program could even be built as part of our build process, right? We would then be able to pull in gplv3 code from binutils into that program, but keep it out of the main project.


Alex
Andreas Färber Sept. 10, 2012, 3:33 p.m. UTC | #55
Am 05.09.2012 22:46, schrieb Anthony Liguori:
> What do we do when the FSF comes out with the GPLv4 and relicenses again
> in an incompatible fashion?  Do we do this exercise every couple of
> years?

That's exactly why I suggested GPLv2+ because it was supposed to be a
preparation for the future. I'm pretty sure I mentioned the potential
GPLv4 back then!

Andreas
diff mbox

Patch

diff --git a/hw/kvm/Makefile.objs b/hw/kvm/Makefile.objs
index 226497a..f620d7f 100644
--- a/hw/kvm/Makefile.objs
+++ b/hw/kvm/Makefile.objs
@@ -1 +1 @@ 
-obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o
+obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o pci-assign.o
diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
new file mode 100644
index 0000000..9cce02c
--- /dev/null
+++ b/hw/kvm/pci-assign.c
@@ -0,0 +1,1929 @@ 
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ *  Assign a PCI device from the host to a guest VM.
+ *
+ *  Adapted for KVM by Qumranet.
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/io.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "hw/hw.h"
+#include "hw/pc.h"
+#include "qemu-error.h"
+#include "console.h"
+#include "hw/loader.h"
+#include "monitor.h"
+#include "range.h"
+#include "sysemu.h"
+#include "hw/pci.h"
+#include "hw/msi.h"
+#include "kvm_i386.h"
+
+#define MSIX_PAGE_SIZE 0x1000
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO       0x00000100  /* Resource type */
+#define IORESOURCE_MEM      0x00000200
+#define IORESOURCE_IRQ      0x00000400
+#define IORESOURCE_DMA      0x00000800
+#define IORESOURCE_PREFETCH 0x00002000  /* No side effects */
+
+/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, ...)                                       \
+    do {                                                      \
+        fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);  \
+    } while (0)
+#else
+#define DEBUG(fmt, ...)
+#endif
+
+typedef struct {
+    int type;           /* Memory or port I/O */
+    int valid;
+    uint32_t base_addr;
+    uint32_t size;    /* size of the region */
+    int resource_fd;
+} PCIRegion;
+
+typedef struct {
+    uint8_t bus, dev, func; /* Bus inside domain, device and function */
+    int irq;                /* IRQ number */
+    uint16_t region_number; /* number of active regions */
+
+    /* Port I/O or MMIO Regions */
+    PCIRegion regions[PCI_NUM_REGIONS - 1];
+    int config_fd;
+} PCIDevRegions;
+
+typedef struct {
+    MemoryRegion container;
+    MemoryRegion real_iomem;
+    union {
+        void *r_virtbase;    /* mmapped access address for memory regions */
+        uint32_t r_baseport; /* the base guest port for I/O regions */
+    } u;
+    pcibus_t e_size;    /* emulated size of region in bytes */
+    pcibus_t r_size;    /* real size of region in bytes */
+    PCIRegion *region;
+} AssignedDevRegion;
+
+#define ASSIGNED_DEVICE_PREFER_MSI_BIT  0
+#define ASSIGNED_DEVICE_SHARE_INTX_BIT  1
+
+#define ASSIGNED_DEVICE_PREFER_MSI_MASK (1 << ASSIGNED_DEVICE_PREFER_MSI_BIT)
+#define ASSIGNED_DEVICE_SHARE_INTX_MASK (1 << ASSIGNED_DEVICE_SHARE_INTX_BIT)
+
+typedef struct {
+    uint32_t addr_lo;
+    uint32_t addr_hi;
+    uint32_t data;
+    uint32_t ctrl;
+} MSIXTableEntry;
+
+typedef enum AssignedIRQType {
+    ASSIGNED_IRQ_NONE = 0,
+    ASSIGNED_IRQ_INTX_HOST_INTX,
+    ASSIGNED_IRQ_INTX_HOST_MSI,
+    ASSIGNED_IRQ_MSI,
+    ASSIGNED_IRQ_MSIX
+} AssignedIRQType;
+
+typedef struct AssignedDevice {
+    PCIDevice dev;
+    PCIHostDeviceAddress host;
+    uint32_t dev_id;
+    uint32_t features;
+    int intpin;
+    AssignedDevRegion v_addrs[PCI_NUM_REGIONS - 1];
+    PCIDevRegions real_device;
+    PCIINTxRoute intx_route;
+    AssignedIRQType assigned_irq_type;
+    struct {
+#define ASSIGNED_DEVICE_CAP_MSI (1 << 0)
+#define ASSIGNED_DEVICE_CAP_MSIX (1 << 1)
+        uint32_t available;
+#define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0)
+#define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1)
+#define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
+        uint32_t state;
+    } cap;
+    uint8_t emulate_config_read[PCI_CONFIG_SPACE_SIZE];
+    uint8_t emulate_config_write[PCI_CONFIG_SPACE_SIZE];
+    int msi_virq_nr;
+    int *msi_virq;
+    MSIXTableEntry *msix_table;
+    target_phys_addr_t msix_table_addr;
+    uint16_t msix_max;
+    MemoryRegion mmio;
+    char *configfd_name;
+    int32_t bootindex;
+} AssignedDevice;
+
+static void assigned_dev_update_irq_routing(PCIDevice *dev);
+
+static void assigned_dev_load_option_rom(AssignedDevice *dev);
+
+static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
+
+static uint64_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
+                                       target_phys_addr_t addr, int size,
+                                       uint64_t *data)
+{
+    uint64_t val = 0;
+    int fd = dev_region->region->resource_fd;
+
+    if (fd >= 0) {
+        if (data) {
+            DEBUG("pwrite data=%lx, size=%d, e_phys=%lx, addr=%lx\n",
+                  *data, size, addr, addr);
+            if (pwrite(fd, data, size, addr) != size) {
+                fprintf(stderr, "%s - pwrite failed %s\n",
+                        __func__, strerror(errno));
+            }
+        } else {
+            if (pread(fd, &val, size, addr) != size) {
+                fprintf(stderr, "%s - pread failed %s\n",
+                        __func__, strerror(errno));
+                val = (1UL << (size * 8)) - 1;
+            }
+            DEBUG("pread val=%lx, size=%d, e_phys=%lx, addr=%lx\n",
+                  val, size, addr, addr);
+        }
+    } else {
+        uint32_t port = addr + dev_region->u.r_baseport;
+
+        if (data) {
+            DEBUG("out data=%lx, size=%d, e_phys=%lx, host=%x\n",
+                  *data, size, addr, port);
+            switch (size) {
+            case 1:
+                outb(*data, port);
+                break;
+            case 2:
+                outw(*data, port);
+                break;
+            case 4:
+                outl(*data, port);
+                break;
+            }
+        } else {
+            switch (size) {
+            case 1:
+                val = inb(port);
+                break;
+            case 2:
+                val = inw(port);
+                break;
+            case 4:
+                val = inl(port);
+                break;
+            }
+            DEBUG("in data=%lx, size=%d, e_phys=%lx, host=%x\n",
+                  val, size, addr, port);
+        }
+    }
+    return val;
+}
+
+static void assigned_dev_ioport_write(void *opaque, target_phys_addr_t addr,
+                                      uint64_t data, unsigned size)
+{
+    assigned_dev_ioport_rw(opaque, addr, size, &data);
+}
+
+static uint64_t assigned_dev_ioport_read(void *opaque,
+                                         target_phys_addr_t addr, unsigned size)
+{
+    return assigned_dev_ioport_rw(opaque, addr, size, NULL);
+}
+
+static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint8_t *in = d->u.r_virtbase + addr;
+    uint32_t r;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint16_t *in = d->u.r_virtbase + addr;
+    uint32_t r;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint32_t *in = d->u.r_virtbase + addr;
+    uint32_t r;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint8_t *out = d->u.r_virtbase + addr;
+
+    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
+    *out = val;
+}
+
+static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint16_t *out = d->u.r_virtbase + addr;
+
+    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
+    *out = val;
+}
+
+static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint32_t *out = d->u.r_virtbase + addr;
+
+    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
+    *out = val;
+}
+
+static const MemoryRegionOps slow_bar_ops = {
+    .old_mmio = {
+        .read = { slow_bar_readb, slow_bar_readw, slow_bar_readl, },
+        .write = { slow_bar_writeb, slow_bar_writew, slow_bar_writel, },
+    },
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void assigned_dev_iomem_setup(PCIDevice *pci_dev, int region_num,
+                                     pcibus_t e_size)
+{
+    AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
+
+    if (e_size > 0) {
+        memory_region_init(&region->container, "assigned-dev-container",
+                           e_size);
+        memory_region_add_subregion(&region->container, 0, &region->real_iomem);
+
+        /* deal with MSI-X MMIO page */
+        if (real_region->base_addr <= r_dev->msix_table_addr &&
+                real_region->base_addr + real_region->size >
+                r_dev->msix_table_addr) {
+            int offset = r_dev->msix_table_addr - real_region->base_addr;
+
+            memory_region_add_subregion_overlap(&region->container,
+                                                offset,
+                                                &r_dev->mmio,
+                                                1);
+        }
+    }
+}
+
+static const MemoryRegionOps assigned_dev_ioport_ops = {
+    .read = assigned_dev_ioport_read,
+    .write = assigned_dev_ioport_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void assigned_dev_ioport_setup(PCIDevice *pci_dev, int region_num,
+                                      pcibus_t size)
+{
+    AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+
+    region->e_size = size;
+    memory_region_init(&region->container, "assigned-dev-container", size);
+    memory_region_init_io(&region->real_iomem, &assigned_dev_ioport_ops,
+                          r_dev->v_addrs + region_num,
+                          "assigned-dev-iomem", size);
+    memory_region_add_subregion(&region->container, 0, &region->real_iomem);
+}
+
+static uint32_t assigned_dev_pci_read(PCIDevice *d, int pos, int len)
+{
+    AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d);
+    uint32_t val;
+    ssize_t ret;
+    int fd = pci_dev->real_device.config_fd;
+
+again:
+    ret = pread(fd, &val, len, pos);
+    if (ret != len) {
+        if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) {
+            goto again;
+        }
+
+        hw_error("pci read failed, ret = %zd errno = %d\n", ret, errno);
+    }
+
+    return val;
+}
+
+static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
+{
+    return (uint8_t)assigned_dev_pci_read(d, pos, 1);
+}
+
+static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
+{
+    AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d);
+    ssize_t ret;
+    int fd = pci_dev->real_device.config_fd;
+
+again:
+    ret = pwrite(fd, &val, len, pos);
+    if (ret != len) {
+        if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) {
+            goto again;
+        }
+
+        hw_error("pci write failed, ret = %zd errno = %d\n", ret, errno);
+    }
+}
+
+static void assigned_dev_emulate_config_read(AssignedDevice *dev,
+                                             uint32_t offset, uint32_t len)
+{
+    memset(dev->emulate_config_read + offset, 0xff, len);
+}
+
+static void assigned_dev_direct_config_read(AssignedDevice *dev,
+                                            uint32_t offset, uint32_t len)
+{
+    memset(dev->emulate_config_read + offset, 0, len);
+}
+
+static void assigned_dev_direct_config_write(AssignedDevice *dev,
+                                             uint32_t offset, uint32_t len)
+{
+    memset(dev->emulate_config_write + offset, 0, len);
+}
+
+static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap, uint8_t start)
+{
+    int id;
+    int max_cap = 48;
+    int pos = start ? start : PCI_CAPABILITY_LIST;
+    int status;
+
+    status = assigned_dev_pci_read_byte(d, PCI_STATUS);
+    if ((status & PCI_STATUS_CAP_LIST) == 0) {
+        return 0;
+    }
+
+    while (max_cap--) {
+        pos = assigned_dev_pci_read_byte(d, pos);
+        if (pos < 0x40) {
+            break;
+        }
+
+        pos &= ~3;
+        id = assigned_dev_pci_read_byte(d, pos + PCI_CAP_LIST_ID);
+
+        if (id == 0xff) {
+            break;
+        }
+        if (id == cap) {
+            return pos;
+        }
+
+        pos += PCI_CAP_LIST_NEXT;
+    }
+    return 0;
+}
+
+static int assigned_dev_register_regions(PCIRegion *io_regions,
+                                         unsigned long regions_num,
+                                         AssignedDevice *pci_dev)
+{
+    uint32_t i;
+    PCIRegion *cur_region = io_regions;
+
+    for (i = 0; i < regions_num; i++, cur_region++) {
+        if (!cur_region->valid) {
+            continue;
+        }
+
+        /* handle memory io regions */
+        if (cur_region->type & IORESOURCE_MEM) {
+            int t = cur_region->type & IORESOURCE_PREFETCH
+                ? PCI_BASE_ADDRESS_MEM_PREFETCH
+                : PCI_BASE_ADDRESS_SPACE_MEMORY;
+
+            /* map physical memory */
+            pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL, cur_region->size,
+                                                    PROT_WRITE | PROT_READ,
+                                                    MAP_SHARED,
+                                                    cur_region->resource_fd,
+                                                    (off_t)0);
+
+            if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
+                pci_dev->v_addrs[i].u.r_virtbase = NULL;
+                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
+                        "\n", __func__,
+                        (uint32_t) (cur_region->base_addr));
+                return -1;
+            }
+
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            /* add offset */
+            pci_dev->v_addrs[i].u.r_virtbase +=
+                (cur_region->base_addr & 0xFFF);
+
+            if (cur_region->size & 0xFFF) {
+                fprintf(stderr, "PCI region %d at address 0x%llx "
+                        "has size 0x%x, which is not a multiple of 4K. "
+                        "You might experience some performance hit "
+                        "due to that.\n",
+                        i, (unsigned long long)cur_region->base_addr,
+                        cur_region->size);
+                memory_region_init_io(&pci_dev->v_addrs[i].real_iomem,
+                                      &slow_bar_ops, &pci_dev->v_addrs[i],
+                                      "assigned-dev-slow-bar",
+                                      cur_region->size);
+            } else {
+                void *virtbase = pci_dev->v_addrs[i].u.r_virtbase;
+                char name[32];
+                snprintf(name, sizeof(name), "%s.bar%d",
+                         object_get_typename(OBJECT(pci_dev)), i);
+                memory_region_init_ram_ptr(&pci_dev->v_addrs[i].real_iomem,
+                                           name, cur_region->size,
+                                           virtbase);
+                vmstate_register_ram(&pci_dev->v_addrs[i].real_iomem,
+                                     &pci_dev->dev.qdev);
+            }
+
+            assigned_dev_iomem_setup(&pci_dev->dev, i, cur_region->size);
+            pci_register_bar((PCIDevice *) pci_dev, i, t,
+                             &pci_dev->v_addrs[i].container);
+            continue;
+        } else {
+            /* handle port io regions */
+            uint32_t val;
+            int ret;
+
+            /* Test kernel support for ioport resource read/write.  Old
+             * kernels return EIO.  New kernels only allow 1/2/4 byte reads
+             * so should return EINVAL for a 3 byte read */
+            ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
+            if (ret >= 0) {
+                fprintf(stderr, "Unexpected return from I/O port read: %d\n",
+                        ret);
+                abort();
+            } else if (errno != EINVAL) {
+                fprintf(stderr, "Kernel doesn't support ioport resource "
+                                "access, hiding this region.\n");
+                close(pci_dev->v_addrs[i].region->resource_fd);
+                cur_region->valid = 0;
+                continue;
+            }
+
+            pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            assigned_dev_ioport_setup(&pci_dev->dev, i, cur_region->size);
+            pci_register_bar((PCIDevice *) pci_dev, i,
+                             PCI_BASE_ADDRESS_SPACE_IO,
+                             &pci_dev->v_addrs[i].container);
+        }
+    }
+
+    /* success */
+    return 0;
+}
+
+static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
+{
+    FILE *f;
+    char name[128];
+    long id;
+
+    snprintf(name, sizeof(name), "%s%s", devpath, idname);
+    f = fopen(name, "r");
+    if (f == NULL) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return -1;
+    }
+    if (fscanf(f, "%li\n", &id) == 1) {
+        *val = id;
+    } else {
+        return -1;
+    }
+    fclose(f);
+
+    return 0;
+}
+
+static int get_real_vendor_id(const char *devpath, uint16_t *val)
+{
+    return get_real_id(devpath, "vendor", val);
+}
+
+static int get_real_device_id(const char *devpath, uint16_t *val)
+{
+    return get_real_id(devpath, "device", val);
+}
+
+static int get_real_device(AssignedDevice *pci_dev, uint16_t r_seg,
+                           uint8_t r_bus, uint8_t r_dev, uint8_t r_func)
+{
+    char dir[128], name[128];
+    int fd, r = 0, v;
+    FILE *f;
+    unsigned long long start, end, size, flags;
+    uint16_t id;
+    PCIRegion *rp;
+    PCIDevRegions *dev = &pci_dev->real_device;
+
+    dev->region_number = 0;
+
+    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
+             r_seg, r_bus, r_dev, r_func);
+
+    snprintf(name, sizeof(name), "%sconfig", dir);
+
+    if (pci_dev->configfd_name && *pci_dev->configfd_name) {
+        if (qemu_isdigit(pci_dev->configfd_name[0])) {
+            dev->config_fd = strtol(pci_dev->configfd_name, NULL, 0);
+        } else {
+            dev->config_fd = monitor_get_fd(cur_mon, pci_dev->configfd_name);
+            if (dev->config_fd < 0) {
+                fprintf(stderr, "%s: (%s) unkown\n", __func__,
+                        pci_dev->configfd_name);
+                return 1;
+            }
+        }
+    } else {
+        dev->config_fd = open(name, O_RDWR);
+
+        if (dev->config_fd == -1) {
+            fprintf(stderr, "%s: %s: %m\n", __func__, name);
+            return 1;
+        }
+    }
+again:
+    r = read(dev->config_fd, pci_dev->dev.config,
+             pci_config_size(&pci_dev->dev));
+    if (r < 0) {
+        if (errno == EINTR || errno == EAGAIN) {
+            goto again;
+        }
+        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
+    }
+
+    /* Restore or clear multifunction, this is always controlled by qemu */
+    if (pci_dev->dev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+        pci_dev->dev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
+    } else {
+        pci_dev->dev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
+    }
+
+    /* Clear host resource mapping info.  If we choose not to register a
+     * BAR, such as might be the case with the option ROM, we can get
+     * confusing, unwritable, residual addresses from the host here. */
+    memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24);
+    memset(&pci_dev->dev.config[PCI_ROM_ADDRESS], 0, 4);
+
+    snprintf(name, sizeof(name), "%sresource", dir);
+
+    f = fopen(name, "r");
+    if (f == NULL) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+
+    for (r = 0; r < PCI_ROM_SLOT; r++) {
+        if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3) {
+            break;
+        }
+
+        rp = dev->regions + r;
+        rp->valid = 0;
+        rp->resource_fd = -1;
+        size = end - start + 1;
+        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0) {
+            continue;
+        }
+        if (flags & IORESOURCE_MEM) {
+            flags &= ~IORESOURCE_IO;
+        } else {
+            flags &= ~IORESOURCE_PREFETCH;
+        }
+        snprintf(name, sizeof(name), "%sresource%d", dir, r);
+        fd = open(name, O_RDWR);
+        if (fd == -1) {
+            continue;
+        }
+        rp->resource_fd = fd;
+
+        rp->type = flags;
+        rp->valid = 1;
+        rp->base_addr = start;
+        rp->size = size;
+        pci_dev->v_addrs[r].region = rp;
+        DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
+              r, rp->size, start, rp->type, rp->resource_fd);
+    }
+
+    fclose(f);
+
+    /* read and fill vendor ID */
+    v = get_real_vendor_id(dir, &id);
+    if (v) {
+        return 1;
+    }
+    pci_dev->dev.config[0] = id & 0xff;
+    pci_dev->dev.config[1] = (id & 0xff00) >> 8;
+
+    /* read and fill device ID */
+    v = get_real_device_id(dir, &id);
+    if (v) {
+        return 1;
+    }
+    pci_dev->dev.config[2] = id & 0xff;
+    pci_dev->dev.config[3] = (id & 0xff00) >> 8;
+
+    pci_word_test_and_clear_mask(pci_dev->emulate_config_write + PCI_COMMAND,
+                                 PCI_COMMAND_MASTER | PCI_COMMAND_INTX_DISABLE);
+
+    dev->region_number = r;
+    return 0;
+}
+
+static void free_msi_virqs(AssignedDevice *dev)
+{
+    int i;
+
+    for (i = 0; i < dev->msi_virq_nr; i++) {
+        if (dev->msi_virq[i] >= 0) {
+            kvm_irqchip_release_virq(kvm_state, dev->msi_virq[i]);
+            dev->msi_virq[i] = -1;
+        }
+    }
+    g_free(dev->msi_virq);
+    dev->msi_virq = NULL;
+    dev->msi_virq_nr = 0;
+}
+
+static void free_assigned_device(AssignedDevice *dev)
+{
+    int i;
+
+    if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
+        assigned_dev_unregister_msix_mmio(dev);
+    }
+    for (i = 0; i < dev->real_device.region_number; i++) {
+        PCIRegion *pci_region = &dev->real_device.regions[i];
+        AssignedDevRegion *region = &dev->v_addrs[i];
+
+        if (!pci_region->valid) {
+            continue;
+        }
+        if (pci_region->type & IORESOURCE_IO) {
+            if (region->u.r_baseport) {
+                memory_region_del_subregion(&region->container,
+                                            &region->real_iomem);
+                memory_region_destroy(&region->real_iomem);
+                memory_region_destroy(&region->container);
+            }
+        } else if (pci_region->type & IORESOURCE_MEM) {
+            if (region->u.r_virtbase) {
+                memory_region_del_subregion(&region->container,
+                                            &region->real_iomem);
+
+                /* Remove MSI-X table subregion */
+                if (pci_region->base_addr <= dev->msix_table_addr &&
+                    pci_region->base_addr + pci_region->size >
+                    dev->msix_table_addr) {
+                    memory_region_del_subregion(&region->container,
+                                                &dev->mmio);
+                }
+
+                memory_region_destroy(&region->real_iomem);
+                memory_region_destroy(&region->container);
+                if (munmap(region->u.r_virtbase,
+                           (pci_region->size + 0xFFF) & 0xFFFFF000)) {
+                    fprintf(stderr,
+                            "Failed to unmap assigned device region: %s\n",
+                            strerror(errno));
+                }
+            }
+        }
+        if (pci_region->resource_fd >= 0) {
+            close(pci_region->resource_fd);
+        }
+    }
+
+    if (dev->real_device.config_fd >= 0) {
+        close(dev->real_device.config_fd);
+    }
+
+    free_msi_virqs(dev);
+}
+
+static void assign_failed_examine(AssignedDevice *dev)
+{
+    char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
+    uint16_t vendor_id, device_id;
+    int r;
+
+    sprintf(dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+            dev->host.domain, dev->host.bus, dev->host.slot,
+            dev->host.function);
+
+    sprintf(name, "%sdriver", dir);
+
+    r = readlink(name, driver, sizeof(driver));
+    if ((r <= 0) || r >= sizeof(driver)) {
+        goto fail;
+    }
+
+    ns = strrchr(driver, '/');
+    if (!ns) {
+        goto fail;
+    }
+
+    ns++;
+
+    if (get_real_vendor_id(dir, &vendor_id) ||
+        get_real_device_id(dir, &device_id)) {
+        goto fail;
+    }
+
+    fprintf(stderr, "*** The driver '%s' is occupying your device "
+                    "%04x:%02x:%02x.%x.\n",
+            ns, dev->host.domain, dev->host.bus, dev->host.slot,
+            dev->host.function);
+    fprintf(stderr, "***\n");
+    fprintf(stderr, "*** You can try the following commands to free it:\n");
+    fprintf(stderr, "***\n");
+    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
+                    "new_id\n", vendor_id, device_id);
+    fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
+                    "%s/unbind\n",
+            dev->host.domain, dev->host.bus, dev->host.slot,
+            dev->host.function, ns);
+    fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
+                    "pci-stub/bind\n",
+            dev->host.domain, dev->host.bus, dev->host.slot,
+            dev->host.function);
+    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
+                    "/remove_id\n", vendor_id, device_id);
+    fprintf(stderr, "***\n");
+
+    return;
+
+fail:
+    fprintf(stderr, "Couldn't find out why.\n");
+}
+
+static int assign_device(AssignedDevice *dev)
+{
+    uint32_t flags = KVM_DEV_ASSIGN_ENABLE_IOMMU;
+    int r;
+
+    /* Only pass non-zero PCI segment to capable module */
+    if (!kvm_check_extension(kvm_state, KVM_CAP_PCI_SEGMENT) &&
+        dev->host.domain) {
+        fprintf(stderr, "Can't assign device inside non-zero PCI segment "
+                "as this KVM module doesn't support it.\n");
+        return -ENODEV;
+    }
+
+    if (!kvm_check_extension(kvm_state, KVM_CAP_IOMMU)) {
+        fprintf(stderr, "No IOMMU found.  Unable to assign device \"%s\"\n",
+                dev->dev.qdev.id);
+        return -ENODEV;
+    }
+
+    if (dev->features & ASSIGNED_DEVICE_SHARE_INTX_MASK &&
+        kvm_has_intx_set_mask()) {
+        flags |= KVM_DEV_ASSIGN_PCI_2_3;
+    }
+
+    r = kvm_device_pci_assign(kvm_state, &dev->host, flags, &dev->dev_id);
+    if (r < 0) {
+        fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
+                dev->dev.qdev.id, strerror(-r));
+
+        switch (r) {
+        case -EBUSY:
+            assign_failed_examine(dev);
+            break;
+        default:
+            break;
+        }
+    }
+    return r;
+}
+
+static bool check_irqchip_in_kernel(void)
+{
+    if (kvm_irqchip_in_kernel()) {
+        return true;
+    }
+    error_report("pci-assign: error: requires KVM with in-kernel irqchip "
+                 "enabled");
+    return false;
+}
+
+static int assign_intx(AssignedDevice *dev)
+{
+    AssignedIRQType new_type;
+    PCIINTxRoute intx_route;
+    bool intx_host_msi;
+    int r;
+
+    /* Interrupt PIN 0 means don't use INTx */
+    if (assigned_dev_pci_read_byte(&dev->dev, PCI_INTERRUPT_PIN) == 0) {
+        pci_device_set_intx_routing_notifier(&dev->dev, NULL);
+        return 0;
+    }
+
+    if (!check_irqchip_in_kernel()) {
+        return -ENOTSUP;
+    }
+
+    pci_device_set_intx_routing_notifier(&dev->dev,
+                                         assigned_dev_update_irq_routing);
+
+    intx_route = pci_device_route_intx_to_irq(&dev->dev, 0);
+    assert(intx_route.mode != PCI_INTX_INVERTED);
+
+    if (dev->intx_route.mode == intx_route.mode &&
+        dev->intx_route.irq == intx_route.irq) {
+        return 0;
+    }
+
+    switch (dev->assigned_irq_type) {
+    case ASSIGNED_IRQ_INTX_HOST_INTX:
+    case ASSIGNED_IRQ_INTX_HOST_MSI:
+        intx_host_msi = dev->assigned_irq_type == ASSIGNED_IRQ_INTX_HOST_MSI;
+        r = kvm_device_intx_deassign(kvm_state, dev->dev_id, intx_host_msi);
+        break;
+    case ASSIGNED_IRQ_MSI:
+        r = kvm_device_msi_deassign(kvm_state, dev->dev_id);
+        break;
+    case ASSIGNED_IRQ_MSIX:
+        r = kvm_device_msix_deassign(kvm_state, dev->dev_id);
+        break;
+    default:
+        r = 0;
+        break;
+    }
+    if (r) {
+        perror("assign_intx: deassign");
+    }
+    dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
+
+    if (intx_route.mode == PCI_INTX_DISABLED) {
+        dev->intx_route = intx_route;
+        return 0;
+    }
+
+retry:
+    if (dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK &&
+        dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
+        intx_host_msi = true;
+        new_type = ASSIGNED_IRQ_INTX_HOST_MSI;
+    } else {
+        intx_host_msi = false;
+        new_type = ASSIGNED_IRQ_INTX_HOST_INTX;
+    }
+
+    r = kvm_device_intx_assign(kvm_state, dev->dev_id, intx_host_msi,
+                               intx_route.irq);
+    if (r < 0) {
+        if (r == -EIO && !(dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK) &&
+            dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
+            /* Retry with host-side MSI. There might be an IRQ conflict and
+             * either the kernel or the device doesn't support sharing. */
+            fprintf(stderr,
+                    "Host-side INTx sharing not supported, "
+                    "using MSI instead.\n"
+                    "Some devices do not to work properly in this mode.\n");
+            dev->features |= ASSIGNED_DEVICE_PREFER_MSI_MASK;
+            goto retry;
+        }
+        fprintf(stderr, "Failed to assign irq for \"%s\": %s\n",
+                dev->dev.qdev.id, strerror(-r));
+        fprintf(stderr, "Perhaps you are assigning a device "
+                "that shares an IRQ with another device?\n");
+        return r;
+    }
+
+    dev->intx_route = intx_route;
+    dev->assigned_irq_type = new_type;
+    return r;
+}
+
+static void deassign_device(AssignedDevice *dev)
+{
+    int r;
+
+    r = kvm_device_pci_deassign(kvm_state, dev->dev_id);
+    if (r < 0) {
+        fprintf(stderr, "Failed to deassign device \"%s\" : %s\n",
+                dev->dev.qdev.id, strerror(-r));
+    }
+}
+
+/* The pci config space got updated. Check if irq numbers have changed
+ * for our devices
+ */
+static void assigned_dev_update_irq_routing(PCIDevice *dev)
+{
+    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, dev);
+    Error *err = NULL;
+    int r;
+
+    r = assign_intx(assigned_dev);
+    if (r < 0) {
+        qdev_unplug(&dev->qdev, &err);
+        assert(!err);
+    }
+}
+
+static void assigned_dev_update_msi(PCIDevice *pci_dev)
+{
+    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    uint8_t ctrl_byte = pci_get_byte(pci_dev->config + pci_dev->msi_cap +
+                                     PCI_MSI_FLAGS);
+    int r;
+
+    /* Some guests gratuitously disable MSI even if they're not using it,
+     * try to catch this by only deassigning irqs if the guest is using
+     * MSI or intends to start. */
+    if (assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSI ||
+        (ctrl_byte & PCI_MSI_FLAGS_ENABLE)) {
+        r = kvm_device_msi_deassign(kvm_state, assigned_dev->dev_id);
+        /* -ENXIO means no assigned irq */
+        if (r && r != -ENXIO) {
+            perror("assigned_dev_update_msi: deassign irq");
+        }
+
+        free_msi_virqs(assigned_dev);
+
+        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
+        pci_device_set_intx_routing_notifier(pci_dev, NULL);
+    }
+
+    if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) {
+        uint8_t *pos = pci_dev->config + pci_dev->msi_cap;
+        MSIMessage msg;
+        int virq;
+
+        msg.address = pci_get_long(pos + PCI_MSI_ADDRESS_LO);
+        msg.data = pci_get_word(pos + PCI_MSI_DATA_32);
+        virq = kvm_irqchip_add_msi_route(kvm_state, msg);
+        if (virq < 0) {
+            perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route");
+            return;
+        }
+
+        assigned_dev->msi_virq = g_malloc(sizeof(*assigned_dev->msi_virq));
+        assigned_dev->msi_virq_nr = 1;
+        assigned_dev->msi_virq[0] = virq;
+        if (kvm_device_msi_assign(kvm_state, assigned_dev->dev_id, virq) < 0) {
+            perror("assigned_dev_update_msi: kvm_device_msi_assign");
+        }
+
+        assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
+        assigned_dev->intx_route.irq = -1;
+        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSI;
+    } else {
+        assign_intx(assigned_dev);
+    }
+}
+
+static bool msix_masked(MSIXTableEntry *entry)
+{
+    return (entry->ctrl & cpu_to_le32(0x1)) != 0;
+}
+
+static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
+{
+    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    uint16_t entries_nr = 0;
+    int i, r = 0;
+    MSIXTableEntry *entry = adev->msix_table;
+    MSIMessage msg;
+
+    /* Get the usable entry number for allocating */
+    for (i = 0; i < adev->msix_max; i++, entry++) {
+        if (msix_masked(entry)) {
+            continue;
+        }
+        entries_nr++;
+    }
+
+    DEBUG("MSI-X entries: %d\n", entries_nr);
+
+    /* It's valid to enable MSI-X with all entries masked */
+    if (!entries_nr) {
+        return 0;
+    }
+
+    r = kvm_device_msix_init_vectors(kvm_state, adev->dev_id, entries_nr);
+    if (r != 0) {
+        fprintf(stderr, "fail to set MSI-X entry number for MSIX! %s\n",
+                strerror(-r));
+        return r;
+    }
+
+    free_msi_virqs(adev);
+
+    adev->msi_virq_nr = adev->msix_max;
+    adev->msi_virq = g_malloc(adev->msix_max * sizeof(*adev->msi_virq));
+
+    entry = adev->msix_table;
+    for (i = 0; i < adev->msix_max; i++, entry++) {
+        adev->msi_virq[i] = -1;
+
+        if (msix_masked(entry)) {
+            continue;
+        }
+
+        msg.address = entry->addr_lo | ((uint64_t)entry->addr_hi << 32);
+        msg.data = entry->data;
+        r = kvm_irqchip_add_msi_route(kvm_state, msg);
+        if (r < 0) {
+            return r;
+        }
+        adev->msi_virq[i] = r;
+
+        DEBUG("MSI-X vector %d, gsi %d, addr %08x_%08x, data %08x\n", i,
+              r, entry->addr_hi, entry->addr_lo, entry->data);
+
+        r = kvm_device_msix_set_vector(kvm_state, adev->dev_id, i,
+                                       adev->msi_virq[i]);
+        if (r) {
+            fprintf(stderr, "fail to set MSI-X entry! %s\n", strerror(-r));
+            break;
+        }
+    }
+
+    return r;
+}
+
+static void assigned_dev_update_msix(PCIDevice *pci_dev)
+{
+    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    uint16_t ctrl_word = pci_get_word(pci_dev->config + pci_dev->msix_cap +
+                                      PCI_MSIX_FLAGS);
+    int r;
+
+    /* Some guests gratuitously disable MSIX even if they're not using it,
+     * try to catch this by only deassigning irqs if the guest is using
+     * MSIX or intends to start. */
+    if ((assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSIX) ||
+        (ctrl_word & PCI_MSIX_FLAGS_ENABLE)) {
+        r = kvm_device_msix_deassign(kvm_state, assigned_dev->dev_id);
+        /* -ENXIO means no assigned irq */
+        if (r && r != -ENXIO) {
+            perror("assigned_dev_update_msix: deassign irq");
+        }
+
+        free_msi_virqs(assigned_dev);
+
+        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
+        pci_device_set_intx_routing_notifier(pci_dev, NULL);
+    }
+
+    if (ctrl_word & PCI_MSIX_FLAGS_ENABLE) {
+        if (assigned_dev_update_msix_mmio(pci_dev) < 0) {
+            perror("assigned_dev_update_msix_mmio");
+            return;
+        }
+
+        if (assigned_dev->msi_virq_nr > 0) {
+            if (kvm_device_msix_assign(kvm_state, assigned_dev->dev_id) < 0) {
+                perror("assigned_dev_enable_msix: assign irq");
+                return;
+            }
+        }
+        assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
+        assigned_dev->intx_route.irq = -1;
+        assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSIX;
+    } else {
+        assign_intx(assigned_dev);
+    }
+}
+
+static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev,
+                                             uint32_t address, int len)
+{
+    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    uint32_t virt_val = pci_default_read_config(pci_dev, address, len);
+    uint32_t real_val, emulate_mask, full_emulation_mask;
+
+    emulate_mask = 0;
+    memcpy(&emulate_mask, assigned_dev->emulate_config_read + address, len);
+    emulate_mask = le32_to_cpu(emulate_mask);
+
+    full_emulation_mask = 0xffffffff >> (32 - len * 8);
+
+    if (emulate_mask != full_emulation_mask) {
+        real_val = assigned_dev_pci_read(pci_dev, address, len);
+        return (virt_val & emulate_mask) | (real_val & ~emulate_mask);
+    } else {
+        return virt_val;
+    }
+}
+
+static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address,
+                                          uint32_t val, int len)
+{
+    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    uint16_t old_cmd = pci_get_word(pci_dev->config + PCI_COMMAND);
+    uint32_t emulate_mask, full_emulation_mask;
+    int ret;
+
+    pci_default_write_config(pci_dev, address, val, len);
+
+    if (kvm_has_intx_set_mask() &&
+        range_covers_byte(address, len, PCI_COMMAND + 1)) {
+        bool intx_masked = (pci_get_word(pci_dev->config + PCI_COMMAND) &
+                            PCI_COMMAND_INTX_DISABLE);
+
+        if (intx_masked != !!(old_cmd & PCI_COMMAND_INTX_DISABLE)) {
+            ret = kvm_device_intx_set_mask(kvm_state, assigned_dev->dev_id,
+                                           intx_masked);
+            if (ret) {
+                perror("assigned_dev_pci_write_config: set intx mask");
+            }
+        }
+    }
+    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
+        if (range_covers_byte(address, len,
+                              pci_dev->msi_cap + PCI_MSI_FLAGS)) {
+            assigned_dev_update_msi(pci_dev);
+        }
+    }
+    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
+        if (range_covers_byte(address, len,
+                              pci_dev->msix_cap + PCI_MSIX_FLAGS + 1)) {
+            assigned_dev_update_msix(pci_dev);
+        }
+    }
+
+    emulate_mask = 0;
+    memcpy(&emulate_mask, assigned_dev->emulate_config_write + address, len);
+    emulate_mask = le32_to_cpu(emulate_mask);
+
+    full_emulation_mask = 0xffffffff >> (32 - len * 8);
+
+    if (emulate_mask != full_emulation_mask) {
+        if (emulate_mask) {
+            val &= ~emulate_mask;
+            val |= assigned_dev_pci_read(pci_dev, address, len) & emulate_mask;
+        }
+        assigned_dev_pci_write(pci_dev, address, val, len);
+    }
+}
+
+static void assigned_dev_setup_cap_read(AssignedDevice *dev, uint32_t offset,
+                                        uint32_t len)
+{
+    assigned_dev_direct_config_read(dev, offset, len);
+    assigned_dev_emulate_config_read(dev, offset + PCI_CAP_LIST_NEXT, 1);
+}
+
+static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
+{
+    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    PCIRegion *pci_region = dev->real_device.regions;
+    int ret, pos;
+
+    /* Clear initial capabilities pointer and status copied from hw */
+    pci_set_byte(pci_dev->config + PCI_CAPABILITY_LIST, 0);
+    pci_set_word(pci_dev->config + PCI_STATUS,
+                 pci_get_word(pci_dev->config + PCI_STATUS) &
+                 ~PCI_STATUS_CAP_LIST);
+
+    /* Expose MSI capability
+     * MSI capability is the 1st capability in capability config */
+    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSI, 0);
+    if (pos != 0 && kvm_check_extension(kvm_state, KVM_CAP_ASSIGN_DEV_IRQ)) {
+        if (!check_irqchip_in_kernel()) {
+            return -ENOTSUP;
+        }
+        dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI;
+        /* Only 32-bit/no-mask currently supported */
+        ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSI, pos, 10);
+        if (ret < 0) {
+            return ret;
+        }
+        pci_dev->msi_cap = pos;
+
+        pci_set_word(pci_dev->config + pos + PCI_MSI_FLAGS,
+                     pci_get_word(pci_dev->config + pos + PCI_MSI_FLAGS) &
+                     PCI_MSI_FLAGS_QMASK);
+        pci_set_long(pci_dev->config + pos + PCI_MSI_ADDRESS_LO, 0);
+        pci_set_word(pci_dev->config + pos + PCI_MSI_DATA_32, 0);
+
+        /* Set writable fields */
+        pci_set_word(pci_dev->wmask + pos + PCI_MSI_FLAGS,
+                     PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
+        pci_set_long(pci_dev->wmask + pos + PCI_MSI_ADDRESS_LO, 0xfffffffc);
+        pci_set_word(pci_dev->wmask + pos + PCI_MSI_DATA_32, 0xffff);
+    }
+    /* Expose MSI-X capability */
+    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX, 0);
+    if (pos != 0 && kvm_device_msix_supported(kvm_state)) {
+        int bar_nr;
+        uint32_t msix_table_entry;
+
+        if (!check_irqchip_in_kernel()) {
+            return -ENOTSUP;
+        }
+        dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX;
+        ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSIX, pos, 12);
+        if (ret < 0) {
+            return ret;
+        }
+        pci_dev->msix_cap = pos;
+
+        pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS,
+                     pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) &
+                     PCI_MSIX_FLAGS_QSIZE);
+
+        /* Only enable and function mask bits are writable */
+        pci_set_word(pci_dev->wmask + pos + PCI_MSIX_FLAGS,
+                     PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
+
+        msix_table_entry = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE);
+        bar_nr = msix_table_entry & PCI_MSIX_FLAGS_BIRMASK;
+        msix_table_entry &= ~PCI_MSIX_FLAGS_BIRMASK;
+        dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
+        dev->msix_max = pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS);
+        dev->msix_max &= PCI_MSIX_FLAGS_QSIZE;
+        dev->msix_max += 1;
+    }
+
+    /* Minimal PM support, nothing writable, device appears to NAK changes */
+    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM, 0);
+    if (pos) {
+        uint16_t pmc;
+
+        ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, pos, PCI_PM_SIZEOF);
+        if (ret < 0) {
+            return ret;
+        }
+
+        assigned_dev_setup_cap_read(dev, pos, PCI_PM_SIZEOF);
+
+        pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
+        pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
+        pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
+
+        /* assign_device will bring the device up to D0, so we don't need
+         * to worry about doing that ourselves here. */
+        pci_set_word(pci_dev->config + pos + PCI_PM_CTRL,
+                     PCI_PM_CTRL_NO_SOFT_RESET);
+
+        pci_set_byte(pci_dev->config + pos + PCI_PM_PPB_EXTENSIONS, 0);
+        pci_set_byte(pci_dev->config + pos + PCI_PM_DATA_REGISTER, 0);
+    }
+
+    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP, 0);
+    if (pos) {
+        uint8_t version, size = 0;
+        uint16_t type, devctl, lnksta;
+        uint32_t devcap, lnkcap;
+
+        version = pci_get_byte(pci_dev->config + pos + PCI_EXP_FLAGS);
+        version &= PCI_EXP_FLAGS_VERS;
+        if (version == 1) {
+            size = 0x14;
+        } else if (version == 2) {
+            /*
+             * Check for non-std size, accept reduced size to 0x34,
+             * which is what bcm5761 implemented, violating the
+             * PCIe v3.0 spec that regs should exist and be read as 0,
+             * not optionally provided and shorten the struct size.
+             */
+            size = MIN(0x3c, PCI_CONFIG_SPACE_SIZE - pos);
+            if (size < 0x34) {
+                fprintf(stderr,
+                        "%s: Invalid size PCIe cap-id 0x%x\n",
+                        __func__, PCI_CAP_ID_EXP);
+                return -EINVAL;
+            } else if (size != 0x3c) {
+                fprintf(stderr,
+                        "WARNING, %s: PCIe cap-id 0x%x has "
+                        "non-standard size 0x%x; std size should be 0x3c\n",
+                         __func__, PCI_CAP_ID_EXP, size);
+            }
+        } else if (version == 0) {
+            uint16_t vid, did;
+            vid = pci_get_word(pci_dev->config + PCI_VENDOR_ID);
+            did = pci_get_word(pci_dev->config + PCI_DEVICE_ID);
+            if (vid == PCI_VENDOR_ID_INTEL && did == 0x10ed) {
+                /*
+                 * quirk for Intel 82599 VF with invalid PCIe capability
+                 * version, should really be version 2 (same as PF)
+                 */
+                size = 0x3c;
+            }
+        }
+
+        if (size == 0) {
+            fprintf(stderr,
+                    "%s: Unsupported PCI express capability version %d\n",
+                    __func__, version);
+            return -EINVAL;
+        }
+
+        ret = pci_add_capability(pci_dev, PCI_CAP_ID_EXP, pos, size);
+        if (ret < 0) {
+            return ret;
+        }
+
+        assigned_dev_setup_cap_read(dev, pos, size);
+
+        type = pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS);
+        type = (type & PCI_EXP_FLAGS_TYPE) >> 4;
+        if (type != PCI_EXP_TYPE_ENDPOINT &&
+            type != PCI_EXP_TYPE_LEG_END && type != PCI_EXP_TYPE_RC_END) {
+            fprintf(stderr,
+                    "Device assignment only supports endpoint assignment, "
+                    "device type %d\n", type);
+            return -EINVAL;
+        }
+
+        /* capabilities, pass existing read-only copy
+         * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */
+
+        /* device capabilities: hide FLR */
+        devcap = pci_get_long(pci_dev->config + pos + PCI_EXP_DEVCAP);
+        devcap &= ~PCI_EXP_DEVCAP_FLR;
+        pci_set_long(pci_dev->config + pos + PCI_EXP_DEVCAP, devcap);
+
+        /* device control: clear all error reporting enable bits, leaving
+         *                 only a few host values.  Note, these are
+         *                 all writable, but not passed to hw.
+         */
+        devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
+        devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
+                  PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
+        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
+        devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
+        pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
+
+        /* Clear device status */
+        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVSTA, 0);
+
+        /* Link capabilities, expose links and latencues, clear reporting */
+        lnkcap = pci_get_long(pci_dev->config + pos + PCI_EXP_LNKCAP);
+        lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
+                   PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
+                   PCI_EXP_LNKCAP_L1EL);
+        pci_set_long(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
+
+        /* Link control, pass existing read-only copy.  Should be writable? */
+
+        /* Link status, only expose current speed and width */
+        lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
+        lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
+        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
+
+        if (version >= 2) {
+            /* Slot capabilities, control, status - not needed for endpoints */
+            pci_set_long(pci_dev->config + pos + PCI_EXP_SLTCAP, 0);
+            pci_set_word(pci_dev->config + pos + PCI_EXP_SLTCTL, 0);
+            pci_set_word(pci_dev->config + pos + PCI_EXP_SLTSTA, 0);
+
+            /* Root control, capabilities, status - not needed for endpoints */
+            pci_set_word(pci_dev->config + pos + PCI_EXP_RTCTL, 0);
+            pci_set_word(pci_dev->config + pos + PCI_EXP_RTCAP, 0);
+            pci_set_long(pci_dev->config + pos + PCI_EXP_RTSTA, 0);
+
+            /* Device capabilities/control 2, pass existing read-only copy */
+            /* Link control 2, pass existing read-only copy */
+        }
+    }
+
+    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX, 0);
+    if (pos) {
+        uint16_t cmd;
+        uint32_t status;
+
+        /* Only expose the minimum, 8 byte capability */
+        ret = pci_add_capability(pci_dev, PCI_CAP_ID_PCIX, pos, 8);
+        if (ret < 0) {
+            return ret;
+        }
+
+        assigned_dev_setup_cap_read(dev, pos, 8);
+
+        /* Command register, clear upper bits, including extended modes */
+        cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
+        cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
+                PCI_X_CMD_MAX_SPLIT);
+        pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
+
+        /* Status register, update with emulated PCI bus location, clear
+         * error bits, leave the rest. */
+        status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
+        status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
+        status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
+        status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
+                    PCI_X_STATUS_SPL_ERR);
+        pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
+    }
+
+    pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD, 0);
+    if (pos) {
+        /* Direct R/W passthrough */
+        ret = pci_add_capability(pci_dev, PCI_CAP_ID_VPD, pos, 8);
+        if (ret < 0) {
+            return ret;
+        }
+
+        assigned_dev_setup_cap_read(dev, pos, 8);
+
+        /* direct write for cap content */
+        assigned_dev_direct_config_write(dev, pos + 2, 6);
+    }
+
+    /* Devices can have multiple vendor capabilities, get them all */
+    for (pos = 0; (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
+        pos += PCI_CAP_LIST_NEXT) {
+        uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
+        /* Direct R/W passthrough */
+        ret = pci_add_capability(pci_dev, PCI_CAP_ID_VNDR, pos, len);
+        if (ret < 0) {
+            return ret;
+        }
+
+        assigned_dev_setup_cap_read(dev, pos, len);
+
+        /* direct write for cap content */
+        assigned_dev_direct_config_write(dev, pos + 2, len - 2);
+    }
+
+    /* If real and virtual capability list status bits differ, virtualize the
+     * access. */
+    if ((pci_get_word(pci_dev->config + PCI_STATUS) & PCI_STATUS_CAP_LIST) !=
+        (assigned_dev_pci_read_byte(pci_dev, PCI_STATUS) &
+         PCI_STATUS_CAP_LIST)) {
+        dev->emulate_config_read[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
+    }
+
+    return 0;
+}
+
+static uint64_t msix_mmio_read(void *opaque, target_phys_addr_t addr,
+                               unsigned size)
+{
+    AssignedDevice *adev = opaque;
+    uint64_t val;
+
+    memcpy(&val, (void *)((uint8_t *)adev->msix_table + addr), size);
+
+    return val;
+}
+
+static void msix_mmio_write(void *opaque, target_phys_addr_t addr,
+                            uint64_t val, unsigned size)
+{
+    AssignedDevice *adev = opaque;
+    PCIDevice *pdev = &adev->dev;
+    uint16_t ctrl;
+    MSIXTableEntry orig;
+    int i = addr >> 4;
+
+    if (i >= adev->msix_max) {
+        return; /* Drop write */
+    }
+
+    ctrl = pci_get_word(pdev->config + pdev->msix_cap + PCI_MSIX_FLAGS);
+
+    DEBUG("write to MSI-X table offset 0x%lx, val 0x%lx\n", addr, val);
+
+    if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
+        orig = adev->msix_table[i];
+    }
+
+    memcpy((void *)((uint8_t *)adev->msix_table + addr), &val, size);
+
+    if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
+        MSIXTableEntry *entry = &adev->msix_table[i];
+
+        if (!msix_masked(&orig) && msix_masked(entry)) {
+            /*
+             * Vector masked, disable it
+             *
+             * XXX It's not clear if we can or should actually attempt
+             * to mask or disable the interrupt.  KVM doesn't have
+             * support for pending bits and kvm_assign_set_msix_entry
+             * doesn't modify the device hardware mask.  Interrupts
+             * while masked are simply not injected to the guest, so
+             * are lost.  Can we get away with always injecting an
+             * interrupt on unmask?
+             */
+        } else if (msix_masked(&orig) && !msix_masked(entry)) {
+            /* Vector unmasked */
+            if (i >= adev->msi_virq_nr || adev->msi_virq[i] < 0) {
+                /* Previously unassigned vector, start from scratch */
+                assigned_dev_update_msix(pdev);
+                return;
+            } else {
+                /* Update an existing, previously masked vector */
+                MSIMessage msg;
+                int ret;
+
+                msg.address = entry->addr_lo |
+                    ((uint64_t)entry->addr_hi << 32);
+                msg.data = entry->data;
+
+                ret = kvm_irqchip_update_msi_route(kvm_state,
+                                                   adev->msi_virq[i], msg);
+                if (ret) {
+                    fprintf(stderr,
+                            "Error updating irq routing entry (%d)\n", ret);
+                }
+            }
+        }
+    }
+}
+
+static const MemoryRegionOps msix_mmio_ops = {
+    .read = msix_mmio_read,
+    .write = msix_mmio_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+    },
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+    },
+};
+
+static void msix_reset(AssignedDevice *dev)
+{
+    MSIXTableEntry *entry;
+    int i;
+
+    if (!dev->msix_table) {
+        return;
+    }
+
+    memset(dev->msix_table, 0, MSIX_PAGE_SIZE);
+
+    for (i = 0, entry = dev->msix_table; i < dev->msix_max; i++, entry++) {
+        entry->ctrl = cpu_to_le32(0x1); /* Masked */
+    }
+}
+
+static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
+{
+    dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE,
+                           MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
+    if (dev->msix_table == MAP_FAILED) {
+        fprintf(stderr, "fail allocate msix_table! %s\n", strerror(errno));
+        return -EFAULT;
+    }
+
+    msix_reset(dev);
+
+    memory_region_init_io(&dev->mmio, &msix_mmio_ops, dev,
+                          "assigned-dev-msix", MSIX_PAGE_SIZE);
+    return 0;
+}
+
+static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
+{
+    if (!dev->msix_table) {
+        return;
+    }
+
+    memory_region_destroy(&dev->mmio);
+
+    if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) {
+        fprintf(stderr, "error unmapping msix_table! %s\n",
+                strerror(errno));
+    }
+    dev->msix_table = NULL;
+}
+
+static const VMStateDescription vmstate_assigned_device = {
+    .name = "pci-assign",
+    .unmigratable = 1,
+};
+
+static void reset_assigned_device(DeviceState *dev)
+{
+    PCIDevice *pci_dev = DO_UPCAST(PCIDevice, qdev, dev);
+    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    char reset_file[64];
+    const char reset[] = "1";
+    int fd, ret;
+
+    /*
+     * If a guest is reset without being shutdown, MSI/MSI-X can still
+     * be running.  We want to return the device to a known state on
+     * reset, so disable those here.  We especially do not want MSI-X
+     * enabled since it lives in MMIO space, which is about to get
+     * disabled.
+     */
+    if (adev->assigned_irq_type == ASSIGNED_IRQ_MSIX) {
+        uint16_t ctrl = pci_get_word(pci_dev->config +
+                                     pci_dev->msix_cap + PCI_MSIX_FLAGS);
+
+        pci_set_word(pci_dev->config + pci_dev->msix_cap + PCI_MSIX_FLAGS,
+                     ctrl & ~PCI_MSIX_FLAGS_ENABLE);
+        assigned_dev_update_msix(pci_dev);
+    } else if (adev->assigned_irq_type == ASSIGNED_IRQ_MSI) {
+        uint8_t ctrl = pci_get_byte(pci_dev->config +
+                                    pci_dev->msi_cap + PCI_MSI_FLAGS);
+
+        pci_set_byte(pci_dev->config + pci_dev->msi_cap + PCI_MSI_FLAGS,
+                     ctrl & ~PCI_MSI_FLAGS_ENABLE);
+        assigned_dev_update_msi(pci_dev);
+    }
+
+    snprintf(reset_file, sizeof(reset_file),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset",
+             adev->host.domain, adev->host.bus, adev->host.slot,
+             adev->host.function);
+
+    /*
+     * Issue a device reset via pci-sysfs.  Note that we use write(2) here
+     * and ignore the return value because some kernels have a bug that
+     * returns 0 rather than bytes written on success, sending us into an
+     * infinite retry loop using other write mechanisms.
+     */
+    fd = open(reset_file, O_WRONLY);
+    if (fd != -1) {
+        ret = write(fd, reset, strlen(reset));
+        (void)ret;
+        close(fd);
+    }
+
+    /*
+     * When a 0 is written to the bus master register, the device is logically
+     * disconnected from the PCI bus. This avoids further DMA transfers.
+     */
+    assigned_dev_pci_write_config(pci_dev, PCI_COMMAND, 0, 1);
+}
+
+static int assigned_initfn(struct PCIDevice *pci_dev)
+{
+    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    uint8_t e_intx;
+    int r;
+
+    if (!kvm_enabled()) {
+        error_report("pci-assign: error: requires KVM support");
+        return -1;
+    }
+
+    if (!dev->host.domain && !dev->host.bus && !dev->host.slot &&
+        !dev->host.function) {
+        error_report("pci-assign: error: no host device specified");
+        return -1;
+    }
+
+    /*
+     * Set up basic config space access control. Will be further refined during
+     * device initialization.
+     */
+    assigned_dev_emulate_config_read(dev, 0, PCI_CONFIG_SPACE_SIZE);
+    assigned_dev_direct_config_read(dev, PCI_STATUS, 2);
+    assigned_dev_direct_config_read(dev, PCI_REVISION_ID, 1);
+    assigned_dev_direct_config_read(dev, PCI_CLASS_PROG, 3);
+    assigned_dev_direct_config_read(dev, PCI_CACHE_LINE_SIZE, 1);
+    assigned_dev_direct_config_read(dev, PCI_LATENCY_TIMER, 1);
+    assigned_dev_direct_config_read(dev, PCI_BIST, 1);
+    assigned_dev_direct_config_read(dev, PCI_CARDBUS_CIS, 4);
+    assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_VENDOR_ID, 2);
+    assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_ID, 2);
+    assigned_dev_direct_config_read(dev, PCI_CAPABILITY_LIST + 1, 7);
+    assigned_dev_direct_config_read(dev, PCI_MIN_GNT, 1);
+    assigned_dev_direct_config_read(dev, PCI_MAX_LAT, 1);
+    memcpy(dev->emulate_config_write, dev->emulate_config_read,
+           sizeof(dev->emulate_config_read));
+
+    if (get_real_device(dev, dev->host.domain, dev->host.bus,
+                        dev->host.slot, dev->host.function)) {
+        error_report("pci-assign: Error: Couldn't get real device (%s)!",
+                     dev->dev.qdev.id);
+        goto out;
+    }
+
+    if (assigned_device_pci_cap_init(pci_dev) < 0) {
+        goto out;
+    }
+
+    /* intercept MSI-X entry page in the MMIO */
+    if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
+        if (assigned_dev_register_msix_mmio(dev)) {
+            goto out;
+        }
+    }
+
+    /* handle real device's MMIO/PIO BARs */
+    if (assigned_dev_register_regions(dev->real_device.regions,
+                                      dev->real_device.region_number,
+                                      dev)) {
+        goto out;
+    }
+
+    /* handle interrupt routing */
+    e_intx = dev->dev.config[0x3d] - 1;
+    dev->intpin = e_intx;
+    dev->intx_route.mode = PCI_INTX_DISABLED;
+    dev->intx_route.irq = -1;
+
+    /* assign device to guest */
+    r = assign_device(dev);
+    if (r < 0) {
+        goto out;
+    }
+
+    /* assign legacy INTx to the device */
+    r = assign_intx(dev);
+    if (r < 0) {
+        goto assigned_out;
+    }
+
+    assigned_dev_load_option_rom(dev);
+
+    add_boot_device_path(dev->bootindex, &pci_dev->qdev, NULL);
+
+    return 0;
+
+assigned_out:
+    deassign_device(dev);
+out:
+    free_assigned_device(dev);
+    return -1;
+}
+
+static void assigned_exitfn(struct PCIDevice *pci_dev)
+{
+    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+
+    deassign_device(dev);
+    free_assigned_device(dev);
+}
+
+static Property da_properties[] = {
+    DEFINE_PROP_PCI_HOST_DEVADDR("host", AssignedDevice, host),
+    DEFINE_PROP_BIT("prefer_msi", AssignedDevice, features,
+                    ASSIGNED_DEVICE_PREFER_MSI_BIT, false),
+    DEFINE_PROP_BIT("share_intx", AssignedDevice, features,
+                    ASSIGNED_DEVICE_SHARE_INTX_BIT, true),
+    DEFINE_PROP_INT32("bootindex", AssignedDevice, bootindex, -1),
+    DEFINE_PROP_STRING("configfd", AssignedDevice, configfd_name),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void assign_class_init(ObjectClass *klass, void *data)
+{
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    k->init         = assigned_initfn;
+    k->exit         = assigned_exitfn;
+    k->config_read  = assigned_dev_pci_read_config;
+    k->config_write = assigned_dev_pci_write_config;
+    dc->props       = da_properties;
+    dc->vmsd        = &vmstate_assigned_device;
+    dc->reset       = reset_assigned_device;
+}
+
+static TypeInfo assign_info = {
+    .name               = "pci-assign",
+    .parent             = TYPE_PCI_DEVICE,
+    .instance_size      = sizeof(AssignedDevice),
+    .class_init         = assign_class_init,
+};
+
+static void assign_register_types(void)
+{
+    type_register_static(&assign_info);
+}
+
+type_init(assign_register_types)
+
+/*
+ * Scan the assigned devices for the devices that have an option ROM, and then
+ * load the corresponding ROM data to RAM. If an error occurs while loading an
+ * option ROM, we just ignore that option ROM and continue with the next one.
+ */
+static void assigned_dev_load_option_rom(AssignedDevice *dev)
+{
+    char name[32], rom_file[64];
+    FILE *fp;
+    uint8_t val;
+    struct stat st;
+    void *ptr;
+
+    /* If loading ROM from file, pci handles it */
+    if (dev->dev.romfile || !dev->dev.rom_bar) {
+        return;
+    }
+
+    snprintf(rom_file, sizeof(rom_file),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/rom",
+             dev->host.domain, dev->host.bus, dev->host.slot,
+             dev->host.function);
+
+    if (stat(rom_file, &st)) {
+        return;
+    }
+
+    if (access(rom_file, F_OK)) {
+        fprintf(stderr, "pci-assign: Insufficient privileges for %s\n",
+                rom_file);
+        return;
+    }
+
+    /* Write "1" to the ROM file to enable it */
+    fp = fopen(rom_file, "r+");
+    if (fp == NULL) {
+        return;
+    }
+    val = 1;
+    if (fwrite(&val, 1, 1, fp) != 1) {
+        goto close_rom;
+    }
+    fseek(fp, 0, SEEK_SET);
+
+    snprintf(name, sizeof(name), "%s.rom",
+            object_get_typename(OBJECT(dev)));
+    memory_region_init_ram(&dev->dev.rom, name, st.st_size);
+    vmstate_register_ram(&dev->dev.rom, &dev->dev.qdev);
+    ptr = memory_region_get_ram_ptr(&dev->dev.rom);
+    memset(ptr, 0xff, st.st_size);
+
+    if (!fread(ptr, 1, st.st_size, fp)) {
+        fprintf(stderr, "pci-assign: Cannot read from host %s\n"
+                "\tDevice option ROM contents are probably invalid "
+                "(check dmesg).\n\tSkip option ROM probe with rombar=0, "
+                "or load from file with romfile=\n", rom_file);
+        memory_region_destroy(&dev->dev.rom);
+        goto close_rom;
+    }
+
+    pci_register_bar(&dev->dev, PCI_ROM_SLOT, 0, &dev->dev.rom);
+    dev->dev.has_rom = true;
+close_rom:
+    /* Write "0" to disable ROM */
+    fseek(fp, 0, SEEK_SET);
+    val = 0;
+    if (!fwrite(&val, 1, 1, fp)) {
+        DEBUG("%s\n", "Failed to disable pci-sysfs rom file");
+    }
+    fclose(fp);
+}