Patchwork [v3,01/13] msi: implemented msi.

login
register
mail settings
Submitter Isaku Yamahata
Date Sept. 15, 2010, 5:38 a.m.
Message ID <1ec66452b4ff87dca4b76052787aa8ca0b924de5.1284528424.git.yamahata@valinux.co.jp>
Download mbox | patch
Permalink /patch/64787/
State New
Headers show

Comments

Isaku Yamahata - Sept. 15, 2010, 5:38 a.m.
implemented msi support functions.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>

---
Changes v2 -> v3:
- improved comment wording.
- simplified shift/ffs dance.

Changes v1 -> v2:
- opencode some oneline helper function/macros for readability
- use ffs where appropriate
- rename some functions/variables as suggested.
- added assert()
- 1 -> 1U
- clear INTx# when MSI is enabled
- clear pending bits for freed vectors.
- check the requested number of vectors.
---
 Makefile.objs |    2 +-
 hw/msi.c      |  358 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/msi.h      |   41 +++++++
 hw/pci.h      |   10 +-
 4 files changed, 407 insertions(+), 4 deletions(-)
 create mode 100644 hw/msi.c
 create mode 100644 hw/msi.h
Michael S. Tsirkin - Sept. 15, 2010, 1:03 p.m.
On Wed, Sep 15, 2010 at 02:38:14PM +0900, Isaku Yamahata wrote:
> implemented msi support functions.
> 
> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
> 
> ---
> Changes v2 -> v3:
> - improved comment wording.
> - simplified shift/ffs dance.
> 
> Changes v1 -> v2:
> - opencode some oneline helper function/macros for readability
> - use ffs where appropriate
> - rename some functions/variables as suggested.
> - added assert()
> - 1 -> 1U
> - clear INTx# when MSI is enabled
> - clear pending bits for freed vectors.
> - check the requested number of vectors.
> ---
>  Makefile.objs |    2 +-
>  hw/msi.c      |  358 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/msi.h      |   41 +++++++
>  hw/pci.h      |   10 +-
>  4 files changed, 407 insertions(+), 4 deletions(-)
>  create mode 100644 hw/msi.c
>  create mode 100644 hw/msi.h
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 594894b..5f5a4c5 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -186,7 +186,7 @@ hw-obj-$(CONFIG_PIIX4) += piix4.o
>  # PCI watchdog devices
>  hw-obj-y += wdt_i6300esb.o
>  
> -hw-obj-y += msix.o
> +hw-obj-y += msix.o msi.o
>  
>  # PCI network cards
>  hw-obj-y += ne2000.o
> diff --git a/hw/msi.c b/hw/msi.c
> new file mode 100644
> index 0000000..65c163f
> --- /dev/null
> +++ b/hw/msi.c
> @@ -0,0 +1,358 @@
> +/*
> + * msi.c
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "msi.h"
> +
> +/* Eventually those constants should go to Linux pci_regs.h */
> +#define PCI_MSI_PENDING_32      0x10
> +#define PCI_MSI_PENDING_64      0x14
> +
> +/* PCI_MSI_ADDRESS_LO */
> +#define PCI_MSI_ADDRESS_LO_MASK         (~0x3)
> +
> +/* If we get rid of cap allocator, we won't need those. */
> +#define PCI_MSI_32_SIZEOF       0x0a
> +#define PCI_MSI_64_SIZEOF       0x0e
> +#define PCI_MSI_32M_SIZEOF      0x14
> +#define PCI_MSI_64M_SIZEOF      0x18
> +
> +/* If we get rid of cap allocator, we won't need this. */
> +static inline uint8_t msi_cap_sizeof(uint16_t flags)
> +{
> +    switch (flags & (PCI_MSI_FLAGS_MASKBIT | PCI_MSI_FLAGS_64BIT)) {
> +    case PCI_MSI_FLAGS_MASKBIT | PCI_MSI_FLAGS_64BIT:
> +        return PCI_MSI_64M_SIZEOF;
> +    case PCI_MSI_FLAGS_64BIT:
> +        return PCI_MSI_64_SIZEOF;
> +    case PCI_MSI_FLAGS_MASKBIT:
> +        return PCI_MSI_32M_SIZEOF;
> +    case 0:
> +        return PCI_MSI_32_SIZEOF;
> +    default:
> +        abort();
> +        break;
> +    }
> +    return 0;
> +}
> +
> +//#define MSI_DEBUG
> +
> +#ifdef MSI_DEBUG
> +# define MSI_DPRINTF(fmt, ...)                                          \
> +    fprintf(stderr, "%s:%d " fmt, __func__, __LINE__, ## __VA_ARGS__)
> +#else
> +# define MSI_DPRINTF(fmt, ...)  do { } while (0)
> +#endif
> +#define MSI_DEV_PRINTF(dev, fmt, ...)                                   \
> +    MSI_DPRINTF("%s:%x " fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
> +
> +static inline uint8_t msi_nr_vectors(uint16_t flags)
> +{
> +    return 1U <<
> +        ((flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1));
> +}
> +
> +static inline uint8_t msi_flags_off(const PCIDevice* dev)
> +{
> +    return dev->msi_cap + PCI_MSI_FLAGS;
> +}
> +
> +static inline uint8_t msi_address_lo_off(const PCIDevice* dev)
> +{
> +    return dev->msi_cap + PCI_MSI_ADDRESS_LO;
> +}
> +
> +static inline uint8_t msi_address_hi_off(const PCIDevice* dev)
> +{
> +    return dev->msi_cap + PCI_MSI_ADDRESS_HI;
> +}
> +
> +static inline uint8_t msi_data_off(const PCIDevice* dev, bool msi64bit)
> +{
> +    return dev->msi_cap + (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32);
> +}
> +
> +static inline uint8_t msi_mask_off(const PCIDevice* dev, bool msi64bit)
> +{
> +    return dev->msi_cap + (msi64bit ? PCI_MSI_MASK_64 : PCI_MSI_MASK_32);
> +}
> +
> +static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
> +{
> +    return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
> +}
> +
> +bool msi_enabled(const PCIDevice *dev)
> +{
> +    return msi_present(dev) &&
> +        (pci_get_word(dev->config + msi_flags_off(dev)) &
> +         PCI_MSI_FLAGS_ENABLE);
> +}
> +
> +int msi_init(struct PCIDevice *dev, uint8_t offset,
> +             uint8_t nr_vectors, bool msi64bit, bool msi_per_vector_mask)

I think that you want simply unsigned nr_vectors and unsigned vector here
and elsewhere (e.g. same for return type of nr_vectors_allocated).
There's no advanatage to u8 here that I can see: the value is not 0-255
as with offset, so it does not help readability, and it does make you
use weird macros to print values instead of plain %x.
Generally better use standard types when width is not relevant.
This makes it easier to notice where it *is*.
Likewise most functions here should just work with unsigned.
config handlers are an exception as they assume specific
signature and value must have specific size (32 bit) for things to work.


> +{
> +    uint8_t vectors_order;
> +    uint16_t flags;
> +    uint8_t cap_size;
> +    int config_offset;
> +    MSI_DEV_PRINTF(dev,
> +                   "init offset: 0x%"PRIx8" vector: %"PRId8
> +                   " 64bit %d mask %d\n",
> +                   offset, nr_vectors, msi64bit, msi_per_vector_mask);
> +
> +    assert(!(nr_vectors & (nr_vectors - 1)));   /* power of 2 */
> +    assert(nr_vectors > 0);
> +    assert(nr_vectors <= 32);   /* the nr of MSI vectors is up to 32 */
> +    vectors_order = ffs(nr_vectors) - 1;
> +
> +    flags = vectors_order << (ffs(PCI_MSI_FLAGS_QMASK) - 1);
> +    if (msi64bit) {
> +        flags |= PCI_MSI_FLAGS_64BIT;
> +    }
> +    if (msi_per_vector_mask) {
> +        flags |= PCI_MSI_FLAGS_MASKBIT;
> +    }
> +
> +    cap_size = msi_cap_sizeof(flags);
> +    config_offset = pci_add_capability(dev, PCI_CAP_ID_MSI, offset, cap_size);
> +    if (config_offset < 0) {
> +        return config_offset;
> +    }
> +
> +    dev->msi_cap = config_offset;
> +    dev->cap_present |= QEMU_PCI_CAP_MSI;
> +
> +    pci_set_word(dev->config + msi_flags_off(dev), flags);
> +    pci_set_word(dev->wmask + msi_flags_off(dev),
> +                 PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
> +    pci_set_long(dev->wmask + msi_address_lo_off(dev),
> +                 PCI_MSI_ADDRESS_LO_MASK);
> +    if (msi64bit) {
> +        pci_set_long(dev->wmask + msi_address_hi_off(dev), 0xffffffff);
> +    }
> +    pci_set_word(dev->wmask + msi_data_off(dev, msi64bit), 0xffff);
> +
> +    if (msi_per_vector_mask) {
> +        pci_set_long(dev->wmask + msi_mask_off(dev, msi64bit),
> +                     (1U << nr_vectors) - 1);

this seems wrong. shift by 32 is undefined in C, isn't it?
You want this I think:
	0xffffffff >> (32 - nr_vectors)

> +    }
> +    return config_offset;
> +}
> +
> +void msi_uninit(struct PCIDevice *dev)
> +{
> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> +    uint8_t cap_size = msi_cap_sizeof(flags);
> +    pci_del_capability(dev, PCI_CAP_ID_MSIX, cap_size);
> +    MSI_DEV_PRINTF(dev, "uninit\n");
> +}
> +
> +void msi_reset(PCIDevice *dev)
> +{
> +    uint16_t flags;
> +    bool msi64bit;
> +
> +    flags = pci_get_word(dev->config + msi_flags_off(dev));
> +    flags &= ~(PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
> +    msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> +
> +    pci_set_word(dev->config + msi_flags_off(dev), flags);
> +    pci_set_long(dev->config + msi_address_lo_off(dev), 0);
> +    if (msi64bit) {
> +        pci_set_long(dev->config + msi_address_hi_off(dev), 0);
> +    }
> +    pci_set_word(dev->config + msi_data_off(dev, msi64bit), 0);
> +    if (flags & PCI_MSI_FLAGS_MASKBIT) {
> +        pci_set_long(dev->config + msi_mask_off(dev, msi64bit), 0);
> +        pci_set_long(dev->config + msi_pending_off(dev, msi64bit), 0);
> +    }
> +    MSI_DEV_PRINTF(dev, "reset\n");
> +}
> +
> +static bool msi_is_masked(const PCIDevice *dev, uint8_t vector)
> +{
> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> +    uint32_t mask;
> +
> +    if (!(flags & PCI_MSI_FLAGS_MASKBIT)) {
> +        return false;
> +    }
> +
> +    mask = pci_get_long(dev->config +
> +                        msi_mask_off(dev, flags & PCI_MSI_FLAGS_64BIT));
> +    return mask & (1U << vector);
> +}
> +
> +static void msi_set_pending(PCIDevice *dev, uint8_t vector)
> +{
> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> +    uint32_t pending;
> +
> +    assert(flags & PCI_MSI_FLAGS_MASKBIT);
> +
> +    pending = pci_get_long(dev->config + msi_pending_off(dev, msi64bit));
> +    pending |= 1U << vector;
> +    pci_set_long(dev->config + msi_pending_off(dev, msi64bit), pending);
> +    MSI_DEV_PRINTF(dev, "pending vector 0x%"PRIx8"\n", vector);
> +}
> +
> +void msi_notify(PCIDevice *dev, uint8_t vector)
> +{
> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> +    uint8_t nr_vectors = msi_nr_vectors(flags);
> +    uint64_t address;
> +    uint32_t data;
> +
> +    assert(vector < nr_vectors);
> +    if (msi_is_masked(dev, vector)) {
> +        msi_set_pending(dev, vector);
> +        return;
> +    }
> +
> +    if (msi64bit){
> +        address = pci_get_quad(dev->config + msi_address_lo_off(dev));
> +    } else {
> +        address = pci_get_long(dev->config + msi_address_lo_off(dev));
> +    }
> +
> +    /* upper bit 31:16 is zero */
> +    data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
> +    if (nr_vectors > 1) {
> +        data &= ~(nr_vectors - 1);
> +        data |= vector;
> +    }
> +
> +    MSI_DEV_PRINTF(dev,
> +                   "notify vector 0x%"PRIx8
> +                   " address: 0x%"PRIx64" data: 0x%"PRIx32"\n",
> +                   vector, address, data);
> +    stl_phys(address, data);
> +}
> +
> +/* call this function after updating configs by pci_default_write_config(). */
> +void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len)
> +{
> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> +    bool msi_per_vector_mask = flags & PCI_MSI_FLAGS_MASKBIT;
> +    uint8_t nr_vectors;
> +    uint8_t log_num_vecs;
> +    uint8_t log_max_vecs;
> +    uint8_t vector;
> +    uint32_t pending;
> +    int i;
> +
> +#ifdef MSI_DEBUG
> +    if (ranges_overlap(addr, len, dev->msi_cap, msi_cap_sizeof(flags))) {
> +        MSI_DEV_PRINTF(dev, "addr 0x%"PRIx32" val 0x%"PRIx32" len %d\n",
> +                       addr, val, len);
> +        MSI_DEV_PRINTF(dev, "ctrl: 0x%"PRIx16" address: 0x%"PRIx32,
> +                       flags,
> +                       pci_get_long(dev->config + msi_address_lo_off(dev)));
> +        if (msi64bit) {
> +            fprintf(stderr, " addrss-hi: 0x%"PRIx32,
> +                    pci_get_long(dev->config + msi_address_hi_off(dev)));
> +        }
> +        fprintf(stderr, " data: 0x%"PRIx16,
> +                pci_get_word(dev->config + msi_data_off(dev, msi64bit)));
> +        if (flags & PCI_MSI_FLAGS_MASKBIT) {
> +            fprintf(stderr, " mask 0x%"PRIx32" pending 0x%"PRIx32,
> +                    pci_get_long(dev->config + msi_mask_off(dev, msi64bit)),
> +                    pci_get_long(dev->config + msi_pending_off(dev, msi64bit)));
> +        }
> +        fprintf(stderr, "\n");
> +    }
> +#endif
> +
> +    /* Are we modified? */
> +    if (!(ranges_overlap(addr, len, msi_flags_off(dev), 2) ||
> +          (msi_per_vector_mask &&
> +           ranges_overlap(addr, len, msi_mask_off(dev, msi64bit), 4)))) {
> +        return;
> +    }
> +
> +    if (!(flags & PCI_MSI_FLAGS_ENABLE)) {
> +        return;
> +    }
> +
> +    /*
> +     * Now MSI is enabled, clear INTx# interrupts.
> +     * the driver is prohibited from writing enable bit to mask
> +     * a service request. But the guest OS could do this.
> +     * So we just discard the interrupts as moderate fallback.
> +     *
> +     * 6.8.3.3. Enabling Operation
> +     *   While enabled for MSI or MSI-X operation, a function is prohibited
> +     *   from using its INTx# pin (if implemented) to request
> +     *   service (MSI, MSI-X, and INTx# are mutually exclusive).
> +     */
> +    for (i = 0; i < PCI_NUM_PINS; ++i) {
> +        qemu_set_irq(dev->irq[i], 0);
> +    }
> +
> +    /*
> +     * nr_vectors might be set bigger than capable. So clamp it.
> +     * This is not legal by spec, so we can do anything we like,
> +     * just don't crash the host
> +     */
> +    log_num_vecs =
> +        (flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1);
> +    log_max_vecs =
> +        (flags & PCI_MSI_FLAGS_QMASK) >> (ffs(PCI_MSI_FLAGS_QMASK) - 1);
> +    if (log_num_vecs > log_max_vecs) {
> +        flags &= ~PCI_MSI_FLAGS_QSIZE;
> +        flags |= log_max_vecs << (ffs(PCI_MSI_FLAGS_QSIZE) - 1);
> +        pci_set_word(dev->config + msi_flags_off(dev), flags);
> +    }
> +
> +    if (!msi_per_vector_mask) {
> +        /* if per vector masking isn't supported,
> +           there is no pending interrupt. */
> +        return;
> +    }
> +
> +    nr_vectors = msi_nr_vectors(flags);
> +
> +    /* This will discard pending interrupts, if any. */
> +    pending = pci_get_long(dev->config + msi_pending_off(dev, msi64bit));
> +    pending &= (1U << nr_vectors) - 1;

as above, this is wrong for nr_vectors == 32
You want:
	0xffffffff >> nr_vectors

> +    pci_set_long(dev->config + msi_pending_off(dev, msi64bit), pending);
> +
> +    /* deliver pending interrupts which are unmasked */
> +    for (vector = 0; vector < nr_vectors; ++vector) {
> +        if (msi_is_masked(dev, vector) || !(pending & (1U << vector))) {
> +            continue;
> +        }
> +
> +        pending &= ~(1U << vector);
> +        pci_set_long(dev->config + msi_pending_off(dev, msi64bit),
> +                     pending);
> +        msi_notify(dev, vector);
> +    }
> +}
> +
> +uint8_t msi_nr_vectors_allocated(const PCIDevice *dev)
> +{
> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> +    return msi_nr_vectors(flags);
> +}
> diff --git a/hw/msi.h b/hw/msi.h
> new file mode 100644
> index 0000000..eac9c78
> --- /dev/null
> +++ b/hw/msi.h
> @@ -0,0 +1,41 @@
> +/*
> + * msi.h
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef QEMU_MSI_H
> +#define QEMU_MSI_H
> +
> +#include "qemu-common.h"
> +#include "pci.h"
> +
> +bool msi_enabled(const PCIDevice *dev);
> +int msi_init(struct PCIDevice *dev, uint8_t offset,
> +             uint8_t nr_vectors, bool msi64bit, bool msi_per_vector_mask);
> +void msi_uninit(struct PCIDevice *dev);
> +void msi_reset(PCIDevice *dev);
> +void msi_notify(PCIDevice *dev, uint8_t vector);
> +void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len);
> +uint8_t msi_nr_vectors_allocated(const PCIDevice *dev);
> +
> +static inline bool msi_present(const PCIDevice *dev)
> +{
> +    return dev->cap_present & QEMU_PCI_CAP_MSI;
> +}
> +
> +#endif /* QEMU_MSI_H */
> diff --git a/hw/pci.h b/hw/pci.h
> index 1c6075e..3879708 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -109,11 +109,12 @@ typedef struct PCIIORegion {
>  
>  /* Bits in cap_present field. */
>  enum {
> -    QEMU_PCI_CAP_MSIX = 0x1,
> -    QEMU_PCI_CAP_EXPRESS = 0x2,
> +    QEMU_PCI_CAP_MSI = 0x1,
> +    QEMU_PCI_CAP_MSIX = 0x2,
> +    QEMU_PCI_CAP_EXPRESS = 0x4,
>  
>      /* multifunction capable device */
> -#define QEMU_PCI_CAP_MULTIFUNCTION_BITNR        2
> +#define QEMU_PCI_CAP_MULTIFUNCTION_BITNR        3
>      QEMU_PCI_CAP_MULTIFUNCTION = (1 << QEMU_PCI_CAP_MULTIFUNCTION_BITNR),
>  };
>  
> @@ -168,6 +169,9 @@ struct PCIDevice {
>      /* Version id needed for VMState */
>      int32_t version_id;
>  
> +    /* Offset of MSI capability in config space */
> +    uint8_t msi_cap;
> +
>      /* Location of option rom */
>      char *romfile;
>      ram_addr_t rom_offset;
> -- 
> 1.7.1.1

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 594894b..5f5a4c5 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -186,7 +186,7 @@  hw-obj-$(CONFIG_PIIX4) += piix4.o
 # PCI watchdog devices
 hw-obj-y += wdt_i6300esb.o
 
-hw-obj-y += msix.o
+hw-obj-y += msix.o msi.o
 
 # PCI network cards
 hw-obj-y += ne2000.o
diff --git a/hw/msi.c b/hw/msi.c
new file mode 100644
index 0000000..65c163f
--- /dev/null
+++ b/hw/msi.c
@@ -0,0 +1,358 @@ 
+/*
+ * msi.c
+ *
+ * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "msi.h"
+
+/* Eventually those constants should go to Linux pci_regs.h */
+#define PCI_MSI_PENDING_32      0x10
+#define PCI_MSI_PENDING_64      0x14
+
+/* PCI_MSI_ADDRESS_LO */
+#define PCI_MSI_ADDRESS_LO_MASK         (~0x3)
+
+/* If we get rid of cap allocator, we won't need those. */
+#define PCI_MSI_32_SIZEOF       0x0a
+#define PCI_MSI_64_SIZEOF       0x0e
+#define PCI_MSI_32M_SIZEOF      0x14
+#define PCI_MSI_64M_SIZEOF      0x18
+
+/* If we get rid of cap allocator, we won't need this. */
+static inline uint8_t msi_cap_sizeof(uint16_t flags)
+{
+    switch (flags & (PCI_MSI_FLAGS_MASKBIT | PCI_MSI_FLAGS_64BIT)) {
+    case PCI_MSI_FLAGS_MASKBIT | PCI_MSI_FLAGS_64BIT:
+        return PCI_MSI_64M_SIZEOF;
+    case PCI_MSI_FLAGS_64BIT:
+        return PCI_MSI_64_SIZEOF;
+    case PCI_MSI_FLAGS_MASKBIT:
+        return PCI_MSI_32M_SIZEOF;
+    case 0:
+        return PCI_MSI_32_SIZEOF;
+    default:
+        abort();
+        break;
+    }
+    return 0;
+}
+
+//#define MSI_DEBUG
+
+#ifdef MSI_DEBUG
+# define MSI_DPRINTF(fmt, ...)                                          \
+    fprintf(stderr, "%s:%d " fmt, __func__, __LINE__, ## __VA_ARGS__)
+#else
+# define MSI_DPRINTF(fmt, ...)  do { } while (0)
+#endif
+#define MSI_DEV_PRINTF(dev, fmt, ...)                                   \
+    MSI_DPRINTF("%s:%x " fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
+
+static inline uint8_t msi_nr_vectors(uint16_t flags)
+{
+    return 1U <<
+        ((flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1));
+}
+
+static inline uint8_t msi_flags_off(const PCIDevice* dev)
+{
+    return dev->msi_cap + PCI_MSI_FLAGS;
+}
+
+static inline uint8_t msi_address_lo_off(const PCIDevice* dev)
+{
+    return dev->msi_cap + PCI_MSI_ADDRESS_LO;
+}
+
+static inline uint8_t msi_address_hi_off(const PCIDevice* dev)
+{
+    return dev->msi_cap + PCI_MSI_ADDRESS_HI;
+}
+
+static inline uint8_t msi_data_off(const PCIDevice* dev, bool msi64bit)
+{
+    return dev->msi_cap + (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32);
+}
+
+static inline uint8_t msi_mask_off(const PCIDevice* dev, bool msi64bit)
+{
+    return dev->msi_cap + (msi64bit ? PCI_MSI_MASK_64 : PCI_MSI_MASK_32);
+}
+
+static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
+{
+    return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
+}
+
+bool msi_enabled(const PCIDevice *dev)
+{
+    return msi_present(dev) &&
+        (pci_get_word(dev->config + msi_flags_off(dev)) &
+         PCI_MSI_FLAGS_ENABLE);
+}
+
+int msi_init(struct PCIDevice *dev, uint8_t offset,
+             uint8_t nr_vectors, bool msi64bit, bool msi_per_vector_mask)
+{
+    uint8_t vectors_order;
+    uint16_t flags;
+    uint8_t cap_size;
+    int config_offset;
+    MSI_DEV_PRINTF(dev,
+                   "init offset: 0x%"PRIx8" vector: %"PRId8
+                   " 64bit %d mask %d\n",
+                   offset, nr_vectors, msi64bit, msi_per_vector_mask);
+
+    assert(!(nr_vectors & (nr_vectors - 1)));   /* power of 2 */
+    assert(nr_vectors > 0);
+    assert(nr_vectors <= 32);   /* the nr of MSI vectors is up to 32 */
+    vectors_order = ffs(nr_vectors) - 1;
+
+    flags = vectors_order << (ffs(PCI_MSI_FLAGS_QMASK) - 1);
+    if (msi64bit) {
+        flags |= PCI_MSI_FLAGS_64BIT;
+    }
+    if (msi_per_vector_mask) {
+        flags |= PCI_MSI_FLAGS_MASKBIT;
+    }
+
+    cap_size = msi_cap_sizeof(flags);
+    config_offset = pci_add_capability(dev, PCI_CAP_ID_MSI, offset, cap_size);
+    if (config_offset < 0) {
+        return config_offset;
+    }
+
+    dev->msi_cap = config_offset;
+    dev->cap_present |= QEMU_PCI_CAP_MSI;
+
+    pci_set_word(dev->config + msi_flags_off(dev), flags);
+    pci_set_word(dev->wmask + msi_flags_off(dev),
+                 PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
+    pci_set_long(dev->wmask + msi_address_lo_off(dev),
+                 PCI_MSI_ADDRESS_LO_MASK);
+    if (msi64bit) {
+        pci_set_long(dev->wmask + msi_address_hi_off(dev), 0xffffffff);
+    }
+    pci_set_word(dev->wmask + msi_data_off(dev, msi64bit), 0xffff);
+
+    if (msi_per_vector_mask) {
+        pci_set_long(dev->wmask + msi_mask_off(dev, msi64bit),
+                     (1U << nr_vectors) - 1);
+    }
+    return config_offset;
+}
+
+void msi_uninit(struct PCIDevice *dev)
+{
+    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
+    uint8_t cap_size = msi_cap_sizeof(flags);
+    pci_del_capability(dev, PCI_CAP_ID_MSIX, cap_size);
+    MSI_DEV_PRINTF(dev, "uninit\n");
+}
+
+void msi_reset(PCIDevice *dev)
+{
+    uint16_t flags;
+    bool msi64bit;
+
+    flags = pci_get_word(dev->config + msi_flags_off(dev));
+    flags &= ~(PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
+    msi64bit = flags & PCI_MSI_FLAGS_64BIT;
+
+    pci_set_word(dev->config + msi_flags_off(dev), flags);
+    pci_set_long(dev->config + msi_address_lo_off(dev), 0);
+    if (msi64bit) {
+        pci_set_long(dev->config + msi_address_hi_off(dev), 0);
+    }
+    pci_set_word(dev->config + msi_data_off(dev, msi64bit), 0);
+    if (flags & PCI_MSI_FLAGS_MASKBIT) {
+        pci_set_long(dev->config + msi_mask_off(dev, msi64bit), 0);
+        pci_set_long(dev->config + msi_pending_off(dev, msi64bit), 0);
+    }
+    MSI_DEV_PRINTF(dev, "reset\n");
+}
+
+static bool msi_is_masked(const PCIDevice *dev, uint8_t vector)
+{
+    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
+    uint32_t mask;
+
+    if (!(flags & PCI_MSI_FLAGS_MASKBIT)) {
+        return false;
+    }
+
+    mask = pci_get_long(dev->config +
+                        msi_mask_off(dev, flags & PCI_MSI_FLAGS_64BIT));
+    return mask & (1U << vector);
+}
+
+static void msi_set_pending(PCIDevice *dev, uint8_t vector)
+{
+    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
+    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
+    uint32_t pending;
+
+    assert(flags & PCI_MSI_FLAGS_MASKBIT);
+
+    pending = pci_get_long(dev->config + msi_pending_off(dev, msi64bit));
+    pending |= 1U << vector;
+    pci_set_long(dev->config + msi_pending_off(dev, msi64bit), pending);
+    MSI_DEV_PRINTF(dev, "pending vector 0x%"PRIx8"\n", vector);
+}
+
+void msi_notify(PCIDevice *dev, uint8_t vector)
+{
+    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
+    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
+    uint8_t nr_vectors = msi_nr_vectors(flags);
+    uint64_t address;
+    uint32_t data;
+
+    assert(vector < nr_vectors);
+    if (msi_is_masked(dev, vector)) {
+        msi_set_pending(dev, vector);
+        return;
+    }
+
+    if (msi64bit){
+        address = pci_get_quad(dev->config + msi_address_lo_off(dev));
+    } else {
+        address = pci_get_long(dev->config + msi_address_lo_off(dev));
+    }
+
+    /* upper bit 31:16 is zero */
+    data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
+    if (nr_vectors > 1) {
+        data &= ~(nr_vectors - 1);
+        data |= vector;
+    }
+
+    MSI_DEV_PRINTF(dev,
+                   "notify vector 0x%"PRIx8
+                   " address: 0x%"PRIx64" data: 0x%"PRIx32"\n",
+                   vector, address, data);
+    stl_phys(address, data);
+}
+
+/* call this function after updating configs by pci_default_write_config(). */
+void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len)
+{
+    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
+    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
+    bool msi_per_vector_mask = flags & PCI_MSI_FLAGS_MASKBIT;
+    uint8_t nr_vectors;
+    uint8_t log_num_vecs;
+    uint8_t log_max_vecs;
+    uint8_t vector;
+    uint32_t pending;
+    int i;
+
+#ifdef MSI_DEBUG
+    if (ranges_overlap(addr, len, dev->msi_cap, msi_cap_sizeof(flags))) {
+        MSI_DEV_PRINTF(dev, "addr 0x%"PRIx32" val 0x%"PRIx32" len %d\n",
+                       addr, val, len);
+        MSI_DEV_PRINTF(dev, "ctrl: 0x%"PRIx16" address: 0x%"PRIx32,
+                       flags,
+                       pci_get_long(dev->config + msi_address_lo_off(dev)));
+        if (msi64bit) {
+            fprintf(stderr, " addrss-hi: 0x%"PRIx32,
+                    pci_get_long(dev->config + msi_address_hi_off(dev)));
+        }
+        fprintf(stderr, " data: 0x%"PRIx16,
+                pci_get_word(dev->config + msi_data_off(dev, msi64bit)));
+        if (flags & PCI_MSI_FLAGS_MASKBIT) {
+            fprintf(stderr, " mask 0x%"PRIx32" pending 0x%"PRIx32,
+                    pci_get_long(dev->config + msi_mask_off(dev, msi64bit)),
+                    pci_get_long(dev->config + msi_pending_off(dev, msi64bit)));
+        }
+        fprintf(stderr, "\n");
+    }
+#endif
+
+    /* Are we modified? */
+    if (!(ranges_overlap(addr, len, msi_flags_off(dev), 2) ||
+          (msi_per_vector_mask &&
+           ranges_overlap(addr, len, msi_mask_off(dev, msi64bit), 4)))) {
+        return;
+    }
+
+    if (!(flags & PCI_MSI_FLAGS_ENABLE)) {
+        return;
+    }
+
+    /*
+     * Now MSI is enabled, clear INTx# interrupts.
+     * the driver is prohibited from writing enable bit to mask
+     * a service request. But the guest OS could do this.
+     * So we just discard the interrupts as moderate fallback.
+     *
+     * 6.8.3.3. Enabling Operation
+     *   While enabled for MSI or MSI-X operation, a function is prohibited
+     *   from using its INTx# pin (if implemented) to request
+     *   service (MSI, MSI-X, and INTx# are mutually exclusive).
+     */
+    for (i = 0; i < PCI_NUM_PINS; ++i) {
+        qemu_set_irq(dev->irq[i], 0);
+    }
+
+    /*
+     * nr_vectors might be set bigger than capable. So clamp it.
+     * This is not legal by spec, so we can do anything we like,
+     * just don't crash the host
+     */
+    log_num_vecs =
+        (flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1);
+    log_max_vecs =
+        (flags & PCI_MSI_FLAGS_QMASK) >> (ffs(PCI_MSI_FLAGS_QMASK) - 1);
+    if (log_num_vecs > log_max_vecs) {
+        flags &= ~PCI_MSI_FLAGS_QSIZE;
+        flags |= log_max_vecs << (ffs(PCI_MSI_FLAGS_QSIZE) - 1);
+        pci_set_word(dev->config + msi_flags_off(dev), flags);
+    }
+
+    if (!msi_per_vector_mask) {
+        /* if per vector masking isn't supported,
+           there is no pending interrupt. */
+        return;
+    }
+
+    nr_vectors = msi_nr_vectors(flags);
+
+    /* This will discard pending interrupts, if any. */
+    pending = pci_get_long(dev->config + msi_pending_off(dev, msi64bit));
+    pending &= (1U << nr_vectors) - 1;
+    pci_set_long(dev->config + msi_pending_off(dev, msi64bit), pending);
+
+    /* deliver pending interrupts which are unmasked */
+    for (vector = 0; vector < nr_vectors; ++vector) {
+        if (msi_is_masked(dev, vector) || !(pending & (1U << vector))) {
+            continue;
+        }
+
+        pending &= ~(1U << vector);
+        pci_set_long(dev->config + msi_pending_off(dev, msi64bit),
+                     pending);
+        msi_notify(dev, vector);
+    }
+}
+
+uint8_t msi_nr_vectors_allocated(const PCIDevice *dev)
+{
+    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
+    return msi_nr_vectors(flags);
+}
diff --git a/hw/msi.h b/hw/msi.h
new file mode 100644
index 0000000..eac9c78
--- /dev/null
+++ b/hw/msi.h
@@ -0,0 +1,41 @@ 
+/*
+ * msi.h
+ *
+ * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QEMU_MSI_H
+#define QEMU_MSI_H
+
+#include "qemu-common.h"
+#include "pci.h"
+
+bool msi_enabled(const PCIDevice *dev);
+int msi_init(struct PCIDevice *dev, uint8_t offset,
+             uint8_t nr_vectors, bool msi64bit, bool msi_per_vector_mask);
+void msi_uninit(struct PCIDevice *dev);
+void msi_reset(PCIDevice *dev);
+void msi_notify(PCIDevice *dev, uint8_t vector);
+void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len);
+uint8_t msi_nr_vectors_allocated(const PCIDevice *dev);
+
+static inline bool msi_present(const PCIDevice *dev)
+{
+    return dev->cap_present & QEMU_PCI_CAP_MSI;
+}
+
+#endif /* QEMU_MSI_H */
diff --git a/hw/pci.h b/hw/pci.h
index 1c6075e..3879708 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -109,11 +109,12 @@  typedef struct PCIIORegion {
 
 /* Bits in cap_present field. */
 enum {
-    QEMU_PCI_CAP_MSIX = 0x1,
-    QEMU_PCI_CAP_EXPRESS = 0x2,
+    QEMU_PCI_CAP_MSI = 0x1,
+    QEMU_PCI_CAP_MSIX = 0x2,
+    QEMU_PCI_CAP_EXPRESS = 0x4,
 
     /* multifunction capable device */
-#define QEMU_PCI_CAP_MULTIFUNCTION_BITNR        2
+#define QEMU_PCI_CAP_MULTIFUNCTION_BITNR        3
     QEMU_PCI_CAP_MULTIFUNCTION = (1 << QEMU_PCI_CAP_MULTIFUNCTION_BITNR),
 };
 
@@ -168,6 +169,9 @@  struct PCIDevice {
     /* Version id needed for VMState */
     int32_t version_id;
 
+    /* Offset of MSI capability in config space */
+    uint8_t msi_cap;
+
     /* Location of option rom */
     char *romfile;
     ram_addr_t rom_offset;