Patchwork [07/14] msi: implemented msi.

login
register
mail settings
Submitter Isaku Yamahata
Date Sept. 6, 2010, 7:46 a.m.
Message ID <1d7d4f07f2fb76258de5cd1e1c5e147778988a71.1283759074.git.yamahata@valinux.co.jp>
Download mbox | patch
Permalink /patch/63886/
State New
Headers show

Comments

Isaku Yamahata - Sept. 6, 2010, 7:46 a.m.
implemented msi support functions.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
---
 Makefile.objs |    2 +-
 hw/msi.c      |  362 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/msi.h      |   41 +++++++
 hw/pci.h      |    5 +
 4 files changed, 409 insertions(+), 1 deletions(-)
 create mode 100644 hw/msi.c
 create mode 100644 hw/msi.h
Michael S. Tsirkin - Sept. 6, 2010, 9:44 a.m.
On Mon, Sep 06, 2010 at 04:46:21PM +0900, Isaku Yamahata wrote:
> implemented msi support functions.
> 
> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>


Good stuff. Below I suggest a couple of minor correctness issues,
and a couple of API changes.  Below are also some suggestions to improve
readability a bit: mostly there are too many one-liners used
only once or twice, with names not informative enough to guess what they
do, so I suggest we opencode them as one ends up reading the source
anyway.

Thanks!

> ---
>  Makefile.objs |    2 +-
>  hw/msi.c      |  362 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/msi.h      |   41 +++++++
>  hw/pci.h      |    5 +
>  4 files changed, 409 insertions(+), 1 deletions(-)
>  create mode 100644 hw/msi.c
>  create mode 100644 hw/msi.h
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 594894b..5f5a4c5 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -186,7 +186,7 @@ hw-obj-$(CONFIG_PIIX4) += piix4.o
>  # PCI watchdog devices
>  hw-obj-y += wdt_i6300esb.o
>  
> -hw-obj-y += msix.o
> +hw-obj-y += msix.o msi.o
>  
>  # PCI network cards
>  hw-obj-y += ne2000.o
> diff --git a/hw/msi.c b/hw/msi.c
> new file mode 100644
> index 0000000..dbe89ee
> --- /dev/null
> +++ b/hw/msi.c
> @@ -0,0 +1,362 @@
> +/*
> + * msi.c
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "msi.h"
> +
> +#define PCI_MSI_PENDING_32      0x10
> +#define PCI_MSI_PENDING_64      0x14
> +

Longer term, pls try to add this to linux, btw.
If you manage to we'll then move this to pci_regs.h

> +/* PCI_MSI_FLAGS */
> +#define PCI_MSI_FLAGS_QSIZE_SHIFT       4
> +#define PCI_MSI_FLAGS_QMASK_SHIFT       1

Use ffs on macros from pci_regs.h :
PCI_MSI_FLAGS_QSIZE_SHIFT ->
(ffs(PCI_MSI_FLAGS_QSIZE) - 1)
as there's a single user, we don't need a macro then.

> +
> +/* PCI_MSI_ADDRESS_LO */
> +#define PCI_MSI_ADDRESS_LO_RESERVED_MASK        0x3

Better to have the valid bits than invalid bits:
#define PCI_MSI_ADDRESS_LO_MASK        (~0x3)
And then use directly.
again, we can try adding this to linux long term.


> +
> +#define PCI_MSI_32_SIZEOF       0x0a
> +#define PCI_MSI_64_SIZEOF       0x0e
> +#define PCI_MSI_32M_SIZEOF      0x14
> +#define PCI_MSI_64M_SIZEOF      0x18

Note to self: if we get rid of cap allocator, we won't need the above.

> +
> +#define MSI_DEBUG
> +#ifdef MSI_DEBUG
> +# define MSI_DPRINTF(fmt, ...)                                          \
> +    fprintf(stderr, "%s:%d " fmt, __func__, __LINE__, ## __VA_ARGS__)
> +#else
> +# define MSI_DPRINTF(fmt, ...)  do { } while (0)
> +#endif
> +#define MSI_DEV_PRINTF(dev, fmt, ...)                                   \
> +    MSI_DPRINTF("%s:%x " fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
> +
> +static inline bool msi_test_bit(uint32_t bitmap, uint8_t bit)
> +{
> +    return bitmap & (1 << bit);
> +}
> +
> +static inline void msi_set_bit(uint32_t *bitmap, uint8_t bit)
> +{
> +    *bitmap |= (1 << bit);
> +}
> +
> +static inline void msi_clear_bit(uint32_t *bitmap, uint8_t bit)
> +{
> +    *bitmap &= ~(1 << bit);
> +}


The above 3 are much better open-coded - all they do is simple math.
Also, I think we should use 1U << - the above is signed and so it
will cause integer overflow when bit is 31.

> +
> +

Two empty lines -> one

> +/* multiple vector only suport power of 2 and up to 32 */
> +static inline uint8_t ilog2(uint8_t nr_vector)
> +{
> +    switch (nr_vector) {
> +    case 1:
> +        return 0;
> +    case 2:
> +        return 1;
> +    case 4:
> +        return 2;
> +    case 8:
> +        return 3;
> +    case 16:
> +        return 4;
> +    case 32:
> +        return 5;
> +    default:
> +        abort();

Not intuitive to abort in a mathematical function IMO.

> +        break;
> +    }
> +    return 0;
> +}
> +

Please just use ffs opencoded, there's a single user.
ilog -> ffs(n) - 1;
and do this after range-checking as suggested below.

> +static inline bool is_mask_bit_support(uint16_t control)
> +{
> +    return control & PCI_MSI_FLAGS_MASKBIT;
> +}
> +
> +static inline bool is_64bit_address(uint16_t control)
> +{
> +    return control & PCI_MSI_FLAGS_64BIT;
> +}
> +

These 2 function names are especially confusing.
The above 2 are better open-coded.

> +static inline uint8_t msi_vector_order(uint16_t control)
> +{
> +    return (control & PCI_MSI_FLAGS_QSIZE) >> PCI_MSI_FLAGS_QSIZE_SHIFT;
> +}
> +

above only used once? better opencoded

> +static inline uint8_t msi_nr_vector(uint16_t control)
> +{
> +    return 1 << msi_vector_order(control);
> +}
> +

Rename msi_nr_vectors.


> +static inline bool msi_control_enabled(uint16_t control)
> +{
> +    return control & PCI_MSI_FLAGS_ENABLE;
> +}
> +

above is better opencoded.

> +static inline uint8_t msi_cap_sizeof(uint16_t control)
> +{
> +    switch (control & (PCI_MSI_FLAGS_MASKBIT | PCI_MSI_FLAGS_64BIT)) {
> +    case PCI_MSI_FLAGS_MASKBIT | PCI_MSI_FLAGS_64BIT:
> +        return PCI_MSI_64M_SIZEOF;
> +    case PCI_MSI_FLAGS_64BIT:
> +        return PCI_MSI_64_SIZEOF;
> +    case PCI_MSI_FLAGS_MASKBIT:
> +        return PCI_MSI_32M_SIZEOF;
> +    case 0:
> +        return PCI_MSI_32_SIZEOF;
> +    default:
> +        abort();
> +        break;
> +    }
> +    return 0;
> +}
> +

Note to self: this will go away too if we get rid of cap allocator.

> +static inline uint16_t msi_control_reg(const PCIDevice* dev)
> +{
> +    return dev->msi_cap + PCI_MSI_FLAGS;
> +}
> +
> +static inline uint32_t msi_lower_address_reg(const PCIDevice* dev)
> +{
> +    return dev->msi_cap + PCI_MSI_ADDRESS_LO;
> +}
> +
> +static inline uint32_t msi_upper_address_reg(const PCIDevice* dev)
> +{
> +    return dev->msi_cap + PCI_MSI_ADDRESS_HI;
> +}
> +

I think above are better open-coded - a bit less sure here as I
understand you want consistency between registers.
In any case, I think names should be consistent with macros
and be explicit in that they return an offset:
e.g. msi_lower_address_reg -> msi_address_lo_off
Also, return uint8_t and not uint32_t: these registers
always live in a traditional config space.


> +static inline uint16_t msi_data_reg(const PCIDevice* dev, bool is64bit)

msi64bit would be a better name than is64bit

> +{
> +    return dev->msi_cap + (is64bit ?  PCI_MSI_DATA_64 : PCI_MSI_DATA_32);
> +}
> +
> +static inline uint32_t msi_mask_reg(const PCIDevice* dev, bool is64bit)
> +{
> +    return dev->msi_cap + (is64bit ? PCI_MSI_MASK_64 : PCI_MSI_MASK_32);
> +}
> +
> +static inline uint32_t msi_pending_reg(const PCIDevice* dev, bool is64bit)
> +{
> +    return dev->msi_cap + (is64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
> +}
> +
> +bool msi_enabled(const PCIDevice *dev)
> +{
> +    return msi_present(dev) &&
> +        msi_control_enabled(pci_get_word(dev->config + msi_control_reg(dev)));
> +}
> +
> +int msi_init(struct PCIDevice *dev, uint8_t offset,
> +             uint8_t nr_vector, uint16_t flags)

rename nr_vector -> nr_vectors here and elsewhere

> +{
> +    uint8_t vector_order = ilog2(nr_vector);
> +    bool is64bit = is_64bit_address(flags);
> +    uint8_t cap_size;
> +    int config_offset;


maybe add assert on nr_vector: should be
	assert(!((nr_vector - 1) & nr_vector));
	assert(nr_vector > 0);
	assert(nr_vector <= 32);


> +    MSI_DEV_PRINTF(dev,
> +                   "init offset: 0x%"PRIx8" vector: %"PRId8
> +                   " flags 0x%"PRIx16"\n", offset, nr_vector, flags);
> +
> +    flags &= PCI_MSI_FLAGS_MASKBIT | PCI_MSI_FLAGS_64BIT;
> +    flags |= (vector_order << PCI_MSI_FLAGS_QMASK_SHIFT) & PCI_MSI_FLAGS_QMASK;
> +
> +    cap_size = msi_cap_sizeof(flags);
> +    config_offset = pci_add_capability(dev, PCI_CAP_ID_MSI, offset, cap_size);
> +    if (config_offset < 0) {
> +        return config_offset;
> +    }
> +
> +    dev->msi_cap = config_offset;
> +    dev->cap_present |= QEMU_PCI_CAP_MSI;
> +
> +    pci_set_word(dev->config + msi_control_reg(dev), flags);
> +    pci_set_word(dev->wmask + msi_control_reg(dev),
> +                 PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
> +    pci_set_long(dev->wmask + msi_lower_address_reg(dev),
> +                 ~PCI_MSI_ADDRESS_LO_RESERVED_MASK);
> +    if (is64bit) {
> +        pci_set_long(dev->wmask + msi_upper_address_reg(dev), 0xffffffff);
> +    }
> +    pci_set_word(dev->wmask + msi_data_reg(dev, is64bit), 0xffff);
> +
> +    if (is_mask_bit_support(flags)) {
> +        pci_set_long(dev->wmask + msi_mask_reg(dev, is64bit),
> +                     (1 << nr_vector) - 1);

1 is signed. I think this should be 1U << nr_vector, otherwise
you get signed integer overflow for nr >= 31.

> +    }
> +    return config_offset;
> +}
> +
> +void msi_uninit(struct PCIDevice *dev)
> +{
> +    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
> +    uint8_t cap_size = msi_cap_sizeof(control);
> +    pci_del_capability(dev, PCI_CAP_ID_MSIX, cap_size);
> +    MSI_DEV_PRINTF(dev, "uninit\n");
> +}
> +
> +void msi_reset(PCIDevice *dev)
> +{
> +    uint16_t control;
> +    bool is64bit;
> +
> +    control = pci_get_word(dev->config + msi_control_reg(dev));
> +    control &= ~(PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
> +    is64bit = is_64bit_address(control);
> +
> +    pci_set_word(dev->config + msi_control_reg(dev), control);
> +    pci_set_long(dev->config + msi_lower_address_reg(dev), 0);
> +    if (is64bit) {
> +        pci_set_long(dev->config + msi_upper_address_reg(dev), 0);
> +    }
> +    pci_set_word(dev->config + msi_data_reg(dev, is64bit), 0);
> +    if (is_mask_bit_support(control)) {
> +        pci_set_long(dev->config + msi_mask_reg(dev, is64bit), 0);
> +        pci_set_long(dev->config + msi_pending_reg(dev, is64bit), 0);
> +    }
> +    MSI_DEV_PRINTF(dev, "reset\n");
> +}
> +
> +static bool msi_is_masked(const PCIDevice *dev, uint8_t vector)
> +{
> +    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
> +    uint32_t mask;
> +
> +    if (!is_mask_bit_support(control)) {
> +        return false;
> +    }
> +
> +    mask = pci_get_long(dev->config +
> +                        msi_mask_reg(dev, is_64bit_address(control)));
> +    return msi_test_bit(mask, vector);
> +}
> +
> +static void msi_set_pending(PCIDevice *dev, uint8_t vector)
> +{
> +    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
> +    bool is64bit = is_64bit_address(control);
> +    uint32_t pending;
> +
> +    assert(is_mask_bit_support(control));
> +
> +    pending = pci_get_long(dev->config + msi_pending_reg(dev, is64bit));
> +    msi_set_bit(&pending, vector);
> +    pci_set_long(dev->config + msi_pending_reg(dev, is64bit), pending);
> +    MSI_DEV_PRINTF(dev, "pending vector 0x%"PRIx8"\n", vector);
> +}
> +
> +void msi_notify(PCIDevice *dev, uint8_t vector)
> +{
> +    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
> +    bool is64bit = is_64bit_address(control);
> +    uint8_t nr_vector = msi_nr_vector(control);
> +    uint64_t address;
> +    uint32_t data;
> +
> +    assert(vector < nr_vector);
> +    if (msi_is_masked(dev, vector)) {
> +        msi_set_pending(dev, vector);
> +        return;
> +    }
> +
> +    if (is64bit){
> +        address = pci_get_quad(dev->config + msi_lower_address_reg(dev));
> +    } else {
> +        address = pci_get_long(dev->config + msi_lower_address_reg(dev));
> +    }
> +
> +    /* upper bit 31:16 is zero */
> +    data = pci_get_word(dev->config + msi_data_reg(dev, is64bit));
> +    if (nr_vector > 1) {
> +        data &= ~(nr_vector - 1);
> +        data |= vector;
> +    }
> +
> +    MSI_DEV_PRINTF(dev,
> +                   "notify vector 0x%"PRIx8
> +                   " address: 0x%"PRIx64" data: 0x%"PRIx32"\n",
> +                   vector, address, data);
> +    stl_phys(address, data);
> +}
> +
> +/* call this function after updating configs by pci_default_write_config() */
> +void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len)
> +{
> +    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
> +    bool is64bit = is_64bit_address(control);
> +    uint8_t nr_vector = msi_nr_vector(control);
> +
> +#ifdef MSI_DEBUG
> +    uint8_t cap_size = msi_cap_sizeof(control & (PCI_MSI_FLAGS_MASKBIT |
> +                                                 PCI_MSI_FLAGS_64BIT));
> +    if (ranges_overlap(addr, len, dev->msi_cap, cap_size)) {
> +        MSI_DEV_PRINTF(dev, "addr 0x%"PRIx32" val 0x%"PRIx32" len %d\n",
> +                       addr, val, len);
> +        MSI_DEV_PRINTF(dev, "ctrl: 0x%"PRIx16" address: 0x%"PRIx32,
> +                       control,
> +                       pci_get_long(dev->config + msi_lower_address_reg(dev)));
> +        if (is64bit) {
> +            fprintf(stderr, " addrss-hi: 0x%"PRIx32,
> +                    pci_get_long(dev->config + msi_upper_address_reg(dev)));
> +        }
> +        fprintf(stderr, " data: 0x%"PRIx16,
> +                pci_get_word(dev->config + msi_data_reg(dev, is64bit)));
> +        if (is_mask_bit_support(control)) {
> +            fprintf(stderr, " mask 0x%"PRIx32" pending 0x%"PRIx32,
> +                    pci_get_long(dev->config + msi_mask_reg(dev, is64bit)),
> +                    pci_get_long(dev->config + msi_pending_reg(dev, is64bit)));
> +        }
> +        fprintf(stderr, "\n");
> +    }
> +#endif
> +
> +    if (!msi_control_enabled(control)) {

Should not we clear pending bits here?

> +        return;
> +    }

We must also clear INTx# interrupts when msi is enabled.

> +
> +    if (!is_mask_bit_support(control)) {
> +        /* if per vector masking isn't support,
> +           there is no pending interrupt. */
> +        return;
> +    }
> +

I think we also need to clear pending bits if # of vectors
is reduced.

We need to also decide what to do if software tries to
enable more vectors than supported. I suggest setting
allocated vectors to nr_vectors in this case.


> +    if (range_covers_byte(addr, len, msi_control_reg(dev)) || /* MSI enable */
> +        ranges_overlap(addr, len, msi_mask_reg(dev, is64bit), 4)) {

I think it is better to reduce nesting.
Reverse the condition and return if this is not an msi write.

> +        uint32_t pending =
> +            pci_get_long(dev->config + msi_pending_reg(dev, is64bit));
> +        uint8_t vector;
> +
> +        /* deliver pending interrupts which are unmasked */
> +        for (vector = 0; vector < nr_vector; ++vector) {
> +            if (msi_is_masked(dev, vector) || !msi_test_bit(pending, vector)) {

I am confused. This is called after mask is updated, right?
So msi_is_masked means vector was masked, not unmasked?
I think the logic is reversed here.

> +                continue;
> +            }
> +
> +            msi_clear_bit(&pending, vector);
> +            pci_set_long(dev->config + msi_pending_reg(dev, is64bit), pending);
> +            msi_notify(dev, vector);
> +        }
> +    }
> +}
> +
> +uint8_t msi_nr_allocated_vector(const PCIDevice *dev)

rename msi_nr_vectors_allocated to match spec.

> +{
> +    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
> +    return msi_nr_vector(control);
> +}
> diff --git a/hw/msi.h b/hw/msi.h
> new file mode 100644
> index 0000000..1131c6e
> --- /dev/null
> +++ b/hw/msi.h
> @@ -0,0 +1,41 @@
> +/*
> + * msi.h
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef QEMU_MSI_H
> +#define QEMU_MSI_H
> +
> +#include "qemu-common.h"
> +#include "pci.h"
> +
> +bool msi_enabled(const PCIDevice *dev);
> +int msi_init(struct PCIDevice *dev, uint8_t offset,
> +             uint8_t nr_vector, uint16_t flags);

So users need to encode flags? Where do they get them?
And to encode, we need registers internal to msi.c or duplicate ...
Would not it be better to pass msi_64_bit and msi_per_vector_mask?


> +void msi_uninit(struct PCIDevice *dev);
> +void msi_reset(PCIDevice *dev);
> +void msi_notify(PCIDevice *dev, uint8_t vector);
> +void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len);
> +uint8_t msi_nr_allocated_vector(const PCIDevice *dev);
> +
> +static inline bool msi_present(const PCIDevice *dev)
> +{
> +    return dev->cap_present & QEMU_PCI_CAP_MSI;
> +}
> +
> +#endif /* QEMU_MSI_H */
> diff --git a/hw/pci.h b/hw/pci.h
> index 2b4c318..9387a03 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -115,6 +115,8 @@ enum {
>      /* multifunction capable device */
>  #define QEMU_PCI_CAP_MULTIFUNCTION_BITNR        2
>      QEMU_PCI_CAP_MULTIFUNCTION = (1 << QEMU_PCI_CAP_MULTIFUNCTION_BITNR),
> +
> +    QEMU_PCI_CAP_MSI = 0x4,

These are internal bits so we do not need to preserve any values.
Put this cap right before or right after MSIX please.

>  };
>  
>  struct PCIDevice {
> @@ -168,6 +170,9 @@ struct PCIDevice {
>      /* Version id needed for VMState */
>      int32_t version_id;
>  
> +    /* Offset of MSI capability in config space */
> +    uint8_t msi_cap;
> +
>      /* Location of option rom */
>      char *romfile;
>      ram_addr_t rom_offset;
> -- 
> 1.7.1.1
Isaku Yamahata - Sept. 8, 2010, 7:43 a.m.
Thank you for through review.

On Mon, Sep 06, 2010 at 12:44:16PM +0300, Michael S. Tsirkin wrote:
> > +        uint32_t pending =
> > +            pci_get_long(dev->config + msi_pending_reg(dev, is64bit));
> > +        uint8_t vector;
> > +
> > +        /* deliver pending interrupts which are unmasked */
> > +        for (vector = 0; vector < nr_vector; ++vector) {
> > +            if (msi_is_masked(dev, vector) || !msi_test_bit(pending, vector)) {
> 
> I am confused. This is called after mask is updated, right?
> So msi_is_masked means vector was masked, not unmasked?
> I think the logic is reversed here.

I suppose you were missing the following continue.


> > +                continue;
                     ^^^^^^^^^ Here

> > +            }
> > +
> > +            msi_clear_bit(&pending, vector);
> > +            pci_set_long(dev->config + msi_pending_reg(dev, is64bit), pending);
> > +            msi_notify(dev, vector);
> > +        }
> > +    }
> > +}
> > +
> > +uint8_t msi_nr_allocated_vector(const PCIDevice *dev)
Michael S. Tsirkin - Sept. 8, 2010, 7:51 a.m.
On Wed, Sep 08, 2010 at 04:43:27PM +0900, Isaku Yamahata wrote:
> Thank you for through review.
> 
> On Mon, Sep 06, 2010 at 12:44:16PM +0300, Michael S. Tsirkin wrote:
> > > +        uint32_t pending =
> > > +            pci_get_long(dev->config + msi_pending_reg(dev, is64bit));
> > > +        uint8_t vector;
> > > +
> > > +        /* deliver pending interrupts which are unmasked */
> > > +        for (vector = 0; vector < nr_vector; ++vector) {
> > > +            if (msi_is_masked(dev, vector) || !msi_test_bit(pending, vector)) {
> > 
> > I am confused. This is called after mask is updated, right?
> > So msi_is_masked means vector was masked, not unmasked?
> > I think the logic is reversed here.
> 
> I suppose you were missing the following continue.
> 
> 
> > > +                continue;
>                      ^^^^^^^^^ Here

I see. You are right. The block is small enough to maybe make it worthwhile
to revert the logic and avoid continue. It's up to you though,
there's no bug here.

> > > +            }
> > > +
> > > +            msi_clear_bit(&pending, vector);
> > > +            pci_set_long(dev->config + msi_pending_reg(dev, is64bit), pending);
> > > +            msi_notify(dev, vector);
> > > +        }
> > > +    }
> > > +}
> > > +
> > > +uint8_t msi_nr_allocated_vector(const PCIDevice *dev)
> 
> -- 
> yamahata

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 594894b..5f5a4c5 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -186,7 +186,7 @@  hw-obj-$(CONFIG_PIIX4) += piix4.o
 # PCI watchdog devices
 hw-obj-y += wdt_i6300esb.o
 
-hw-obj-y += msix.o
+hw-obj-y += msix.o msi.o
 
 # PCI network cards
 hw-obj-y += ne2000.o
diff --git a/hw/msi.c b/hw/msi.c
new file mode 100644
index 0000000..dbe89ee
--- /dev/null
+++ b/hw/msi.c
@@ -0,0 +1,362 @@ 
+/*
+ * msi.c
+ *
+ * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "msi.h"
+
+#define PCI_MSI_PENDING_32      0x10
+#define PCI_MSI_PENDING_64      0x14
+
+/* PCI_MSI_FLAGS */
+#define PCI_MSI_FLAGS_QSIZE_SHIFT       4
+#define PCI_MSI_FLAGS_QMASK_SHIFT       1
+
+/* PCI_MSI_ADDRESS_LO */
+#define PCI_MSI_ADDRESS_LO_RESERVED_MASK        0x3
+
+#define PCI_MSI_32_SIZEOF       0x0a
+#define PCI_MSI_64_SIZEOF       0x0e
+#define PCI_MSI_32M_SIZEOF      0x14
+#define PCI_MSI_64M_SIZEOF      0x18
+
+#define MSI_DEBUG
+#ifdef MSI_DEBUG
+# define MSI_DPRINTF(fmt, ...)                                          \
+    fprintf(stderr, "%s:%d " fmt, __func__, __LINE__, ## __VA_ARGS__)
+#else
+# define MSI_DPRINTF(fmt, ...)  do { } while (0)
+#endif
+#define MSI_DEV_PRINTF(dev, fmt, ...)                                   \
+    MSI_DPRINTF("%s:%x " fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
+
+static inline bool msi_test_bit(uint32_t bitmap, uint8_t bit)
+{
+    return bitmap & (1 << bit);
+}
+
+static inline void msi_set_bit(uint32_t *bitmap, uint8_t bit)
+{
+    *bitmap |= (1 << bit);
+}
+
+static inline void msi_clear_bit(uint32_t *bitmap, uint8_t bit)
+{
+    *bitmap &= ~(1 << bit);
+}
+
+
+/* multiple vector only suport power of 2 and up to 32 */
+static inline uint8_t ilog2(uint8_t nr_vector)
+{
+    switch (nr_vector) {
+    case 1:
+        return 0;
+    case 2:
+        return 1;
+    case 4:
+        return 2;
+    case 8:
+        return 3;
+    case 16:
+        return 4;
+    case 32:
+        return 5;
+    default:
+        abort();
+        break;
+    }
+    return 0;
+}
+
+static inline bool is_mask_bit_support(uint16_t control)
+{
+    return control & PCI_MSI_FLAGS_MASKBIT;
+}
+
+static inline bool is_64bit_address(uint16_t control)
+{
+    return control & PCI_MSI_FLAGS_64BIT;
+}
+
+static inline uint8_t msi_vector_order(uint16_t control)
+{
+    return (control & PCI_MSI_FLAGS_QSIZE) >> PCI_MSI_FLAGS_QSIZE_SHIFT;
+}
+
+static inline uint8_t msi_nr_vector(uint16_t control)
+{
+    return 1 << msi_vector_order(control);
+}
+
+static inline bool msi_control_enabled(uint16_t control)
+{
+    return control & PCI_MSI_FLAGS_ENABLE;
+}
+
+static inline uint8_t msi_cap_sizeof(uint16_t control)
+{
+    switch (control & (PCI_MSI_FLAGS_MASKBIT | PCI_MSI_FLAGS_64BIT)) {
+    case PCI_MSI_FLAGS_MASKBIT | PCI_MSI_FLAGS_64BIT:
+        return PCI_MSI_64M_SIZEOF;
+    case PCI_MSI_FLAGS_64BIT:
+        return PCI_MSI_64_SIZEOF;
+    case PCI_MSI_FLAGS_MASKBIT:
+        return PCI_MSI_32M_SIZEOF;
+    case 0:
+        return PCI_MSI_32_SIZEOF;
+    default:
+        abort();
+        break;
+    }
+    return 0;
+}
+
+static inline uint16_t msi_control_reg(const PCIDevice* dev)
+{
+    return dev->msi_cap + PCI_MSI_FLAGS;
+}
+
+static inline uint32_t msi_lower_address_reg(const PCIDevice* dev)
+{
+    return dev->msi_cap + PCI_MSI_ADDRESS_LO;
+}
+
+static inline uint32_t msi_upper_address_reg(const PCIDevice* dev)
+{
+    return dev->msi_cap + PCI_MSI_ADDRESS_HI;
+}
+
+static inline uint16_t msi_data_reg(const PCIDevice* dev, bool is64bit)
+{
+    return dev->msi_cap + (is64bit ?  PCI_MSI_DATA_64 : PCI_MSI_DATA_32);
+}
+
+static inline uint32_t msi_mask_reg(const PCIDevice* dev, bool is64bit)
+{
+    return dev->msi_cap + (is64bit ? PCI_MSI_MASK_64 : PCI_MSI_MASK_32);
+}
+
+static inline uint32_t msi_pending_reg(const PCIDevice* dev, bool is64bit)
+{
+    return dev->msi_cap + (is64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
+}
+
+bool msi_enabled(const PCIDevice *dev)
+{
+    return msi_present(dev) &&
+        msi_control_enabled(pci_get_word(dev->config + msi_control_reg(dev)));
+}
+
+int msi_init(struct PCIDevice *dev, uint8_t offset,
+             uint8_t nr_vector, uint16_t flags)
+{
+    uint8_t vector_order = ilog2(nr_vector);
+    bool is64bit = is_64bit_address(flags);
+    uint8_t cap_size;
+    int config_offset;
+    MSI_DEV_PRINTF(dev,
+                   "init offset: 0x%"PRIx8" vector: %"PRId8
+                   " flags 0x%"PRIx16"\n", offset, nr_vector, flags);
+
+    flags &= PCI_MSI_FLAGS_MASKBIT | PCI_MSI_FLAGS_64BIT;
+    flags |= (vector_order << PCI_MSI_FLAGS_QMASK_SHIFT) & PCI_MSI_FLAGS_QMASK;
+
+    cap_size = msi_cap_sizeof(flags);
+    config_offset = pci_add_capability(dev, PCI_CAP_ID_MSI, offset, cap_size);
+    if (config_offset < 0) {
+        return config_offset;
+    }
+
+    dev->msi_cap = config_offset;
+    dev->cap_present |= QEMU_PCI_CAP_MSI;
+
+    pci_set_word(dev->config + msi_control_reg(dev), flags);
+    pci_set_word(dev->wmask + msi_control_reg(dev),
+                 PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
+    pci_set_long(dev->wmask + msi_lower_address_reg(dev),
+                 ~PCI_MSI_ADDRESS_LO_RESERVED_MASK);
+    if (is64bit) {
+        pci_set_long(dev->wmask + msi_upper_address_reg(dev), 0xffffffff);
+    }
+    pci_set_word(dev->wmask + msi_data_reg(dev, is64bit), 0xffff);
+
+    if (is_mask_bit_support(flags)) {
+        pci_set_long(dev->wmask + msi_mask_reg(dev, is64bit),
+                     (1 << nr_vector) - 1);
+    }
+    return config_offset;
+}
+
+void msi_uninit(struct PCIDevice *dev)
+{
+    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
+    uint8_t cap_size = msi_cap_sizeof(control);
+    pci_del_capability(dev, PCI_CAP_ID_MSIX, cap_size);
+    MSI_DEV_PRINTF(dev, "uninit\n");
+}
+
+void msi_reset(PCIDevice *dev)
+{
+    uint16_t control;
+    bool is64bit;
+
+    control = pci_get_word(dev->config + msi_control_reg(dev));
+    control &= ~(PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
+    is64bit = is_64bit_address(control);
+
+    pci_set_word(dev->config + msi_control_reg(dev), control);
+    pci_set_long(dev->config + msi_lower_address_reg(dev), 0);
+    if (is64bit) {
+        pci_set_long(dev->config + msi_upper_address_reg(dev), 0);
+    }
+    pci_set_word(dev->config + msi_data_reg(dev, is64bit), 0);
+    if (is_mask_bit_support(control)) {
+        pci_set_long(dev->config + msi_mask_reg(dev, is64bit), 0);
+        pci_set_long(dev->config + msi_pending_reg(dev, is64bit), 0);
+    }
+    MSI_DEV_PRINTF(dev, "reset\n");
+}
+
+static bool msi_is_masked(const PCIDevice *dev, uint8_t vector)
+{
+    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
+    uint32_t mask;
+
+    if (!is_mask_bit_support(control)) {
+        return false;
+    }
+
+    mask = pci_get_long(dev->config +
+                        msi_mask_reg(dev, is_64bit_address(control)));
+    return msi_test_bit(mask, vector);
+}
+
+static void msi_set_pending(PCIDevice *dev, uint8_t vector)
+{
+    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
+    bool is64bit = is_64bit_address(control);
+    uint32_t pending;
+
+    assert(is_mask_bit_support(control));
+
+    pending = pci_get_long(dev->config + msi_pending_reg(dev, is64bit));
+    msi_set_bit(&pending, vector);
+    pci_set_long(dev->config + msi_pending_reg(dev, is64bit), pending);
+    MSI_DEV_PRINTF(dev, "pending vector 0x%"PRIx8"\n", vector);
+}
+
+void msi_notify(PCIDevice *dev, uint8_t vector)
+{
+    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
+    bool is64bit = is_64bit_address(control);
+    uint8_t nr_vector = msi_nr_vector(control);
+    uint64_t address;
+    uint32_t data;
+
+    assert(vector < nr_vector);
+    if (msi_is_masked(dev, vector)) {
+        msi_set_pending(dev, vector);
+        return;
+    }
+
+    if (is64bit){
+        address = pci_get_quad(dev->config + msi_lower_address_reg(dev));
+    } else {
+        address = pci_get_long(dev->config + msi_lower_address_reg(dev));
+    }
+
+    /* upper bit 31:16 is zero */
+    data = pci_get_word(dev->config + msi_data_reg(dev, is64bit));
+    if (nr_vector > 1) {
+        data &= ~(nr_vector - 1);
+        data |= vector;
+    }
+
+    MSI_DEV_PRINTF(dev,
+                   "notify vector 0x%"PRIx8
+                   " address: 0x%"PRIx64" data: 0x%"PRIx32"\n",
+                   vector, address, data);
+    stl_phys(address, data);
+}
+
+/* call this function after updating configs by pci_default_write_config() */
+void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len)
+{
+    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
+    bool is64bit = is_64bit_address(control);
+    uint8_t nr_vector = msi_nr_vector(control);
+
+#ifdef MSI_DEBUG
+    uint8_t cap_size = msi_cap_sizeof(control & (PCI_MSI_FLAGS_MASKBIT |
+                                                 PCI_MSI_FLAGS_64BIT));
+    if (ranges_overlap(addr, len, dev->msi_cap, cap_size)) {
+        MSI_DEV_PRINTF(dev, "addr 0x%"PRIx32" val 0x%"PRIx32" len %d\n",
+                       addr, val, len);
+        MSI_DEV_PRINTF(dev, "ctrl: 0x%"PRIx16" address: 0x%"PRIx32,
+                       control,
+                       pci_get_long(dev->config + msi_lower_address_reg(dev)));
+        if (is64bit) {
+            fprintf(stderr, " addrss-hi: 0x%"PRIx32,
+                    pci_get_long(dev->config + msi_upper_address_reg(dev)));
+        }
+        fprintf(stderr, " data: 0x%"PRIx16,
+                pci_get_word(dev->config + msi_data_reg(dev, is64bit)));
+        if (is_mask_bit_support(control)) {
+            fprintf(stderr, " mask 0x%"PRIx32" pending 0x%"PRIx32,
+                    pci_get_long(dev->config + msi_mask_reg(dev, is64bit)),
+                    pci_get_long(dev->config + msi_pending_reg(dev, is64bit)));
+        }
+        fprintf(stderr, "\n");
+    }
+#endif
+
+    if (!msi_control_enabled(control)) {
+        return;
+    }
+
+    if (!is_mask_bit_support(control)) {
+        /* if per vector masking isn't support,
+           there is no pending interrupt. */
+        return;
+    }
+
+    if (range_covers_byte(addr, len, msi_control_reg(dev)) || /* MSI enable */
+        ranges_overlap(addr, len, msi_mask_reg(dev, is64bit), 4)) {
+        uint32_t pending =
+            pci_get_long(dev->config + msi_pending_reg(dev, is64bit));
+        uint8_t vector;
+
+        /* deliver pending interrupts which are unmasked */
+        for (vector = 0; vector < nr_vector; ++vector) {
+            if (msi_is_masked(dev, vector) || !msi_test_bit(pending, vector)) {
+                continue;
+            }
+
+            msi_clear_bit(&pending, vector);
+            pci_set_long(dev->config + msi_pending_reg(dev, is64bit), pending);
+            msi_notify(dev, vector);
+        }
+    }
+}
+
+uint8_t msi_nr_allocated_vector(const PCIDevice *dev)
+{
+    uint16_t control = pci_get_word(dev->config + msi_control_reg(dev));
+    return msi_nr_vector(control);
+}
diff --git a/hw/msi.h b/hw/msi.h
new file mode 100644
index 0000000..1131c6e
--- /dev/null
+++ b/hw/msi.h
@@ -0,0 +1,41 @@ 
+/*
+ * msi.h
+ *
+ * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QEMU_MSI_H
+#define QEMU_MSI_H
+
+#include "qemu-common.h"
+#include "pci.h"
+
+bool msi_enabled(const PCIDevice *dev);
+int msi_init(struct PCIDevice *dev, uint8_t offset,
+             uint8_t nr_vector, uint16_t flags);
+void msi_uninit(struct PCIDevice *dev);
+void msi_reset(PCIDevice *dev);
+void msi_notify(PCIDevice *dev, uint8_t vector);
+void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len);
+uint8_t msi_nr_allocated_vector(const PCIDevice *dev);
+
+static inline bool msi_present(const PCIDevice *dev)
+{
+    return dev->cap_present & QEMU_PCI_CAP_MSI;
+}
+
+#endif /* QEMU_MSI_H */
diff --git a/hw/pci.h b/hw/pci.h
index 2b4c318..9387a03 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -115,6 +115,8 @@  enum {
     /* multifunction capable device */
 #define QEMU_PCI_CAP_MULTIFUNCTION_BITNR        2
     QEMU_PCI_CAP_MULTIFUNCTION = (1 << QEMU_PCI_CAP_MULTIFUNCTION_BITNR),
+
+    QEMU_PCI_CAP_MSI = 0x4,
 };
 
 struct PCIDevice {
@@ -168,6 +170,9 @@  struct PCIDevice {
     /* Version id needed for VMState */
     int32_t version_id;
 
+    /* Offset of MSI capability in config space */
+    uint8_t msi_cap;
+
     /* Location of option rom */
     char *romfile;
     ram_addr_t rom_offset;