diff mbox

[v6,5/6] Inter-VM shared memory PCI device

Message ID 1275687942-12312-6-git-send-email-cam@cs.ualberta.ca
State New
Headers show

Commit Message

Cam Macdonell June 4, 2010, 9:45 p.m. UTC
Support an inter-vm shared memory device that maps a shared-memory object as a
PCI device in the guest.  This patch also supports interrupts between guest by
communicating over a unix domain socket.  This patch applies to the qemu-kvm
repository.

    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]

Interrupts are supported between multiple VMs by using a shared memory server
by using a chardev socket.

    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
           [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n][,role=peer|master]
    -chardev socket,path=<path>,id=<id>

(shared memory server is qemu.git/contrib/ivshmem-server)

Sample programs and init scripts are in a git repo here:

    www.gitorious.org/nahanni
---
 Makefile.target |    3 +
 hw/ivshmem.c    |  852 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 qemu-char.c     |    6 +
 qemu-char.h     |    3 +
 qemu-doc.texi   |   43 +++
 5 files changed, 907 insertions(+), 0 deletions(-)
 create mode 100644 hw/ivshmem.c

Comments

Blue Swirl June 5, 2010, 9:44 a.m. UTC | #1
On Fri, Jun 4, 2010 at 9:45 PM, Cam Macdonell <cam@cs.ualberta.ca> wrote:
> Support an inter-vm shared memory device that maps a shared-memory object as a
> PCI device in the guest.  This patch also supports interrupts between guest by
> communicating over a unix domain socket.  This patch applies to the qemu-kvm
> repository.
>
>    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>
> Interrupts are supported between multiple VMs by using a shared memory server
> by using a chardev socket.
>
>    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>           [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n][,role=peer|master]
>    -chardev socket,path=<path>,id=<id>
>
> (shared memory server is qemu.git/contrib/ivshmem-server)
>
> Sample programs and init scripts are in a git repo here:
>
>    www.gitorious.org/nahanni
> ---
>  Makefile.target |    3 +
>  hw/ivshmem.c    |  852 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  qemu-char.c     |    6 +
>  qemu-char.h     |    3 +
>  qemu-doc.texi   |   43 +++
>  5 files changed, 907 insertions(+), 0 deletions(-)
>  create mode 100644 hw/ivshmem.c
>
> diff --git a/Makefile.target b/Makefile.target
> index c4ba592..4888308 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -202,6 +202,9 @@ obj-$(CONFIG_USB_OHCI) += usb-ohci.o
>  obj-y += rtl8139.o
>  obj-y += e1000.o
>
> +# Inter-VM PCI shared memory
> +obj-y += ivshmem.o
> +

Can this be compiled once, simply by moving this to Makefile.objs
instead of Makefile.target? Also, because the code seems to be KVM
specific, it can't be compiled unconditionally but depending on at
least CONFIG_KVM and maybe CONFIG_EVENTFD.

Why is this KVM specific BTW, Posix SHM is available on many
platforms? What would happen if kvm_set_foobar functions were not
called when KVM is not being used? Is host eventfd support essential?

>  # Hardware support
>  obj-i386-y += vga.o
>  obj-i386-y += mc146818rtc.o i8259.o pc.o
> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> new file mode 100644
> index 0000000..9057612
> --- /dev/null
> +++ b/hw/ivshmem.c
> @@ -0,0 +1,852 @@
> +/*
> + * Inter-VM Shared Memory PCI device.
> + *
> + * Author:
> + *      Cam Macdonell <cam@cs.ualberta.ca>
> + *
> + * Based On: cirrus_vga.c
> + *          Copyright (c) 2004 Fabrice Bellard
> + *          Copyright (c) 2004 Makoto Suzuki (suzu)
> + *
> + *      and rtl8139.c
> + *          Copyright (c) 2006 Igor Kovalenko
> + *
> + * This code is licensed under the GNU GPL v2.
> + */
> +#include <sys/mman.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/io.h>
> +#include <sys/ioctl.h>
> +#include "hw.h"
> +#include "console.h"
> +#include "pc.h"
> +#include "pci.h"
> +#include "sysemu.h"
> +
> +#include "msix.h"
> +#include "qemu-kvm.h"
> +#include "libkvm.h"
> +
> +#include <sys/eventfd.h>
> +#include <sys/mman.h>
> +#include <sys/socket.h>
> +#include <sys/ioctl.h>
> +
> +#define IVSHMEM_IRQFD   0
> +#define IVSHMEM_MSI     1
> +
> +//#define DEBUG_IVSHMEM
> +#ifdef DEBUG_IVSHMEM
> +#define IVSHMEM_DPRINTF(fmt, args...)        \
> +    do {printf("IVSHMEM: " fmt, ##args); } while (0)

Please use __VA_ARGS__.

> +#else
> +#define IVSHMEM_DPRINTF(fmt, args...)
> +#endif
> +
> +typedef struct Peer {
> +    int nb_eventfds;
> +    int *eventfds;
> +} Peer;
> +
> +typedef struct EventfdEntry {
> +    PCIDevice *pdev;
> +    int vector;
> +} EventfdEntry;
> +
> +typedef struct IVShmemState {
> +    PCIDevice dev;
> +    uint32_t intrmask;
> +    uint32_t intrstatus;
> +    uint32_t doorbell;
> +
> +    CharDriverState ** eventfd_chr;

I'd remove the space between '**' and 'eventfd_chr', it's used inconsistently.

> +    CharDriverState * server_chr;
> +    int ivshmem_mmio_io_addr;
> +
> +    pcibus_t mmio_addr;
> +    pcibus_t shm_pci_addr;
> +    uint64_t ivshmem_offset;
> +    uint64_t ivshmem_size; /* size of shared memory region */
> +    int shm_fd; /* shared memory file descriptor */
> +
> +    Peer *peers;
> +    int nb_peers; /* how many guests we have space for */
> +    int max_peer; /* maximum numbered peer */
> +
> +    int vm_id;
> +    uint32_t vectors;
> +    uint32_t features;
> +    EventfdEntry *eventfd_table;
> +
> +    char * shmobj;
> +    char * sizearg;
> +    char * role;
> +} IVShmemState;
> +
> +/* registers for the Inter-VM shared memory device */
> +enum ivshmem_registers {
> +    IntrMask = 0,
> +    IntrStatus = 4,
> +    IVPosition = 8,
> +    Doorbell = 12,
> +};

IIRC these should be uppercase.

> +
> +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int feature) {
> +    return (ivs->features & (1 << feature));
> +}

Since this is the first version, do we need any features at this
point, can't we expect that all features are available now? Why does
the user need to specify the features?

To avoid a negative shift, I'd make 'feature' unsigned.

> +
> +static inline bool is_power_of_two(uint64_t x) {
> +    return (x & (x - 1)) == 0;
> +}
> +
> +static void ivshmem_map(PCIDevice *pci_dev, int region_num,
> +                    pcibus_t addr, pcibus_t size, int type)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
> +
> +    s->shm_pci_addr = addr;
> +
> +    if (s->ivshmem_offset > 0) {
> +        cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
> +                                                            s->ivshmem_offset);
> +        if (s->role && strncmp(s->role, "peer", 4) == 0) {
> +            IVSHMEM_DPRINTF("marking pages no migrate\n");
> +            cpu_mark_pages_no_migrate(s->ivshmem_offset, s->ivshmem_size);
> +        }
> +    }
> +
> +    IVSHMEM_DPRINTF("guest pci addr = %u, guest h/w addr = %u, size = %u\n",
> +                (uint32_t)addr, (uint32_t)s->ivshmem_offset, (uint32_t)size);

Please use FMT_PCIBUS for addr and size and PRIu64 for s->ivshmem_offset.

> +
> +}
> +
> +/* accessing registers - based on rtl8139 */
> +static void ivshmem_update_irq(IVShmemState *s, int val)
> +{
> +    int isr;
> +    isr = (s->intrstatus & s->intrmask) & 0xffffffff;
> +
> +    /* don't print ISR resets */
> +    if (isr) {
> +        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
> +           isr ? 1 : 0, s->intrstatus, s->intrmask);
> +    }
> +
> +    qemu_set_irq(s->dev.irq[0], (isr != 0));
> +}
> +
> +static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
> +{
> +    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
> +
> +    s->intrmask = val;
> +
> +    ivshmem_update_irq(s, val);
> +}
> +
> +static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
> +{
> +    uint32_t ret = s->intrmask;
> +
> +    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
> +
> +    return ret;
> +}
> +
> +static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
> +{
> +    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
> +
> +    s->intrstatus = val;
> +
> +    ivshmem_update_irq(s, val);
> +    return;
> +}
> +
> +static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
> +{
> +    uint32_t ret = s->intrstatus;
> +
> +    /* reading ISR clears all interrupts */
> +    s->intrstatus = 0;
> +
> +    ivshmem_update_irq(s, 0);
> +
> +    return ret;
> +}
> +
> +static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val)
> +{
> +
> +    IVSHMEM_DPRINTF("We shouldn't be writing words\n");
> +}
> +
> +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
> +{
> +    IVShmemState *s = opaque;
> +
> +    u_int64_t write_one = 1;

Please use uintNN_t instead of u_intNN_t.

> +    u_int16_t dest = val >> 16;
> +    u_int16_t vector = val & 0xff;
> +
> +    addr &= 0xfc;

I'd add a debug printf here, likewise for exit of ivshmem_io_readl().
When you do the merge (see below), the correct printf format for the
addresses will be TARGET_FMT_plx.

> +
> +    switch (addr)
> +    {
> +        case IntrMask:
> +            ivshmem_IntrMask_write(s, val);
> +            break;
> +
> +        case IntrStatus:
> +            ivshmem_IntrStatus_write(s, val);
> +            break;
> +
> +        case Doorbell:
> +            /* check that dest VM ID is reasonable */
> +            if ((dest < 0) || (dest > s->max_peer)) {
> +                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
> +                break;
> +            }
> +
> +            /* check doorbell range */
> +            if ((vector >= 0) && (vector < s->peers[dest].nb_eventfds)) {
> +                IVSHMEM_DPRINTF("Writing %ld to VM %d on vector %d\n",
> +                                                    write_one, dest, vector);

PRId64 for write_one, %ld is not enough on ILP32.

> +                if (write(s->peers[dest].eventfds[vector],
> +                                                    &(write_one), 8) != 8) {
> +                    IVSHMEM_DPRINTF("error writing to eventfd\n");
> +                }
> +            }
> +            break;
> +        default:
> +            IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest);
> +    }
> +}
> +
> +static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val)
> +{
> +    IVSHMEM_DPRINTF("We shouldn't be writing bytes\n");
> +}
> +
> +static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr)
> +{
> +
> +    IVSHMEM_DPRINTF("We shouldn't be reading words\n");
> +    return 0;
> +}
> +
> +static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr)
> +{
> +
> +    IVShmemState *s = opaque;
> +    uint32_t ret;
> +
> +    switch (addr)
> +    {
> +        case IntrMask:
> +            ret = ivshmem_IntrMask_read(s);
> +            break;
> +
> +        case IntrStatus:
> +            ret = ivshmem_IntrStatus_read(s);
> +            break;
> +
> +        case IVPosition:
> +            /* return my VM ID if the memory is mapped */
> +            if (s->shm_fd > 0) {
> +                ret = s->vm_id;
> +            } else {
> +                ret = -1;
> +            }
> +            break;
> +
> +        default:
> +            IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr);
> +            ret = 0;
> +    }
> +
> +    return ret;
> +}
> +
> +static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr)
> +{
> +    IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
> +
> +    return 0;
> +}
> +
> +static void ivshmem_mmio_writeb(void *opaque,
> +                                target_phys_addr_t addr, uint32_t val)
> +{
> +    ivshmem_io_writeb(opaque, addr & 0xFF, val);
> +}

This function and others below only performs a cast and useless
masking (the address passed is these days an offset from start of the
area). Please merge these to ivshmem_io_readl() etc.

> +
> +static void ivshmem_mmio_writew(void *opaque,
> +                                target_phys_addr_t addr, uint32_t val)
> +{
> +    ivshmem_io_writew(opaque, addr & 0xFF, val);
> +}
> +
> +static void ivshmem_mmio_writel(void *opaque,
> +                                target_phys_addr_t addr, uint32_t val)
> +{
> +    ivshmem_io_writel(opaque, addr & 0xFF, val);
> +}
> +
> +static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    return ivshmem_io_readb(opaque, addr & 0xFF);
> +}
> +
> +static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    uint32_t val = ivshmem_io_readw(opaque, addr & 0xFF);
> +    return val;
> +}
> +
> +static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    uint32_t val = ivshmem_io_readl(opaque, addr & 0xFF);
> +    return val;
> +}
> +
> +static CPUReadMemoryFunc *ivshmem_mmio_read[3] = {

Please add 'const'.

> +    ivshmem_mmio_readb,
> +    ivshmem_mmio_readw,
> +    ivshmem_mmio_readl,
> +};
> +
> +static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = {
> +    ivshmem_mmio_writeb,
> +    ivshmem_mmio_writew,
> +    ivshmem_mmio_writel,
> +};
> +
> +static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
> +{
> +    IVShmemState *s = opaque;
> +
> +    ivshmem_IntrStatus_write(s, *buf);
> +
> +    IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
> +}
> +
> +static int ivshmem_can_receive(void * opaque)
> +{
> +    return 8;
> +}
> +
> +static void ivshmem_event(void *opaque, int event)
> +{
> +    IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
> +}
> +
> +static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {
> +
> +    EventfdEntry *entry = opaque;
> +    PCIDevice *pdev = entry->pdev;
> +
> +    IVSHMEM_DPRINTF("fake irqfd on vector %p %d\n", pdev, entry->vector);
> +    msix_notify(pdev, entry->vector);
> +}
> +
> +static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd,
> +                                                                    int vector)
> +{
> +    /* create a event character device based on the passed eventfd */
> +    IVShmemState *s = opaque;
> +    CharDriverState * chr;
> +
> +    chr = qemu_chr_open_eventfd(eventfd);
> +
> +    if (chr == NULL) {
> +        IVSHMEM_DPRINTF("creating eventfd for eventfd %d failed\n", eventfd);

This should not be a DPRINTF.

> +        exit(-1);
> +    }
> +
> +    /* if MSI is supported we need multiple interrupts */
> +    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
> +        s->eventfd_table[vector].pdev = &s->dev;
> +        s->eventfd_table[vector].vector = vector;
> +
> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
> +                      ivshmem_event, &s->eventfd_table[vector]);
> +    } else {
> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
> +                      ivshmem_event, s);
> +    }
> +
> +    return chr;
> +
> +}
> +
> +static int check_shm_size(IVShmemState *s, int fd) {
> +    /* check that the guest isn't going to try and map more memory than the
> +     * the object has allocated return -1 to indicate error */
> +
> +    struct stat buf;
> +
> +    fstat(fd, &buf);
> +
> +    if (s->ivshmem_size > buf.st_size) {
> +        fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
> +        fprintf(stderr, " than shared object size (%ld > %ld)\n",
> +                                          s->ivshmem_size, buf.st_size);

Please use PRIx64 for s->ivshmem_size, this will cause a warning on ILP32.

> +        return -1;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +/* create the shared memory BAR when we are not using the server, so we can
> + * create the BAR and map the memory immediately */
> +static void create_shared_memory_BAR(IVShmemState *s, int fd) {
> +
> +    void * ptr;
> +
> +    s->shm_fd = fd;
> +
> +    ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
> +
> +    s->ivshmem_offset = qemu_ram_map(s->ivshmem_size, ptr);

qemu_ram_map() does not exist in HEAD.

> +
> +    /* region for shared memory */
> +    pci_register_bar(&s->dev, 2, s->ivshmem_size,
> +                                    PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
> +}
> +
> +static void close_guest_eventfds(IVShmemState *s, int posn)
> +{
> +    int i, guest_curr_max;
> +
> +    guest_curr_max = s->peers[posn].nb_eventfds;
> +
> +    for (i = 0; i < guest_curr_max; i++)
> +        close(s->peers[posn].eventfds[i]);

CODING_STYLE.

> +
> +    qemu_free(s->peers[posn].eventfds);
> +    s->peers[posn].nb_eventfds = 0;
> +}
> +
> +static void setup_ioeventfds(IVShmemState *s) {
> +
> +    int i, j;
> +
> +    for (i = 0; i <= s->max_peer; i++) {
> +        for (j = 0; j < s->peers[i].nb_eventfds; j++) {
> +            kvm_set_ioeventfd_mmio_long(s->peers[i].eventfds[j],
> +                    s->mmio_addr + Doorbell, (i << 16) | j, 1);
> +        }
> +    }
> +
> +    /* setup irqfd for this VM's eventfds */
> +    for (i = 0; i < s->vectors; i++) {
> +        kvm_set_irqfd(s->dev.msix_irq_entries[i].gsi,
> +                        s->peers[s->vm_id].eventfds[i], 1);

kvm_set_irqfd() does not exist in HEAD.

> +    }
> +}
> +
> +
> +/* this function increase the dynamic storage need to store data about other
> + * guests */
> +static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
> +
> +    int j, old_nb_alloc;
> +
> +    old_nb_alloc = s->nb_peers;
> +
> +    while (new_min_size >= s->nb_peers)
> +        s->nb_peers = s->nb_peers * 2;
> +
> +    IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nb_peers);
> +    s->peers = qemu_realloc(s->peers, s->nb_peers * sizeof(Peer));
> +
> +    if (s->peers == NULL) {
> +        fprintf(stderr, "Allocation error - exiting\n");
> +        exit(1);
> +    }

qemu_realloc will never return zero.

> +
> +    /* zero out new pointers */
> +    for (j = old_nb_alloc; j < s->nb_peers; j++) {
> +        s->peers[j].eventfds = NULL;
> +        s->peers[j].nb_eventfds = 0;
> +    }
> +}
> +
> +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
> +{
> +    IVShmemState *s = opaque;
> +    int incoming_fd, tmp_fd;
> +    int guest_curr_max;
> +    long incoming_posn;
> +
> +    memcpy(&incoming_posn, buf, sizeof(long));
> +    /* pick off s->server_chr->msgfd and store it, posn should accompany msg */
> +    tmp_fd = qemu_chr_get_msgfd(s->server_chr);
> +    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
> +
> +    /* make sure we have enough space for this guest */
> +    if (incoming_posn >= s->nb_peers) {
> +        increase_dynamic_storage(s, incoming_posn);
> +    }
> +
> +    if (tmp_fd == -1) {
> +        /* if posn is positive and unseen before then this is our posn*/
> +        if ((incoming_posn >= 0) && (s->peers[incoming_posn].eventfds == NULL)) {
> +            /* receive our posn */
> +            s->vm_id = incoming_posn;
> +            return;
> +        } else {
> +            /* otherwise an fd == -1 means an existing guest has gone away */
> +            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
> +            close_guest_eventfds(s, incoming_posn);
> +            return;
> +        }
> +    }
> +
> +    /* because of the implementation of get_msgfd, we need a dup */
> +    incoming_fd = dup(tmp_fd);
> +
> +    if (incoming_fd == -1) {
> +        fprintf(stderr, "could not allocate file descriptor %s\n",
> +                                                            strerror(errno));
> +        return;
> +    }
> +
> +    /* if the position is -1, then it's shared memory region fd */
> +    if (incoming_posn == -1) {
> +
> +        void * map_ptr;
> +
> +        s->max_peer = 0;
> +
> +        if (check_shm_size(s, incoming_fd) == -1) {
> +            exit(-1);
> +        }
> +
> +        /* mmap the region and map into the BAR2 */
> +        map_ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED,
> +                                                                incoming_fd, 0);
> +        s->ivshmem_offset = qemu_ram_map(s->ivshmem_size, map_ptr);
> +
> +        IVSHMEM_DPRINTF("guest pci addr = %u, guest h/w addr = %u, size = %u\n",
> +                        (uint32_t)s->shm_pci_addr, (uint32_t)s->ivshmem_offset,
> +                        (uint32_t)s->ivshmem_size);
> +
> +        if (s->shm_pci_addr > 0) {
> +            /* map memory into BAR2 */
> +            cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
> +                                                            s->ivshmem_offset);
> +            if (s->role && strncmp(s->role, "peer", 4) == 0) {
> +                IVSHMEM_DPRINTF("marking pages no migrate\n");
> +                cpu_mark_pages_no_migrate(s->ivshmem_offset, s->ivshmem_size);
> +            }
> +
> +        }
> +
> +        /* only store the fd if it is successfully mapped */
> +        s->shm_fd = incoming_fd;
> +
> +        return;
> +    }
> +
> +    /* each guest has an array of eventfds, and we keep track of how many
> +     * guests for each VM */
> +    guest_curr_max = s->peers[incoming_posn].nb_eventfds;
> +    if (guest_curr_max == 0) {
> +        /* one eventfd per MSI vector */
> +        s->peers[incoming_posn].eventfds = (int *) qemu_malloc(s->vectors *
> +                                                                sizeof(int));

Useless cast in C.

> +    }
> +
> +    /* this is an eventfd for a particular guest VM */
> +    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, guest_curr_max,
> +                                                                incoming_fd);
> +    s->peers[incoming_posn].eventfds[guest_curr_max] = incoming_fd;
> +
> +    /* increment count for particular guest */
> +    s->peers[incoming_posn].nb_eventfds++;
> +
> +    /* keep track of the maximum VM ID */
> +    if (incoming_posn > s->max_peer) {
> +        s->max_peer = incoming_posn;
> +    }
> +
> +    if (incoming_posn == s->vm_id) {
> +        int vector = guest_curr_max;

Why add a new variable?

> +        if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> +            /* initialize char device for callback
> +             * if this is one of my eventfds */
> +            s->eventfd_chr[vector] = create_eventfd_chr_device(s,
> +                       s->peers[s->vm_id].eventfds[vector], vector);
> +        }
> +    }
> +
> +    return;
> +}
> +
> +static void ivshmem_reset(DeviceState *d)
> +{
> +    return;

Nothing to do?

> +}
> +
> +static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
> +                       pcibus_t addr, pcibus_t size, int type)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
> +
> +    s->mmio_addr = addr;
> +    cpu_register_physical_memory(addr + 0, 0x400, s->ivshmem_mmio_io_addr);

The 0x400 should be #defined earlier. Why 0x400 since you are only
interested in the 0x100 first bytes?

> +
> +    /* ioeventfd and irqfd are enabled together,
> +     * so the flag IRQFD refers to both */
> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> +        setup_ioeventfds(s);
> +    }
> +}
> +
> +static uint64_t ivshmem_get_size(IVShmemState * s) {
> +
> +    uint64_t value;
> +    char *ptr;
> +
> +    value = strtoul(s->sizearg, &ptr, 10);

I'd use strtoull() but the whole function should be suppressed, see below.

> +    switch (*ptr) {
> +        case 0: case 'M': case 'm':
> +            value <<= 20;
> +            break;
> +        case 'G': case 'g':
> +            value <<= 30;
> +            break;
> +        default:
> +            fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg);
> +            exit(1);
> +    }
> +
> +    /* BARs must be a power of 2 */
> +    if (!is_power_of_two(value)) {
> +        fprintf(stderr, "ivshmem: size must be power of 2\n");
> +        exit(1);
> +    }

Isn't the BAR check in pci.c enough?

> +
> +    return value;
> +
> +}
> +
> +static void ivshmem_setup_msi(IVShmemState * s) {
> +
> +    int i;
> +
> +    /* allocate the MSI-X vectors */
> +
> +    if (!msix_init(&s->dev, s->vectors, 1, 0)) {
> +        pci_register_bar(&s->dev, 1,
> +                         msix_bar_size(&s->dev),
> +                         PCI_BASE_ADDRESS_SPACE_MEMORY,
> +                         msix_mmio_map);
> +        IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
> +    } else {
> +        IVSHMEM_DPRINTF("msix initialization failed\n");

Is this fatal considering the msix_vector_use() below?

> +    }
> +
> +    /* 'activate' the vectors */
> +    for (i = 0; i < s->vectors; i++) {
> +        msix_vector_use(&s->dev, i);
> +    }
> +
> +    /* if IRQFDs are not supported, we'll have to trigger the interrupts
> +     * via Qemu char devices */
> +    if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> +        /* for handling interrupts when IRQFD is not available */
> +        s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry));
> +    }
> +}
> +
> +static void ivshmem_save(QEMUFile* f, void *opaque)
> +{
> +    IVShmemState *proxy = opaque;
> +
> +    IVSHMEM_DPRINTF("ivshmem_save\n");
> +    pci_device_save(&proxy->dev, f);
> +
> +    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
> +        msix_save(&proxy->dev, f);
> +    } else {
> +        qemu_put_be32(f, proxy->intrstatus);
> +        qemu_put_be32(f, proxy->intrmask);
> +    }
> +
> +}

There may be VMState magic to handle conditional structures (or just
make the structures unconditional), so VMState should be used instead.

> +
> +static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
> +{
> +    IVSHMEM_DPRINTF("ivshmem_load\n");
> +
> +    IVShmemState *proxy = opaque;
> +    int ret, i;
> +

Missing version check.

> +    ret = pci_device_load(&proxy->dev, f);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
> +        msix_load(&proxy->dev, f);
> +        for (i = 0; i < proxy->vectors; i++) {
> +            msix_vector_use(&proxy->dev, i);
> +        }
> +    } else {
> +        proxy->intrstatus = qemu_get_be32(f);
> +        proxy->intrmask = qemu_get_be32(f);
> +    }
> +
> +    return 0;
> +}
> +
> +static int pci_ivshmem_init(PCIDevice *dev)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> +    uint8_t *pci_conf;
> +
> +    if (s->sizearg == NULL)
> +        s->ivshmem_size = 4 << 20; /* 4 MB default */
> +    else {
> +        s->ivshmem_size = ivshmem_get_size(s);
> +    }
> +
> +    register_savevm("ivshmem", 0, 0, ivshmem_save, ivshmem_load, dev);
> +
> +    /* IRQFD requires MSI */
> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD) &&
> +        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
> +        fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n");
> +        exit(1);
> +    }
> +
> +    /* check that role is reasonable */
> +    if (s->role && !((strncmp(s->role, "peer", 5) == 0) ||
> +                        (strncmp(s->role, "master", 7) == 0))) {
> +        fprintf(stderr, "ivshmem: 'role' must be 'peer' or 'master'\n");
> +        exit(1);
> +    }

I'd add a scalar flag in IVShmemState for role so that further strcmps
are avoided.

> +
> +    pci_conf = s->dev.config;
> +    pci_conf[0x00] = 0xf4; /* Qumranet vendor ID 0x5002 */
> +    pci_conf[0x01] = 0x1a;
> +    pci_conf[0x02] = 0x10;
> +    pci_conf[0x03] = 0x11;

Please add the DID to hw/pci_ids.h and use pci_config_set_xyz() here.

> +    pci_conf[0x04] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
> +    pci_conf[0x0a] = 0x00; /* RAM controller */
> +    pci_conf[0x0b] = 0x05;
> +    pci_conf[0x0e] = 0x00; /* header_type */
> +
> +    pci_conf[PCI_INTERRUPT_PIN] = 1;
> +
> +    s->shm_pci_addr = 0;
> +    s->ivshmem_offset = 0;
> +    s->shm_fd = 0;
> +
> +    s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
> +                                    ivshmem_mmio_write, s);
> +    /* region for registers*/
> +    pci_register_bar(&s->dev, 0, 0x400,
> +                           PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map);
> +
> +    if ((s->server_chr != NULL) &&
> +                        (strncmp(s->server_chr->filename, "unix:", 5) == 0)) {
> +        /* if we get a UNIX socket as the parameter we will talk
> +         * to the ivshmem server to receive the memory region */
> +
> +        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
> +                                                    s->server_chr->filename);
> +
> +        if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
> +            ivshmem_setup_msi(s);
> +        }
> +
> +        /* we allocate enough space for 16 guests and grow as needed */
> +        s->nb_peers = 16;
> +        s->vm_id = -1;
> +
> +        /* allocate/initialize space for interrupt handling */
> +        s->peers = qemu_mallocz(s->nb_peers * sizeof(Peer));
> +
> +        pci_register_bar(&s->dev, 2, s->ivshmem_size,
> +                                    PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
> +
> +        s->eventfd_chr = (CharDriverState **) qemu_mallocz(s->vectors *
> +                                                sizeof(CharDriverState *));

Useless cast in C.

> +
> +        qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, ivshmem_read,
> +                     ivshmem_event, s);
> +    } else {
> +        /* just map the file immediately, we're not using a server */
> +        int fd;
> +
> +        if (s->shmobj == NULL) {
> +            fprintf(stderr, "Must specify 'chardev' or 'shm' to ivshmem\n");
> +        }

I'd rather have separate 'chardev' and 'shm_file' parameters. Then
'info qtree' could return more useful information about the chardev.

> +
> +        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);
> +
> +        /* try opening with O_EXCL and if it succeeds zero the memory
> +         * by truncating to 0 */
> +        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
> +                        S_IRWXU|S_IRWXG|S_IRWXO)) > 0) {
> +           /* truncate file to length PCI device's memory */
> +            if (ftruncate(fd, s->ivshmem_size) != 0) {
> +                fprintf(stderr, "kvm_ivshmem: could not truncate shared file\n");

Why 'kvm_ivshmem'?

> +            }
> +
> +        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
> +                        S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
> +            fprintf(stderr, "kvm_ivshmem: could not open shared file\n");
> +            exit(-1);
> +
> +        }
> +
> +        if (check_shm_size(s, fd) == -1) {
> +            exit(-1);
> +        }
> +
> +        create_shared_memory_BAR(s, fd);
> +
> +    }
> +
> +    return 0;
> +}
> +
> +static int pci_ivshmem_uninit(PCIDevice *dev)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> +
> +    cpu_unregister_io_memory(s->ivshmem_mmio_io_addr);
> +
> +    return 0;
> +}
> +
> +static PCIDeviceInfo ivshmem_info = {
> +    .qdev.name  = "ivshmem",
> +    .qdev.size  = sizeof(IVShmemState),
> +    .qdev.reset = ivshmem_reset,
> +    .init       = pci_ivshmem_init,
> +    .exit       = pci_ivshmem_uninit,
> +    .qdev.props = (Property[]) {
> +        DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
> +        DEFINE_PROP_STRING("size", IVShmemState, sizearg),

This should be scalar type, not string.

> +        DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
> +        DEFINE_PROP_BIT("irqfd", IVShmemState, features, IVSHMEM_IRQFD, false),
> +        DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
> +        DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
> +        DEFINE_PROP_STRING("role", IVShmemState, role),
> +        DEFINE_PROP_END_OF_LIST(),
> +    }
> +};
> +
> +static void ivshmem_register_devices(void)
> +{
> +    pci_qdev_register(&ivshmem_info);
> +}
> +
> +device_init(ivshmem_register_devices)
> diff --git a/qemu-char.c b/qemu-char.c
> index ac65a1c..b2e50d0 100644
> --- a/qemu-char.c
> +++ b/qemu-char.c
> @@ -2093,6 +2093,12 @@ static void tcp_chr_read(void *opaque)
>     }
>  }
>
> +CharDriverState *qemu_chr_open_eventfd(int eventfd){
> +
> +    return qemu_chr_open_fd(eventfd, eventfd);
> +
> +}
> +
>  static void tcp_chr_connect(void *opaque)
>  {
>     CharDriverState *chr = opaque;
> diff --git a/qemu-char.h b/qemu-char.h
> index e3a0783..6ea01ba 100644
> --- a/qemu-char.h
> +++ b/qemu-char.h
> @@ -94,6 +94,9 @@ void qemu_chr_info_print(Monitor *mon, const QObject *ret_data);
>  void qemu_chr_info(Monitor *mon, QObject **ret_data);
>  CharDriverState *qemu_chr_find(const char *name);
>
> +/* add an eventfd to the qemu devices that are polled */
> +CharDriverState *qemu_chr_open_eventfd(int eventfd);

Maybe this should be removed and just open coded with qemu_chr_open_fd.

> +
>  extern int term_escape_char;
>
>  /* async I/O support */
> diff --git a/qemu-doc.texi b/qemu-doc.texi
> index 6647b7b..24f8748 100644
> --- a/qemu-doc.texi
> +++ b/qemu-doc.texi
> @@ -706,6 +706,49 @@ Using the @option{-net socket} option, it is possible to make VLANs
>  that span several QEMU instances. See @ref{sec_invocation} to have a
>  basic example.
>
> +@section Other Devices
> +
> +@subsection Inter-VM Shared Memory device
> +
> +With KVM enabled on a Linux host, a shared memory device is available.  Guests
> +map a POSIX shared memory region into the guest as a PCI device that enables
> +zero-copy communication to the application level of the guests.  The basic
> +syntax is:
> +
> +@example
> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
> +@end example
> +
> +If desired, interrupts can be sent between guest VMs accessing the same shared
> +memory region.  Interrupt support requires using a shared memory server and
> +using a chardev socket to connect to it.  The code for the shared memory server
> +is qemu.git/contrib/ivshmem-server.  An example syntax when using the shared
> +memory server is:
> +
> +@example
> +qemu -device ivshmem,size=<size in format accepted by -m>[,chardev=<id>]
> +                        [,msi=on][,irqfd=on][,vectors=n][,role=peer|master]
> +qemu -chardev socket,path=<path>,id=<id>
> +@end example
> +
> +When using the server, the guest will be assigned a VM ID (>=0) that allows guests
> +using the same server to communicate via interrupts.  Guests can read their
> +VM ID from a device register (see example code).  Since receiving the shared
> +memory region from the server is asynchronous, there is a (small) chance the
> +guest may boot before the shared memory is attached.  To allow an application
> +to ensure shared memory is attached, the VM ID register will return -1 (an
> +invalid VM ID) until the memory is attached.  Once the shared memory is
> +attached, the VM ID will return the guest's valid VM ID.  With these semantics,
> +the guest application can check to ensure the shared memory is attached to the
> +guest before proceeding.
> +
> +The @option{role} argument can be set to either master or peer and will affect
> +how the shared memory is migrated.  With @option{role=master}, the guest will
> +copy the shared memory on migration to the destination host.  With
> +@option{role=peer}, the shared memory will not be copied on migration.  Only
> +one guest should be specified as
> +the master.
> +
>  @node direct_linux_boot
>  @section Direct Linux Boot
>
> --
> 1.6.3.2.198.g6096d
>
>
>
Avi Kivity June 6, 2010, 3:02 p.m. UTC | #2
On 06/05/2010 12:44 PM, Blue Swirl wrote:
> On Fri, Jun 4, 2010 at 9:45 PM, Cam Macdonell<cam@cs.ualberta.ca>  wrote:
>    
>> Support an inter-vm shared memory device that maps a shared-memory object as a
>> PCI device in the guest.  This patch also supports interrupts between guest by
>> communicating over a unix domain socket.  This patch applies to the qemu-kvm
>> repository.
>>
>>     -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>
>> Interrupts are supported between multiple VMs by using a shared memory server
>> by using a chardev socket.
>>
>>     -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>            [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n][,role=peer|master]
>>     -chardev socket,path=<path>,id=<id>
>>
>> (shared memory server is qemu.git/contrib/ivshmem-server)
>>
>> Sample programs and init scripts are in a git repo here:
>>
>>      
> Why is this KVM specific BTW, Posix SHM is available on many
> platforms? What would happen if kvm_set_foobar functions were not
> called when KVM is not being used? Is host eventfd support essential?
>    

It's not kvm specific, it's 
atomic-ops-on-shared-memory-are-visible-as-atomic-ops specific, which is 
currently only available with kvm.  When tcg gains true smp support (and 
not just against other tcg threads) this can work with tcg as well.

I guess that needs a host with at least 32/64 bit CAS for 32/64 bit 
targets respectively, and double that if the target has DCAS.  Not sure 
how targets with ll/sc can be implemented, especially if there are 
limits as to what can go in between.
Cam Macdonell June 7, 2010, 4:41 p.m. UTC | #3
On Sat, Jun 5, 2010 at 3:44 AM, Blue Swirl <blauwirbel@gmail.com> wrote:
> On Fri, Jun 4, 2010 at 9:45 PM, Cam Macdonell <cam@cs.ualberta.ca> wrote:
>> Support an inter-vm shared memory device that maps a shared-memory object as a
>> PCI device in the guest.  This patch also supports interrupts between guest by
>> communicating over a unix domain socket.  This patch applies to the qemu-kvm
>> repository.
>>
>>    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>
>> Interrupts are supported between multiple VMs by using a shared memory server
>> by using a chardev socket.
>>
>>    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>           [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n][,role=peer|master]
>>    -chardev socket,path=<path>,id=<id>
>>
>> (shared memory server is qemu.git/contrib/ivshmem-server)
>>
>> Sample programs and init scripts are in a git repo here:
>>
>>    www.gitorious.org/nahanni
>> ---
>>  Makefile.target |    3 +
>>  hw/ivshmem.c    |  852 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  qemu-char.c     |    6 +
>>  qemu-char.h     |    3 +
>>  qemu-doc.texi   |   43 +++
>>  5 files changed, 907 insertions(+), 0 deletions(-)
>>  create mode 100644 hw/ivshmem.c
>>
>> diff --git a/Makefile.target b/Makefile.target
>> index c4ba592..4888308 100644
>> --- a/Makefile.target
>> +++ b/Makefile.target
>> @@ -202,6 +202,9 @@ obj-$(CONFIG_USB_OHCI) += usb-ohci.o
>>  obj-y += rtl8139.o
>>  obj-y += e1000.o
>>
>> +# Inter-VM PCI shared memory
>> +obj-y += ivshmem.o
>> +
>
> Can this be compiled once, simply by moving this to Makefile.objs
> instead of Makefile.target? Also, because the code seems to be KVM
> specific, it can't be compiled unconditionally but depending on at
> least CONFIG_KVM and maybe CONFIG_EVENTFD.

The device uses eventfds for signalling, but never creates the
eventfds itself as they are passed from a server using SCM_RIGHTS.
So, it does not depend on the eventfd API.  Its dependence on
irqfd/ioeventfd (the only KVM specific bits) are optional via the
command-line.

A runtime check for KVM is done for the reasons Avi mentioned.

>
> Why is this KVM specific BTW, Posix SHM is available on many
> platforms? What would happen if kvm_set_foobar functions were not
> called when KVM is not being used? Is host eventfd support essential?
>
>>  # Hardware support
>>  obj-i386-y += vga.o
>>  obj-i386-y += mc146818rtc.o i8259.o pc.o
>> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
>> new file mode 100644
>> index 0000000..9057612
>> --- /dev/null
>> +++ b/hw/ivshmem.c
>> @@ -0,0 +1,852 @@
>> +/*
>> + * Inter-VM Shared Memory PCI device.
>> + *
>> + * Author:
>> + *      Cam Macdonell <cam@cs.ualberta.ca>
>> + *
>> + * Based On: cirrus_vga.c
>> + *          Copyright (c) 2004 Fabrice Bellard
>> + *          Copyright (c) 2004 Makoto Suzuki (suzu)
>> + *
>> + *      and rtl8139.c
>> + *          Copyright (c) 2006 Igor Kovalenko
>> + *
>> + * This code is licensed under the GNU GPL v2.
>> + */
>> +#include <sys/mman.h>
>> +#include <sys/types.h>
>> +#include <sys/socket.h>
>> +#include <sys/io.h>
>> +#include <sys/ioctl.h>
>> +#include "hw.h"
>> +#include "console.h"
>> +#include "pc.h"
>> +#include "pci.h"
>> +#include "sysemu.h"
>> +
>> +#include "msix.h"
>> +#include "qemu-kvm.h"
>> +#include "libkvm.h"
>> +
>> +#include <sys/eventfd.h>
>> +#include <sys/mman.h>
>> +#include <sys/socket.h>
>> +#include <sys/ioctl.h>
>> +
>> +#define IVSHMEM_IRQFD   0
>> +#define IVSHMEM_MSI     1
>> +
>> +//#define DEBUG_IVSHMEM
>> +#ifdef DEBUG_IVSHMEM
>> +#define IVSHMEM_DPRINTF(fmt, args...)        \
>> +    do {printf("IVSHMEM: " fmt, ##args); } while (0)
>
> Please use __VA_ARGS__.
>
>> +#else
>> +#define IVSHMEM_DPRINTF(fmt, args...)
>> +#endif
>> +
>> +typedef struct Peer {
>> +    int nb_eventfds;
>> +    int *eventfds;
>> +} Peer;
>> +
>> +typedef struct EventfdEntry {
>> +    PCIDevice *pdev;
>> +    int vector;
>> +} EventfdEntry;
>> +
>> +typedef struct IVShmemState {
>> +    PCIDevice dev;
>> +    uint32_t intrmask;
>> +    uint32_t intrstatus;
>> +    uint32_t doorbell;
>> +
>> +    CharDriverState ** eventfd_chr;
>
> I'd remove the space between '**' and 'eventfd_chr', it's used inconsistently.
>
>> +    CharDriverState * server_chr;
>> +    int ivshmem_mmio_io_addr;
>> +
>> +    pcibus_t mmio_addr;
>> +    pcibus_t shm_pci_addr;
>> +    uint64_t ivshmem_offset;
>> +    uint64_t ivshmem_size; /* size of shared memory region */
>> +    int shm_fd; /* shared memory file descriptor */
>> +
>> +    Peer *peers;
>> +    int nb_peers; /* how many guests we have space for */
>> +    int max_peer; /* maximum numbered peer */
>> +
>> +    int vm_id;
>> +    uint32_t vectors;
>> +    uint32_t features;
>> +    EventfdEntry *eventfd_table;
>> +
>> +    char * shmobj;
>> +    char * sizearg;
>> +    char * role;
>> +} IVShmemState;
>> +
>> +/* registers for the Inter-VM shared memory device */
>> +enum ivshmem_registers {
>> +    IntrMask = 0,
>> +    IntrStatus = 4,
>> +    IVPosition = 8,
>> +    Doorbell = 12,
>> +};
>
> IIRC these should be uppercase.

I worked from rtl8139 which doesn't have them uppercase.  But doing a
quick search, I can see most devices do, I'll fix that.

>
>> +
>> +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int feature) {
>> +    return (ivs->features & (1 << feature));
>> +}
>
> Since this is the first version, do we need any features at this
> point, can't we expect that all features are available now? Why does
> the user need to specify the features?

Some features require host support such as irqfd/ioeventfds.  So
having features allows them to be disabled on the command-line (e.g.,
irqfd=off).

>
> To avoid a negative shift, I'd make 'feature' unsigned.
>
>> +
>> +static inline bool is_power_of_two(uint64_t x) {
>> +    return (x & (x - 1)) == 0;
>> +}
>> +
>> +static void ivshmem_map(PCIDevice *pci_dev, int region_num,
>> +                    pcibus_t addr, pcibus_t size, int type)
>> +{
>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
>> +
>> +    s->shm_pci_addr = addr;
>> +
>> +    if (s->ivshmem_offset > 0) {
>> +        cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
>> +                                                            s->ivshmem_offset);
>> +        if (s->role && strncmp(s->role, "peer", 4) == 0) {
>> +            IVSHMEM_DPRINTF("marking pages no migrate\n");
>> +            cpu_mark_pages_no_migrate(s->ivshmem_offset, s->ivshmem_size);
>> +        }
>> +    }
>> +
>> +    IVSHMEM_DPRINTF("guest pci addr = %u, guest h/w addr = %u, size = %u\n",
>> +                (uint32_t)addr, (uint32_t)s->ivshmem_offset, (uint32_t)size);
>
> Please use FMT_PCIBUS for addr and size and PRIu64 for s->ivshmem_offset.
>
>> +
>> +}
>> +
>> +/* accessing registers - based on rtl8139 */
>> +static void ivshmem_update_irq(IVShmemState *s, int val)
>> +{
>> +    int isr;
>> +    isr = (s->intrstatus & s->intrmask) & 0xffffffff;
>> +
>> +    /* don't print ISR resets */
>> +    if (isr) {
>> +        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
>> +           isr ? 1 : 0, s->intrstatus, s->intrmask);
>> +    }
>> +
>> +    qemu_set_irq(s->dev.irq[0], (isr != 0));
>> +}
>> +
>> +static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
>> +{
>> +    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
>> +
>> +    s->intrmask = val;
>> +
>> +    ivshmem_update_irq(s, val);
>> +}
>> +
>> +static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
>> +{
>> +    uint32_t ret = s->intrmask;
>> +
>> +    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
>> +{
>> +    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
>> +
>> +    s->intrstatus = val;
>> +
>> +    ivshmem_update_irq(s, val);
>> +    return;
>> +}
>> +
>> +static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
>> +{
>> +    uint32_t ret = s->intrstatus;
>> +
>> +    /* reading ISR clears all interrupts */
>> +    s->intrstatus = 0;
>> +
>> +    ivshmem_update_irq(s, 0);
>> +
>> +    return ret;
>> +}
>> +
>> +static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val)
>> +{
>> +
>> +    IVSHMEM_DPRINTF("We shouldn't be writing words\n");
>> +}
>> +
>> +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
>> +{
>> +    IVShmemState *s = opaque;
>> +
>> +    u_int64_t write_one = 1;
>
> Please use uintNN_t instead of u_intNN_t.
>
>> +    u_int16_t dest = val >> 16;
>> +    u_int16_t vector = val & 0xff;
>> +
>> +    addr &= 0xfc;
>
> I'd add a debug printf here, likewise for exit of ivshmem_io_readl().
> When you do the merge (see below), the correct printf format for the
> addresses will be TARGET_FMT_plx.
>
>> +
>> +    switch (addr)
>> +    {
>> +        case IntrMask:
>> +            ivshmem_IntrMask_write(s, val);
>> +            break;
>> +
>> +        case IntrStatus:
>> +            ivshmem_IntrStatus_write(s, val);
>> +            break;
>> +
>> +        case Doorbell:
>> +            /* check that dest VM ID is reasonable */
>> +            if ((dest < 0) || (dest > s->max_peer)) {
>> +                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
>> +                break;
>> +            }
>> +
>> +            /* check doorbell range */
>> +            if ((vector >= 0) && (vector < s->peers[dest].nb_eventfds)) {
>> +                IVSHMEM_DPRINTF("Writing %ld to VM %d on vector %d\n",
>> +                                                    write_one, dest, vector);
>
> PRId64 for write_one, %ld is not enough on ILP32.
>
>> +                if (write(s->peers[dest].eventfds[vector],
>> +                                                    &(write_one), 8) != 8) {
>> +                    IVSHMEM_DPRINTF("error writing to eventfd\n");
>> +                }
>> +            }
>> +            break;
>> +        default:
>> +            IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest);
>> +    }
>> +}
>> +
>> +static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val)
>> +{
>> +    IVSHMEM_DPRINTF("We shouldn't be writing bytes\n");
>> +}
>> +
>> +static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr)
>> +{
>> +
>> +    IVSHMEM_DPRINTF("We shouldn't be reading words\n");
>> +    return 0;
>> +}
>> +
>> +static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr)
>> +{
>> +
>> +    IVShmemState *s = opaque;
>> +    uint32_t ret;
>> +
>> +    switch (addr)
>> +    {
>> +        case IntrMask:
>> +            ret = ivshmem_IntrMask_read(s);
>> +            break;
>> +
>> +        case IntrStatus:
>> +            ret = ivshmem_IntrStatus_read(s);
>> +            break;
>> +
>> +        case IVPosition:
>> +            /* return my VM ID if the memory is mapped */
>> +            if (s->shm_fd > 0) {
>> +                ret = s->vm_id;
>> +            } else {
>> +                ret = -1;
>> +            }
>> +            break;
>> +
>> +        default:
>> +            IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr);
>> +            ret = 0;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr)
>> +{
>> +    IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
>> +
>> +    return 0;
>> +}
>> +
>> +static void ivshmem_mmio_writeb(void *opaque,
>> +                                target_phys_addr_t addr, uint32_t val)
>> +{
>> +    ivshmem_io_writeb(opaque, addr & 0xFF, val);
>> +}
>
> This function and others below only performs a cast and useless
> masking (the address passed is these days an offset from start of the
> area). Please merge these to ivshmem_io_readl() etc.

Ok, these are artifacts from basing my patch on rtl8139.c.  I'll remove them.

>
>> +
>> +static void ivshmem_mmio_writew(void *opaque,
>> +                                target_phys_addr_t addr, uint32_t val)
>> +{
>> +    ivshmem_io_writew(opaque, addr & 0xFF, val);
>> +}
>> +
>> +static void ivshmem_mmio_writel(void *opaque,
>> +                                target_phys_addr_t addr, uint32_t val)
>> +{
>> +    ivshmem_io_writel(opaque, addr & 0xFF, val);
>> +}
>> +
>> +static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr)
>> +{
>> +    return ivshmem_io_readb(opaque, addr & 0xFF);
>> +}
>> +
>> +static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr)
>> +{
>> +    uint32_t val = ivshmem_io_readw(opaque, addr & 0xFF);
>> +    return val;
>> +}
>> +
>> +static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr)
>> +{
>> +    uint32_t val = ivshmem_io_readl(opaque, addr & 0xFF);
>> +    return val;
>> +}
>> +
>> +static CPUReadMemoryFunc *ivshmem_mmio_read[3] = {
>
> Please add 'const'.
>
>> +    ivshmem_mmio_readb,
>> +    ivshmem_mmio_readw,
>> +    ivshmem_mmio_readl,
>> +};
>> +
>> +static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = {
>> +    ivshmem_mmio_writeb,
>> +    ivshmem_mmio_writew,
>> +    ivshmem_mmio_writel,
>> +};
>> +
>> +static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
>> +{
>> +    IVShmemState *s = opaque;
>> +
>> +    ivshmem_IntrStatus_write(s, *buf);
>> +
>> +    IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
>> +}
>> +
>> +static int ivshmem_can_receive(void * opaque)
>> +{
>> +    return 8;
>> +}
>> +
>> +static void ivshmem_event(void *opaque, int event)
>> +{
>> +    IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
>> +}
>> +
>> +static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {
>> +
>> +    EventfdEntry *entry = opaque;
>> +    PCIDevice *pdev = entry->pdev;
>> +
>> +    IVSHMEM_DPRINTF("fake irqfd on vector %p %d\n", pdev, entry->vector);
>> +    msix_notify(pdev, entry->vector);
>> +}
>> +
>> +static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd,
>> +                                                                    int vector)
>> +{
>> +    /* create a event character device based on the passed eventfd */
>> +    IVShmemState *s = opaque;
>> +    CharDriverState * chr;
>> +
>> +    chr = qemu_chr_open_eventfd(eventfd);
>> +
>> +    if (chr == NULL) {
>> +        IVSHMEM_DPRINTF("creating eventfd for eventfd %d failed\n", eventfd);
>
> This should not be a DPRINTF.
>
>> +        exit(-1);
>> +    }
>> +
>> +    /* if MSI is supported we need multiple interrupts */
>> +    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
>> +        s->eventfd_table[vector].pdev = &s->dev;
>> +        s->eventfd_table[vector].vector = vector;
>> +
>> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
>> +                      ivshmem_event, &s->eventfd_table[vector]);
>> +    } else {
>> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
>> +                      ivshmem_event, s);
>> +    }
>> +
>> +    return chr;
>> +
>> +}
>> +
>> +static int check_shm_size(IVShmemState *s, int fd) {
>> +    /* check that the guest isn't going to try and map more memory than the
>> +     * the object has allocated return -1 to indicate error */
>> +
>> +    struct stat buf;
>> +
>> +    fstat(fd, &buf);
>> +
>> +    if (s->ivshmem_size > buf.st_size) {
>> +        fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
>> +        fprintf(stderr, " than shared object size (%ld > %ld)\n",
>> +                                          s->ivshmem_size, buf.st_size);
>
> Please use PRIx64 for s->ivshmem_size, this will cause a warning on ILP32.
>
>> +        return -1;
>> +    } else {
>> +        return 0;
>> +    }
>> +}
>> +
>> +/* create the shared memory BAR when we are not using the server, so we can
>> + * create the BAR and map the memory immediately */
>> +static void create_shared_memory_BAR(IVShmemState *s, int fd) {
>> +
>> +    void * ptr;
>> +
>> +    s->shm_fd = fd;
>> +
>> +    ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
>> +
>> +    s->ivshmem_offset = qemu_ram_map(s->ivshmem_size, ptr);
>
> qemu_ram_map() does not exist in HEAD.
>

Ok, so qemu_ram_map() and kvm_set_irq() are in the KVM HEAD.  I had my
own version of both functions, but removed them when Marcelo's were
merged into KVM.

>> +
>> +    /* region for shared memory */
>> +    pci_register_bar(&s->dev, 2, s->ivshmem_size,
>> +                                    PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
>> +}
>> +
>> +static void close_guest_eventfds(IVShmemState *s, int posn)
>> +{
>> +    int i, guest_curr_max;
>> +
>> +    guest_curr_max = s->peers[posn].nb_eventfds;
>> +
>> +    for (i = 0; i < guest_curr_max; i++)
>> +        close(s->peers[posn].eventfds[i]);
>
> CODING_STYLE.
>
>> +
>> +    qemu_free(s->peers[posn].eventfds);
>> +    s->peers[posn].nb_eventfds = 0;
>> +}
>> +
>> +static void setup_ioeventfds(IVShmemState *s) {
>> +
>> +    int i, j;
>> +
>> +    for (i = 0; i <= s->max_peer; i++) {
>> +        for (j = 0; j < s->peers[i].nb_eventfds; j++) {
>> +            kvm_set_ioeventfd_mmio_long(s->peers[i].eventfds[j],
>> +                    s->mmio_addr + Doorbell, (i << 16) | j, 1);
>> +        }
>> +    }
>> +
>> +    /* setup irqfd for this VM's eventfds */
>> +    for (i = 0; i < s->vectors; i++) {
>> +        kvm_set_irqfd(s->dev.msix_irq_entries[i].gsi,
>> +                        s->peers[s->vm_id].eventfds[i], 1);
>
> kvm_set_irqfd() does not exist in HEAD.
>
>> +    }
>> +}
>> +
>> +
>> +/* this function increase the dynamic storage need to store data about other
>> + * guests */
>> +static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
>> +
>> +    int j, old_nb_alloc;
>> +
>> +    old_nb_alloc = s->nb_peers;
>> +
>> +    while (new_min_size >= s->nb_peers)
>> +        s->nb_peers = s->nb_peers * 2;
>> +
>> +    IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nb_peers);
>> +    s->peers = qemu_realloc(s->peers, s->nb_peers * sizeof(Peer));
>> +
>> +    if (s->peers == NULL) {
>> +        fprintf(stderr, "Allocation error - exiting\n");
>> +        exit(1);
>> +    }
>
> qemu_realloc will never return zero.
>
>> +
>> +    /* zero out new pointers */
>> +    for (j = old_nb_alloc; j < s->nb_peers; j++) {
>> +        s->peers[j].eventfds = NULL;
>> +        s->peers[j].nb_eventfds = 0;
>> +    }
>> +}
>> +
>> +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
>> +{
>> +    IVShmemState *s = opaque;
>> +    int incoming_fd, tmp_fd;
>> +    int guest_curr_max;
>> +    long incoming_posn;
>> +
>> +    memcpy(&incoming_posn, buf, sizeof(long));
>> +    /* pick off s->server_chr->msgfd and store it, posn should accompany msg */
>> +    tmp_fd = qemu_chr_get_msgfd(s->server_chr);
>> +    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
>> +
>> +    /* make sure we have enough space for this guest */
>> +    if (incoming_posn >= s->nb_peers) {
>> +        increase_dynamic_storage(s, incoming_posn);
>> +    }
>> +
>> +    if (tmp_fd == -1) {
>> +        /* if posn is positive and unseen before then this is our posn*/
>> +        if ((incoming_posn >= 0) && (s->peers[incoming_posn].eventfds == NULL)) {
>> +            /* receive our posn */
>> +            s->vm_id = incoming_posn;
>> +            return;
>> +        } else {
>> +            /* otherwise an fd == -1 means an existing guest has gone away */
>> +            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
>> +            close_guest_eventfds(s, incoming_posn);
>> +            return;
>> +        }
>> +    }
>> +
>> +    /* because of the implementation of get_msgfd, we need a dup */
>> +    incoming_fd = dup(tmp_fd);
>> +
>> +    if (incoming_fd == -1) {
>> +        fprintf(stderr, "could not allocate file descriptor %s\n",
>> +                                                            strerror(errno));
>> +        return;
>> +    }
>> +
>> +    /* if the position is -1, then it's shared memory region fd */
>> +    if (incoming_posn == -1) {
>> +
>> +        void * map_ptr;
>> +
>> +        s->max_peer = 0;
>> +
>> +        if (check_shm_size(s, incoming_fd) == -1) {
>> +            exit(-1);
>> +        }
>> +
>> +        /* mmap the region and map into the BAR2 */
>> +        map_ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED,
>> +                                                                incoming_fd, 0);
>> +        s->ivshmem_offset = qemu_ram_map(s->ivshmem_size, map_ptr);
>> +
>> +        IVSHMEM_DPRINTF("guest pci addr = %u, guest h/w addr = %u, size = %u\n",
>> +                        (uint32_t)s->shm_pci_addr, (uint32_t)s->ivshmem_offset,
>> +                        (uint32_t)s->ivshmem_size);
>> +
>> +        if (s->shm_pci_addr > 0) {
>> +            /* map memory into BAR2 */
>> +            cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
>> +                                                            s->ivshmem_offset);
>> +            if (s->role && strncmp(s->role, "peer", 4) == 0) {
>> +                IVSHMEM_DPRINTF("marking pages no migrate\n");
>> +                cpu_mark_pages_no_migrate(s->ivshmem_offset, s->ivshmem_size);
>> +            }
>> +
>> +        }
>> +
>> +        /* only store the fd if it is successfully mapped */
>> +        s->shm_fd = incoming_fd;
>> +
>> +        return;
>> +    }
>> +
>> +    /* each guest has an array of eventfds, and we keep track of how many
>> +     * guests for each VM */
>> +    guest_curr_max = s->peers[incoming_posn].nb_eventfds;
>> +    if (guest_curr_max == 0) {
>> +        /* one eventfd per MSI vector */
>> +        s->peers[incoming_posn].eventfds = (int *) qemu_malloc(s->vectors *
>> +                                                                sizeof(int));
>
> Useless cast in C.
>
>> +    }
>> +
>> +    /* this is an eventfd for a particular guest VM */
>> +    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, guest_curr_max,
>> +                                                                incoming_fd);
>> +    s->peers[incoming_posn].eventfds[guest_curr_max] = incoming_fd;
>> +
>> +    /* increment count for particular guest */
>> +    s->peers[incoming_posn].nb_eventfds++;
>> +
>> +    /* keep track of the maximum VM ID */
>> +    if (incoming_posn > s->max_peer) {
>> +        s->max_peer = incoming_posn;
>> +    }
>> +
>> +    if (incoming_posn == s->vm_id) {
>> +        int vector = guest_curr_max;
>
> Why add a new variable?

for clarity, so when looking at the code it's clear that the current
maxium is a MSI vector.  Perhaps I'll rename guest_curr_max to achieve
this.

>
>> +        if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
>> +            /* initialize char device for callback
>> +             * if this is one of my eventfds */
>> +            s->eventfd_chr[vector] = create_eventfd_chr_device(s,
>> +                       s->peers[s->vm_id].eventfds[vector], vector);
>> +        }
>> +    }
>> +
>> +    return;
>> +}
>> +
>> +static void ivshmem_reset(DeviceState *d)
>> +{
>> +    return;
>
> Nothing to do?

Oversight, will fix.

>
>> +}
>> +
>> +static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
>> +                       pcibus_t addr, pcibus_t size, int type)
>> +{
>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
>> +
>> +    s->mmio_addr = addr;
>> +    cpu_register_physical_memory(addr + 0, 0x400, s->ivshmem_mmio_io_addr);
>
> The 0x400 should be #defined earlier. Why 0x400 since you are only
> interested in the 0x100 first bytes?

We bumped to 1MB for a previous version that had multiple doorbells.
ioeventfd masks eliminated made multiple doorbells unnecessary.  I'll
put it back to 0x100.

>
>> +
>> +    /* ioeventfd and irqfd are enabled together,
>> +     * so the flag IRQFD refers to both */
>> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
>> +        setup_ioeventfds(s);
>> +    }
>> +}
>> +
>> +static uint64_t ivshmem_get_size(IVShmemState * s) {
>> +
>> +    uint64_t value;
>> +    char *ptr;
>> +
>> +    value = strtoul(s->sizearg, &ptr, 10);
>
> I'd use strtoull() but the whole function should be suppressed, see below.
>
>> +    switch (*ptr) {
>> +        case 0: case 'M': case 'm':
>> +            value <<= 20;
>> +            break;
>> +        case 'G': case 'g':
>> +            value <<= 30;
>> +            break;
>> +        default:
>> +            fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg);
>> +            exit(1);
>> +    }
>> +
>> +    /* BARs must be a power of 2 */
>> +    if (!is_power_of_two(value)) {
>> +        fprintf(stderr, "ivshmem: size must be power of 2\n");
>> +        exit(1);
>> +    }
>
> Isn't the BAR check in pci.c enough?

This would produce a clearer error that it is the ivshmem device that
is the problem.

>
>> +
>> +    return value;
>> +
>> +}
>> +
>> +static void ivshmem_setup_msi(IVShmemState * s) {
>> +
>> +    int i;
>> +
>> +    /* allocate the MSI-X vectors */
>> +
>> +    if (!msix_init(&s->dev, s->vectors, 1, 0)) {
>> +        pci_register_bar(&s->dev, 1,
>> +                         msix_bar_size(&s->dev),
>> +                         PCI_BASE_ADDRESS_SPACE_MEMORY,
>> +                         msix_mmio_map);
>> +        IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
>> +    } else {
>> +        IVSHMEM_DPRINTF("msix initialization failed\n");
>
> Is this fatal considering the msix_vector_use() below?

Perhaps not, we could fall back to regular interrupts.  Could move the
msix_vector_use() into the "then" clause.

>
>> +    }
>> +
>> +    /* 'activate' the vectors */
>> +    for (i = 0; i < s->vectors; i++) {
>> +        msix_vector_use(&s->dev, i);
>> +    }
>> +
>> +    /* if IRQFDs are not supported, we'll have to trigger the interrupts
>> +     * via Qemu char devices */
>> +    if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
>> +        /* for handling interrupts when IRQFD is not available */
>> +        s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry));
>> +    }
>> +}
>> +
>> +static void ivshmem_save(QEMUFile* f, void *opaque)
>> +{
>> +    IVShmemState *proxy = opaque;
>> +
>> +    IVSHMEM_DPRINTF("ivshmem_save\n");
>> +    pci_device_save(&proxy->dev, f);
>> +
>> +    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
>> +        msix_save(&proxy->dev, f);
>> +    } else {
>> +        qemu_put_be32(f, proxy->intrstatus);
>> +        qemu_put_be32(f, proxy->intrmask);
>> +    }
>> +
>> +}
>
> There may be VMState magic to handle conditional structures (or just
> make the structures unconditional), so VMState should be used instead.

MSI-X requires the use of the old-style savevm/loadvm functions.
Should VMState and the load/save be used together?

>
>> +
>> +static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
>> +{
>> +    IVSHMEM_DPRINTF("ivshmem_load\n");
>> +
>> +    IVShmemState *proxy = opaque;
>> +    int ret, i;
>> +
>
> Missing version check.
>
>> +    ret = pci_device_load(&proxy->dev, f);
>> +    if (ret) {
>> +        return ret;
>> +    }
>> +
>> +    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
>> +        msix_load(&proxy->dev, f);
>> +        for (i = 0; i < proxy->vectors; i++) {
>> +            msix_vector_use(&proxy->dev, i);
>> +        }
>> +    } else {
>> +        proxy->intrstatus = qemu_get_be32(f);
>> +        proxy->intrmask = qemu_get_be32(f);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int pci_ivshmem_init(PCIDevice *dev)
>> +{
>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>> +    uint8_t *pci_conf;
>> +
>> +    if (s->sizearg == NULL)
>> +        s->ivshmem_size = 4 << 20; /* 4 MB default */
>> +    else {
>> +        s->ivshmem_size = ivshmem_get_size(s);
>> +    }
>> +
>> +    register_savevm("ivshmem", 0, 0, ivshmem_save, ivshmem_load, dev);
>> +
>> +    /* IRQFD requires MSI */
>> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD) &&
>> +        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
>> +        fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n");
>> +        exit(1);
>> +    }
>> +
>> +    /* check that role is reasonable */
>> +    if (s->role && !((strncmp(s->role, "peer", 5) == 0) ||
>> +                        (strncmp(s->role, "master", 7) == 0))) {
>> +        fprintf(stderr, "ivshmem: 'role' must be 'peer' or 'master'\n");
>> +        exit(1);
>> +    }
>
> I'd add a scalar flag in IVShmemState for role so that further strcmps
> are avoided.
>
>> +
>> +    pci_conf = s->dev.config;
>> +    pci_conf[0x00] = 0xf4; /* Qumranet vendor ID 0x5002 */
>> +    pci_conf[0x01] = 0x1a;
>> +    pci_conf[0x02] = 0x10;
>> +    pci_conf[0x03] = 0x11;
>
> Please add the DID to hw/pci_ids.h and use pci_config_set_xyz() here.
>
>> +    pci_conf[0x04] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
>> +    pci_conf[0x0a] = 0x00; /* RAM controller */
>> +    pci_conf[0x0b] = 0x05;
>> +    pci_conf[0x0e] = 0x00; /* header_type */
>> +
>> +    pci_conf[PCI_INTERRUPT_PIN] = 1;
>> +
>> +    s->shm_pci_addr = 0;
>> +    s->ivshmem_offset = 0;
>> +    s->shm_fd = 0;
>> +
>> +    s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
>> +                                    ivshmem_mmio_write, s);
>> +    /* region for registers*/
>> +    pci_register_bar(&s->dev, 0, 0x400,
>> +                           PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map);
>> +
>> +    if ((s->server_chr != NULL) &&
>> +                        (strncmp(s->server_chr->filename, "unix:", 5) == 0)) {
>> +        /* if we get a UNIX socket as the parameter we will talk
>> +         * to the ivshmem server to receive the memory region */
>> +
>> +        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
>> +                                                    s->server_chr->filename);
>> +
>> +        if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
>> +            ivshmem_setup_msi(s);
>> +        }
>> +
>> +        /* we allocate enough space for 16 guests and grow as needed */
>> +        s->nb_peers = 16;
>> +        s->vm_id = -1;
>> +
>> +        /* allocate/initialize space for interrupt handling */
>> +        s->peers = qemu_mallocz(s->nb_peers * sizeof(Peer));
>> +
>> +        pci_register_bar(&s->dev, 2, s->ivshmem_size,
>> +                                    PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
>> +
>> +        s->eventfd_chr = (CharDriverState **) qemu_mallocz(s->vectors *
>> +                                                sizeof(CharDriverState *));
>
> Useless cast in C.
>
>> +
>> +        qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, ivshmem_read,
>> +                     ivshmem_event, s);
>> +    } else {
>> +        /* just map the file immediately, we're not using a server */
>> +        int fd;
>> +
>> +        if (s->shmobj == NULL) {
>> +            fprintf(stderr, "Must specify 'chardev' or 'shm' to ivshmem\n");
>> +        }
>
> I'd rather have separate 'chardev' and 'shm_file' parameters. Then
> 'info qtree' could return more useful information about the chardev.

can you elaborate?  the command-line currently must be one of the following

server case:
-ivshmem chardev=x,...
-chardev id=x,...

or

non-server case:
-ivshmem shm=<name>,...

>
>> +
>> +        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);
>> +
>> +        /* try opening with O_EXCL and if it succeeds zero the memory
>> +         * by truncating to 0 */
>> +        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
>> +                        S_IRWXU|S_IRWXG|S_IRWXO)) > 0) {
>> +           /* truncate file to length PCI device's memory */
>> +            if (ftruncate(fd, s->ivshmem_size) != 0) {
>> +                fprintf(stderr, "kvm_ivshmem: could not truncate shared file\n");
>
> Why 'kvm_ivshmem'?

old name, will remove.

>
>> +            }
>> +
>> +        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
>> +                        S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
>> +            fprintf(stderr, "kvm_ivshmem: could not open shared file\n");
>> +            exit(-1);
>> +
>> +        }
>> +
>> +        if (check_shm_size(s, fd) == -1) {
>> +            exit(-1);
>> +        }
>> +
>> +        create_shared_memory_BAR(s, fd);
>> +
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int pci_ivshmem_uninit(PCIDevice *dev)
>> +{
>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>> +
>> +    cpu_unregister_io_memory(s->ivshmem_mmio_io_addr);
>> +
>> +    return 0;
>> +}
>> +
>> +static PCIDeviceInfo ivshmem_info = {
>> +    .qdev.name  = "ivshmem",
>> +    .qdev.size  = sizeof(IVShmemState),
>> +    .qdev.reset = ivshmem_reset,
>> +    .init       = pci_ivshmem_init,
>> +    .exit       = pci_ivshmem_uninit,
>> +    .qdev.props = (Property[]) {
>> +        DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
>> +        DEFINE_PROP_STRING("size", IVShmemState, sizearg),
>
> This should be scalar type, not string.

but it needs to handle the 'm' and 'g' suffixes for memory sizes.

>
>> +        DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
>> +        DEFINE_PROP_BIT("irqfd", IVShmemState, features, IVSHMEM_IRQFD, false),
>> +        DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
>> +        DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
>> +        DEFINE_PROP_STRING("role", IVShmemState, role),
>> +        DEFINE_PROP_END_OF_LIST(),
>> +    }
>> +};
>> +
>> +static void ivshmem_register_devices(void)
>> +{
>> +    pci_qdev_register(&ivshmem_info);
>> +}
>> +
>> +device_init(ivshmem_register_devices)
>> diff --git a/qemu-char.c b/qemu-char.c
>> index ac65a1c..b2e50d0 100644
>> --- a/qemu-char.c
>> +++ b/qemu-char.c
>> @@ -2093,6 +2093,12 @@ static void tcp_chr_read(void *opaque)
>>     }
>>  }
>>
>> +CharDriverState *qemu_chr_open_eventfd(int eventfd){
>> +
>> +    return qemu_chr_open_fd(eventfd, eventfd);
>> +
>> +}
>> +
>>  static void tcp_chr_connect(void *opaque)
>>  {
>>     CharDriverState *chr = opaque;
>> diff --git a/qemu-char.h b/qemu-char.h
>> index e3a0783..6ea01ba 100644
>> --- a/qemu-char.h
>> +++ b/qemu-char.h
>> @@ -94,6 +94,9 @@ void qemu_chr_info_print(Monitor *mon, const QObject *ret_data);
>>  void qemu_chr_info(Monitor *mon, QObject **ret_data);
>>  CharDriverState *qemu_chr_find(const char *name);
>>
>> +/* add an eventfd to the qemu devices that are polled */
>> +CharDriverState *qemu_chr_open_eventfd(int eventfd);
>
> Maybe this should be removed and just open coded with qemu_chr_open_fd.

I'm indifferent, it just looked a little funny passing the parameter twice.

>
>> +
>>  extern int term_escape_char;
>>
>>  /* async I/O support */
>> diff --git a/qemu-doc.texi b/qemu-doc.texi
>> index 6647b7b..24f8748 100644
>> --- a/qemu-doc.texi
>> +++ b/qemu-doc.texi
>> @@ -706,6 +706,49 @@ Using the @option{-net socket} option, it is possible to make VLANs
>>  that span several QEMU instances. See @ref{sec_invocation} to have a
>>  basic example.
>>
>> +@section Other Devices
>> +
>> +@subsection Inter-VM Shared Memory device
>> +
>> +With KVM enabled on a Linux host, a shared memory device is available.  Guests
>> +map a POSIX shared memory region into the guest as a PCI device that enables
>> +zero-copy communication to the application level of the guests.  The basic
>> +syntax is:
>> +
>> +@example
>> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>> +@end example
>> +
>> +If desired, interrupts can be sent between guest VMs accessing the same shared
>> +memory region.  Interrupt support requires using a shared memory server and
>> +using a chardev socket to connect to it.  The code for the shared memory server
>> +is qemu.git/contrib/ivshmem-server.  An example syntax when using the shared
>> +memory server is:
>> +
>> +@example
>> +qemu -device ivshmem,size=<size in format accepted by -m>[,chardev=<id>]
>> +                        [,msi=on][,irqfd=on][,vectors=n][,role=peer|master]
>> +qemu -chardev socket,path=<path>,id=<id>
>> +@end example
>> +
>> +When using the server, the guest will be assigned a VM ID (>=0) that allows guests
>> +using the same server to communicate via interrupts.  Guests can read their
>> +VM ID from a device register (see example code).  Since receiving the shared
>> +memory region from the server is asynchronous, there is a (small) chance the
>> +guest may boot before the shared memory is attached.  To allow an application
>> +to ensure shared memory is attached, the VM ID register will return -1 (an
>> +invalid VM ID) until the memory is attached.  Once the shared memory is
>> +attached, the VM ID will return the guest's valid VM ID.  With these semantics,
>> +the guest application can check to ensure the shared memory is attached to the
>> +guest before proceeding.
>> +
>> +The @option{role} argument can be set to either master or peer and will affect
>> +how the shared memory is migrated.  With @option{role=master}, the guest will
>> +copy the shared memory on migration to the destination host.  With
>> +@option{role=peer}, the shared memory will not be copied on migration.  Only
>> +one guest should be specified as
>> +the master.
>> +
>>  @node direct_linux_boot
>>  @section Direct Linux Boot
>>
>> --
>> 1.6.3.2.198.g6096d
>>
>>
>>
>
>
Blue Swirl June 9, 2010, 8:12 p.m. UTC | #4
On Mon, Jun 7, 2010 at 4:41 PM, Cam Macdonell <cam@cs.ualberta.ca> wrote:
> On Sat, Jun 5, 2010 at 3:44 AM, Blue Swirl <blauwirbel@gmail.com> wrote:
>> On Fri, Jun 4, 2010 at 9:45 PM, Cam Macdonell <cam@cs.ualberta.ca> wrote:
>>> Support an inter-vm shared memory device that maps a shared-memory object as a
>>> PCI device in the guest.  This patch also supports interrupts between guest by
>>> communicating over a unix domain socket.  This patch applies to the qemu-kvm
>>> repository.
>>>
>>>    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>>
>>> Interrupts are supported between multiple VMs by using a shared memory server
>>> by using a chardev socket.
>>>
>>>    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>>           [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n][,role=peer|master]
>>>    -chardev socket,path=<path>,id=<id>
>>>
>>> (shared memory server is qemu.git/contrib/ivshmem-server)
>>>
>>> Sample programs and init scripts are in a git repo here:
>>>
>>>    www.gitorious.org/nahanni
>>> ---
>>>  Makefile.target |    3 +
>>>  hw/ivshmem.c    |  852 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  qemu-char.c     |    6 +
>>>  qemu-char.h     |    3 +
>>>  qemu-doc.texi   |   43 +++
>>>  5 files changed, 907 insertions(+), 0 deletions(-)
>>>  create mode 100644 hw/ivshmem.c
>>>
>>> diff --git a/Makefile.target b/Makefile.target
>>> index c4ba592..4888308 100644
>>> --- a/Makefile.target
>>> +++ b/Makefile.target
>>> @@ -202,6 +202,9 @@ obj-$(CONFIG_USB_OHCI) += usb-ohci.o
>>>  obj-y += rtl8139.o
>>>  obj-y += e1000.o
>>>
>>> +# Inter-VM PCI shared memory
>>> +obj-y += ivshmem.o
>>> +
>>
>> Can this be compiled once, simply by moving this to Makefile.objs
>> instead of Makefile.target? Also, because the code seems to be KVM
>> specific, it can't be compiled unconditionally but depending on at
>> least CONFIG_KVM and maybe CONFIG_EVENTFD.
>
> The device uses eventfds for signalling, but never creates the
> eventfds itself as they are passed from a server using SCM_RIGHTS.
> So, it does not depend on the eventfd API.  Its dependence on
> irqfd/ioeventfd (the only KVM specific bits) are optional via the
> command-line.
>
> A runtime check for KVM is done for the reasons Avi mentioned.
>
>>
>> Why is this KVM specific BTW, Posix SHM is available on many
>> platforms? What would happen if kvm_set_foobar functions were not
>> called when KVM is not being used? Is host eventfd support essential?
>>
>>>  # Hardware support
>>>  obj-i386-y += vga.o
>>>  obj-i386-y += mc146818rtc.o i8259.o pc.o
>>> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
>>> new file mode 100644
>>> index 0000000..9057612
>>> --- /dev/null
>>> +++ b/hw/ivshmem.c
>>> @@ -0,0 +1,852 @@
>>> +/*
>>> + * Inter-VM Shared Memory PCI device.
>>> + *
>>> + * Author:
>>> + *      Cam Macdonell <cam@cs.ualberta.ca>
>>> + *
>>> + * Based On: cirrus_vga.c
>>> + *          Copyright (c) 2004 Fabrice Bellard
>>> + *          Copyright (c) 2004 Makoto Suzuki (suzu)
>>> + *
>>> + *      and rtl8139.c
>>> + *          Copyright (c) 2006 Igor Kovalenko
>>> + *
>>> + * This code is licensed under the GNU GPL v2.
>>> + */
>>> +#include <sys/mman.h>
>>> +#include <sys/types.h>
>>> +#include <sys/socket.h>
>>> +#include <sys/io.h>
>>> +#include <sys/ioctl.h>
>>> +#include "hw.h"
>>> +#include "console.h"
>>> +#include "pc.h"
>>> +#include "pci.h"
>>> +#include "sysemu.h"
>>> +
>>> +#include "msix.h"
>>> +#include "qemu-kvm.h"
>>> +#include "libkvm.h"
>>> +
>>> +#include <sys/eventfd.h>
>>> +#include <sys/mman.h>
>>> +#include <sys/socket.h>
>>> +#include <sys/ioctl.h>
>>> +
>>> +#define IVSHMEM_IRQFD   0
>>> +#define IVSHMEM_MSI     1
>>> +
>>> +//#define DEBUG_IVSHMEM
>>> +#ifdef DEBUG_IVSHMEM
>>> +#define IVSHMEM_DPRINTF(fmt, args...)        \
>>> +    do {printf("IVSHMEM: " fmt, ##args); } while (0)
>>
>> Please use __VA_ARGS__.
>>
>>> +#else
>>> +#define IVSHMEM_DPRINTF(fmt, args...)
>>> +#endif
>>> +
>>> +typedef struct Peer {
>>> +    int nb_eventfds;
>>> +    int *eventfds;
>>> +} Peer;
>>> +
>>> +typedef struct EventfdEntry {
>>> +    PCIDevice *pdev;
>>> +    int vector;
>>> +} EventfdEntry;
>>> +
>>> +typedef struct IVShmemState {
>>> +    PCIDevice dev;
>>> +    uint32_t intrmask;
>>> +    uint32_t intrstatus;
>>> +    uint32_t doorbell;
>>> +
>>> +    CharDriverState ** eventfd_chr;
>>
>> I'd remove the space between '**' and 'eventfd_chr', it's used inconsistently.
>>
>>> +    CharDriverState * server_chr;
>>> +    int ivshmem_mmio_io_addr;
>>> +
>>> +    pcibus_t mmio_addr;
>>> +    pcibus_t shm_pci_addr;
>>> +    uint64_t ivshmem_offset;
>>> +    uint64_t ivshmem_size; /* size of shared memory region */
>>> +    int shm_fd; /* shared memory file descriptor */
>>> +
>>> +    Peer *peers;
>>> +    int nb_peers; /* how many guests we have space for */
>>> +    int max_peer; /* maximum numbered peer */
>>> +
>>> +    int vm_id;
>>> +    uint32_t vectors;
>>> +    uint32_t features;
>>> +    EventfdEntry *eventfd_table;
>>> +
>>> +    char * shmobj;
>>> +    char * sizearg;
>>> +    char * role;
>>> +} IVShmemState;
>>> +
>>> +/* registers for the Inter-VM shared memory device */
>>> +enum ivshmem_registers {
>>> +    IntrMask = 0,
>>> +    IntrStatus = 4,
>>> +    IVPosition = 8,
>>> +    Doorbell = 12,
>>> +};
>>
>> IIRC these should be uppercase.
>
> I worked from rtl8139 which doesn't have them uppercase.  But doing a
> quick search, I can see most devices do, I'll fix that.
>
>>
>>> +
>>> +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int feature) {
>>> +    return (ivs->features & (1 << feature));
>>> +}
>>
>> Since this is the first version, do we need any features at this
>> point, can't we expect that all features are available now? Why does
>> the user need to specify the features?
>
> Some features require host support such as irqfd/ioeventfds.  So
> having features allows them to be disabled on the command-line (e.g.,
> irqfd=off).
>
>>
>> To avoid a negative shift, I'd make 'feature' unsigned.
>>
>>> +
>>> +static inline bool is_power_of_two(uint64_t x) {
>>> +    return (x & (x - 1)) == 0;
>>> +}
>>> +
>>> +static void ivshmem_map(PCIDevice *pci_dev, int region_num,
>>> +                    pcibus_t addr, pcibus_t size, int type)
>>> +{
>>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
>>> +
>>> +    s->shm_pci_addr = addr;
>>> +
>>> +    if (s->ivshmem_offset > 0) {
>>> +        cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
>>> +                                                            s->ivshmem_offset);
>>> +        if (s->role && strncmp(s->role, "peer", 4) == 0) {
>>> +            IVSHMEM_DPRINTF("marking pages no migrate\n");
>>> +            cpu_mark_pages_no_migrate(s->ivshmem_offset, s->ivshmem_size);
>>> +        }
>>> +    }
>>> +
>>> +    IVSHMEM_DPRINTF("guest pci addr = %u, guest h/w addr = %u, size = %u\n",
>>> +                (uint32_t)addr, (uint32_t)s->ivshmem_offset, (uint32_t)size);
>>
>> Please use FMT_PCIBUS for addr and size and PRIu64 for s->ivshmem_offset.
>>
>>> +
>>> +}
>>> +
>>> +/* accessing registers - based on rtl8139 */
>>> +static void ivshmem_update_irq(IVShmemState *s, int val)
>>> +{
>>> +    int isr;
>>> +    isr = (s->intrstatus & s->intrmask) & 0xffffffff;
>>> +
>>> +    /* don't print ISR resets */
>>> +    if (isr) {
>>> +        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
>>> +           isr ? 1 : 0, s->intrstatus, s->intrmask);
>>> +    }
>>> +
>>> +    qemu_set_irq(s->dev.irq[0], (isr != 0));
>>> +}
>>> +
>>> +static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
>>> +{
>>> +    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
>>> +
>>> +    s->intrmask = val;
>>> +
>>> +    ivshmem_update_irq(s, val);
>>> +}
>>> +
>>> +static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
>>> +{
>>> +    uint32_t ret = s->intrmask;
>>> +
>>> +    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
>>> +{
>>> +    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
>>> +
>>> +    s->intrstatus = val;
>>> +
>>> +    ivshmem_update_irq(s, val);
>>> +    return;
>>> +}
>>> +
>>> +static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
>>> +{
>>> +    uint32_t ret = s->intrstatus;
>>> +
>>> +    /* reading ISR clears all interrupts */
>>> +    s->intrstatus = 0;
>>> +
>>> +    ivshmem_update_irq(s, 0);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val)
>>> +{
>>> +
>>> +    IVSHMEM_DPRINTF("We shouldn't be writing words\n");
>>> +}
>>> +
>>> +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
>>> +{
>>> +    IVShmemState *s = opaque;
>>> +
>>> +    u_int64_t write_one = 1;
>>
>> Please use uintNN_t instead of u_intNN_t.
>>
>>> +    u_int16_t dest = val >> 16;
>>> +    u_int16_t vector = val & 0xff;
>>> +
>>> +    addr &= 0xfc;
>>
>> I'd add a debug printf here, likewise for exit of ivshmem_io_readl().
>> When you do the merge (see below), the correct printf format for the
>> addresses will be TARGET_FMT_plx.
>>
>>> +
>>> +    switch (addr)
>>> +    {
>>> +        case IntrMask:
>>> +            ivshmem_IntrMask_write(s, val);
>>> +            break;
>>> +
>>> +        case IntrStatus:
>>> +            ivshmem_IntrStatus_write(s, val);
>>> +            break;
>>> +
>>> +        case Doorbell:
>>> +            /* check that dest VM ID is reasonable */
>>> +            if ((dest < 0) || (dest > s->max_peer)) {
>>> +                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
>>> +                break;
>>> +            }
>>> +
>>> +            /* check doorbell range */
>>> +            if ((vector >= 0) && (vector < s->peers[dest].nb_eventfds)) {
>>> +                IVSHMEM_DPRINTF("Writing %ld to VM %d on vector %d\n",
>>> +                                                    write_one, dest, vector);
>>
>> PRId64 for write_one, %ld is not enough on ILP32.
>>
>>> +                if (write(s->peers[dest].eventfds[vector],
>>> +                                                    &(write_one), 8) != 8) {
>>> +                    IVSHMEM_DPRINTF("error writing to eventfd\n");
>>> +                }
>>> +            }
>>> +            break;
>>> +        default:
>>> +            IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest);
>>> +    }
>>> +}
>>> +
>>> +static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val)
>>> +{
>>> +    IVSHMEM_DPRINTF("We shouldn't be writing bytes\n");
>>> +}
>>> +
>>> +static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr)
>>> +{
>>> +
>>> +    IVSHMEM_DPRINTF("We shouldn't be reading words\n");
>>> +    return 0;
>>> +}
>>> +
>>> +static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr)
>>> +{
>>> +
>>> +    IVShmemState *s = opaque;
>>> +    uint32_t ret;
>>> +
>>> +    switch (addr)
>>> +    {
>>> +        case IntrMask:
>>> +            ret = ivshmem_IntrMask_read(s);
>>> +            break;
>>> +
>>> +        case IntrStatus:
>>> +            ret = ivshmem_IntrStatus_read(s);
>>> +            break;
>>> +
>>> +        case IVPosition:
>>> +            /* return my VM ID if the memory is mapped */
>>> +            if (s->shm_fd > 0) {
>>> +                ret = s->vm_id;
>>> +            } else {
>>> +                ret = -1;
>>> +            }
>>> +            break;
>>> +
>>> +        default:
>>> +            IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr);
>>> +            ret = 0;
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr)
>>> +{
>>> +    IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void ivshmem_mmio_writeb(void *opaque,
>>> +                                target_phys_addr_t addr, uint32_t val)
>>> +{
>>> +    ivshmem_io_writeb(opaque, addr & 0xFF, val);
>>> +}
>>
>> This function and others below only performs a cast and useless
>> masking (the address passed is these days an offset from start of the
>> area). Please merge these to ivshmem_io_readl() etc.
>
> Ok, these are artifacts from basing my patch on rtl8139.c.  I'll remove them.
>
>>
>>> +
>>> +static void ivshmem_mmio_writew(void *opaque,
>>> +                                target_phys_addr_t addr, uint32_t val)
>>> +{
>>> +    ivshmem_io_writew(opaque, addr & 0xFF, val);
>>> +}
>>> +
>>> +static void ivshmem_mmio_writel(void *opaque,
>>> +                                target_phys_addr_t addr, uint32_t val)
>>> +{
>>> +    ivshmem_io_writel(opaque, addr & 0xFF, val);
>>> +}
>>> +
>>> +static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr)
>>> +{
>>> +    return ivshmem_io_readb(opaque, addr & 0xFF);
>>> +}
>>> +
>>> +static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr)
>>> +{
>>> +    uint32_t val = ivshmem_io_readw(opaque, addr & 0xFF);
>>> +    return val;
>>> +}
>>> +
>>> +static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr)
>>> +{
>>> +    uint32_t val = ivshmem_io_readl(opaque, addr & 0xFF);
>>> +    return val;
>>> +}
>>> +
>>> +static CPUReadMemoryFunc *ivshmem_mmio_read[3] = {
>>
>> Please add 'const'.
>>
>>> +    ivshmem_mmio_readb,
>>> +    ivshmem_mmio_readw,
>>> +    ivshmem_mmio_readl,
>>> +};
>>> +
>>> +static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = {
>>> +    ivshmem_mmio_writeb,
>>> +    ivshmem_mmio_writew,
>>> +    ivshmem_mmio_writel,
>>> +};
>>> +
>>> +static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
>>> +{
>>> +    IVShmemState *s = opaque;
>>> +
>>> +    ivshmem_IntrStatus_write(s, *buf);
>>> +
>>> +    IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
>>> +}
>>> +
>>> +static int ivshmem_can_receive(void * opaque)
>>> +{
>>> +    return 8;
>>> +}
>>> +
>>> +static void ivshmem_event(void *opaque, int event)
>>> +{
>>> +    IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
>>> +}
>>> +
>>> +static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {
>>> +
>>> +    EventfdEntry *entry = opaque;
>>> +    PCIDevice *pdev = entry->pdev;
>>> +
>>> +    IVSHMEM_DPRINTF("fake irqfd on vector %p %d\n", pdev, entry->vector);
>>> +    msix_notify(pdev, entry->vector);
>>> +}
>>> +
>>> +static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd,
>>> +                                                                    int vector)
>>> +{
>>> +    /* create a event character device based on the passed eventfd */
>>> +    IVShmemState *s = opaque;
>>> +    CharDriverState * chr;
>>> +
>>> +    chr = qemu_chr_open_eventfd(eventfd);
>>> +
>>> +    if (chr == NULL) {
>>> +        IVSHMEM_DPRINTF("creating eventfd for eventfd %d failed\n", eventfd);
>>
>> This should not be a DPRINTF.
>>
>>> +        exit(-1);
>>> +    }
>>> +
>>> +    /* if MSI is supported we need multiple interrupts */
>>> +    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
>>> +        s->eventfd_table[vector].pdev = &s->dev;
>>> +        s->eventfd_table[vector].vector = vector;
>>> +
>>> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
>>> +                      ivshmem_event, &s->eventfd_table[vector]);
>>> +    } else {
>>> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
>>> +                      ivshmem_event, s);
>>> +    }
>>> +
>>> +    return chr;
>>> +
>>> +}
>>> +
>>> +static int check_shm_size(IVShmemState *s, int fd) {
>>> +    /* check that the guest isn't going to try and map more memory than the
>>> +     * the object has allocated return -1 to indicate error */
>>> +
>>> +    struct stat buf;
>>> +
>>> +    fstat(fd, &buf);
>>> +
>>> +    if (s->ivshmem_size > buf.st_size) {
>>> +        fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
>>> +        fprintf(stderr, " than shared object size (%ld > %ld)\n",
>>> +                                          s->ivshmem_size, buf.st_size);
>>
>> Please use PRIx64 for s->ivshmem_size, this will cause a warning on ILP32.
>>
>>> +        return -1;
>>> +    } else {
>>> +        return 0;
>>> +    }
>>> +}
>>> +
>>> +/* create the shared memory BAR when we are not using the server, so we can
>>> + * create the BAR and map the memory immediately */
>>> +static void create_shared_memory_BAR(IVShmemState *s, int fd) {
>>> +
>>> +    void * ptr;
>>> +
>>> +    s->shm_fd = fd;
>>> +
>>> +    ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
>>> +
>>> +    s->ivshmem_offset = qemu_ram_map(s->ivshmem_size, ptr);
>>
>> qemu_ram_map() does not exist in HEAD.
>>
>
> Ok, so qemu_ram_map() and kvm_set_irq() are in the KVM HEAD.  I had my
> own version of both functions, but removed them when Marcelo's were
> merged into KVM.
>
>>> +
>>> +    /* region for shared memory */
>>> +    pci_register_bar(&s->dev, 2, s->ivshmem_size,
>>> +                                    PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
>>> +}
>>> +
>>> +static void close_guest_eventfds(IVShmemState *s, int posn)
>>> +{
>>> +    int i, guest_curr_max;
>>> +
>>> +    guest_curr_max = s->peers[posn].nb_eventfds;
>>> +
>>> +    for (i = 0; i < guest_curr_max; i++)
>>> +        close(s->peers[posn].eventfds[i]);
>>
>> CODING_STYLE.
>>
>>> +
>>> +    qemu_free(s->peers[posn].eventfds);
>>> +    s->peers[posn].nb_eventfds = 0;
>>> +}
>>> +
>>> +static void setup_ioeventfds(IVShmemState *s) {
>>> +
>>> +    int i, j;
>>> +
>>> +    for (i = 0; i <= s->max_peer; i++) {
>>> +        for (j = 0; j < s->peers[i].nb_eventfds; j++) {
>>> +            kvm_set_ioeventfd_mmio_long(s->peers[i].eventfds[j],
>>> +                    s->mmio_addr + Doorbell, (i << 16) | j, 1);
>>> +        }
>>> +    }
>>> +
>>> +    /* setup irqfd for this VM's eventfds */
>>> +    for (i = 0; i < s->vectors; i++) {
>>> +        kvm_set_irqfd(s->dev.msix_irq_entries[i].gsi,
>>> +                        s->peers[s->vm_id].eventfds[i], 1);
>>
>> kvm_set_irqfd() does not exist in HEAD.
>>
>>> +    }
>>> +}
>>> +
>>> +
>>> +/* this function increase the dynamic storage need to store data about other
>>> + * guests */
>>> +static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
>>> +
>>> +    int j, old_nb_alloc;
>>> +
>>> +    old_nb_alloc = s->nb_peers;
>>> +
>>> +    while (new_min_size >= s->nb_peers)
>>> +        s->nb_peers = s->nb_peers * 2;
>>> +
>>> +    IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nb_peers);
>>> +    s->peers = qemu_realloc(s->peers, s->nb_peers * sizeof(Peer));
>>> +
>>> +    if (s->peers == NULL) {
>>> +        fprintf(stderr, "Allocation error - exiting\n");
>>> +        exit(1);
>>> +    }
>>
>> qemu_realloc will never return zero.
>>
>>> +
>>> +    /* zero out new pointers */
>>> +    for (j = old_nb_alloc; j < s->nb_peers; j++) {
>>> +        s->peers[j].eventfds = NULL;
>>> +        s->peers[j].nb_eventfds = 0;
>>> +    }
>>> +}
>>> +
>>> +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
>>> +{
>>> +    IVShmemState *s = opaque;
>>> +    int incoming_fd, tmp_fd;
>>> +    int guest_curr_max;
>>> +    long incoming_posn;
>>> +
>>> +    memcpy(&incoming_posn, buf, sizeof(long));
>>> +    /* pick off s->server_chr->msgfd and store it, posn should accompany msg */
>>> +    tmp_fd = qemu_chr_get_msgfd(s->server_chr);
>>> +    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
>>> +
>>> +    /* make sure we have enough space for this guest */
>>> +    if (incoming_posn >= s->nb_peers) {
>>> +        increase_dynamic_storage(s, incoming_posn);
>>> +    }
>>> +
>>> +    if (tmp_fd == -1) {
>>> +        /* if posn is positive and unseen before then this is our posn*/
>>> +        if ((incoming_posn >= 0) && (s->peers[incoming_posn].eventfds == NULL)) {
>>> +            /* receive our posn */
>>> +            s->vm_id = incoming_posn;
>>> +            return;
>>> +        } else {
>>> +            /* otherwise an fd == -1 means an existing guest has gone away */
>>> +            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
>>> +            close_guest_eventfds(s, incoming_posn);
>>> +            return;
>>> +        }
>>> +    }
>>> +
>>> +    /* because of the implementation of get_msgfd, we need a dup */
>>> +    incoming_fd = dup(tmp_fd);
>>> +
>>> +    if (incoming_fd == -1) {
>>> +        fprintf(stderr, "could not allocate file descriptor %s\n",
>>> +                                                            strerror(errno));
>>> +        return;
>>> +    }
>>> +
>>> +    /* if the position is -1, then it's shared memory region fd */
>>> +    if (incoming_posn == -1) {
>>> +
>>> +        void * map_ptr;
>>> +
>>> +        s->max_peer = 0;
>>> +
>>> +        if (check_shm_size(s, incoming_fd) == -1) {
>>> +            exit(-1);
>>> +        }
>>> +
>>> +        /* mmap the region and map into the BAR2 */
>>> +        map_ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED,
>>> +                                                                incoming_fd, 0);
>>> +        s->ivshmem_offset = qemu_ram_map(s->ivshmem_size, map_ptr);
>>> +
>>> +        IVSHMEM_DPRINTF("guest pci addr = %u, guest h/w addr = %u, size = %u\n",
>>> +                        (uint32_t)s->shm_pci_addr, (uint32_t)s->ivshmem_offset,
>>> +                        (uint32_t)s->ivshmem_size);
>>> +
>>> +        if (s->shm_pci_addr > 0) {
>>> +            /* map memory into BAR2 */
>>> +            cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
>>> +                                                            s->ivshmem_offset);
>>> +            if (s->role && strncmp(s->role, "peer", 4) == 0) {
>>> +                IVSHMEM_DPRINTF("marking pages no migrate\n");
>>> +                cpu_mark_pages_no_migrate(s->ivshmem_offset, s->ivshmem_size);
>>> +            }
>>> +
>>> +        }
>>> +
>>> +        /* only store the fd if it is successfully mapped */
>>> +        s->shm_fd = incoming_fd;
>>> +
>>> +        return;
>>> +    }
>>> +
>>> +    /* each guest has an array of eventfds, and we keep track of how many
>>> +     * guests for each VM */
>>> +    guest_curr_max = s->peers[incoming_posn].nb_eventfds;
>>> +    if (guest_curr_max == 0) {
>>> +        /* one eventfd per MSI vector */
>>> +        s->peers[incoming_posn].eventfds = (int *) qemu_malloc(s->vectors *
>>> +                                                                sizeof(int));
>>
>> Useless cast in C.
>>
>>> +    }
>>> +
>>> +    /* this is an eventfd for a particular guest VM */
>>> +    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, guest_curr_max,
>>> +                                                                incoming_fd);
>>> +    s->peers[incoming_posn].eventfds[guest_curr_max] = incoming_fd;
>>> +
>>> +    /* increment count for particular guest */
>>> +    s->peers[incoming_posn].nb_eventfds++;
>>> +
>>> +    /* keep track of the maximum VM ID */
>>> +    if (incoming_posn > s->max_peer) {
>>> +        s->max_peer = incoming_posn;
>>> +    }
>>> +
>>> +    if (incoming_posn == s->vm_id) {
>>> +        int vector = guest_curr_max;
>>
>> Why add a new variable?
>
> for clarity, so when looking at the code it's clear that the current
> maxium is a MSI vector.  Perhaps I'll rename guest_curr_max to achieve
> this.
>
>>
>>> +        if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
>>> +            /* initialize char device for callback
>>> +             * if this is one of my eventfds */
>>> +            s->eventfd_chr[vector] = create_eventfd_chr_device(s,
>>> +                       s->peers[s->vm_id].eventfds[vector], vector);
>>> +        }
>>> +    }
>>> +
>>> +    return;
>>> +}
>>> +
>>> +static void ivshmem_reset(DeviceState *d)
>>> +{
>>> +    return;
>>
>> Nothing to do?
>
> Oversight, will fix.
>
>>
>>> +}
>>> +
>>> +static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
>>> +                       pcibus_t addr, pcibus_t size, int type)
>>> +{
>>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
>>> +
>>> +    s->mmio_addr = addr;
>>> +    cpu_register_physical_memory(addr + 0, 0x400, s->ivshmem_mmio_io_addr);
>>
>> The 0x400 should be #defined earlier. Why 0x400 since you are only
>> interested in the 0x100 first bytes?
>
> We bumped to 1MB for a previous version that had multiple doorbells.
> ioeventfd masks eliminated made multiple doorbells unnecessary.  I'll
> put it back to 0x100.
>
>>
>>> +
>>> +    /* ioeventfd and irqfd are enabled together,
>>> +     * so the flag IRQFD refers to both */
>>> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
>>> +        setup_ioeventfds(s);
>>> +    }
>>> +}
>>> +
>>> +static uint64_t ivshmem_get_size(IVShmemState * s) {
>>> +
>>> +    uint64_t value;
>>> +    char *ptr;
>>> +
>>> +    value = strtoul(s->sizearg, &ptr, 10);
>>
>> I'd use strtoull() but the whole function should be suppressed, see below.
>>
>>> +    switch (*ptr) {
>>> +        case 0: case 'M': case 'm':
>>> +            value <<= 20;
>>> +            break;
>>> +        case 'G': case 'g':
>>> +            value <<= 30;
>>> +            break;
>>> +        default:
>>> +            fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg);
>>> +            exit(1);
>>> +    }
>>> +
>>> +    /* BARs must be a power of 2 */
>>> +    if (!is_power_of_two(value)) {
>>> +        fprintf(stderr, "ivshmem: size must be power of 2\n");
>>> +        exit(1);
>>> +    }
>>
>> Isn't the BAR check in pci.c enough?
>
> This would produce a clearer error that it is the ivshmem device that
> is the problem.
>
>>
>>> +
>>> +    return value;
>>> +
>>> +}
>>> +
>>> +static void ivshmem_setup_msi(IVShmemState * s) {
>>> +
>>> +    int i;
>>> +
>>> +    /* allocate the MSI-X vectors */
>>> +
>>> +    if (!msix_init(&s->dev, s->vectors, 1, 0)) {
>>> +        pci_register_bar(&s->dev, 1,
>>> +                         msix_bar_size(&s->dev),
>>> +                         PCI_BASE_ADDRESS_SPACE_MEMORY,
>>> +                         msix_mmio_map);
>>> +        IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
>>> +    } else {
>>> +        IVSHMEM_DPRINTF("msix initialization failed\n");
>>
>> Is this fatal considering the msix_vector_use() below?
>
> Perhaps not, we could fall back to regular interrupts.  Could move the
> msix_vector_use() into the "then" clause.
>
>>
>>> +    }
>>> +
>>> +    /* 'activate' the vectors */
>>> +    for (i = 0; i < s->vectors; i++) {
>>> +        msix_vector_use(&s->dev, i);
>>> +    }
>>> +
>>> +    /* if IRQFDs are not supported, we'll have to trigger the interrupts
>>> +     * via Qemu char devices */
>>> +    if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
>>> +        /* for handling interrupts when IRQFD is not available */
>>> +        s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry));
>>> +    }
>>> +}
>>> +
>>> +static void ivshmem_save(QEMUFile* f, void *opaque)
>>> +{
>>> +    IVShmemState *proxy = opaque;
>>> +
>>> +    IVSHMEM_DPRINTF("ivshmem_save\n");
>>> +    pci_device_save(&proxy->dev, f);
>>> +
>>> +    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
>>> +        msix_save(&proxy->dev, f);
>>> +    } else {
>>> +        qemu_put_be32(f, proxy->intrstatus);
>>> +        qemu_put_be32(f, proxy->intrmask);
>>> +    }
>>> +
>>> +}
>>
>> There may be VMState magic to handle conditional structures (or just
>> make the structures unconditional), so VMState should be used instead.
>
> MSI-X requires the use of the old-style savevm/loadvm functions.
> Should VMState and the load/save be used together?

They do the same thing. Maybe MSI-X should be fixed then.

>>
>>> +
>>> +static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
>>> +{
>>> +    IVSHMEM_DPRINTF("ivshmem_load\n");
>>> +
>>> +    IVShmemState *proxy = opaque;
>>> +    int ret, i;
>>> +
>>
>> Missing version check.
>>
>>> +    ret = pci_device_load(&proxy->dev, f);
>>> +    if (ret) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
>>> +        msix_load(&proxy->dev, f);
>>> +        for (i = 0; i < proxy->vectors; i++) {
>>> +            msix_vector_use(&proxy->dev, i);
>>> +        }
>>> +    } else {
>>> +        proxy->intrstatus = qemu_get_be32(f);
>>> +        proxy->intrmask = qemu_get_be32(f);
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int pci_ivshmem_init(PCIDevice *dev)
>>> +{
>>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>>> +    uint8_t *pci_conf;
>>> +
>>> +    if (s->sizearg == NULL)
>>> +        s->ivshmem_size = 4 << 20; /* 4 MB default */
>>> +    else {
>>> +        s->ivshmem_size = ivshmem_get_size(s);
>>> +    }
>>> +
>>> +    register_savevm("ivshmem", 0, 0, ivshmem_save, ivshmem_load, dev);
>>> +
>>> +    /* IRQFD requires MSI */
>>> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD) &&
>>> +        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
>>> +        fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n");
>>> +        exit(1);
>>> +    }
>>> +
>>> +    /* check that role is reasonable */
>>> +    if (s->role && !((strncmp(s->role, "peer", 5) == 0) ||
>>> +                        (strncmp(s->role, "master", 7) == 0))) {
>>> +        fprintf(stderr, "ivshmem: 'role' must be 'peer' or 'master'\n");
>>> +        exit(1);
>>> +    }
>>
>> I'd add a scalar flag in IVShmemState for role so that further strcmps
>> are avoided.
>>
>>> +
>>> +    pci_conf = s->dev.config;
>>> +    pci_conf[0x00] = 0xf4; /* Qumranet vendor ID 0x5002 */
>>> +    pci_conf[0x01] = 0x1a;
>>> +    pci_conf[0x02] = 0x10;
>>> +    pci_conf[0x03] = 0x11;
>>
>> Please add the DID to hw/pci_ids.h and use pci_config_set_xyz() here.
>>
>>> +    pci_conf[0x04] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
>>> +    pci_conf[0x0a] = 0x00; /* RAM controller */
>>> +    pci_conf[0x0b] = 0x05;
>>> +    pci_conf[0x0e] = 0x00; /* header_type */
>>> +
>>> +    pci_conf[PCI_INTERRUPT_PIN] = 1;
>>> +
>>> +    s->shm_pci_addr = 0;
>>> +    s->ivshmem_offset = 0;
>>> +    s->shm_fd = 0;
>>> +
>>> +    s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
>>> +                                    ivshmem_mmio_write, s);
>>> +    /* region for registers*/
>>> +    pci_register_bar(&s->dev, 0, 0x400,
>>> +                           PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map);
>>> +
>>> +    if ((s->server_chr != NULL) &&
>>> +                        (strncmp(s->server_chr->filename, "unix:", 5) == 0)) {
>>> +        /* if we get a UNIX socket as the parameter we will talk
>>> +         * to the ivshmem server to receive the memory region */
>>> +
>>> +        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
>>> +                                                    s->server_chr->filename);
>>> +
>>> +        if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
>>> +            ivshmem_setup_msi(s);
>>> +        }
>>> +
>>> +        /* we allocate enough space for 16 guests and grow as needed */
>>> +        s->nb_peers = 16;
>>> +        s->vm_id = -1;
>>> +
>>> +        /* allocate/initialize space for interrupt handling */
>>> +        s->peers = qemu_mallocz(s->nb_peers * sizeof(Peer));
>>> +
>>> +        pci_register_bar(&s->dev, 2, s->ivshmem_size,
>>> +                                    PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
>>> +
>>> +        s->eventfd_chr = (CharDriverState **) qemu_mallocz(s->vectors *
>>> +                                                sizeof(CharDriverState *));
>>
>> Useless cast in C.
>>
>>> +
>>> +        qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, ivshmem_read,
>>> +                     ivshmem_event, s);
>>> +    } else {
>>> +        /* just map the file immediately, we're not using a server */
>>> +        int fd;
>>> +
>>> +        if (s->shmobj == NULL) {
>>> +            fprintf(stderr, "Must specify 'chardev' or 'shm' to ivshmem\n");
>>> +        }
>>
>> I'd rather have separate 'chardev' and 'shm_file' parameters. Then
>> 'info qtree' could return more useful information about the chardev.
>
> can you elaborate?  the command-line currently must be one of the following
>
> server case:
> -ivshmem chardev=x,...
> -chardev id=x,...
>
> or
>
> non-server case:
> -ivshmem shm=<name>,...

Never mind, I was confused.

>
>>
>>> +
>>> +        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);
>>> +
>>> +        /* try opening with O_EXCL and if it succeeds zero the memory
>>> +         * by truncating to 0 */
>>> +        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
>>> +                        S_IRWXU|S_IRWXG|S_IRWXO)) > 0) {
>>> +           /* truncate file to length PCI device's memory */
>>> +            if (ftruncate(fd, s->ivshmem_size) != 0) {
>>> +                fprintf(stderr, "kvm_ivshmem: could not truncate shared file\n");
>>
>> Why 'kvm_ivshmem'?
>
> old name, will remove.
>
>>
>>> +            }
>>> +
>>> +        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
>>> +                        S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
>>> +            fprintf(stderr, "kvm_ivshmem: could not open shared file\n");
>>> +            exit(-1);
>>> +
>>> +        }
>>> +
>>> +        if (check_shm_size(s, fd) == -1) {
>>> +            exit(-1);
>>> +        }
>>> +
>>> +        create_shared_memory_BAR(s, fd);
>>> +
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int pci_ivshmem_uninit(PCIDevice *dev)
>>> +{
>>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>>> +
>>> +    cpu_unregister_io_memory(s->ivshmem_mmio_io_addr);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static PCIDeviceInfo ivshmem_info = {
>>> +    .qdev.name  = "ivshmem",
>>> +    .qdev.size  = sizeof(IVShmemState),
>>> +    .qdev.reset = ivshmem_reset,
>>> +    .init       = pci_ivshmem_init,
>>> +    .exit       = pci_ivshmem_uninit,
>>> +    .qdev.props = (Property[]) {
>>> +        DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
>>> +        DEFINE_PROP_STRING("size", IVShmemState, sizearg),
>>
>> This should be scalar type, not string.
>
> but it needs to handle the 'm' and 'g' suffixes for memory sizes.
>
>>
>>> +        DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
>>> +        DEFINE_PROP_BIT("irqfd", IVShmemState, features, IVSHMEM_IRQFD, false),
>>> +        DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
>>> +        DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
>>> +        DEFINE_PROP_STRING("role", IVShmemState, role),
>>> +        DEFINE_PROP_END_OF_LIST(),
>>> +    }
>>> +};
>>> +
>>> +static void ivshmem_register_devices(void)
>>> +{
>>> +    pci_qdev_register(&ivshmem_info);
>>> +}
>>> +
>>> +device_init(ivshmem_register_devices)
>>> diff --git a/qemu-char.c b/qemu-char.c
>>> index ac65a1c..b2e50d0 100644
>>> --- a/qemu-char.c
>>> +++ b/qemu-char.c
>>> @@ -2093,6 +2093,12 @@ static void tcp_chr_read(void *opaque)
>>>     }
>>>  }
>>>
>>> +CharDriverState *qemu_chr_open_eventfd(int eventfd){
>>> +
>>> +    return qemu_chr_open_fd(eventfd, eventfd);
>>> +
>>> +}
>>> +
>>>  static void tcp_chr_connect(void *opaque)
>>>  {
>>>     CharDriverState *chr = opaque;
>>> diff --git a/qemu-char.h b/qemu-char.h
>>> index e3a0783..6ea01ba 100644
>>> --- a/qemu-char.h
>>> +++ b/qemu-char.h
>>> @@ -94,6 +94,9 @@ void qemu_chr_info_print(Monitor *mon, const QObject *ret_data);
>>>  void qemu_chr_info(Monitor *mon, QObject **ret_data);
>>>  CharDriverState *qemu_chr_find(const char *name);
>>>
>>> +/* add an eventfd to the qemu devices that are polled */
>>> +CharDriverState *qemu_chr_open_eventfd(int eventfd);
>>
>> Maybe this should be removed and just open coded with qemu_chr_open_fd.
>
> I'm indifferent, it just looked a little funny passing the parameter twice.
>
>>
>>> +
>>>  extern int term_escape_char;
>>>
>>>  /* async I/O support */
>>> diff --git a/qemu-doc.texi b/qemu-doc.texi
>>> index 6647b7b..24f8748 100644
>>> --- a/qemu-doc.texi
>>> +++ b/qemu-doc.texi
>>> @@ -706,6 +706,49 @@ Using the @option{-net socket} option, it is possible to make VLANs
>>>  that span several QEMU instances. See @ref{sec_invocation} to have a
>>>  basic example.
>>>
>>> +@section Other Devices
>>> +
>>> +@subsection Inter-VM Shared Memory device
>>> +
>>> +With KVM enabled on a Linux host, a shared memory device is available.  Guests
>>> +map a POSIX shared memory region into the guest as a PCI device that enables
>>> +zero-copy communication to the application level of the guests.  The basic
>>> +syntax is:
>>> +
>>> +@example
>>> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>> +@end example
>>> +
>>> +If desired, interrupts can be sent between guest VMs accessing the same shared
>>> +memory region.  Interrupt support requires using a shared memory server and
>>> +using a chardev socket to connect to it.  The code for the shared memory server
>>> +is qemu.git/contrib/ivshmem-server.  An example syntax when using the shared
>>> +memory server is:
>>> +
>>> +@example
>>> +qemu -device ivshmem,size=<size in format accepted by -m>[,chardev=<id>]
>>> +                        [,msi=on][,irqfd=on][,vectors=n][,role=peer|master]
>>> +qemu -chardev socket,path=<path>,id=<id>
>>> +@end example
>>> +
>>> +When using the server, the guest will be assigned a VM ID (>=0) that allows guests
>>> +using the same server to communicate via interrupts.  Guests can read their
>>> +VM ID from a device register (see example code).  Since receiving the shared
>>> +memory region from the server is asynchronous, there is a (small) chance the
>>> +guest may boot before the shared memory is attached.  To allow an application
>>> +to ensure shared memory is attached, the VM ID register will return -1 (an
>>> +invalid VM ID) until the memory is attached.  Once the shared memory is
>>> +attached, the VM ID will return the guest's valid VM ID.  With these semantics,
>>> +the guest application can check to ensure the shared memory is attached to the
>>> +guest before proceeding.
>>> +
>>> +The @option{role} argument can be set to either master or peer and will affect
>>> +how the shared memory is migrated.  With @option{role=master}, the guest will
>>> +copy the shared memory on migration to the destination host.  With
>>> +@option{role=peer}, the shared memory will not be copied on migration.  Only
>>> +one guest should be specified as
>>> +the master.
>>> +
>>>  @node direct_linux_boot
>>>  @section Direct Linux Boot
>>>
>>> --
>>> 1.6.3.2.198.g6096d
>>>
>>>
>>>
>>
>>
>
diff mbox

Patch

diff --git a/Makefile.target b/Makefile.target
index c4ba592..4888308 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -202,6 +202,9 @@  obj-$(CONFIG_USB_OHCI) += usb-ohci.o
 obj-y += rtl8139.o
 obj-y += e1000.o
 
+# Inter-VM PCI shared memory
+obj-y += ivshmem.o
+
 # Hardware support
 obj-i386-y += vga.o
 obj-i386-y += mc146818rtc.o i8259.o pc.o
diff --git a/hw/ivshmem.c b/hw/ivshmem.c
new file mode 100644
index 0000000..9057612
--- /dev/null
+++ b/hw/ivshmem.c
@@ -0,0 +1,852 @@ 
+/*
+ * Inter-VM Shared Memory PCI device.
+ *
+ * Author:
+ *      Cam Macdonell <cam@cs.ualberta.ca>
+ *
+ * Based On: cirrus_vga.c
+ *          Copyright (c) 2004 Fabrice Bellard
+ *          Copyright (c) 2004 Makoto Suzuki (suzu)
+ *
+ *      and rtl8139.c
+ *          Copyright (c) 2006 Igor Kovalenko
+ *
+ * This code is licensed under the GNU GPL v2.
+ */
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include "hw.h"
+#include "console.h"
+#include "pc.h"
+#include "pci.h"
+#include "sysemu.h"
+
+#include "msix.h"
+#include "qemu-kvm.h"
+#include "libkvm.h"
+
+#include <sys/eventfd.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+
+#define IVSHMEM_IRQFD   0
+#define IVSHMEM_MSI     1
+
+//#define DEBUG_IVSHMEM
+#ifdef DEBUG_IVSHMEM
+#define IVSHMEM_DPRINTF(fmt, args...)        \
+    do {printf("IVSHMEM: " fmt, ##args); } while (0)
+#else
+#define IVSHMEM_DPRINTF(fmt, args...)
+#endif
+
+typedef struct Peer {
+    int nb_eventfds;
+    int *eventfds;
+} Peer;
+
+typedef struct EventfdEntry {
+    PCIDevice *pdev;
+    int vector;
+} EventfdEntry;
+
+typedef struct IVShmemState {
+    PCIDevice dev;
+    uint32_t intrmask;
+    uint32_t intrstatus;
+    uint32_t doorbell;
+
+    CharDriverState ** eventfd_chr;
+    CharDriverState * server_chr;
+    int ivshmem_mmio_io_addr;
+
+    pcibus_t mmio_addr;
+    pcibus_t shm_pci_addr;
+    uint64_t ivshmem_offset;
+    uint64_t ivshmem_size; /* size of shared memory region */
+    int shm_fd; /* shared memory file descriptor */
+
+    Peer *peers;
+    int nb_peers; /* how many guests we have space for */
+    int max_peer; /* maximum numbered peer */
+
+    int vm_id;
+    uint32_t vectors;
+    uint32_t features;
+    EventfdEntry *eventfd_table;
+
+    char * shmobj;
+    char * sizearg;
+    char * role;
+} IVShmemState;
+
+/* registers for the Inter-VM shared memory device */
+enum ivshmem_registers {
+    IntrMask = 0,
+    IntrStatus = 4,
+    IVPosition = 8,
+    Doorbell = 12,
+};
+
+static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int feature) {
+    return (ivs->features & (1 << feature));
+}
+
+static inline bool is_power_of_two(uint64_t x) {
+    return (x & (x - 1)) == 0;
+}
+
+static void ivshmem_map(PCIDevice *pci_dev, int region_num,
+                    pcibus_t addr, pcibus_t size, int type)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
+
+    s->shm_pci_addr = addr;
+
+    if (s->ivshmem_offset > 0) {
+        cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
+                                                            s->ivshmem_offset);
+        if (s->role && strncmp(s->role, "peer", 4) == 0) {
+            IVSHMEM_DPRINTF("marking pages no migrate\n");
+            cpu_mark_pages_no_migrate(s->ivshmem_offset, s->ivshmem_size);
+        }
+    }
+
+    IVSHMEM_DPRINTF("guest pci addr = %u, guest h/w addr = %u, size = %u\n",
+                (uint32_t)addr, (uint32_t)s->ivshmem_offset, (uint32_t)size);
+
+}
+
+/* accessing registers - based on rtl8139 */
+static void ivshmem_update_irq(IVShmemState *s, int val)
+{
+    int isr;
+    isr = (s->intrstatus & s->intrmask) & 0xffffffff;
+
+    /* don't print ISR resets */
+    if (isr) {
+        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
+           isr ? 1 : 0, s->intrstatus, s->intrmask);
+    }
+
+    qemu_set_irq(s->dev.irq[0], (isr != 0));
+}
+
+static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
+{
+    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
+
+    s->intrmask = val;
+
+    ivshmem_update_irq(s, val);
+}
+
+static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
+{
+    uint32_t ret = s->intrmask;
+
+    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
+
+    return ret;
+}
+
+static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
+{
+    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
+
+    s->intrstatus = val;
+
+    ivshmem_update_irq(s, val);
+    return;
+}
+
+static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
+{
+    uint32_t ret = s->intrstatus;
+
+    /* reading ISR clears all interrupts */
+    s->intrstatus = 0;
+
+    ivshmem_update_irq(s, 0);
+
+    return ret;
+}
+
+static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val)
+{
+
+    IVSHMEM_DPRINTF("We shouldn't be writing words\n");
+}
+
+static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
+{
+    IVShmemState *s = opaque;
+
+    u_int64_t write_one = 1;
+    u_int16_t dest = val >> 16;
+    u_int16_t vector = val & 0xff;
+
+    addr &= 0xfc;
+
+    switch (addr)
+    {
+        case IntrMask:
+            ivshmem_IntrMask_write(s, val);
+            break;
+
+        case IntrStatus:
+            ivshmem_IntrStatus_write(s, val);
+            break;
+
+        case Doorbell:
+            /* check that dest VM ID is reasonable */
+            if ((dest < 0) || (dest > s->max_peer)) {
+                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
+                break;
+            }
+
+            /* check doorbell range */
+            if ((vector >= 0) && (vector < s->peers[dest].nb_eventfds)) {
+                IVSHMEM_DPRINTF("Writing %ld to VM %d on vector %d\n",
+                                                    write_one, dest, vector);
+                if (write(s->peers[dest].eventfds[vector],
+                                                    &(write_one), 8) != 8) {
+                    IVSHMEM_DPRINTF("error writing to eventfd\n");
+                }
+            }
+            break;
+        default:
+            IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest);
+    }
+}
+
+static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val)
+{
+    IVSHMEM_DPRINTF("We shouldn't be writing bytes\n");
+}
+
+static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr)
+{
+
+    IVSHMEM_DPRINTF("We shouldn't be reading words\n");
+    return 0;
+}
+
+static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr)
+{
+
+    IVShmemState *s = opaque;
+    uint32_t ret;
+
+    switch (addr)
+    {
+        case IntrMask:
+            ret = ivshmem_IntrMask_read(s);
+            break;
+
+        case IntrStatus:
+            ret = ivshmem_IntrStatus_read(s);
+            break;
+
+        case IVPosition:
+            /* return my VM ID if the memory is mapped */
+            if (s->shm_fd > 0) {
+                ret = s->vm_id;
+            } else {
+                ret = -1;
+            }
+            break;
+
+        default:
+            IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr);
+            ret = 0;
+    }
+
+    return ret;
+}
+
+static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr)
+{
+    IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
+
+    return 0;
+}
+
+static void ivshmem_mmio_writeb(void *opaque,
+                                target_phys_addr_t addr, uint32_t val)
+{
+    ivshmem_io_writeb(opaque, addr & 0xFF, val);
+}
+
+static void ivshmem_mmio_writew(void *opaque,
+                                target_phys_addr_t addr, uint32_t val)
+{
+    ivshmem_io_writew(opaque, addr & 0xFF, val);
+}
+
+static void ivshmem_mmio_writel(void *opaque,
+                                target_phys_addr_t addr, uint32_t val)
+{
+    ivshmem_io_writel(opaque, addr & 0xFF, val);
+}
+
+static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr)
+{
+    return ivshmem_io_readb(opaque, addr & 0xFF);
+}
+
+static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr)
+{
+    uint32_t val = ivshmem_io_readw(opaque, addr & 0xFF);
+    return val;
+}
+
+static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr)
+{
+    uint32_t val = ivshmem_io_readl(opaque, addr & 0xFF);
+    return val;
+}
+
+static CPUReadMemoryFunc *ivshmem_mmio_read[3] = {
+    ivshmem_mmio_readb,
+    ivshmem_mmio_readw,
+    ivshmem_mmio_readl,
+};
+
+static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = {
+    ivshmem_mmio_writeb,
+    ivshmem_mmio_writew,
+    ivshmem_mmio_writel,
+};
+
+static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
+{
+    IVShmemState *s = opaque;
+
+    ivshmem_IntrStatus_write(s, *buf);
+
+    IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
+}
+
+static int ivshmem_can_receive(void * opaque)
+{
+    return 8;
+}
+
+static void ivshmem_event(void *opaque, int event)
+{
+    IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
+}
+
+static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {
+
+    EventfdEntry *entry = opaque;
+    PCIDevice *pdev = entry->pdev;
+
+    IVSHMEM_DPRINTF("fake irqfd on vector %p %d\n", pdev, entry->vector);
+    msix_notify(pdev, entry->vector);
+}
+
+static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd,
+                                                                    int vector)
+{
+    /* create a event character device based on the passed eventfd */
+    IVShmemState *s = opaque;
+    CharDriverState * chr;
+
+    chr = qemu_chr_open_eventfd(eventfd);
+
+    if (chr == NULL) {
+        IVSHMEM_DPRINTF("creating eventfd for eventfd %d failed\n", eventfd);
+        exit(-1);
+    }
+
+    /* if MSI is supported we need multiple interrupts */
+    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
+        s->eventfd_table[vector].pdev = &s->dev;
+        s->eventfd_table[vector].vector = vector;
+
+        qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
+                      ivshmem_event, &s->eventfd_table[vector]);
+    } else {
+        qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
+                      ivshmem_event, s);
+    }
+
+    return chr;
+
+}
+
+static int check_shm_size(IVShmemState *s, int fd) {
+    /* check that the guest isn't going to try and map more memory than the
+     * the object has allocated return -1 to indicate error */
+
+    struct stat buf;
+
+    fstat(fd, &buf);
+
+    if (s->ivshmem_size > buf.st_size) {
+        fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
+        fprintf(stderr, " than shared object size (%ld > %ld)\n",
+                                          s->ivshmem_size, buf.st_size);
+        return -1;
+    } else {
+        return 0;
+    }
+}
+
+/* create the shared memory BAR when we are not using the server, so we can
+ * create the BAR and map the memory immediately */
+static void create_shared_memory_BAR(IVShmemState *s, int fd) {
+
+    void * ptr;
+
+    s->shm_fd = fd;
+
+    ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+
+    s->ivshmem_offset = qemu_ram_map(s->ivshmem_size, ptr);
+
+    /* region for shared memory */
+    pci_register_bar(&s->dev, 2, s->ivshmem_size,
+                                    PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
+}
+
+static void close_guest_eventfds(IVShmemState *s, int posn)
+{
+    int i, guest_curr_max;
+
+    guest_curr_max = s->peers[posn].nb_eventfds;
+
+    for (i = 0; i < guest_curr_max; i++)
+        close(s->peers[posn].eventfds[i]);
+
+    qemu_free(s->peers[posn].eventfds);
+    s->peers[posn].nb_eventfds = 0;
+}
+
+static void setup_ioeventfds(IVShmemState *s) {
+
+    int i, j;
+
+    for (i = 0; i <= s->max_peer; i++) {
+        for (j = 0; j < s->peers[i].nb_eventfds; j++) {
+            kvm_set_ioeventfd_mmio_long(s->peers[i].eventfds[j],
+                    s->mmio_addr + Doorbell, (i << 16) | j, 1);
+        }
+    }
+
+    /* setup irqfd for this VM's eventfds */
+    for (i = 0; i < s->vectors; i++) {
+        kvm_set_irqfd(s->dev.msix_irq_entries[i].gsi,
+                        s->peers[s->vm_id].eventfds[i], 1);
+    }
+}
+
+
+/* this function increase the dynamic storage need to store data about other
+ * guests */
+static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
+
+    int j, old_nb_alloc;
+
+    old_nb_alloc = s->nb_peers;
+
+    while (new_min_size >= s->nb_peers)
+        s->nb_peers = s->nb_peers * 2;
+
+    IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nb_peers);
+    s->peers = qemu_realloc(s->peers, s->nb_peers * sizeof(Peer));
+
+    if (s->peers == NULL) {
+        fprintf(stderr, "Allocation error - exiting\n");
+        exit(1);
+    }
+
+    /* zero out new pointers */
+    for (j = old_nb_alloc; j < s->nb_peers; j++) {
+        s->peers[j].eventfds = NULL;
+        s->peers[j].nb_eventfds = 0;
+    }
+}
+
+static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
+{
+    IVShmemState *s = opaque;
+    int incoming_fd, tmp_fd;
+    int guest_curr_max;
+    long incoming_posn;
+
+    memcpy(&incoming_posn, buf, sizeof(long));
+    /* pick off s->server_chr->msgfd and store it, posn should accompany msg */
+    tmp_fd = qemu_chr_get_msgfd(s->server_chr);
+    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
+
+    /* make sure we have enough space for this guest */
+    if (incoming_posn >= s->nb_peers) {
+        increase_dynamic_storage(s, incoming_posn);
+    }
+
+    if (tmp_fd == -1) {
+        /* if posn is positive and unseen before then this is our posn*/
+        if ((incoming_posn >= 0) && (s->peers[incoming_posn].eventfds == NULL)) {
+            /* receive our posn */
+            s->vm_id = incoming_posn;
+            return;
+        } else {
+            /* otherwise an fd == -1 means an existing guest has gone away */
+            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
+            close_guest_eventfds(s, incoming_posn);
+            return;
+        }
+    }
+
+    /* because of the implementation of get_msgfd, we need a dup */
+    incoming_fd = dup(tmp_fd);
+
+    if (incoming_fd == -1) {
+        fprintf(stderr, "could not allocate file descriptor %s\n",
+                                                            strerror(errno));
+        return;
+    }
+
+    /* if the position is -1, then it's shared memory region fd */
+    if (incoming_posn == -1) {
+
+        void * map_ptr;
+
+        s->max_peer = 0;
+
+        if (check_shm_size(s, incoming_fd) == -1) {
+            exit(-1);
+        }
+
+        /* mmap the region and map into the BAR2 */
+        map_ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED,
+                                                                incoming_fd, 0);
+        s->ivshmem_offset = qemu_ram_map(s->ivshmem_size, map_ptr);
+
+        IVSHMEM_DPRINTF("guest pci addr = %u, guest h/w addr = %u, size = %u\n",
+                        (uint32_t)s->shm_pci_addr, (uint32_t)s->ivshmem_offset,
+                        (uint32_t)s->ivshmem_size);
+
+        if (s->shm_pci_addr > 0) {
+            /* map memory into BAR2 */
+            cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
+                                                            s->ivshmem_offset);
+            if (s->role && strncmp(s->role, "peer", 4) == 0) {
+                IVSHMEM_DPRINTF("marking pages no migrate\n");
+                cpu_mark_pages_no_migrate(s->ivshmem_offset, s->ivshmem_size);
+            }
+
+        }
+
+        /* only store the fd if it is successfully mapped */
+        s->shm_fd = incoming_fd;
+
+        return;
+    }
+
+    /* each guest has an array of eventfds, and we keep track of how many
+     * guests for each VM */
+    guest_curr_max = s->peers[incoming_posn].nb_eventfds;
+    if (guest_curr_max == 0) {
+        /* one eventfd per MSI vector */
+        s->peers[incoming_posn].eventfds = (int *) qemu_malloc(s->vectors *
+                                                                sizeof(int));
+    }
+
+    /* this is an eventfd for a particular guest VM */
+    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, guest_curr_max,
+                                                                incoming_fd);
+    s->peers[incoming_posn].eventfds[guest_curr_max] = incoming_fd;
+
+    /* increment count for particular guest */
+    s->peers[incoming_posn].nb_eventfds++;
+
+    /* keep track of the maximum VM ID */
+    if (incoming_posn > s->max_peer) {
+        s->max_peer = incoming_posn;
+    }
+
+    if (incoming_posn == s->vm_id) {
+        int vector = guest_curr_max;
+        if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
+            /* initialize char device for callback
+             * if this is one of my eventfds */
+            s->eventfd_chr[vector] = create_eventfd_chr_device(s,
+                       s->peers[s->vm_id].eventfds[vector], vector);
+        }
+    }
+
+    return;
+}
+
+static void ivshmem_reset(DeviceState *d)
+{
+    return;
+}
+
+static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
+                       pcibus_t addr, pcibus_t size, int type)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
+
+    s->mmio_addr = addr;
+    cpu_register_physical_memory(addr + 0, 0x400, s->ivshmem_mmio_io_addr);
+
+    /* ioeventfd and irqfd are enabled together,
+     * so the flag IRQFD refers to both */
+    if (ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
+        setup_ioeventfds(s);
+    }
+}
+
+static uint64_t ivshmem_get_size(IVShmemState * s) {
+
+    uint64_t value;
+    char *ptr;
+
+    value = strtoul(s->sizearg, &ptr, 10);
+    switch (*ptr) {
+        case 0: case 'M': case 'm':
+            value <<= 20;
+            break;
+        case 'G': case 'g':
+            value <<= 30;
+            break;
+        default:
+            fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg);
+            exit(1);
+    }
+
+    /* BARs must be a power of 2 */
+    if (!is_power_of_two(value)) {
+        fprintf(stderr, "ivshmem: size must be power of 2\n");
+        exit(1);
+    }
+
+    return value;
+
+}
+
+static void ivshmem_setup_msi(IVShmemState * s) {
+
+    int i;
+
+    /* allocate the MSI-X vectors */
+
+    if (!msix_init(&s->dev, s->vectors, 1, 0)) {
+        pci_register_bar(&s->dev, 1,
+                         msix_bar_size(&s->dev),
+                         PCI_BASE_ADDRESS_SPACE_MEMORY,
+                         msix_mmio_map);
+        IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
+    } else {
+        IVSHMEM_DPRINTF("msix initialization failed\n");
+    }
+
+    /* 'activate' the vectors */
+    for (i = 0; i < s->vectors; i++) {
+        msix_vector_use(&s->dev, i);
+    }
+
+    /* if IRQFDs are not supported, we'll have to trigger the interrupts
+     * via Qemu char devices */
+    if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
+        /* for handling interrupts when IRQFD is not available */
+        s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry));
+    }
+}
+
+static void ivshmem_save(QEMUFile* f, void *opaque)
+{
+    IVShmemState *proxy = opaque;
+
+    IVSHMEM_DPRINTF("ivshmem_save\n");
+    pci_device_save(&proxy->dev, f);
+
+    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
+        msix_save(&proxy->dev, f);
+    } else {
+        qemu_put_be32(f, proxy->intrstatus);
+        qemu_put_be32(f, proxy->intrmask);
+    }
+
+}
+
+static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
+{
+    IVSHMEM_DPRINTF("ivshmem_load\n");
+
+    IVShmemState *proxy = opaque;
+    int ret, i;
+
+    ret = pci_device_load(&proxy->dev, f);
+    if (ret) {
+        return ret;
+    }
+
+    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
+        msix_load(&proxy->dev, f);
+        for (i = 0; i < proxy->vectors; i++) {
+            msix_vector_use(&proxy->dev, i);
+        }
+    } else {
+        proxy->intrstatus = qemu_get_be32(f);
+        proxy->intrmask = qemu_get_be32(f);
+    }
+
+    return 0;
+}
+
+static int pci_ivshmem_init(PCIDevice *dev)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
+    uint8_t *pci_conf;
+
+    if (s->sizearg == NULL)
+        s->ivshmem_size = 4 << 20; /* 4 MB default */
+    else {
+        s->ivshmem_size = ivshmem_get_size(s);
+    }
+
+    register_savevm("ivshmem", 0, 0, ivshmem_save, ivshmem_load, dev);
+
+    /* IRQFD requires MSI */
+    if (ivshmem_has_feature(s, IVSHMEM_IRQFD) &&
+        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
+        fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n");
+        exit(1);
+    }
+
+    /* check that role is reasonable */
+    if (s->role && !((strncmp(s->role, "peer", 5) == 0) ||
+                        (strncmp(s->role, "master", 7) == 0))) {
+        fprintf(stderr, "ivshmem: 'role' must be 'peer' or 'master'\n");
+        exit(1);
+    }
+
+    pci_conf = s->dev.config;
+    pci_conf[0x00] = 0xf4; /* Qumranet vendor ID 0x5002 */
+    pci_conf[0x01] = 0x1a;
+    pci_conf[0x02] = 0x10;
+    pci_conf[0x03] = 0x11;
+    pci_conf[0x04] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
+    pci_conf[0x0a] = 0x00; /* RAM controller */
+    pci_conf[0x0b] = 0x05;
+    pci_conf[0x0e] = 0x00; /* header_type */
+
+    pci_conf[PCI_INTERRUPT_PIN] = 1;
+
+    s->shm_pci_addr = 0;
+    s->ivshmem_offset = 0;
+    s->shm_fd = 0;
+
+    s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
+                                    ivshmem_mmio_write, s);
+    /* region for registers*/
+    pci_register_bar(&s->dev, 0, 0x400,
+                           PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map);
+
+    if ((s->server_chr != NULL) &&
+                        (strncmp(s->server_chr->filename, "unix:", 5) == 0)) {
+        /* if we get a UNIX socket as the parameter we will talk
+         * to the ivshmem server to receive the memory region */
+
+        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
+                                                    s->server_chr->filename);
+
+        if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
+            ivshmem_setup_msi(s);
+        }
+
+        /* we allocate enough space for 16 guests and grow as needed */
+        s->nb_peers = 16;
+        s->vm_id = -1;
+
+        /* allocate/initialize space for interrupt handling */
+        s->peers = qemu_mallocz(s->nb_peers * sizeof(Peer));
+
+        pci_register_bar(&s->dev, 2, s->ivshmem_size,
+                                    PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
+
+        s->eventfd_chr = (CharDriverState **) qemu_mallocz(s->vectors *
+                                                sizeof(CharDriverState *));
+
+        qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, ivshmem_read,
+                     ivshmem_event, s);
+    } else {
+        /* just map the file immediately, we're not using a server */
+        int fd;
+
+        if (s->shmobj == NULL) {
+            fprintf(stderr, "Must specify 'chardev' or 'shm' to ivshmem\n");
+        }
+
+        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);
+
+        /* try opening with O_EXCL and if it succeeds zero the memory
+         * by truncating to 0 */
+        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
+                        S_IRWXU|S_IRWXG|S_IRWXO)) > 0) {
+           /* truncate file to length PCI device's memory */
+            if (ftruncate(fd, s->ivshmem_size) != 0) {
+                fprintf(stderr, "kvm_ivshmem: could not truncate shared file\n");
+            }
+
+        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
+                        S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
+            fprintf(stderr, "kvm_ivshmem: could not open shared file\n");
+            exit(-1);
+
+        }
+
+        if (check_shm_size(s, fd) == -1) {
+            exit(-1);
+        }
+
+        create_shared_memory_BAR(s, fd);
+
+    }
+
+    return 0;
+}
+
+static int pci_ivshmem_uninit(PCIDevice *dev)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
+
+    cpu_unregister_io_memory(s->ivshmem_mmio_io_addr);
+
+    return 0;
+}
+
+static PCIDeviceInfo ivshmem_info = {
+    .qdev.name  = "ivshmem",
+    .qdev.size  = sizeof(IVShmemState),
+    .qdev.reset = ivshmem_reset,
+    .init       = pci_ivshmem_init,
+    .exit       = pci_ivshmem_uninit,
+    .qdev.props = (Property[]) {
+        DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
+        DEFINE_PROP_STRING("size", IVShmemState, sizearg),
+        DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
+        DEFINE_PROP_BIT("irqfd", IVShmemState, features, IVSHMEM_IRQFD, false),
+        DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
+        DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
+        DEFINE_PROP_STRING("role", IVShmemState, role),
+        DEFINE_PROP_END_OF_LIST(),
+    }
+};
+
+static void ivshmem_register_devices(void)
+{
+    pci_qdev_register(&ivshmem_info);
+}
+
+device_init(ivshmem_register_devices)
diff --git a/qemu-char.c b/qemu-char.c
index ac65a1c..b2e50d0 100644
--- a/qemu-char.c
+++ b/qemu-char.c
@@ -2093,6 +2093,12 @@  static void tcp_chr_read(void *opaque)
     }
 }
 
+CharDriverState *qemu_chr_open_eventfd(int eventfd){
+
+    return qemu_chr_open_fd(eventfd, eventfd);
+
+}
+
 static void tcp_chr_connect(void *opaque)
 {
     CharDriverState *chr = opaque;
diff --git a/qemu-char.h b/qemu-char.h
index e3a0783..6ea01ba 100644
--- a/qemu-char.h
+++ b/qemu-char.h
@@ -94,6 +94,9 @@  void qemu_chr_info_print(Monitor *mon, const QObject *ret_data);
 void qemu_chr_info(Monitor *mon, QObject **ret_data);
 CharDriverState *qemu_chr_find(const char *name);
 
+/* add an eventfd to the qemu devices that are polled */
+CharDriverState *qemu_chr_open_eventfd(int eventfd);
+
 extern int term_escape_char;
 
 /* async I/O support */
diff --git a/qemu-doc.texi b/qemu-doc.texi
index 6647b7b..24f8748 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -706,6 +706,49 @@  Using the @option{-net socket} option, it is possible to make VLANs
 that span several QEMU instances. See @ref{sec_invocation} to have a
 basic example.
 
+@section Other Devices
+
+@subsection Inter-VM Shared Memory device
+
+With KVM enabled on a Linux host, a shared memory device is available.  Guests
+map a POSIX shared memory region into the guest as a PCI device that enables
+zero-copy communication to the application level of the guests.  The basic
+syntax is:
+
+@example
+qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
+@end example
+
+If desired, interrupts can be sent between guest VMs accessing the same shared
+memory region.  Interrupt support requires using a shared memory server and
+using a chardev socket to connect to it.  The code for the shared memory server
+is qemu.git/contrib/ivshmem-server.  An example syntax when using the shared
+memory server is:
+
+@example
+qemu -device ivshmem,size=<size in format accepted by -m>[,chardev=<id>]
+                        [,msi=on][,irqfd=on][,vectors=n][,role=peer|master]
+qemu -chardev socket,path=<path>,id=<id>
+@end example
+
+When using the server, the guest will be assigned a VM ID (>=0) that allows guests
+using the same server to communicate via interrupts.  Guests can read their
+VM ID from a device register (see example code).  Since receiving the shared
+memory region from the server is asynchronous, there is a (small) chance the
+guest may boot before the shared memory is attached.  To allow an application
+to ensure shared memory is attached, the VM ID register will return -1 (an
+invalid VM ID) until the memory is attached.  Once the shared memory is
+attached, the VM ID will return the guest's valid VM ID.  With these semantics,
+the guest application can check to ensure the shared memory is attached to the
+guest before proceeding.
+
+The @option{role} argument can be set to either master or peer and will affect
+how the shared memory is migrated.  With @option{role=master}, the guest will
+copy the shared memory on migration to the destination host.  With
+@option{role=peer}, the shared memory will not be copied on migration.  Only
+one guest should be specified as
+the master.
+
 @node direct_linux_boot
 @section Direct Linux Boot