Patchwork [v5,4/5] Inter-VM shared memory PCI device

login
register
mail settings
Submitter Cam Macdonell
Date April 21, 2010, 5:53 p.m.
Message ID <1271872408-22842-5-git-send-email-cam@cs.ualberta.ca>
Download mbox | patch
Permalink /patch/50665/
State New
Headers show

Comments

Cam Macdonell - April 21, 2010, 5:53 p.m.
Support an inter-vm shared memory device that maps a shared-memory object as a
PCI device in the guest.  This patch also supports interrupts between guest by
communicating over a unix domain socket.  This patch applies to the qemu-kvm
repository.

    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]

Interrupts are supported between multiple VMs by using a shared memory server
by using a chardev socket.

    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
                    [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
    -chardev socket,path=<path>,id=<id>

(shared memory server is qemu.git/contrib/ivshmem-server)

Sample programs and init scripts are in a git repo here:

    www.gitorious.org/nahanni
---
 Makefile.target |    3 +
 hw/ivshmem.c    |  727 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 qemu-char.c     |    6 +
 qemu-char.h     |    3 +
 qemu-doc.texi   |   25 ++
 5 files changed, 764 insertions(+), 0 deletions(-)
 create mode 100644 hw/ivshmem.c
Anthony Liguori - May 6, 2010, 5:32 p.m.
On 04/21/2010 12:53 PM, Cam Macdonell wrote:
> Support an inter-vm shared memory device that maps a shared-memory object as a
> PCI device in the guest.  This patch also supports interrupts between guest by
> communicating over a unix domain socket.  This patch applies to the qemu-kvm
> repository.
>
>      -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>
> Interrupts are supported between multiple VMs by using a shared memory server
> by using a chardev socket.
>
>      -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>                      [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
>      -chardev socket,path=<path>,id=<id>
>
> (shared memory server is qemu.git/contrib/ivshmem-server)
>
> Sample programs and init scripts are in a git repo here:
>
>      www.gitorious.org/nahanni
> ---
>   Makefile.target |    3 +
>   hw/ivshmem.c    |  727 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   qemu-char.c     |    6 +
>   qemu-char.h     |    3 +
>   qemu-doc.texi   |   25 ++
>   5 files changed, 764 insertions(+), 0 deletions(-)
>   create mode 100644 hw/ivshmem.c
>
> diff --git a/Makefile.target b/Makefile.target
> index 1ffd802..bc9a681 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -199,6 +199,9 @@ obj-$(CONFIG_USB_OHCI) += usb-ohci.o
>   obj-y += rtl8139.o
>   obj-y += e1000.o
>
> +# Inter-VM PCI shared memory
> +obj-y += ivshmem.o
> +
>   # Hardware support
>   obj-i386-y = pckbd.o dma.o
>   obj-i386-y += vga.o
> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> new file mode 100644
> index 0000000..f8d8fdb
> --- /dev/null
> +++ b/hw/ivshmem.c
> @@ -0,0 +1,727 @@
> +/*
> + * Inter-VM Shared Memory PCI device.
> + *
> + * Author:
> + *      Cam Macdonell<cam@cs.ualberta.ca>
> + *
> + * Based On: cirrus_vga.c and rtl8139.c
> + *
> + * This code is licensed under the GNU GPL v2.
> + */
> +#include<sys/mman.h>
> +#include<sys/types.h>
> +#include<sys/socket.h>
> +#include<sys/io.h>
> +#include<sys/ioctl.h>
> +#include<sys/eventfd.h>
>    

This will break the Windows along with any non-Linux unix or any Linux 
old enough to not have eventfd support.

If it's based on cirrus_vga.c and rtl8139.c, then it ought to carry the 
respective copyrights, no?

Regards,

Anthony Liguori

> +#include "hw.h"
> +#include "console.h"
> +#include "pc.h"
> +#include "pci.h"
> +#include "sysemu.h"
> +
> +#include "msix.h"
> +#include "qemu-kvm.h"
> +#include "libkvm.h"
> +
> +#include<sys/eventfd.h>
> +#include<sys/mman.h>
> +#include<sys/socket.h>
> +#include<sys/ioctl.h>
> +
> +#define IVSHMEM_IRQFD   0
> +#define IVSHMEM_MSI     1
> +
> +#define DEBUG_IVSHMEM
> +#ifdef DEBUG_IVSHMEM
> +#define IVSHMEM_DPRINTF(fmt, args...)        \
> +    do {printf("IVSHMEM: " fmt, ##args); } while (0)
> +#else
> +#define IVSHMEM_DPRINTF(fmt, args...)
> +#endif
> +
> +typedef struct EventfdEntry {
> +    PCIDevice *pdev;
> +    int vector;
> +} EventfdEntry;
> +
> +typedef struct IVShmemState {
> +    PCIDevice dev;
> +    uint32_t intrmask;
> +    uint32_t intrstatus;
> +    uint32_t doorbell;
> +
> +    CharDriverState * chr;
> +    CharDriverState ** eventfd_chr;
> +    int ivshmem_mmio_io_addr;
> +
> +    pcibus_t mmio_addr;
> +    unsigned long ivshmem_offset;
> +    uint64_t ivshmem_size; /* size of shared memory region */
> +    int shm_fd; /* shared memory file descriptor */
> +
> +    int nr_allocated_vms;
> +    /* array of eventfds for each guest */
> +    int ** eventfds;
> +    /* keep track of # of eventfds for each guest*/
> +    int * eventfds_posn_count;
> +
> +    int nr_alloc_guests;
> +    int vm_id;
> +    int num_eventfds;
> +    uint32_t vectors;
> +    uint32_t features;
> +    EventfdEntry *eventfd_table;
> +
> +    char * shmobj;
> +    char * sizearg;
> +} IVShmemState;
> +
> +/* registers for the Inter-VM shared memory device */
> +enum ivshmem_registers {
> +    IntrMask = 0,
> +    IntrStatus = 4,
> +    IVPosition = 8,
> +    Doorbell = 12,
> +};
> +
> +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int feature) {
> +    return (ivs->features&  (1<<  feature));
> +}
> +
> +static inline int is_power_of_two(int x) {
> +    return (x&  (x-1)) == 0;
> +}
> +
> +static void ivshmem_map(PCIDevice *pci_dev, int region_num,
> +                    pcibus_t addr, pcibus_t size, int type)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
> +
> +    IVSHMEM_DPRINTF("addr = %u size = %u\n", (uint32_t)addr, (uint32_t)size);
> +    cpu_register_physical_memory(addr, s->ivshmem_size, s->ivshmem_offset);
> +
> +}
> +
> +/* accessing registers - based on rtl8139 */
> +static void ivshmem_update_irq(IVShmemState *s, int val)
> +{
> +    int isr;
> +    isr = (s->intrstatus&  s->intrmask)&  0xffffffff;
> +
> +    /* don't print ISR resets */
> +    if (isr) {
> +        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
> +           isr ? 1 : 0, s->intrstatus, s->intrmask);
> +    }
> +
> +    qemu_set_irq(s->dev.irq[0], (isr != 0));
> +}
> +
> +static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
> +{
> +    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
> +
> +    s->intrmask = val;
> +
> +    ivshmem_update_irq(s, val);
> +}
> +
> +static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
> +{
> +    uint32_t ret = s->intrmask;
> +
> +    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
> +
> +    return ret;
> +}
> +
> +static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
> +{
> +    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
> +
> +    s->intrstatus = val;
> +
> +    ivshmem_update_irq(s, val);
> +    return;
> +}
> +
> +static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
> +{
> +    uint32_t ret = s->intrstatus;
> +
> +    /* reading ISR clears all interrupts */
> +    s->intrstatus = 0;
> +
> +    ivshmem_update_irq(s, 0);
> +
> +    return ret;
> +}
> +
> +static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val)
> +{
> +
> +    IVSHMEM_DPRINTF("We shouldn't be writing words\n");
> +}
> +
> +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
> +{
> +    IVShmemState *s = opaque;
> +
> +    u_int64_t write_one = 1;
> +    u_int16_t dest = val>>  16;
> +    u_int16_t vector = val&  0xff;
> +
> +    addr&= 0xfe;
> +
> +    switch (addr)
> +    {
> +        case IntrMask:
> +            ivshmem_IntrMask_write(s, val);
> +            break;
> +
> +        case IntrStatus:
> +            ivshmem_IntrStatus_write(s, val);
> +            break;
> +
> +        case Doorbell:
> +            /* check doorbell range */
> +            if ((vector>= 0)&&  (vector<  s->eventfds_posn_count[dest])) {
> +                IVSHMEM_DPRINTF("Writing %ld to VM %d on vector %d\n", write_one, dest, vector);
> +                if (write(s->eventfds[dest][vector],&(write_one), 8) != 8) {
> +                    IVSHMEM_DPRINTF("error writing to eventfd\n");
> +                }
> +            }
> +            break;
> +        default:
> +            IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest);
> +    }
> +}
> +
> +static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val)
> +{
> +    IVSHMEM_DPRINTF("We shouldn't be writing bytes\n");
> +}
> +
> +static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr)
> +{
> +
> +    IVSHMEM_DPRINTF("We shouldn't be reading words\n");
> +    return 0;
> +}
> +
> +static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr)
> +{
> +
> +    IVShmemState *s = opaque;
> +    uint32_t ret;
> +
> +    switch (addr)
> +    {
> +        case IntrMask:
> +            ret = ivshmem_IntrMask_read(s);
> +            break;
> +
> +        case IntrStatus:
> +            ret = ivshmem_IntrStatus_read(s);
> +            break;
> +
> +        case IVPosition:
> +            /* return my id in the ivshmem list */
> +            ret = s->vm_id;
> +            break;
> +
> +        default:
> +            IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr);
> +            ret = 0;
> +    }
> +
> +    return ret;
> +
> +}
> +
> +static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr)
> +{
> +    IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
> +
> +    return 0;
> +}
> +
> +static void ivshmem_mmio_writeb(void *opaque,
> +                                target_phys_addr_t addr, uint32_t val)
> +{
> +    ivshmem_io_writeb(opaque, addr&  0xFF, val);
> +}
> +
> +static void ivshmem_mmio_writew(void *opaque,
> +                                target_phys_addr_t addr, uint32_t val)
> +{
> +    ivshmem_io_writew(opaque, addr&  0xFF, val);
> +}
> +
> +static void ivshmem_mmio_writel(void *opaque,
> +                                target_phys_addr_t addr, uint32_t val)
> +{
> +    ivshmem_io_writel(opaque, addr&  0xFF, val);
> +}
> +
> +static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    return ivshmem_io_readb(opaque, addr&  0xFF);
> +}
> +
> +static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    uint32_t val = ivshmem_io_readw(opaque, addr&  0xFF);
> +    return val;
> +}
> +
> +static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    uint32_t val = ivshmem_io_readl(opaque, addr&  0xFF);
> +    return val;
> +}
> +
> +static CPUReadMemoryFunc *ivshmem_mmio_read[3] = {
> +    ivshmem_mmio_readb,
> +    ivshmem_mmio_readw,
> +    ivshmem_mmio_readl,
> +};
> +
> +static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = {
> +    ivshmem_mmio_writeb,
> +    ivshmem_mmio_writew,
> +    ivshmem_mmio_writel,
> +};
> +
> +static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
> +{
> +    IVShmemState *s = opaque;
> +
> +    ivshmem_IntrStatus_write(s, *buf);
> +
> +    IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
> +}
> +
> +static int ivshmem_can_receive(void * opaque)
> +{
> +    return 8;
> +}
> +
> +static void ivshmem_event(void *opaque, int event)
> +{
> +    IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
> +}
> +
> +static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {
> +
> +    EventfdEntry *entry = opaque;
> +    PCIDevice *pdev = entry->pdev;
> +
> +    IVSHMEM_DPRINTF("fake irqfd on vector %d\n", entry->vector);
> +    msix_notify(pdev, entry->vector);
> +}
> +
> +static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd,
> +                                                                    int vector)
> +{
> +    /* create a event character device based on the passed eventfd */
> +    IVShmemState *s = opaque;
> +    CharDriverState * chr;
> +
> +    chr = qemu_chr_open_eventfd(eventfd);
> +
> +    if (chr == NULL) {
> +        IVSHMEM_DPRINTF("creating eventfd for eventfd %d failed\n", eventfd);
> +        exit(-1);
> +    }
> +
> +    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
> +        s->eventfd_table[vector].pdev =&s->dev;
> +        s->eventfd_table[vector].vector = vector;
> +
> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
> +                      ivshmem_event,&s->eventfd_table[vector]);
> +    } else {
> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
> +                      ivshmem_event, s);
> +    }
> +
> +    return chr;
> +
> +}
> +
> +static int check_shm_size(IVShmemState *s, int shmemfd) {
> +    /* check that the guest isn't going to try and map more memory than the
> +     * card server allocated return -1 to indicate error */
> +
> +    struct stat buf;
> +
> +    fstat(shmemfd,&buf);
> +
> +    if (s->ivshmem_size>  buf.st_size) {
> +        fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
> +        fprintf(stderr, " than shared object size (%ld>  %ld)\n",
> +                                          s->ivshmem_size, buf.st_size);
> +        return -1;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static void create_shared_memory_BAR(IVShmemState *s, int fd) {
> +
> +    s->shm_fd = fd;
> +
> +    s->ivshmem_offset = qemu_ram_mmap(s->shm_fd, s->ivshmem_size,
> +             MAP_SHARED, 0);
> +
> +    /* region for shared memory */
> +    pci_register_bar(&s->dev, 2, s->ivshmem_size,
> +                                    PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
> +}
> +
> +static void close_guest_eventfds(IVShmemState *s, int posn)
> +{
> +    int i, guest_curr_max;
> +
> +    guest_curr_max = s->eventfds_posn_count[posn];
> +
> +    for (i = 0; i<  guest_curr_max; i++)
> +        close(s->eventfds[posn][i]);
> +
> +    free(s->eventfds[posn]);
> +    s->eventfds_posn_count[posn] = 0;
> +}
> +
> +/* this function increase the dynamic storage need to store data about other
> + * guests */
> +static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
> +
> +    int j, old_nr_alloc;
> +
> +    old_nr_alloc = s->nr_alloc_guests;
> +
> +    while (s->nr_alloc_guests<  new_min_size)
> +        s->nr_alloc_guests = s->nr_alloc_guests * 2;
> +
> +    IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nr_alloc_guests);
> +    s->eventfds = qemu_realloc(s->eventfds, s->nr_alloc_guests *
> +                                                        sizeof(int *));
> +    s->eventfds_posn_count = qemu_realloc(s->eventfds_posn_count,
> +                                                    s->nr_alloc_guests *
> +                                                        sizeof(int));
> +    s->eventfd_table = qemu_realloc(s->eventfd_table, s->nr_alloc_guests *
> +                                                    sizeof(EventfdEntry));
> +
> +    if ((s->eventfds == NULL) || (s->eventfds_posn_count == NULL) ||
> +            (s->eventfd_table == NULL)) {
> +        fprintf(stderr, "Allocation error - exiting\n");
> +        exit(1);
> +    }
> +
> +    if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> +        s->eventfd_chr = (CharDriverState **)qemu_realloc(s->eventfd_chr,
> +                                    s->nr_alloc_guests * sizeof(void *));
> +        if (s->eventfd_chr == NULL) {
> +            fprintf(stderr, "Allocation error - exiting\n");
> +            exit(1);
> +        }
> +    }
> +
> +    /* zero out new pointers */
> +    for (j = old_nr_alloc; j<  s->nr_alloc_guests; j++) {
> +        s->eventfds[j] = NULL;
> +    }
> +}
> +
> +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
> +{
> +    IVShmemState *s = opaque;
> +    int incoming_fd, tmp_fd;
> +    int guest_curr_max;
> +    long incoming_posn;
> +
> +    memcpy(&incoming_posn, buf, sizeof(long));
> +    /* pick off s->chr->msgfd and store it, posn should accompany msg */
> +    tmp_fd = qemu_chr_get_msgfd(s->chr);
> +    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
> +
> +    /* make sure we have enough space for this guest */
> +    if (incoming_posn>= s->nr_alloc_guests) {
> +        increase_dynamic_storage(s, incoming_posn);
> +    }
> +
> +    if (tmp_fd == -1) {
> +        /* if posn is positive and unseen before then this is our posn*/
> +        if ((incoming_posn>= 0)&&  (s->eventfds[incoming_posn] == NULL)) {
> +            /* receive our posn */
> +            s->vm_id = incoming_posn;
> +            return;
> +        } else {
> +            /* otherwise an fd == -1 means an existing guest has gone away */
> +            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
> +            close_guest_eventfds(s, incoming_posn);
> +            return;
> +        }
> +    }
> +
> +    /* because of the implementation of get_msgfd, we need a dup */
> +    incoming_fd = dup(tmp_fd);
> +
> +    /* if the position is -1, then it's shared memory region fd */
> +    if (incoming_posn == -1) {
> +
> +        s->num_eventfds = 0;
> +
> +        if (check_shm_size(s, incoming_fd) == -1) {
> +            exit(-1);
> +        }
> +
> +        /* creating a BAR in qemu_chr callback may be crazy */
> +        create_shared_memory_BAR(s, incoming_fd);
> +
> +       return;
> +    }
> +
> +    /* each guest has an array of eventfds, and we keep track of how many
> +     * guests for each VM */
> +    guest_curr_max = s->eventfds_posn_count[incoming_posn];
> +    if (guest_curr_max == 0) {
> +        /* one eventfd per MSI vector */
> +        s->eventfds[incoming_posn] = (int *) qemu_malloc(s->vectors *
> +                                                                sizeof(int));
> +    }
> +
> +    /* this is an eventfd for a particular guest VM */
> +    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, guest_curr_max,
> +                                                                incoming_fd);
> +    s->eventfds[incoming_posn][guest_curr_max] = incoming_fd;
> +
> +    /* increment count for particular guest */
> +    s->eventfds_posn_count[incoming_posn]++;
> +
> +    /* ioeventfd and irqfd are enabled together,
> +     * so the flag IRQFD refers to both */
> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD)&&  guest_curr_max>= 0) {
> +        /* allocate ioeventfd for the new fd
> +         * received for guest @ incoming_posn */
> +        kvm_set_ioeventfd_mmio_long(incoming_fd, s->mmio_addr + Doorbell,
> +                                (incoming_posn<<  16) | guest_curr_max, 1);
> +    }
> +
> +    /* keep track of the maximum VM ID */
> +    if (incoming_posn>  s->num_eventfds) {
> +        s->num_eventfds = incoming_posn;
> +    }
> +
> +    if (incoming_posn == s->vm_id) {
> +        if (ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> +            /* setup irqfd for this VM's eventfd */
> +            int vector = guest_curr_max;
> +            kvm_set_irqfd(s->eventfds[s->vm_id][guest_curr_max], vector,
> +                                        s->dev.msix_irq_entries[vector].gsi);
> +        } else {
> +            /* initialize char device for callback
> +             * if this is one of my eventfd */
> +            s->eventfd_chr[guest_curr_max] = create_eventfd_chr_device(s,
> +                s->eventfds[s->vm_id][guest_curr_max], guest_curr_max);
> +        }
> +    }
> +
> +    return;
> +}
> +
> +static void ivshmem_reset(DeviceState *d)
> +{
> +    return;
> +}
> +
> +static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
> +                       pcibus_t addr, pcibus_t size, int type)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
> +
> +    s->mmio_addr = addr;
> +    cpu_register_physical_memory(addr + 0, 0x400, s->ivshmem_mmio_io_addr);
> +
> +    /* now that our mmio region has been allocated, we can receive
> +     * the file descriptors */
> +    if (s->chr != NULL) {
> +        qemu_chr_add_handlers(s->chr, ivshmem_can_receive, ivshmem_read,
> +                     ivshmem_event, s);
> +    }
> +
> +}
> +
> +static uint64_t ivshmem_get_size(IVShmemState * s) {
> +
> +    uint64_t value;
> +    char *ptr;
> +
> +    value = strtoul(s->sizearg,&ptr, 10);
> +    switch (*ptr) {
> +        case 0: case 'M': case 'm':
> +            value<<= 20;
> +            break;
> +        case 'G': case 'g':
> +            value<<= 30;
> +            break;
> +        default:
> +            fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg);
> +            exit(1);
> +    }
> +
> +    /* BARs must be a power of 2 */
> +    if (!is_power_of_two(value)) {
> +        fprintf(stderr, "ivshmem: size must be power of 2\n");
> +        exit(1);
> +    }
> +
> +    return value;
> +
> +}
> +
> +static int pci_ivshmem_init(PCIDevice *dev)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> +    uint8_t *pci_conf;
> +    int i;
> +
> +    if (s->sizearg == NULL)
> +        s->ivshmem_size = 4<<  20; /* 4 MB default */
> +    else {
> +        s->ivshmem_size = ivshmem_get_size(s);
> +    }
> +
> +    /* IRQFD requires MSI */
> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD)&&
> +        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
> +        fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n");
> +        exit(1);
> +    }
> +
> +    pci_conf = s->dev.config;
> +    pci_conf[0x00] = 0xf4; /* Qumranet vendor ID 0x5002 */
> +    pci_conf[0x01] = 0x1a;
> +    pci_conf[0x02] = 0x10;
> +    pci_conf[0x03] = 0x11;
> +    pci_conf[0x04] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
> +    pci_conf[0x0a] = 0x00; /* RAM controller */
> +    pci_conf[0x0b] = 0x05;
> +    pci_conf[0x0e] = 0x00; /* header_type */
> +
> +    s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
> +                                    ivshmem_mmio_write, s);
> +    /* region for registers*/
> +    pci_register_bar(&s->dev, 0, 0x400,
> +                           PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map);
> +
> +    /* allocate the MSI-X vectors */
> +    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
> +
> +        if (!msix_init(&s->dev, s->vectors, 1, 0)) {
> +            pci_register_bar(&s->dev, 1,
> +                             msix_bar_size(&s->dev),
> +                             PCI_BASE_ADDRESS_SPACE_MEMORY,
> +                             msix_mmio_map);
> +            IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
> +        } else {
> +            IVSHMEM_DPRINTF("msix initialization failed\n");
> +        }
> +
> +        /* 'activate' the vectors */
> +        for (i = 0; i<  s->vectors; i++) {
> +            msix_vector_use(&s->dev, i);
> +        }
> +    }
> +
> +    if ((s->chr != NULL)&&  (strncmp(s->chr->filename, "unix:", 5) == 0)) {
> +        /* if we get a UNIX socket as the parameter we will talk
> +         * to the ivshmem server later once the MMIO BAR is actually
> +         * allocated (see ivshmem_mmio_map) */
> +
> +        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
> +                                                            s->chr->filename);
> +
> +        /* we allocate enough space for 16 guests and grow as needed */
> +        s->nr_alloc_guests = 16;
> +        s->vm_id = -1;
> +
> +        /* allocate/initialize space for interrupt handling */
> +        s->eventfds = qemu_mallocz(s->nr_alloc_guests * sizeof(int *));
> +        s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry));
> +        s->eventfds_posn_count = qemu_mallocz(s->nr_alloc_guests * sizeof(int));
> +
> +        pci_conf[PCI_INTERRUPT_PIN] = 1; /* we are going to support interrupts */
> +
> +        if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> +            s->eventfd_chr = (CharDriverState **)qemu_malloc(s->nr_alloc_guests *
> +                                                            sizeof(void *));
> +        }
> +
> +    } else {
> +        /* just map the file immediately, we're not using a server */
> +        int fd;
> +
> +        if (s->shmobj == NULL) {
> +            fprintf(stderr, "Must specify 'chardev' or 'shm' to ivshmem\n");
> +        }
> +
> +        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);
> +
> +        /* try opening with O_EXCL and if it succeeds zero the memory
> +         * by truncating to 0 */
> +        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
> +                        S_IRWXU|S_IRWXG|S_IRWXO))>  0) {
> +           /* truncate file to length PCI device's memory */
> +            if (ftruncate(fd, s->ivshmem_size) != 0) {
> +                fprintf(stderr, "kvm_ivshmem: could not truncate shared file\n");
> +            }
> +
> +        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
> +                        S_IRWXU|S_IRWXG|S_IRWXO))<  0) {
> +            fprintf(stderr, "kvm_ivshmem: could not open shared file\n");
> +            exit(-1);
> +        }
> +
> +        create_shared_memory_BAR(s, fd);
> +
> +    }
> +
> +
> +    return 0;
> +}
> +
> +static int pci_ivshmem_uninit(PCIDevice *dev)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> +
> +    cpu_unregister_io_memory(s->ivshmem_mmio_io_addr);
> +
> +    return 0;
> +}
> +
> +static PCIDeviceInfo ivshmem_info = {
> +    .qdev.name  = "ivshmem",
> +    .qdev.size  = sizeof(IVShmemState),
> +    .qdev.reset = ivshmem_reset,
> +    .init       = pci_ivshmem_init,
> +    .exit       = pci_ivshmem_uninit,
> +    .qdev.props = (Property[]) {
> +        DEFINE_PROP_CHR("chardev", IVShmemState, chr),
> +        DEFINE_PROP_STRING("size", IVShmemState, sizearg),
> +        DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
> +        DEFINE_PROP_BIT("irqfd", IVShmemState, features, IVSHMEM_IRQFD, false),
> +        DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
> +        DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
> +        DEFINE_PROP_END_OF_LIST(),
> +    }
> +};
> +
> +static void ivshmem_register_devices(void)
> +{
> +    pci_qdev_register(&ivshmem_info);
> +}
> +
> +device_init(ivshmem_register_devices)
> diff --git a/qemu-char.c b/qemu-char.c
> index 048da3f..41cb8c7 100644
> --- a/qemu-char.c
> +++ b/qemu-char.c
> @@ -2076,6 +2076,12 @@ static void tcp_chr_read(void *opaque)
>       }
>   }
>
> +CharDriverState *qemu_chr_open_eventfd(int eventfd){
> +
> +    return qemu_chr_open_fd(eventfd, eventfd);
> +
> +}
> +
>   static void tcp_chr_connect(void *opaque)
>   {
>       CharDriverState *chr = opaque;
> diff --git a/qemu-char.h b/qemu-char.h
> index 3a9427b..1571091 100644
> --- a/qemu-char.h
> +++ b/qemu-char.h
> @@ -93,6 +93,9 @@ void qemu_chr_info_print(Monitor *mon, const QObject *ret_data);
>   void qemu_chr_info(Monitor *mon, QObject **ret_data);
>   CharDriverState *qemu_chr_find(const char *name);
>
> +/* add an eventfd to the qemu devices that are polled */
> +CharDriverState *qemu_chr_open_eventfd(int eventfd);
> +
>   extern int term_escape_char;
>
>   /* async I/O support */
> diff --git a/qemu-doc.texi b/qemu-doc.texi
> index 6647b7b..2df4687 100644
> --- a/qemu-doc.texi
> +++ b/qemu-doc.texi
> @@ -706,6 +706,31 @@ Using the @option{-net socket} option, it is possible to make VLANs
>   that span several QEMU instances. See @ref{sec_invocation} to have a
>   basic example.
>
> +@section Other Devices
> +
> +@subsection Inter-VM Shared Memory device
> +
> +With KVM enabled on a Linux host, a shared memory device is available.  Guests
> +map a POSIX shared memory region into the guest as a PCI device that enables
> +zero-copy communication to the application level of the guests.  The basic
> +syntax is:
> +
> +@example
> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
> +@end example
> +
> +If desired, interrupts can be sent between guest VMs accessing the same shared
> +memory region.  Interrupt support requires using a shared memory server and
> +using a chardev socket to connect to it.  The code for the shared memory server
> +is qemu.git/contrib/ivshmem-server.  An example syntax when using the shared
> +memory server is:
> +
> +@example
> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
> +                        [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
> +qemu -chardev socket,path=<path>,id=<id>
> +@end example
> +
>   @node direct_linux_boot
>   @section Direct Linux Boot
>
>
Cam Macdonell - May 6, 2010, 5:59 p.m.
On Thu, May 6, 2010 at 11:32 AM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 04/21/2010 12:53 PM, Cam Macdonell wrote:
>>
>> Support an inter-vm shared memory device that maps a shared-memory object
>> as a
>> PCI device in the guest.  This patch also supports interrupts between
>> guest by
>> communicating over a unix domain socket.  This patch applies to the
>> qemu-kvm
>> repository.
>>
>>     -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>
>> Interrupts are supported between multiple VMs by using a shared memory
>> server
>> by using a chardev socket.
>>
>>     -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>                     [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
>>     -chardev socket,path=<path>,id=<id>
>>
>> (shared memory server is qemu.git/contrib/ivshmem-server)
>>
>> Sample programs and init scripts are in a git repo here:
>>
>>     www.gitorious.org/nahanni
>> ---
>>  Makefile.target |    3 +
>>  hw/ivshmem.c    |  727
>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  qemu-char.c     |    6 +
>>  qemu-char.h     |    3 +
>>  qemu-doc.texi   |   25 ++
>>  5 files changed, 764 insertions(+), 0 deletions(-)
>>  create mode 100644 hw/ivshmem.c
>>
>> diff --git a/Makefile.target b/Makefile.target
>> index 1ffd802..bc9a681 100644
>> --- a/Makefile.target
>> +++ b/Makefile.target
>> @@ -199,6 +199,9 @@ obj-$(CONFIG_USB_OHCI) += usb-ohci.o
>>  obj-y += rtl8139.o
>>  obj-y += e1000.o
>>
>> +# Inter-VM PCI shared memory
>> +obj-y += ivshmem.o
>> +
>>  # Hardware support
>>  obj-i386-y = pckbd.o dma.o
>>  obj-i386-y += vga.o
>> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
>> new file mode 100644
>> index 0000000..f8d8fdb
>> --- /dev/null
>> +++ b/hw/ivshmem.c
>> @@ -0,0 +1,727 @@
>> +/*
>> + * Inter-VM Shared Memory PCI device.
>> + *
>> + * Author:
>> + *      Cam Macdonell<cam@cs.ualberta.ca>
>> + *
>> + * Based On: cirrus_vga.c and rtl8139.c
>> + *
>> + * This code is licensed under the GNU GPL v2.
>> + */
>> +#include<sys/mman.h>
>> +#include<sys/types.h>
>> +#include<sys/socket.h>
>> +#include<sys/io.h>
>> +#include<sys/ioctl.h>
>> +#include<sys/eventfd.h>
>>
>
> This will break the Windows along with any non-Linux unix or any Linux old
> enough to not have eventfd support.

I'll wrap it with

#ifdef CONFIG_EVENTFD

> If it's based on cirrus_vga.c and rtl8139.c, then it ought to carry the
> respective copyrights, no?

Sure, I can add those

Cam

>
> Regards,
>
> Anthony Liguori
>
>> +#include "hw.h"
>> +#include "console.h"
>> +#include "pc.h"
>> +#include "pci.h"
>> +#include "sysemu.h"
>> +
>> +#include "msix.h"
>> +#include "qemu-kvm.h"
>> +#include "libkvm.h"
>> +
>> +#include<sys/eventfd.h>
>> +#include<sys/mman.h>
>> +#include<sys/socket.h>
>> +#include<sys/ioctl.h>
>> +
>> +#define IVSHMEM_IRQFD   0
>> +#define IVSHMEM_MSI     1
>> +
>> +#define DEBUG_IVSHMEM
>> +#ifdef DEBUG_IVSHMEM
>> +#define IVSHMEM_DPRINTF(fmt, args...)        \
>> +    do {printf("IVSHMEM: " fmt, ##args); } while (0)
>> +#else
>> +#define IVSHMEM_DPRINTF(fmt, args...)
>> +#endif
>> +
>> +typedef struct EventfdEntry {
>> +    PCIDevice *pdev;
>> +    int vector;
>> +} EventfdEntry;
>> +
>> +typedef struct IVShmemState {
>> +    PCIDevice dev;
>> +    uint32_t intrmask;
>> +    uint32_t intrstatus;
>> +    uint32_t doorbell;
>> +
>> +    CharDriverState * chr;
>> +    CharDriverState ** eventfd_chr;
>> +    int ivshmem_mmio_io_addr;
>> +
>> +    pcibus_t mmio_addr;
>> +    unsigned long ivshmem_offset;
>> +    uint64_t ivshmem_size; /* size of shared memory region */
>> +    int shm_fd; /* shared memory file descriptor */
>> +
>> +    int nr_allocated_vms;
>> +    /* array of eventfds for each guest */
>> +    int ** eventfds;
>> +    /* keep track of # of eventfds for each guest*/
>> +    int * eventfds_posn_count;
>> +
>> +    int nr_alloc_guests;
>> +    int vm_id;
>> +    int num_eventfds;
>> +    uint32_t vectors;
>> +    uint32_t features;
>> +    EventfdEntry *eventfd_table;
>> +
>> +    char * shmobj;
>> +    char * sizearg;
>> +} IVShmemState;
>> +
>> +/* registers for the Inter-VM shared memory device */
>> +enum ivshmem_registers {
>> +    IntrMask = 0,
>> +    IntrStatus = 4,
>> +    IVPosition = 8,
>> +    Doorbell = 12,
>> +};
>> +
>> +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int
>> feature) {
>> +    return (ivs->features&  (1<<  feature));
>> +}
>> +
>> +static inline int is_power_of_two(int x) {
>> +    return (x&  (x-1)) == 0;
>> +}
>> +
>> +static void ivshmem_map(PCIDevice *pci_dev, int region_num,
>> +                    pcibus_t addr, pcibus_t size, int type)
>> +{
>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
>> +
>> +    IVSHMEM_DPRINTF("addr = %u size = %u\n", (uint32_t)addr,
>> (uint32_t)size);
>> +    cpu_register_physical_memory(addr, s->ivshmem_size,
>> s->ivshmem_offset);
>> +
>> +}
>> +
>> +/* accessing registers - based on rtl8139 */
>> +static void ivshmem_update_irq(IVShmemState *s, int val)
>> +{
>> +    int isr;
>> +    isr = (s->intrstatus&  s->intrmask)&  0xffffffff;
>> +
>> +    /* don't print ISR resets */
>> +    if (isr) {
>> +        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
>> +           isr ? 1 : 0, s->intrstatus, s->intrmask);
>> +    }
>> +
>> +    qemu_set_irq(s->dev.irq[0], (isr != 0));
>> +}
>> +
>> +static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
>> +{
>> +    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
>> +
>> +    s->intrmask = val;
>> +
>> +    ivshmem_update_irq(s, val);
>> +}
>> +
>> +static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
>> +{
>> +    uint32_t ret = s->intrmask;
>> +
>> +    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
>> +{
>> +    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
>> +
>> +    s->intrstatus = val;
>> +
>> +    ivshmem_update_irq(s, val);
>> +    return;
>> +}
>> +
>> +static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
>> +{
>> +    uint32_t ret = s->intrstatus;
>> +
>> +    /* reading ISR clears all interrupts */
>> +    s->intrstatus = 0;
>> +
>> +    ivshmem_update_irq(s, 0);
>> +
>> +    return ret;
>> +}
>> +
>> +static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val)
>> +{
>> +
>> +    IVSHMEM_DPRINTF("We shouldn't be writing words\n");
>> +}
>> +
>> +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
>> +{
>> +    IVShmemState *s = opaque;
>> +
>> +    u_int64_t write_one = 1;
>> +    u_int16_t dest = val>>  16;
>> +    u_int16_t vector = val&  0xff;
>> +
>> +    addr&= 0xfe;
>> +
>> +    switch (addr)
>> +    {
>> +        case IntrMask:
>> +            ivshmem_IntrMask_write(s, val);
>> +            break;
>> +
>> +        case IntrStatus:
>> +            ivshmem_IntrStatus_write(s, val);
>> +            break;
>> +
>> +        case Doorbell:
>> +            /* check doorbell range */
>> +            if ((vector>= 0)&&  (vector<  s->eventfds_posn_count[dest]))
>> {
>> +                IVSHMEM_DPRINTF("Writing %ld to VM %d on vector %d\n",
>> write_one, dest, vector);
>> +                if (write(s->eventfds[dest][vector],&(write_one), 8) !=
>> 8) {
>> +                    IVSHMEM_DPRINTF("error writing to eventfd\n");
>> +                }
>> +            }
>> +            break;
>> +        default:
>> +            IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest);
>> +    }
>> +}
>> +
>> +static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val)
>> +{
>> +    IVSHMEM_DPRINTF("We shouldn't be writing bytes\n");
>> +}
>> +
>> +static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr)
>> +{
>> +
>> +    IVSHMEM_DPRINTF("We shouldn't be reading words\n");
>> +    return 0;
>> +}
>> +
>> +static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr)
>> +{
>> +
>> +    IVShmemState *s = opaque;
>> +    uint32_t ret;
>> +
>> +    switch (addr)
>> +    {
>> +        case IntrMask:
>> +            ret = ivshmem_IntrMask_read(s);
>> +            break;
>> +
>> +        case IntrStatus:
>> +            ret = ivshmem_IntrStatus_read(s);
>> +            break;
>> +
>> +        case IVPosition:
>> +            /* return my id in the ivshmem list */
>> +            ret = s->vm_id;
>> +            break;
>> +
>> +        default:
>> +            IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr);
>> +            ret = 0;
>> +    }
>> +
>> +    return ret;
>> +
>> +}
>> +
>> +static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr)
>> +{
>> +    IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
>> +
>> +    return 0;
>> +}
>> +
>> +static void ivshmem_mmio_writeb(void *opaque,
>> +                                target_phys_addr_t addr, uint32_t val)
>> +{
>> +    ivshmem_io_writeb(opaque, addr&  0xFF, val);
>> +}
>> +
>> +static void ivshmem_mmio_writew(void *opaque,
>> +                                target_phys_addr_t addr, uint32_t val)
>> +{
>> +    ivshmem_io_writew(opaque, addr&  0xFF, val);
>> +}
>> +
>> +static void ivshmem_mmio_writel(void *opaque,
>> +                                target_phys_addr_t addr, uint32_t val)
>> +{
>> +    ivshmem_io_writel(opaque, addr&  0xFF, val);
>> +}
>> +
>> +static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr)
>> +{
>> +    return ivshmem_io_readb(opaque, addr&  0xFF);
>> +}
>> +
>> +static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr)
>> +{
>> +    uint32_t val = ivshmem_io_readw(opaque, addr&  0xFF);
>> +    return val;
>> +}
>> +
>> +static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr)
>> +{
>> +    uint32_t val = ivshmem_io_readl(opaque, addr&  0xFF);
>> +    return val;
>> +}
>> +
>> +static CPUReadMemoryFunc *ivshmem_mmio_read[3] = {
>> +    ivshmem_mmio_readb,
>> +    ivshmem_mmio_readw,
>> +    ivshmem_mmio_readl,
>> +};
>> +
>> +static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = {
>> +    ivshmem_mmio_writeb,
>> +    ivshmem_mmio_writew,
>> +    ivshmem_mmio_writel,
>> +};
>> +
>> +static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
>> +{
>> +    IVShmemState *s = opaque;
>> +
>> +    ivshmem_IntrStatus_write(s, *buf);
>> +
>> +    IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
>> +}
>> +
>> +static int ivshmem_can_receive(void * opaque)
>> +{
>> +    return 8;
>> +}
>> +
>> +static void ivshmem_event(void *opaque, int event)
>> +{
>> +    IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
>> +}
>> +
>> +static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {
>> +
>> +    EventfdEntry *entry = opaque;
>> +    PCIDevice *pdev = entry->pdev;
>> +
>> +    IVSHMEM_DPRINTF("fake irqfd on vector %d\n", entry->vector);
>> +    msix_notify(pdev, entry->vector);
>> +}
>> +
>> +static CharDriverState* create_eventfd_chr_device(void * opaque, int
>> eventfd,
>> +                                                                    int
>> vector)
>> +{
>> +    /* create a event character device based on the passed eventfd */
>> +    IVShmemState *s = opaque;
>> +    CharDriverState * chr;
>> +
>> +    chr = qemu_chr_open_eventfd(eventfd);
>> +
>> +    if (chr == NULL) {
>> +        IVSHMEM_DPRINTF("creating eventfd for eventfd %d failed\n",
>> eventfd);
>> +        exit(-1);
>> +    }
>> +
>> +    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
>> +        s->eventfd_table[vector].pdev =&s->dev;
>> +        s->eventfd_table[vector].vector = vector;
>> +
>> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
>> +                      ivshmem_event,&s->eventfd_table[vector]);
>> +    } else {
>> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
>> +                      ivshmem_event, s);
>> +    }
>> +
>> +    return chr;
>> +
>> +}
>> +
>> +static int check_shm_size(IVShmemState *s, int shmemfd) {
>> +    /* check that the guest isn't going to try and map more memory than
>> the
>> +     * card server allocated return -1 to indicate error */
>> +
>> +    struct stat buf;
>> +
>> +    fstat(shmemfd,&buf);
>> +
>> +    if (s->ivshmem_size>  buf.st_size) {
>> +        fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
>> +        fprintf(stderr, " than shared object size (%ld>  %ld)\n",
>> +                                          s->ivshmem_size, buf.st_size);
>> +        return -1;
>> +    } else {
>> +        return 0;
>> +    }
>> +}
>> +
>> +static void create_shared_memory_BAR(IVShmemState *s, int fd) {
>> +
>> +    s->shm_fd = fd;
>> +
>> +    s->ivshmem_offset = qemu_ram_mmap(s->shm_fd, s->ivshmem_size,
>> +             MAP_SHARED, 0);
>> +
>> +    /* region for shared memory */
>> +    pci_register_bar(&s->dev, 2, s->ivshmem_size,
>> +                                    PCI_BASE_ADDRESS_SPACE_MEMORY,
>> ivshmem_map);
>> +}
>> +
>> +static void close_guest_eventfds(IVShmemState *s, int posn)
>> +{
>> +    int i, guest_curr_max;
>> +
>> +    guest_curr_max = s->eventfds_posn_count[posn];
>> +
>> +    for (i = 0; i<  guest_curr_max; i++)
>> +        close(s->eventfds[posn][i]);
>> +
>> +    free(s->eventfds[posn]);
>> +    s->eventfds_posn_count[posn] = 0;
>> +}
>> +
>> +/* this function increase the dynamic storage need to store data about
>> other
>> + * guests */
>> +static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
>> +
>> +    int j, old_nr_alloc;
>> +
>> +    old_nr_alloc = s->nr_alloc_guests;
>> +
>> +    while (s->nr_alloc_guests<  new_min_size)
>> +        s->nr_alloc_guests = s->nr_alloc_guests * 2;
>> +
>> +    IVSHMEM_DPRINTF("bumping storage to %d guests\n",
>> s->nr_alloc_guests);
>> +    s->eventfds = qemu_realloc(s->eventfds, s->nr_alloc_guests *
>> +                                                        sizeof(int *));
>> +    s->eventfds_posn_count = qemu_realloc(s->eventfds_posn_count,
>> +                                                    s->nr_alloc_guests *
>> +                                                        sizeof(int));
>> +    s->eventfd_table = qemu_realloc(s->eventfd_table, s->nr_alloc_guests
>> *
>> +
>>  sizeof(EventfdEntry));
>> +
>> +    if ((s->eventfds == NULL) || (s->eventfds_posn_count == NULL) ||
>> +            (s->eventfd_table == NULL)) {
>> +        fprintf(stderr, "Allocation error - exiting\n");
>> +        exit(1);
>> +    }
>> +
>> +    if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
>> +        s->eventfd_chr = (CharDriverState **)qemu_realloc(s->eventfd_chr,
>> +                                    s->nr_alloc_guests * sizeof(void *));
>> +        if (s->eventfd_chr == NULL) {
>> +            fprintf(stderr, "Allocation error - exiting\n");
>> +            exit(1);
>> +        }
>> +    }
>> +
>> +    /* zero out new pointers */
>> +    for (j = old_nr_alloc; j<  s->nr_alloc_guests; j++) {
>> +        s->eventfds[j] = NULL;
>> +    }
>> +}
>> +
>> +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
>> +{
>> +    IVShmemState *s = opaque;
>> +    int incoming_fd, tmp_fd;
>> +    int guest_curr_max;
>> +    long incoming_posn;
>> +
>> +    memcpy(&incoming_posn, buf, sizeof(long));
>> +    /* pick off s->chr->msgfd and store it, posn should accompany msg */
>> +    tmp_fd = qemu_chr_get_msgfd(s->chr);
>> +    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
>> +
>> +    /* make sure we have enough space for this guest */
>> +    if (incoming_posn>= s->nr_alloc_guests) {
>> +        increase_dynamic_storage(s, incoming_posn);
>> +    }
>> +
>> +    if (tmp_fd == -1) {
>> +        /* if posn is positive and unseen before then this is our posn*/
>> +        if ((incoming_posn>= 0)&&  (s->eventfds[incoming_posn] == NULL))
>> {
>> +            /* receive our posn */
>> +            s->vm_id = incoming_posn;
>> +            return;
>> +        } else {
>> +            /* otherwise an fd == -1 means an existing guest has gone
>> away */
>> +            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
>> +            close_guest_eventfds(s, incoming_posn);
>> +            return;
>> +        }
>> +    }
>> +
>> +    /* because of the implementation of get_msgfd, we need a dup */
>> +    incoming_fd = dup(tmp_fd);
>> +
>> +    /* if the position is -1, then it's shared memory region fd */
>> +    if (incoming_posn == -1) {
>> +
>> +        s->num_eventfds = 0;
>> +
>> +        if (check_shm_size(s, incoming_fd) == -1) {
>> +            exit(-1);
>> +        }
>> +
>> +        /* creating a BAR in qemu_chr callback may be crazy */
>> +        create_shared_memory_BAR(s, incoming_fd);
>> +
>> +       return;
>> +    }
>> +
>> +    /* each guest has an array of eventfds, and we keep track of how many
>> +     * guests for each VM */
>> +    guest_curr_max = s->eventfds_posn_count[incoming_posn];
>> +    if (guest_curr_max == 0) {
>> +        /* one eventfd per MSI vector */
>> +        s->eventfds[incoming_posn] = (int *) qemu_malloc(s->vectors *
>> +
>>  sizeof(int));
>> +    }
>> +
>> +    /* this is an eventfd for a particular guest VM */
>> +    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn,
>> guest_curr_max,
>> +
>>  incoming_fd);
>> +    s->eventfds[incoming_posn][guest_curr_max] = incoming_fd;
>> +
>> +    /* increment count for particular guest */
>> +    s->eventfds_posn_count[incoming_posn]++;
>> +
>> +    /* ioeventfd and irqfd are enabled together,
>> +     * so the flag IRQFD refers to both */
>> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD)&&  guest_curr_max>= 0) {
>> +        /* allocate ioeventfd for the new fd
>> +         * received for guest @ incoming_posn */
>> +        kvm_set_ioeventfd_mmio_long(incoming_fd, s->mmio_addr + Doorbell,
>> +                                (incoming_posn<<  16) | guest_curr_max,
>> 1);
>> +    }
>> +
>> +    /* keep track of the maximum VM ID */
>> +    if (incoming_posn>  s->num_eventfds) {
>> +        s->num_eventfds = incoming_posn;
>> +    }
>> +
>> +    if (incoming_posn == s->vm_id) {
>> +        if (ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
>> +            /* setup irqfd for this VM's eventfd */
>> +            int vector = guest_curr_max;
>> +            kvm_set_irqfd(s->eventfds[s->vm_id][guest_curr_max], vector,
>> +
>>  s->dev.msix_irq_entries[vector].gsi);
>> +        } else {
>> +            /* initialize char device for callback
>> +             * if this is one of my eventfd */
>> +            s->eventfd_chr[guest_curr_max] = create_eventfd_chr_device(s,
>> +                s->eventfds[s->vm_id][guest_curr_max], guest_curr_max);
>> +        }
>> +    }
>> +
>> +    return;
>> +}
>> +
>> +static void ivshmem_reset(DeviceState *d)
>> +{
>> +    return;
>> +}
>> +
>> +static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
>> +                       pcibus_t addr, pcibus_t size, int type)
>> +{
>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
>> +
>> +    s->mmio_addr = addr;
>> +    cpu_register_physical_memory(addr + 0, 0x400,
>> s->ivshmem_mmio_io_addr);
>> +
>> +    /* now that our mmio region has been allocated, we can receive
>> +     * the file descriptors */
>> +    if (s->chr != NULL) {
>> +        qemu_chr_add_handlers(s->chr, ivshmem_can_receive, ivshmem_read,
>> +                     ivshmem_event, s);
>> +    }
>> +
>> +}
>> +
>> +static uint64_t ivshmem_get_size(IVShmemState * s) {
>> +
>> +    uint64_t value;
>> +    char *ptr;
>> +
>> +    value = strtoul(s->sizearg,&ptr, 10);
>> +    switch (*ptr) {
>> +        case 0: case 'M': case 'm':
>> +            value<<= 20;
>> +            break;
>> +        case 'G': case 'g':
>> +            value<<= 30;
>> +            break;
>> +        default:
>> +            fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg);
>> +            exit(1);
>> +    }
>> +
>> +    /* BARs must be a power of 2 */
>> +    if (!is_power_of_two(value)) {
>> +        fprintf(stderr, "ivshmem: size must be power of 2\n");
>> +        exit(1);
>> +    }
>> +
>> +    return value;
>> +
>> +}
>> +
>> +static int pci_ivshmem_init(PCIDevice *dev)
>> +{
>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>> +    uint8_t *pci_conf;
>> +    int i;
>> +
>> +    if (s->sizearg == NULL)
>> +        s->ivshmem_size = 4<<  20; /* 4 MB default */
>> +    else {
>> +        s->ivshmem_size = ivshmem_get_size(s);
>> +    }
>> +
>> +    /* IRQFD requires MSI */
>> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD)&&
>> +        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
>> +        fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n");
>> +        exit(1);
>> +    }
>> +
>> +    pci_conf = s->dev.config;
>> +    pci_conf[0x00] = 0xf4; /* Qumranet vendor ID 0x5002 */
>> +    pci_conf[0x01] = 0x1a;
>> +    pci_conf[0x02] = 0x10;
>> +    pci_conf[0x03] = 0x11;
>> +    pci_conf[0x04] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
>> +    pci_conf[0x0a] = 0x00; /* RAM controller */
>> +    pci_conf[0x0b] = 0x05;
>> +    pci_conf[0x0e] = 0x00; /* header_type */
>> +
>> +    s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
>> +                                    ivshmem_mmio_write, s);
>> +    /* region for registers*/
>> +    pci_register_bar(&s->dev, 0, 0x400,
>> +                           PCI_BASE_ADDRESS_SPACE_MEMORY,
>> ivshmem_mmio_map);
>> +
>> +    /* allocate the MSI-X vectors */
>> +    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
>> +
>> +        if (!msix_init(&s->dev, s->vectors, 1, 0)) {
>> +            pci_register_bar(&s->dev, 1,
>> +                             msix_bar_size(&s->dev),
>> +                             PCI_BASE_ADDRESS_SPACE_MEMORY,
>> +                             msix_mmio_map);
>> +            IVSHMEM_DPRINTF("msix initialized (%d vectors)\n",
>> s->vectors);
>> +        } else {
>> +            IVSHMEM_DPRINTF("msix initialization failed\n");
>> +        }
>> +
>> +        /* 'activate' the vectors */
>> +        for (i = 0; i<  s->vectors; i++) {
>> +            msix_vector_use(&s->dev, i);
>> +        }
>> +    }
>> +
>> +    if ((s->chr != NULL)&&  (strncmp(s->chr->filename, "unix:", 5) == 0))
>> {
>> +        /* if we get a UNIX socket as the parameter we will talk
>> +         * to the ivshmem server later once the MMIO BAR is actually
>> +         * allocated (see ivshmem_mmio_map) */
>> +
>> +        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
>> +
>>  s->chr->filename);
>> +
>> +        /* we allocate enough space for 16 guests and grow as needed */
>> +        s->nr_alloc_guests = 16;
>> +        s->vm_id = -1;
>> +
>> +        /* allocate/initialize space for interrupt handling */
>> +        s->eventfds = qemu_mallocz(s->nr_alloc_guests * sizeof(int *));
>> +        s->eventfd_table = qemu_mallocz(s->vectors *
>> sizeof(EventfdEntry));
>> +        s->eventfds_posn_count = qemu_mallocz(s->nr_alloc_guests *
>> sizeof(int));
>> +
>> +        pci_conf[PCI_INTERRUPT_PIN] = 1; /* we are going to support
>> interrupts */
>> +
>> +        if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
>> +            s->eventfd_chr = (CharDriverState
>> **)qemu_malloc(s->nr_alloc_guests *
>> +                                                            sizeof(void
>> *));
>> +        }
>> +
>> +    } else {
>> +        /* just map the file immediately, we're not using a server */
>> +        int fd;
>> +
>> +        if (s->shmobj == NULL) {
>> +            fprintf(stderr, "Must specify 'chardev' or 'shm' to
>> ivshmem\n");
>> +        }
>> +
>> +        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);
>> +
>> +        /* try opening with O_EXCL and if it succeeds zero the memory
>> +         * by truncating to 0 */
>> +        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
>> +                        S_IRWXU|S_IRWXG|S_IRWXO))>  0) {
>> +           /* truncate file to length PCI device's memory */
>> +            if (ftruncate(fd, s->ivshmem_size) != 0) {
>> +                fprintf(stderr, "kvm_ivshmem: could not truncate shared
>> file\n");
>> +            }
>> +
>> +        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
>> +                        S_IRWXU|S_IRWXG|S_IRWXO))<  0) {
>> +            fprintf(stderr, "kvm_ivshmem: could not open shared file\n");
>> +            exit(-1);
>> +        }
>> +
>> +        create_shared_memory_BAR(s, fd);
>> +
>> +    }
>> +
>> +
>> +    return 0;
>> +}
>> +
>> +static int pci_ivshmem_uninit(PCIDevice *dev)
>> +{
>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>> +
>> +    cpu_unregister_io_memory(s->ivshmem_mmio_io_addr);
>> +
>> +    return 0;
>> +}
>> +
>> +static PCIDeviceInfo ivshmem_info = {
>> +    .qdev.name  = "ivshmem",
>> +    .qdev.size  = sizeof(IVShmemState),
>> +    .qdev.reset = ivshmem_reset,
>> +    .init       = pci_ivshmem_init,
>> +    .exit       = pci_ivshmem_uninit,
>> +    .qdev.props = (Property[]) {
>> +        DEFINE_PROP_CHR("chardev", IVShmemState, chr),
>> +        DEFINE_PROP_STRING("size", IVShmemState, sizearg),
>> +        DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
>> +        DEFINE_PROP_BIT("irqfd", IVShmemState, features, IVSHMEM_IRQFD,
>> false),
>> +        DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI,
>> true),
>> +        DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
>> +        DEFINE_PROP_END_OF_LIST(),
>> +    }
>> +};
>> +
>> +static void ivshmem_register_devices(void)
>> +{
>> +    pci_qdev_register(&ivshmem_info);
>> +}
>> +
>> +device_init(ivshmem_register_devices)
>> diff --git a/qemu-char.c b/qemu-char.c
>> index 048da3f..41cb8c7 100644
>> --- a/qemu-char.c
>> +++ b/qemu-char.c
>> @@ -2076,6 +2076,12 @@ static void tcp_chr_read(void *opaque)
>>      }
>>  }
>>
>> +CharDriverState *qemu_chr_open_eventfd(int eventfd){
>> +
>> +    return qemu_chr_open_fd(eventfd, eventfd);
>> +
>> +}
>> +
>>  static void tcp_chr_connect(void *opaque)
>>  {
>>      CharDriverState *chr = opaque;
>> diff --git a/qemu-char.h b/qemu-char.h
>> index 3a9427b..1571091 100644
>> --- a/qemu-char.h
>> +++ b/qemu-char.h
>> @@ -93,6 +93,9 @@ void qemu_chr_info_print(Monitor *mon, const QObject
>> *ret_data);
>>  void qemu_chr_info(Monitor *mon, QObject **ret_data);
>>  CharDriverState *qemu_chr_find(const char *name);
>>
>> +/* add an eventfd to the qemu devices that are polled */
>> +CharDriverState *qemu_chr_open_eventfd(int eventfd);
>> +
>>  extern int term_escape_char;
>>
>>  /* async I/O support */
>> diff --git a/qemu-doc.texi b/qemu-doc.texi
>> index 6647b7b..2df4687 100644
>> --- a/qemu-doc.texi
>> +++ b/qemu-doc.texi
>> @@ -706,6 +706,31 @@ Using the @option{-net socket} option, it is possible
>> to make VLANs
>>  that span several QEMU instances. See @ref{sec_invocation} to have a
>>  basic example.
>>
>> +@section Other Devices
>> +
>> +@subsection Inter-VM Shared Memory device
>> +
>> +With KVM enabled on a Linux host, a shared memory device is available.
>>  Guests
>> +map a POSIX shared memory region into the guest as a PCI device that
>> enables
>> +zero-copy communication to the application level of the guests.  The
>> basic
>> +syntax is:
>> +
>> +@example
>> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm
>> name>]
>> +@end example
>> +
>> +If desired, interrupts can be sent between guest VMs accessing the same
>> shared
>> +memory region.  Interrupt support requires using a shared memory server
>> and
>> +using a chardev socket to connect to it.  The code for the shared memory
>> server
>> +is qemu.git/contrib/ivshmem-server.  An example syntax when using the
>> shared
>> +memory server is:
>> +
>> +@example
>> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm
>> name>]
>> +                        [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
>> +qemu -chardev socket,path=<path>,id=<id>
>> +@end example
>> +
>>  @node direct_linux_boot
>>  @section Direct Linux Boot
>>
>>
>
>
Avi Kivity - May 10, 2010, 11:59 a.m.
On 04/21/2010 08:53 PM, Cam Macdonell wrote:
> Support an inter-vm shared memory device that maps a shared-memory object as a
> PCI device in the guest.  This patch also supports interrupts between guest by
> communicating over a unix domain socket.  This patch applies to the qemu-kvm
> repository.
>
>      -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>
> Interrupts are supported between multiple VMs by using a shared memory server
> by using a chardev socket.
>
>      -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>                      [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
>      -chardev socket,path=<path>,id=<id>
>
> (shared memory server is qemu.git/contrib/ivshmem-server)
>
> Sample programs and init scripts are in a git repo here:
>
>
> +typedef struct EventfdEntry {
> +    PCIDevice *pdev;
> +    int vector;
> +} EventfdEntry;
> +
> +typedef struct IVShmemState {
> +    PCIDevice dev;
> +    uint32_t intrmask;
> +    uint32_t intrstatus;
> +    uint32_t doorbell;
> +
> +    CharDriverState * chr;
> +    CharDriverState ** eventfd_chr;
> +    int ivshmem_mmio_io_addr;
> +
> +    pcibus_t mmio_addr;
> +    unsigned long ivshmem_offset;
> +    uint64_t ivshmem_size; /* size of shared memory region */
> +    int shm_fd; /* shared memory file descriptor */
> +
> +    int nr_allocated_vms;
> +    /* array of eventfds for each guest */
> +    int ** eventfds;
> +    /* keep track of # of eventfds for each guest*/
> +    int * eventfds_posn_count;
>    

More readable:

   typedef struct Peer {
       int nb_eventfds;
       int *eventfds;
   } Peer;
   int nb_peers;
   Peer *peers;

Does eventfd_chr need to be there as well?

> +
> +    int nr_alloc_guests;
> +    int vm_id;
> +    int num_eventfds;
> +    uint32_t vectors;
> +    uint32_t features;
> +    EventfdEntry *eventfd_table;
> +
> +    char * shmobj;
> +    char * sizearg;
>    

Does this need to be part of the state?

> +} IVShmemState;
> +
> +/* registers for the Inter-VM shared memory device */
> +enum ivshmem_registers {
> +    IntrMask = 0,
> +    IntrStatus = 4,
> +    IVPosition = 8,
> +    Doorbell = 12,
> +};
> +
> +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int feature) {
> +    return (ivs->features&  (1<<  feature));
> +}
> +
> +static inline int is_power_of_two(int x) {
> +    return (x&  (x-1)) == 0;
> +}
>    

argument needs to be uint64_t to avoid overflow with large BARs.  Return 
type can be bool.

> +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
> +{
> +    IVShmemState *s = opaque;
> +
> +    u_int64_t write_one = 1;
> +    u_int16_t dest = val>>  16;
> +    u_int16_t vector = val&  0xff;
> +
> +    addr&= 0xfe;
>    

Why 0xfe?  Can understand 0xfc or 0xff.

> +
> +    switch (addr)
> +    {
> +        case IntrMask:
> +            ivshmem_IntrMask_write(s, val);
> +            break;
> +
> +        case IntrStatus:
> +            ivshmem_IntrStatus_write(s, val);
> +            break;
> +
> +        case Doorbell:
> +            /* check doorbell range */
> +            if ((vector>= 0)&&  (vector<  s->eventfds_posn_count[dest])) {
>    

What if dest is too big?  We overflow s->eventfds_posn_count.
> +
> +static void close_guest_eventfds(IVShmemState *s, int posn)
> +{
> +    int i, guest_curr_max;
> +
> +    guest_curr_max = s->eventfds_posn_count[posn];
> +
> +    for (i = 0; i<  guest_curr_max; i++)
> +        close(s->eventfds[posn][i]);
> +
> +    free(s->eventfds[posn]);
>    

qemu_free().

> +/* this function increase the dynamic storage need to store data about other
> + * guests */
> +static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
> +
> +    int j, old_nr_alloc;
> +
> +    old_nr_alloc = s->nr_alloc_guests;
> +
> +    while (s->nr_alloc_guests<  new_min_size)
> +        s->nr_alloc_guests = s->nr_alloc_guests * 2;
> +
> +    IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nr_alloc_guests);
> +    s->eventfds = qemu_realloc(s->eventfds, s->nr_alloc_guests *
> +                                                        sizeof(int *));
> +    s->eventfds_posn_count = qemu_realloc(s->eventfds_posn_count,
> +                                                    s->nr_alloc_guests *
> +                                                        sizeof(int));
> +    s->eventfd_table = qemu_realloc(s->eventfd_table, s->nr_alloc_guests *
> +                                                    sizeof(EventfdEntry));
> +
> +    if ((s->eventfds == NULL) || (s->eventfds_posn_count == NULL) ||
> +            (s->eventfd_table == NULL)) {
> +        fprintf(stderr, "Allocation error - exiting\n");
> +        exit(1);
> +    }
> +
> +    if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> +        s->eventfd_chr = (CharDriverState **)qemu_realloc(s->eventfd_chr,
> +                                    s->nr_alloc_guests * sizeof(void *));
> +        if (s->eventfd_chr == NULL) {
> +            fprintf(stderr, "Allocation error - exiting\n");
> +            exit(1);
> +        }
> +    }
> +
> +    /* zero out new pointers */
> +    for (j = old_nr_alloc; j<  s->nr_alloc_guests; j++) {
> +        s->eventfds[j] = NULL;
>    

eventfds_posn_count and eventfd_table want zeroing as well.

> +    }
> +}
> +
> +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
> +{
> +    IVShmemState *s = opaque;
> +    int incoming_fd, tmp_fd;
> +    int guest_curr_max;
> +    long incoming_posn;
> +
> +    memcpy(&incoming_posn, buf, sizeof(long));
> +    /* pick off s->chr->msgfd and store it, posn should accompany msg */
> +    tmp_fd = qemu_chr_get_msgfd(s->chr);
> +    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
> +
> +    /* make sure we have enough space for this guest */
> +    if (incoming_posn>= s->nr_alloc_guests) {
> +        increase_dynamic_storage(s, incoming_posn);
> +    }
> +
> +    if (tmp_fd == -1) {
> +        /* if posn is positive and unseen before then this is our posn*/
> +        if ((incoming_posn>= 0)&&  (s->eventfds[incoming_posn] == NULL)) {
> +            /* receive our posn */
> +            s->vm_id = incoming_posn;
> +            return;
> +        } else {
> +            /* otherwise an fd == -1 means an existing guest has gone away */
> +            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
> +            close_guest_eventfds(s, incoming_posn);
> +            return;
> +        }
> +    }
> +
> +    /* because of the implementation of get_msgfd, we need a dup */
> +    incoming_fd = dup(tmp_fd);
>    

Error check.

> +
> +    /* if the position is -1, then it's shared memory region fd */
> +    if (incoming_posn == -1) {
> +
> +        s->num_eventfds = 0;
> +
> +        if (check_shm_size(s, incoming_fd) == -1) {
> +            exit(-1);
> +        }
> +
> +        /* creating a BAR in qemu_chr callback may be crazy */
> +        create_shared_memory_BAR(s, incoming_fd);
>    

It probably is... why can't you create it during initialization?


> +
> +       return;
> +    }
> +
> +    /* each guest has an array of eventfds, and we keep track of how many
> +     * guests for each VM */
> +    guest_curr_max = s->eventfds_posn_count[incoming_posn];
> +    if (guest_curr_max == 0) {
> +        /* one eventfd per MSI vector */
> +        s->eventfds[incoming_posn] = (int *) qemu_malloc(s->vectors *
> +                                                                sizeof(int));
> +    }
> +
> +    /* this is an eventfd for a particular guest VM */
> +    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, guest_curr_max,
> +                                                                incoming_fd);
> +    s->eventfds[incoming_posn][guest_curr_max] = incoming_fd;
> +
> +    /* increment count for particular guest */
> +    s->eventfds_posn_count[incoming_posn]++;
>    

Not sure I follow exactly, but perhaps this needs to be

     s->eventfds_posn_count[incoming_posn] = guest_curr_max + 1;

Oh, it is.

> +
> +        /* allocate/initialize space for interrupt handling */
> +        s->eventfds = qemu_mallocz(s->nr_alloc_guests * sizeof(int *));
> +        s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry));
> +        s->eventfds_posn_count = qemu_mallocz(s->nr_alloc_guests * sizeof(int));
> +
> +        pci_conf[PCI_INTERRUPT_PIN] = 1; /* we are going to support interrupts */
>    

This is done by the guest BIOS.
Cam Macdonell - May 10, 2010, 3:22 p.m.
On Mon, May 10, 2010 at 5:59 AM, Avi Kivity <avi@redhat.com> wrote:
> On 04/21/2010 08:53 PM, Cam Macdonell wrote:
>>
>> Support an inter-vm shared memory device that maps a shared-memory object
>> as a
>> PCI device in the guest.  This patch also supports interrupts between
>> guest by
>> communicating over a unix domain socket.  This patch applies to the
>> qemu-kvm
>> repository.
>>
>>     -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>
>> Interrupts are supported between multiple VMs by using a shared memory
>> server
>> by using a chardev socket.
>>
>>     -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>                     [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
>>     -chardev socket,path=<path>,id=<id>
>>
>> (shared memory server is qemu.git/contrib/ivshmem-server)
>>
>> Sample programs and init scripts are in a git repo here:
>>
>>
>> +typedef struct EventfdEntry {
>> +    PCIDevice *pdev;
>> +    int vector;
>> +} EventfdEntry;
>> +
>> +typedef struct IVShmemState {
>> +    PCIDevice dev;
>> +    uint32_t intrmask;
>> +    uint32_t intrstatus;
>> +    uint32_t doorbell;
>> +
>> +    CharDriverState * chr;
>> +    CharDriverState ** eventfd_chr;
>> +    int ivshmem_mmio_io_addr;
>> +
>> +    pcibus_t mmio_addr;
>> +    unsigned long ivshmem_offset;
>> +    uint64_t ivshmem_size; /* size of shared memory region */
>> +    int shm_fd; /* shared memory file descriptor */
>> +
>> +    int nr_allocated_vms;
>> +    /* array of eventfds for each guest */
>> +    int ** eventfds;
>> +    /* keep track of # of eventfds for each guest*/
>> +    int * eventfds_posn_count;
>>
>
> More readable:
>
>  typedef struct Peer {
>      int nb_eventfds;
>      int *eventfds;
>  } Peer;
>  int nb_peers;
>  Peer *peers;
>
> Does eventfd_chr need to be there as well?
>
>> +
>> +    int nr_alloc_guests;
>> +    int vm_id;
>> +    int num_eventfds;
>> +    uint32_t vectors;
>> +    uint32_t features;
>> +    EventfdEntry *eventfd_table;
>> +
>> +    char * shmobj;
>> +    char * sizearg;
>>
>
> Does this need to be part of the state?
>
>> +} IVShmemState;
>> +
>> +/* registers for the Inter-VM shared memory device */
>> +enum ivshmem_registers {
>> +    IntrMask = 0,
>> +    IntrStatus = 4,
>> +    IVPosition = 8,
>> +    Doorbell = 12,
>> +};
>> +
>> +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int
>> feature) {
>> +    return (ivs->features&  (1<<  feature));
>> +}
>> +
>> +static inline int is_power_of_two(int x) {
>> +    return (x&  (x-1)) == 0;
>> +}
>>
>
> argument needs to be uint64_t to avoid overflow with large BARs.  Return
> type can be bool.
>
>> +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
>> +{
>> +    IVShmemState *s = opaque;
>> +
>> +    u_int64_t write_one = 1;
>> +    u_int16_t dest = val>>  16;
>> +    u_int16_t vector = val&  0xff;
>> +
>> +    addr&= 0xfe;
>>
>
> Why 0xfe?  Can understand 0xfc or 0xff.
>
>> +
>> +    switch (addr)
>> +    {
>> +        case IntrMask:
>> +            ivshmem_IntrMask_write(s, val);
>> +            break;
>> +
>> +        case IntrStatus:
>> +            ivshmem_IntrStatus_write(s, val);
>> +            break;
>> +
>> +        case Doorbell:
>> +            /* check doorbell range */
>> +            if ((vector>= 0)&&  (vector<  s->eventfds_posn_count[dest]))
>> {
>>
>
> What if dest is too big?  We overflow s->eventfds_posn_count.
>>
>> +
>> +static void close_guest_eventfds(IVShmemState *s, int posn)
>> +{
>> +    int i, guest_curr_max;
>> +
>> +    guest_curr_max = s->eventfds_posn_count[posn];
>> +
>> +    for (i = 0; i<  guest_curr_max; i++)
>> +        close(s->eventfds[posn][i]);
>> +
>> +    free(s->eventfds[posn]);
>>
>
> qemu_free().
>
>> +/* this function increase the dynamic storage need to store data about
>> other
>> + * guests */
>> +static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
>> +
>> +    int j, old_nr_alloc;
>> +
>> +    old_nr_alloc = s->nr_alloc_guests;
>> +
>> +    while (s->nr_alloc_guests<  new_min_size)
>> +        s->nr_alloc_guests = s->nr_alloc_guests * 2;
>> +
>> +    IVSHMEM_DPRINTF("bumping storage to %d guests\n",
>> s->nr_alloc_guests);
>> +    s->eventfds = qemu_realloc(s->eventfds, s->nr_alloc_guests *
>> +                                                        sizeof(int *));
>> +    s->eventfds_posn_count = qemu_realloc(s->eventfds_posn_count,
>> +                                                    s->nr_alloc_guests *
>> +                                                        sizeof(int));
>> +    s->eventfd_table = qemu_realloc(s->eventfd_table, s->nr_alloc_guests
>> *
>> +
>>  sizeof(EventfdEntry));
>> +
>> +    if ((s->eventfds == NULL) || (s->eventfds_posn_count == NULL) ||
>> +            (s->eventfd_table == NULL)) {
>> +        fprintf(stderr, "Allocation error - exiting\n");
>> +        exit(1);
>> +    }
>> +
>> +    if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
>> +        s->eventfd_chr = (CharDriverState **)qemu_realloc(s->eventfd_chr,
>> +                                    s->nr_alloc_guests * sizeof(void *));
>> +        if (s->eventfd_chr == NULL) {
>> +            fprintf(stderr, "Allocation error - exiting\n");
>> +            exit(1);
>> +        }
>> +    }
>> +
>> +    /* zero out new pointers */
>> +    for (j = old_nr_alloc; j<  s->nr_alloc_guests; j++) {
>> +        s->eventfds[j] = NULL;
>>
>
> eventfds_posn_count and eventfd_table want zeroing as well.
>
>> +    }
>> +}
>> +
>> +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
>> +{
>> +    IVShmemState *s = opaque;
>> +    int incoming_fd, tmp_fd;
>> +    int guest_curr_max;
>> +    long incoming_posn;
>> +
>> +    memcpy(&incoming_posn, buf, sizeof(long));
>> +    /* pick off s->chr->msgfd and store it, posn should accompany msg */
>> +    tmp_fd = qemu_chr_get_msgfd(s->chr);
>> +    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
>> +
>> +    /* make sure we have enough space for this guest */
>> +    if (incoming_posn>= s->nr_alloc_guests) {
>> +        increase_dynamic_storage(s, incoming_posn);
>> +    }
>> +
>> +    if (tmp_fd == -1) {
>> +        /* if posn is positive and unseen before then this is our posn*/
>> +        if ((incoming_posn>= 0)&&  (s->eventfds[incoming_posn] == NULL))
>> {
>> +            /* receive our posn */
>> +            s->vm_id = incoming_posn;
>> +            return;
>> +        } else {
>> +            /* otherwise an fd == -1 means an existing guest has gone
>> away */
>> +            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
>> +            close_guest_eventfds(s, incoming_posn);
>> +            return;
>> +        }
>> +    }
>> +
>> +    /* because of the implementation of get_msgfd, we need a dup */
>> +    incoming_fd = dup(tmp_fd);
>>
>
> Error check.
>
>> +
>> +    /* if the position is -1, then it's shared memory region fd */
>> +    if (incoming_posn == -1) {
>> +
>> +        s->num_eventfds = 0;
>> +
>> +        if (check_shm_size(s, incoming_fd) == -1) {
>> +            exit(-1);
>> +        }
>> +
>> +        /* creating a BAR in qemu_chr callback may be crazy */
>> +        create_shared_memory_BAR(s, incoming_fd);
>>
>
> It probably is... why can't you create it during initialization?

This is for the shared memory server implementation, so the fd for the
shared memory has to be received (over the qemu char device) from the
server before the BAR can be created via qemu_ram_mmap() which adds
the necessary memory

Otherwise, if the BAR is allocated during initialization, I would have
to use MAP_FIXED to mmap the memory.  This is what I did before the
qemu_ram_mmap() function was added.

>
>
>> +
>> +       return;
>> +    }
>> +
>> +    /* each guest has an array of eventfds, and we keep track of how many
>> +     * guests for each VM */
>> +    guest_curr_max = s->eventfds_posn_count[incoming_posn];
>> +    if (guest_curr_max == 0) {
>> +        /* one eventfd per MSI vector */
>> +        s->eventfds[incoming_posn] = (int *) qemu_malloc(s->vectors *
>> +
>>  sizeof(int));
>> +    }
>> +
>> +    /* this is an eventfd for a particular guest VM */
>> +    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn,
>> guest_curr_max,
>> +
>>  incoming_fd);
>> +    s->eventfds[incoming_posn][guest_curr_max] = incoming_fd;
>> +
>> +    /* increment count for particular guest */
>> +    s->eventfds_posn_count[incoming_posn]++;
>>
>
> Not sure I follow exactly, but perhaps this needs to be
>
>    s->eventfds_posn_count[incoming_posn] = guest_curr_max + 1;
>
> Oh, it is.
>
>> +
>> +        /* allocate/initialize space for interrupt handling */
>> +        s->eventfds = qemu_mallocz(s->nr_alloc_guests * sizeof(int *));
>> +        s->eventfd_table = qemu_mallocz(s->vectors *
>> sizeof(EventfdEntry));
>> +        s->eventfds_posn_count = qemu_mallocz(s->nr_alloc_guests *
>> sizeof(int));
>> +
>> +        pci_conf[PCI_INTERRUPT_PIN] = 1; /* we are going to support
>> interrupts */
>>
>
> This is done by the guest BIOS.
>
>
> --
> error compiling committee.c: too many arguments to function
>
>
Avi Kivity - May 10, 2010, 3:28 p.m.
On 05/10/2010 06:22 PM, Cam Macdonell wrote:
>
>>
>>> +
>>> +    /* if the position is -1, then it's shared memory region fd */
>>> +    if (incoming_posn == -1) {
>>> +
>>> +        s->num_eventfds = 0;
>>> +
>>> +        if (check_shm_size(s, incoming_fd) == -1) {
>>> +            exit(-1);
>>> +        }
>>> +
>>> +        /* creating a BAR in qemu_chr callback may be crazy */
>>> +        create_shared_memory_BAR(s, incoming_fd);
>>>
>>>        
>> It probably is... why can't you create it during initialization?
>>      
> This is for the shared memory server implementation, so the fd for the
> shared memory has to be received (over the qemu char device) from the
> server before the BAR can be created via qemu_ram_mmap() which adds
> the necessary memory
>
>    


We could do the handshake during initialization.  I'm worried that the 
device will appear without the BAR, and strange things will happen.  But 
the chardev API is probably not geared for passing data during init.

Anthony, any ideas?

> Otherwise, if the BAR is allocated during initialization, I would have
> to use MAP_FIXED to mmap the memory.  This is what I did before the
> qemu_ram_mmap() function was added.
>    

What would happen to any data written to the BAR before the the 
handshake completed?  I think it would disappear.

So it's a good idea to make the initialization process atomic.
Anthony Liguori - May 10, 2010, 3:38 p.m.
On 05/10/2010 10:28 AM, Avi Kivity wrote:
> On 05/10/2010 06:22 PM, Cam Macdonell wrote:
>>
>>>
>>>> +
>>>> +    /* if the position is -1, then it's shared memory region fd */
>>>> +    if (incoming_posn == -1) {
>>>> +
>>>> +        s->num_eventfds = 0;
>>>> +
>>>> +        if (check_shm_size(s, incoming_fd) == -1) {
>>>> +            exit(-1);
>>>> +        }
>>>> +
>>>> +        /* creating a BAR in qemu_chr callback may be crazy */
>>>> +        create_shared_memory_BAR(s, incoming_fd);
>>>>
>>> It probably is... why can't you create it during initialization?
>> This is for the shared memory server implementation, so the fd for the
>> shared memory has to be received (over the qemu char device) from the
>> server before the BAR can be created via qemu_ram_mmap() which adds
>> the necessary memory
>>
>
>
> We could do the handshake during initialization.  I'm worried that the 
> device will appear without the BAR, and strange things will happen.  
> But the chardev API is probably not geared for passing data during init.
>
> Anthony, any ideas?

Why can't we create the BAR with just normal RAM and then change it to a 
mmap()'d fd after initialization?  This will be behavior would be 
important for live migration as it would let you quickly migrate 
preserving the memory contents without waiting for an external program 
to reconnect.

Regards,

Anthony Lioguori

>> Otherwise, if the BAR is allocated during initialization, I would have
>> to use MAP_FIXED to mmap the memory.  This is what I did before the
>> qemu_ram_mmap() function was added.
>
> What would happen to any data written to the BAR before the the 
> handshake completed?  I think it would disappear.

You don't have to do MAP_FIXED.  You can allocate a ram area and map 
that in when disconnected.  When you connect, you create another ram 
area and memcpy() the previous ram area to the new one.  You then map 
the second ram area in.

 From the guest's perspective, it's totally transparent.  For the 
backend, I'd suggest having an explicit "initialized" ack or something 
so that it knows that the data is now mapped to the guest.

If you're doing just a ring queue in shared memory, it should allow 
disconnect/reconnect during live migration asynchronously to the actual 
qemu live migration.

Regards,

Anthony Liguori

> So it's a good idea to make the initialization process atomic.
>
Cam Macdonell - May 10, 2010, 3:41 p.m.
On Mon, May 10, 2010 at 9:28 AM, Avi Kivity <avi@redhat.com> wrote:
> On 05/10/2010 06:22 PM, Cam Macdonell wrote:
>>
>>>
>>>> +
>>>> +    /* if the position is -1, then it's shared memory region fd */
>>>> +    if (incoming_posn == -1) {
>>>> +
>>>> +        s->num_eventfds = 0;
>>>> +
>>>> +        if (check_shm_size(s, incoming_fd) == -1) {
>>>> +            exit(-1);
>>>> +        }
>>>> +
>>>> +        /* creating a BAR in qemu_chr callback may be crazy */
>>>> +        create_shared_memory_BAR(s, incoming_fd);
>>>>
>>>>
>>>
>>> It probably is... why can't you create it during initialization?
>>>
>>
>> This is for the shared memory server implementation, so the fd for the
>> shared memory has to be received (over the qemu char device) from the
>> server before the BAR can be created via qemu_ram_mmap() which adds
>> the necessary memory
>>
>>
>
>
> We could do the handshake during initialization.  I'm worried that the
> device will appear without the BAR, and strange things will happen.  But the
> chardev API is probably not geared for passing data during init.

More specifically, the challenge I've found is that there is no
function to tell a chardev to block and wait for the initialization
data.

>
> Anthony, any ideas?
>
>> Otherwise, if the BAR is allocated during initialization, I would have
>> to use MAP_FIXED to mmap the memory.  This is what I did before the
>> qemu_ram_mmap() function was added.
>>
>
> What would happen to any data written to the BAR before the the handshake
> completed?  I think it would disappear.

But, the BAR isn't there until the handshake is completed.  Only after
receiving the shared memory fd does my device call pci_register_bar()
in the callback function.  So there may be a case with BAR2 (the
shared memory BAR) missing during initialization.  FWIW, I haven't
encountered this.

>
> So it's a good idea to make the initialization process atomic.
>
> --
> error compiling committee.c: too many arguments to function
>
>
Cam Macdonell - May 10, 2010, 4:20 p.m.
On Mon, May 10, 2010 at 9:38 AM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 05/10/2010 10:28 AM, Avi Kivity wrote:
>>
>> On 05/10/2010 06:22 PM, Cam Macdonell wrote:
>>>
>>>>
>>>>> +
>>>>> +    /* if the position is -1, then it's shared memory region fd */
>>>>> +    if (incoming_posn == -1) {
>>>>> +
>>>>> +        s->num_eventfds = 0;
>>>>> +
>>>>> +        if (check_shm_size(s, incoming_fd) == -1) {
>>>>> +            exit(-1);
>>>>> +        }
>>>>> +
>>>>> +        /* creating a BAR in qemu_chr callback may be crazy */
>>>>> +        create_shared_memory_BAR(s, incoming_fd);
>>>>>
>>>> It probably is... why can't you create it during initialization?
>>>
>>> This is for the shared memory server implementation, so the fd for the
>>> shared memory has to be received (over the qemu char device) from the
>>> server before the BAR can be created via qemu_ram_mmap() which adds
>>> the necessary memory
>>>
>>
>>
>> We could do the handshake during initialization.  I'm worried that the
>> device will appear without the BAR, and strange things will happen.  But the
>> chardev API is probably not geared for passing data during init.
>>
>> Anthony, any ideas?
>
> Why can't we create the BAR with just normal RAM and then change it to a
> mmap()'d fd after initialization?  This will be behavior would be important
> for live migration as it would let you quickly migrate preserving the memory
> contents without waiting for an external program to reconnect.
>
> Regards,
>
> Anthony Lioguori
>
>>> Otherwise, if the BAR is allocated during initialization, I would have
>>> to use MAP_FIXED to mmap the memory.  This is what I did before the
>>> qemu_ram_mmap() function was added.
>>
>> What would happen to any data written to the BAR before the the handshake
>> completed?  I think it would disappear.
>
> You don't have to do MAP_FIXED.  You can allocate a ram area and map that in
> when disconnected.  When you connect, you create another ram area and
> memcpy() the previous ram area to the new one.  You then map the second ram
> area in.

the memcpy() would overwrite the contents of the shared memory each
time a guest joins which would be dangerous.

>
> From the guest's perspective, it's totally transparent.  For the backend,
> I'd suggest having an explicit "initialized" ack or something so that it
> knows that the data is now mapped to the guest.

Yes, I think the ack is the way to go, so the guest has to be aware of
it.  Would setting a flag in the driver-specific config space be an
acceptable ack that the shared region is now mapped?

Cam
Avi Kivity - May 10, 2010, 4:40 p.m.
On 05/10/2010 06:41 PM, Cam Macdonell wrote:
>
>> What would happen to any data written to the BAR before the the handshake
>> completed?  I think it would disappear.
>>      
> But, the BAR isn't there until the handshake is completed.  Only after
> receiving the shared memory fd does my device call pci_register_bar()
> in the callback function.  So there may be a case with BAR2 (the
> shared memory BAR) missing during initialization.  FWIW, I haven't
> encountered this.
>    

Well, that violates PCI.  You can't have a PCI device with no BAR, then 
have a BAR appear.  It may work since the BAR is registered a lot faster 
than the BIOS is able to peek at it, but it's a race nevertheless.
Cam Macdonell - May 10, 2010, 4:48 p.m.
On Mon, May 10, 2010 at 10:40 AM, Avi Kivity <avi@redhat.com> wrote:
> On 05/10/2010 06:41 PM, Cam Macdonell wrote:
>>
>>> What would happen to any data written to the BAR before the the handshake
>>> completed?  I think it would disappear.
>>>
>>
>> But, the BAR isn't there until the handshake is completed.  Only after
>> receiving the shared memory fd does my device call pci_register_bar()
>> in the callback function.  So there may be a case with BAR2 (the
>> shared memory BAR) missing during initialization.  FWIW, I haven't
>> encountered this.
>>
>
> Well, that violates PCI.  You can't have a PCI device with no BAR, then have
> a BAR appear.  It may work since the BAR is registered a lot faster than the
> BIOS is able to peek at it, but it's a race nevertheless.

Agreed.  I'll get Anthony's idea up and running.  It seems that is the
way forward.

Cam
Anthony Liguori - May 10, 2010, 4:52 p.m.
On 05/10/2010 11:20 AM, Cam Macdonell wrote:
> On Mon, May 10, 2010 at 9:38 AM, Anthony Liguori<anthony@codemonkey.ws>  wrote:
>    
>> On 05/10/2010 10:28 AM, Avi Kivity wrote:
>>      
>>> On 05/10/2010 06:22 PM, Cam Macdonell wrote:
>>>        
>>>>          
>>>>>            
>>>>>> +
>>>>>> +    /* if the position is -1, then it's shared memory region fd */
>>>>>> +    if (incoming_posn == -1) {
>>>>>> +
>>>>>> +        s->num_eventfds = 0;
>>>>>> +
>>>>>> +        if (check_shm_size(s, incoming_fd) == -1) {
>>>>>> +            exit(-1);
>>>>>> +        }
>>>>>> +
>>>>>> +        /* creating a BAR in qemu_chr callback may be crazy */
>>>>>> +        create_shared_memory_BAR(s, incoming_fd);
>>>>>>
>>>>>>              
>>>>> It probably is... why can't you create it during initialization?
>>>>>            
>>>> This is for the shared memory server implementation, so the fd for the
>>>> shared memory has to be received (over the qemu char device) from the
>>>> server before the BAR can be created via qemu_ram_mmap() which adds
>>>> the necessary memory
>>>>
>>>>          
>>>
>>> We could do the handshake during initialization.  I'm worried that the
>>> device will appear without the BAR, and strange things will happen.  But the
>>> chardev API is probably not geared for passing data during init.
>>>
>>> Anthony, any ideas?
>>>        
>> Why can't we create the BAR with just normal RAM and then change it to a
>> mmap()'d fd after initialization?  This will be behavior would be important
>> for live migration as it would let you quickly migrate preserving the memory
>> contents without waiting for an external program to reconnect.
>>
>> Regards,
>>
>> Anthony Lioguori
>>
>>      
>>>> Otherwise, if the BAR is allocated during initialization, I would have
>>>> to use MAP_FIXED to mmap the memory.  This is what I did before the
>>>> qemu_ram_mmap() function was added.
>>>>          
>>> What would happen to any data written to the BAR before the the handshake
>>> completed?  I think it would disappear.
>>>        
>> You don't have to do MAP_FIXED.  You can allocate a ram area and map that in
>> when disconnected.  When you connect, you create another ram area and
>> memcpy() the previous ram area to the new one.  You then map the second ram
>> area in.
>>      
> the memcpy() would overwrite the contents of the shared memory each
> time a guest joins which would be dangerous.
>    

I think those are reasonable semantics and is really the only way to get 
guest-transparent reconnect.  The later is pretty critical for guest 
transparent live migration.

>>  From the guest's perspective, it's totally transparent.  For the backend,
>> I'd suggest having an explicit "initialized" ack or something so that it
>> knows that the data is now mapped to the guest.
>>      
> Yes, I think the ack is the way to go, so the guest has to be aware of
> it.  Would setting a flag in the driver-specific config space be an
> acceptable ack that the shared region is now mapped?
>    

You know it's mapped because it's mapped when the pci map function 
returns.  You don't need the guest to explicitly tell you.

Regards,

Anthony Liguori

> Cam
>
Avi Kivity - May 10, 2010, 4:59 p.m.
On 05/10/2010 06:38 PM, Anthony Liguori wrote:
>
>>> Otherwise, if the BAR is allocated during initialization, I would have
>>> to use MAP_FIXED to mmap the memory.  This is what I did before the
>>> qemu_ram_mmap() function was added.
>>
>> What would happen to any data written to the BAR before the the 
>> handshake completed?  I think it would disappear.
>
> You don't have to do MAP_FIXED.  You can allocate a ram area and map 
> that in when disconnected.  When you connect, you create another ram 
> area and memcpy() the previous ram area to the new one.  You then map 
> the second ram area in.

But it's a shared memory area.  Other peers could have connected and 
written some data in.  The memcpy() would destroy their data.

>
> From the guest's perspective, it's totally transparent.  For the 
> backend, I'd suggest having an explicit "initialized" ack or something 
> so that it knows that the data is now mapped to the guest.

 From the peers' perspective, it's non-transparent :(

Also it doubles the transient memory requirement.

>
> If you're doing just a ring queue in shared memory, it should allow 
> disconnect/reconnect during live migration asynchronously to the 
> actual qemu live migration.
>

Live migration of guests using shared memory is interesting.  You'd need 
to freeze all peers on one node, disconnect, reconnect, and restart them 
on the other node.
Anthony Liguori - May 10, 2010, 5:25 p.m.
On 05/10/2010 11:59 AM, Avi Kivity wrote:
> On 05/10/2010 06:38 PM, Anthony Liguori wrote:
>>
>>>> Otherwise, if the BAR is allocated during initialization, I would have
>>>> to use MAP_FIXED to mmap the memory.  This is what I did before the
>>>> qemu_ram_mmap() function was added.
>>>
>>> What would happen to any data written to the BAR before the the 
>>> handshake completed?  I think it would disappear.
>>
>> You don't have to do MAP_FIXED.  You can allocate a ram area and map 
>> that in when disconnected.  When you connect, you create another ram 
>> area and memcpy() the previous ram area to the new one.  You then map 
>> the second ram area in.
>
> But it's a shared memory area.  Other peers could have connected and 
> written some data in.  The memcpy() would destroy their data.

Why try to attempt to support multi-master shared memory?  What's the 
use-case?

Regards,

Anthony Liguori

>>
>> From the guest's perspective, it's totally transparent.  For the 
>> backend, I'd suggest having an explicit "initialized" ack or 
>> something so that it knows that the data is now mapped to the guest.
>
> From the peers' perspective, it's non-transparent :(
>
> Also it doubles the transient memory requirement.
>
>>
>> If you're doing just a ring queue in shared memory, it should allow 
>> disconnect/reconnect during live migration asynchronously to the 
>> actual qemu live migration.
>>
>
> Live migration of guests using shared memory is interesting.  You'd 
> need to freeze all peers on one node, disconnect, reconnect, and 
> restart them on the other node.
>
Cam Macdonell - May 10, 2010, 5:43 p.m.
On Mon, May 10, 2010 at 11:25 AM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 05/10/2010 11:59 AM, Avi Kivity wrote:
>>
>> On 05/10/2010 06:38 PM, Anthony Liguori wrote:
>>>
>>>>> Otherwise, if the BAR is allocated during initialization, I would have
>>>>> to use MAP_FIXED to mmap the memory.  This is what I did before the
>>>>> qemu_ram_mmap() function was added.
>>>>
>>>> What would happen to any data written to the BAR before the the
>>>> handshake completed?  I think it would disappear.
>>>
>>> You don't have to do MAP_FIXED.  You can allocate a ram area and map that
>>> in when disconnected.  When you connect, you create another ram area and
>>> memcpy() the previous ram area to the new one.  You then map the second ram
>>> area in.
>>
>> But it's a shared memory area.  Other peers could have connected and
>> written some data in.  The memcpy() would destroy their data.
>
> Why try to attempt to support multi-master shared memory?  What's the
> use-case?

I don't see it as multi-master, but that the latest guest to join
shouldn't have their contents take precedence.  In developing this
patch, my motivation has been to let the guests decide.  If the memcpy
is always done, even when no data is written, a guest cannot join
without overwriting everything.

One use case we're looking at is having VMs using a map reduce
framework like Hadoop or Phoenix running in VMs.  However, if a
workqueue is stored or data transfer passes through shared memory, a
system can't scale up the number of workers because each new guest
will erase the shared memory (and the workqueue or in progress data
transfer).

In cases where the latest guest to join wants to clear the memory, it
can do so without the automatic memcpy.  The guest can do a memset
once it knows the memory is attached.  My opinion is to leave it to
the guests and the application that is using the shared memory to
decide what to do on guest joins.

Cam

>
> Regards,
>
> Anthony Liguori
>
>>>
>>> From the guest's perspective, it's totally transparent.  For the backend,
>>> I'd suggest having an explicit "initialized" ack or something so that it
>>> knows that the data is now mapped to the guest.
>>
>> From the peers' perspective, it's non-transparent :(
>>
>> Also it doubles the transient memory requirement.
>>
>>>
>>> If you're doing just a ring queue in shared memory, it should allow
>>> disconnect/reconnect during live migration asynchronously to the actual qemu
>>> live migration.
>>>
>>
>> Live migration of guests using shared memory is interesting.  You'd need
>> to freeze all peers on one node, disconnect, reconnect, and restart them on
>> the other node.
>>
>
>
Anthony Liguori - May 10, 2010, 5:52 p.m.
On 05/10/2010 12:43 PM, Cam Macdonell wrote:
> On Mon, May 10, 2010 at 11:25 AM, Anthony Liguori<anthony@codemonkey.ws>  wrote:
>    
>> On 05/10/2010 11:59 AM, Avi Kivity wrote:
>>      
>>> On 05/10/2010 06:38 PM, Anthony Liguori wrote:
>>>        
>>>>          
>>>>>> Otherwise, if the BAR is allocated during initialization, I would have
>>>>>> to use MAP_FIXED to mmap the memory.  This is what I did before the
>>>>>> qemu_ram_mmap() function was added.
>>>>>>              
>>>>> What would happen to any data written to the BAR before the the
>>>>> handshake completed?  I think it would disappear.
>>>>>            
>>>> You don't have to do MAP_FIXED.  You can allocate a ram area and map that
>>>> in when disconnected.  When you connect, you create another ram area and
>>>> memcpy() the previous ram area to the new one.  You then map the second ram
>>>> area in.
>>>>          
>>> But it's a shared memory area.  Other peers could have connected and
>>> written some data in.  The memcpy() would destroy their data.
>>>        
>> Why try to attempt to support multi-master shared memory?  What's the
>> use-case?
>>      
> I don't see it as multi-master, but that the latest guest to join
> shouldn't have their contents take precedence.  In developing this
> patch, my motivation has been to let the guests decide.  If the memcpy
> is always done, even when no data is written, a guest cannot join
> without overwriting everything.
>
> One use case we're looking at is having VMs using a map reduce
> framework like Hadoop or Phoenix running in VMs.  However, if a
> workqueue is stored or data transfer passes through shared memory, a
> system can't scale up the number of workers because each new guest
> will erase the shared memory (and the workqueue or in progress data
> transfer).
>    

(Replying again to list)

What data structure would you use?  For a lockless ring queue, you can 
only support a single producer and consumer.  To achieve bidirectional 
communication in virtio, we always use two queues.

If you're adding additional queues to support other levels of 
communication, you can always use different areas of shared memory.

I guess this is the point behind the doorbell mechanism?

Regards,

Anthony Liguori

> In cases where the latest guest to join wants to clear the memory, it
> can do so without the automatic memcpy.  The guest can do a memset
> once it knows the memory is attached.  My opinion is to leave it to
> the guests and the application that is using the shared memory to
> decide what to do on guest joins.
>
> Cam
>
>    
>> Regards,
>>
>> Anthony Liguori
>>
>>      
>>>>  From the guest's perspective, it's totally transparent.  For the backend,
>>>> I'd suggest having an explicit "initialized" ack or something so that it
>>>> knows that the data is now mapped to the guest.
>>>>          
>>>  From the peers' perspective, it's non-transparent :(
>>>
>>> Also it doubles the transient memory requirement.
>>>
>>>        
>>>> If you're doing just a ring queue in shared memory, it should allow
>>>> disconnect/reconnect during live migration asynchronously to the actual qemu
>>>> live migration.
>>>>
>>>>          
>>> Live migration of guests using shared memory is interesting.  You'd need
>>> to freeze all peers on one node, disconnect, reconnect, and restart them on
>>> the other node.
>>>
>>>        
>>
>>      
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Cam Macdonell - May 10, 2010, 6:01 p.m.
On Mon, May 10, 2010 at 11:52 AM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 05/10/2010 12:43 PM, Cam Macdonell wrote:
>>
>> On Mon, May 10, 2010 at 11:25 AM, Anthony Liguori<anthony@codemonkey.ws>
>>  wrote:
>>
>>>
>>> On 05/10/2010 11:59 AM, Avi Kivity wrote:
>>>
>>>>
>>>> On 05/10/2010 06:38 PM, Anthony Liguori wrote:
>>>>
>>>>>
>>>>>
>>>>>>>
>>>>>>> Otherwise, if the BAR is allocated during initialization, I would
>>>>>>> have
>>>>>>> to use MAP_FIXED to mmap the memory.  This is what I did before the
>>>>>>> qemu_ram_mmap() function was added.
>>>>>>>
>>>>>>
>>>>>> What would happen to any data written to the BAR before the the
>>>>>> handshake completed?  I think it would disappear.
>>>>>>
>>>>>
>>>>> You don't have to do MAP_FIXED.  You can allocate a ram area and map
>>>>> that
>>>>> in when disconnected.  When you connect, you create another ram area
>>>>> and
>>>>> memcpy() the previous ram area to the new one.  You then map the second
>>>>> ram
>>>>> area in.
>>>>>
>>>>
>>>> But it's a shared memory area.  Other peers could have connected and
>>>> written some data in.  The memcpy() would destroy their data.
>>>>
>>>
>>> Why try to attempt to support multi-master shared memory?  What's the
>>> use-case?
>>>
>>
>> I don't see it as multi-master, but that the latest guest to join
>> shouldn't have their contents take precedence.  In developing this
>> patch, my motivation has been to let the guests decide.  If the memcpy
>> is always done, even when no data is written, a guest cannot join
>> without overwriting everything.
>>
>> One use case we're looking at is having VMs using a map reduce
>> framework like Hadoop or Phoenix running in VMs.  However, if a
>> workqueue is stored or data transfer passes through shared memory, a
>> system can't scale up the number of workers because each new guest
>> will erase the shared memory (and the workqueue or in progress data
>> transfer).
>>
>
> (Replying again to list)

Sorry about that.

> What data structure would you use?  For a lockless ring queue, you can only
> support a single producer and consumer.  To achieve bidirectional
> communication in virtio, we always use two queues.

MCS locks can work with multiple producer/consumers, either with busy
waiting or using the doorbell mechanism.

>
> If you're adding additional queues to support other levels of communication,
> you can always use different areas of shared memory.

True, and my point is simply that the memcpy would wipe those all out.

>
> I guess this is the point behind the doorbell mechanism?

Yes.

>
> Regards,
>
> Anthony Liguori
>
>> In cases where the latest guest to join wants to clear the memory, it
>> can do so without the automatic memcpy.  The guest can do a memset
>> once it knows the memory is attached.  My opinion is to leave it to
>> the guests and the application that is using the shared memory to
>> decide what to do on guest joins.
>>
>> Cam
>>
>>
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>>
>>>>>
>>>>>  From the guest's perspective, it's totally transparent.  For the
>>>>> backend,
>>>>> I'd suggest having an explicit "initialized" ack or something so that
>>>>> it
>>>>> knows that the data is now mapped to the guest.
>>>>>
>>>>
>>>>  From the peers' perspective, it's non-transparent :(
>>>>
>>>> Also it doubles the transient memory requirement.
>>>>
>>>>
>>>>>
>>>>> If you're doing just a ring queue in shared memory, it should allow
>>>>> disconnect/reconnect during live migration asynchronously to the actual
>>>>> qemu
>>>>> live migration.
>>>>>
>>>>>
>>>>
>>>> Live migration of guests using shared memory is interesting.  You'd need
>>>> to freeze all peers on one node, disconnect, reconnect, and restart them
>>>> on
>>>> the other node.
>>>>
>>>>
>>>
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>
>
Cam Macdonell - May 10, 2010, 11:17 p.m.
On Mon, May 10, 2010 at 5:59 AM, Avi Kivity <avi@redhat.com> wrote:
> On 04/21/2010 08:53 PM, Cam Macdonell wrote:
>>
>> Support an inter-vm shared memory device that maps a shared-memory object
>> as a
>> PCI device in the guest.  This patch also supports interrupts between
>> guest by
>> communicating over a unix domain socket.  This patch applies to the
>> qemu-kvm
>> repository.
>>
>>     -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>
>> Interrupts are supported between multiple VMs by using a shared memory
>> server
>> by using a chardev socket.
>>
>>     -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>                     [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
>>     -chardev socket,path=<path>,id=<id>
>>
>> (shared memory server is qemu.git/contrib/ivshmem-server)
>>
>> Sample programs and init scripts are in a git repo here:
>>
>>
>> +typedef struct EventfdEntry {
>> +    PCIDevice *pdev;
>> +    int vector;
>> +} EventfdEntry;
>> +
>> +typedef struct IVShmemState {
>> +    PCIDevice dev;
>> +    uint32_t intrmask;
>> +    uint32_t intrstatus;
>> +    uint32_t doorbell;
>> +
>> +    CharDriverState * chr;
>> +    CharDriverState ** eventfd_chr;
>> +    int ivshmem_mmio_io_addr;
>> +
>> +    pcibus_t mmio_addr;
>> +    unsigned long ivshmem_offset;
>> +    uint64_t ivshmem_size; /* size of shared memory region */
>> +    int shm_fd; /* shared memory file descriptor */
>> +
>> +    int nr_allocated_vms;
>> +    /* array of eventfds for each guest */
>> +    int ** eventfds;
>> +    /* keep track of # of eventfds for each guest*/
>> +    int * eventfds_posn_count;
>>
>
> More readable:
>
>  typedef struct Peer {
>      int nb_eventfds;
>      int *eventfds;
>  } Peer;
>  int nb_peers;
>  Peer *peers;
>
> Does eventfd_chr need to be there as well?

No it does not, eventfd_chr store character devices for receiving
interrupts when irqfd is not available, so we only them for this
guest, not for our peers.

I've switched over to this more readable naming you've suggested.

>
>> +
>> +    int nr_alloc_guests;
>> +    int vm_id;
>> +    int num_eventfds;
>> +    uint32_t vectors;
>> +    uint32_t features;
>> +    EventfdEntry *eventfd_table;
>> +
>> +    char * shmobj;
>> +    char * sizearg;
>>
>
> Does this need to be part of the state?

They are because they're passed in as qdev properties from the
command-line so I thought they needed to be in the state struct to be
assigned via DEFINE_PROP_...

>
>> +} IVShmemState;
>> +
>> +/* registers for the Inter-VM shared memory device */
>> +enum ivshmem_registers {
>> +    IntrMask = 0,
>> +    IntrStatus = 4,
>> +    IVPosition = 8,
>> +    Doorbell = 12,
>> +};
>> +
>> +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int
>> feature) {
>> +    return (ivs->features&  (1<<  feature));
>> +}
>> +
>> +static inline int is_power_of_two(int x) {
>> +    return (x&  (x-1)) == 0;
>> +}
>>
>
> argument needs to be uint64_t to avoid overflow with large BARs.  Return
> type can be bool.
>
>> +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
>> +{
>> +    IVShmemState *s = opaque;
>> +
>> +    u_int64_t write_one = 1;
>> +    u_int16_t dest = val>>  16;
>> +    u_int16_t vector = val&  0xff;
>> +
>> +    addr&= 0xfe;
>>
>
> Why 0xfe?  Can understand 0xfc or 0xff.

Forgot to change to 0xfc when registers went from 16 to 32-bits.

>
>> +
>> +    switch (addr)
>> +    {
>> +        case IntrMask:
>> +            ivshmem_IntrMask_write(s, val);
>> +            break;
>> +
>> +        case IntrStatus:
>> +            ivshmem_IntrStatus_write(s, val);
>> +            break;
>> +
>> +        case Doorbell:
>> +            /* check doorbell range */
>> +            if ((vector>= 0)&&  (vector<  s->eventfds_posn_count[dest]))
>> {
>>
>
> What if dest is too big?  We overflow s->eventfds_posn_count.

added a check for that.

Thanks,
Cam
Avi Kivity - May 11, 2010, 7:55 a.m.
On 05/10/2010 08:25 PM, Anthony Liguori wrote:
> On 05/10/2010 11:59 AM, Avi Kivity wrote:
>> On 05/10/2010 06:38 PM, Anthony Liguori wrote:
>>>
>>>>> Otherwise, if the BAR is allocated during initialization, I would 
>>>>> have
>>>>> to use MAP_FIXED to mmap the memory.  This is what I did before the
>>>>> qemu_ram_mmap() function was added.
>>>>
>>>> What would happen to any data written to the BAR before the the 
>>>> handshake completed?  I think it would disappear.
>>>
>>> You don't have to do MAP_FIXED.  You can allocate a ram area and map 
>>> that in when disconnected.  When you connect, you create another ram 
>>> area and memcpy() the previous ram area to the new one.  You then 
>>> map the second ram area in.
>>
>> But it's a shared memory area.  Other peers could have connected and 
>> written some data in.  The memcpy() would destroy their data.
>
> Why try to attempt to support multi-master shared memory?  What's the 
> use-case?

(presuming you mean multiple writers?)

This is a surprising take.  What's the use of a single master shared 
memory area?

Most uses of shared memory among processes or threads are multi-master.  
One use case can be a shared cache among the various guests.
Avi Kivity - May 11, 2010, 7:59 a.m.
On 05/10/2010 08:52 PM, Anthony Liguori wrote:
>>> Why try to attempt to support multi-master shared memory?  What's the
>>> use-case?
>> I don't see it as multi-master, but that the latest guest to join
>> shouldn't have their contents take precedence.  In developing this
>> patch, my motivation has been to let the guests decide.  If the memcpy
>> is always done, even when no data is written, a guest cannot join
>> without overwriting everything.
>>
>> One use case we're looking at is having VMs using a map reduce
>> framework like Hadoop or Phoenix running in VMs.  However, if a
>> workqueue is stored or data transfer passes through shared memory, a
>> system can't scale up the number of workers because each new guest
>> will erase the shared memory (and the workqueue or in progress data
>> transfer).
>
> (Replying again to list)
>
> What data structure would you use?  For a lockless ring queue, you can 
> only support a single producer and consumer.  To achieve bidirectional 
> communication in virtio, we always use two queues.

You don't have to use a lockless ring queue.  You can use locks 
(spinlocks without interrupt support, full mutexes with interrupts) and 
any data structure you like.  Say a hash table + LRU for a shared cache.

>
> If you're adding additional queues to support other levels of 
> communication, you can always use different areas of shared memory.

You'll need O(n^2) shared memory areas (n=peer count), and it is a lot 
less flexible that real shared memory.  Consider using threading where 
the only communication among threads is a pipe (erlang?)
Avi Kivity - May 11, 2010, 8:03 a.m.
On 05/11/2010 02:17 AM, Cam Macdonell wrote:
> On Mon, May 10, 2010 at 5:59 AM, Avi Kivity<avi@redhat.com>  wrote:
>    
>> On 04/21/2010 08:53 PM, Cam Macdonell wrote:
>>      
>>> Support an inter-vm shared memory device that maps a shared-memory object
>>> as a
>>> PCI device in the guest.  This patch also supports interrupts between
>>> guest by
>>> communicating over a unix domain socket.  This patch applies to the
>>> qemu-kvm
>>> repository.
>>>
>>>      -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>>
>>> Interrupts are supported between multiple VMs by using a shared memory
>>> server
>>> by using a chardev socket.
>>>
>>>      -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>>>                      [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
>>>      -chardev socket,path=<path>,id=<id>
>>>
>>> (shared memory server is qemu.git/contrib/ivshmem-server)
>>>
>>> Sample programs and init scripts are in a git repo here:
>>>
>>>
>>> +typedef struct EventfdEntry {
>>> +    PCIDevice *pdev;
>>> +    int vector;
>>> +} EventfdEntry;
>>> +
>>> +typedef struct IVShmemState {
>>> +    PCIDevice dev;
>>> +    uint32_t intrmask;
>>> +    uint32_t intrstatus;
>>> +    uint32_t doorbell;
>>> +
>>> +    CharDriverState * chr;
>>> +    CharDriverState ** eventfd_chr;
>>> +    int ivshmem_mmio_io_addr;
>>> +
>>> +    pcibus_t mmio_addr;
>>> +    unsigned long ivshmem_offset;
>>> +    uint64_t ivshmem_size; /* size of shared memory region */
>>> +    int shm_fd; /* shared memory file descriptor */
>>> +
>>> +    int nr_allocated_vms;
>>> +    /* array of eventfds for each guest */
>>> +    int ** eventfds;
>>> +    /* keep track of # of eventfds for each guest*/
>>> +    int * eventfds_posn_count;
>>>
>>>        
>> More readable:
>>
>>   typedef struct Peer {
>>       int nb_eventfds;
>>       int *eventfds;
>>   } Peer;
>>   int nb_peers;
>>   Peer *peers;
>>
>> Does eventfd_chr need to be there as well?
>>      
> No it does not, eventfd_chr store character devices for receiving
> interrupts when irqfd is not available, so we only them for this
> guest, not for our peers.
>    

Ok.

>> Does this need to be part of the state?
>>      
> They are because they're passed in as qdev properties from the
> command-line so I thought they needed to be in the state struct to be
> assigned via DEFINE_PROP_...
>    

Well I'm not q-ualified to comment on qdev, so I'm fine either way with 
this.
Anthony Liguori - May 11, 2010, 1:10 p.m.
On 05/11/2010 02:59 AM, Avi Kivity wrote:
>> (Replying again to list)
>>
>> What data structure would you use?  For a lockless ring queue, you 
>> can only support a single producer and consumer.  To achieve 
>> bidirectional communication in virtio, we always use two queues.
>
>
> You don't have to use a lockless ring queue.  You can use locks 
> (spinlocks without interrupt support, full mutexes with interrupts) 
> and any data structure you like.  Say a hash table + LRU for a shared 
> cache.

Yeah, the mailslot enables this.

I think the question boils down to whether we can support transparent 
peer connections and disconnections.  I think that's important in order 
to support transparent live migration.

If you have two peers that are disconnected and then connect to each 
other, there's simply no way to choose who's content gets preserved.  
It's necessary to designate one peer as a master in order to break the tie.

So this could simply involve an additional option to the shared memory 
driver: role=master|peer.  If role=master, when a new shared memory 
segment is mapped, the contents of the BAR ram is memcpy()'d to the 
shared memory segment.  In either case, the contents of the shared 
memory segment should be memcpy()'d to the BAR ram whenever the shared 
memory segment is disconnected.

I believe role=master should be default because I think a relationship 
of master/slave is going to be much more common than peering.

>>
>> If you're adding additional queues to support other levels of 
>> communication, you can always use different areas of shared memory.
>
> You'll need O(n^2) shared memory areas (n=peer count), and it is a lot 
> less flexible that real shared memory.  Consider using threading where 
> the only communication among threads is a pipe (erlang?)

I can't think of a use of multiple peers via shared memory today with 
virtualization.  I know lots of master/slave uses of shared memory 
though.  I agree that it's useful to support from an academic 
perspective but I don't believe it's going to be the common use.

Regards,

Anthony Liguori
Avi Kivity - May 11, 2010, 2:03 p.m.
On 05/11/2010 04:10 PM, Anthony Liguori wrote:
> On 05/11/2010 02:59 AM, Avi Kivity wrote:
>>> (Replying again to list)
>>>
>>> What data structure would you use?  For a lockless ring queue, you 
>>> can only support a single producer and consumer.  To achieve 
>>> bidirectional communication in virtio, we always use two queues.
>>
>>
>> You don't have to use a lockless ring queue.  You can use locks 
>> (spinlocks without interrupt support, full mutexes with interrupts) 
>> and any data structure you like.  Say a hash table + LRU for a shared 
>> cache.
>
> Yeah, the mailslot enables this.
>
> I think the question boils down to whether we can support transparent 
> peer connections and disconnections.  I think that's important in 
> order to support transparent live migration.
>
> If you have two peers that are disconnected and then connect to each 
> other, there's simply no way to choose who's content gets preserved.  
> It's necessary to designate one peer as a master in order to break the 
> tie.

The master is the shared memory area.  It's a completely separate entity 
that is represented by the backing file (or shared memory server handing 
out the fd to mmap).  It can exists independently of any guest.

>
> So this could simply involve an additional option to the shared memory 
> driver: role=master|peer.  If role=master, when a new shared memory 
> segment is mapped, the contents of the BAR ram is memcpy()'d to the 
> shared memory segment.  In either case, the contents of the shared 
> memory segment should be memcpy()'d to the BAR ram whenever the shared 
> memory segment is disconnected.

I don't understand why we need separate BAR ram and shared memory.  Have 
just shared memory, exposed by the BAR when connected.  When the PCI 
card is disconnected from shared memory, the BAR should discard writes 
and return all 1s for reads.

Having a temporary RAM area while disconnected doesn't serve a purpose 
(since it exists only for a short while) and increases the RAM load.

> I believe role=master should be default because I think a relationship 
> of master/slave is going to be much more common than peering.

What if you have N guests?  What if the master disconnects?

>
>>>
>>> If you're adding additional queues to support other levels of 
>>> communication, you can always use different areas of shared memory.
>>
>> You'll need O(n^2) shared memory areas (n=peer count), and it is a 
>> lot less flexible that real shared memory.  Consider using threading 
>> where the only communication among threads is a pipe (erlang?)
>
> I can't think of a use of multiple peers via shared memory today with 
> virtualization.  I know lots of master/slave uses of shared memory 
> though.  I agree that it's useful to support from an academic 
> perspective but I don't believe it's going to be the common use.

Large shared cache.  That use case even survives live migration if you 
use lockless algorithms.
Cam Macdonell - May 11, 2010, 2:17 p.m.
On Tue, May 11, 2010 at 8:03 AM, Avi Kivity <avi@redhat.com> wrote:
> On 05/11/2010 04:10 PM, Anthony Liguori wrote:
>>
>> On 05/11/2010 02:59 AM, Avi Kivity wrote:
>>>>
>>>> (Replying again to list)
>>>>
>>>> What data structure would you use?  For a lockless ring queue, you can
>>>> only support a single producer and consumer.  To achieve bidirectional
>>>> communication in virtio, we always use two queues.
>>>
>>>
>>> You don't have to use a lockless ring queue.  You can use locks
>>> (spinlocks without interrupt support, full mutexes with interrupts) and any
>>> data structure you like.  Say a hash table + LRU for a shared cache.
>>
>> Yeah, the mailslot enables this.
>>
>> I think the question boils down to whether we can support transparent peer
>> connections and disconnections.  I think that's important in order to
>> support transparent live migration.
>>
>> If you have two peers that are disconnected and then connect to each
>> other, there's simply no way to choose who's content gets preserved.  It's
>> necessary to designate one peer as a master in order to break the tie.
>
> The master is the shared memory area.  It's a completely separate entity
> that is represented by the backing file (or shared memory server handing out
> the fd to mmap).  It can exists independently of any guest.

I think the master/peer idea would be necessary if we were sharing
guest memory (sharing guest A's memory with guest B).  Then if the
master (guest A) dies, perhaps something needs to happen to preserve
the memory contents.  But since we're sharing host memory, the
applications in the guests can race to determine the master by
grabbing a lock at offset 0 or by using lowest VM ID.

Looking at it another way, it is the applications using shared memory
that may or may not need a master, the Qemu processes don't need the
concept of a master since the memory belongs to the host.

>
>>
>> So this could simply involve an additional option to the shared memory
>> driver: role=master|peer.  If role=master, when a new shared memory segment
>> is mapped, the contents of the BAR ram is memcpy()'d to the shared memory
>> segment.  In either case, the contents of the shared memory segment should
>> be memcpy()'d to the BAR ram whenever the shared memory segment is
>> disconnected.
>
> I don't understand why we need separate BAR ram and shared memory.  Have
> just shared memory, exposed by the BAR when connected.  When the PCI card is
> disconnected from shared memory, the BAR should discard writes and return
> all 1s for reads.
>
> Having a temporary RAM area while disconnected doesn't serve a purpose
> (since it exists only for a short while) and increases the RAM load.

I agree with Avi here.  If a guest wants to view shared memory, then
it needs to stay connected.  I think being able to see the contents
(via the memcpy()) even though the guest is disconnected would be
confusing.

>
>> I believe role=master should be default because I think a relationship of
>> master/slave is going to be much more common than peering.
>
> What if you have N guests?  What if the master disconnects?
>
>>
>>>>
>>>> If you're adding additional queues to support other levels of
>>>> communication, you can always use different areas of shared memory.
>>>
>>> You'll need O(n^2) shared memory areas (n=peer count), and it is a lot
>>> less flexible that real shared memory.  Consider using threading where the
>>> only communication among threads is a pipe (erlang?)
>>
>> I can't think of a use of multiple peers via shared memory today with
>> virtualization.  I know lots of master/slave uses of shared memory though.
>>  I agree that it's useful to support from an academic perspective but I
>> don't believe it's going to be the common use.
>
> Large shared cache.  That use case even survives live migration if you use
> lockless algorithms.
>
> --
> error compiling committee.c: too many arguments to function
>
>
Avi Kivity - May 11, 2010, 2:53 p.m.
On 05/11/2010 05:17 PM, Cam Macdonell wrote:
>
>> The master is the shared memory area.  It's a completely separate entity
>> that is represented by the backing file (or shared memory server handing out
>> the fd to mmap).  It can exists independently of any guest.
>>      
> I think the master/peer idea would be necessary if we were sharing
> guest memory (sharing guest A's memory with guest B).  Then if the
> master (guest A) dies, perhaps something needs to happen to preserve
> the memory contents.

Definitely.  But we aren't...

>    But since we're sharing host memory, the
> applications in the guests can race to determine the master by
> grabbing a lock at offset 0 or by using lowest VM ID.
>
> Looking at it another way, it is the applications using shared memory
> that may or may not need a master, the Qemu processes don't need the
> concept of a master since the memory belongs to the host.
>    

Exactly.  Furthermore, even in a master/slave relationship, there will 
be different masters for different sub-areas, it would be a pity to 
expose all this in the hardware abstraction.  This way we have an 
external device, and PCI HBAs which connect to it - just like a 
multi-tailed SCSI disk.
Anthony Liguori - May 11, 2010, 3:51 p.m.
On 05/11/2010 09:53 AM, Avi Kivity wrote:
> On 05/11/2010 05:17 PM, Cam Macdonell wrote:
>>
>>> The master is the shared memory area.  It's a completely separate 
>>> entity
>>> that is represented by the backing file (or shared memory server 
>>> handing out
>>> the fd to mmap).  It can exists independently of any guest.
>> I think the master/peer idea would be necessary if we were sharing
>> guest memory (sharing guest A's memory with guest B).  Then if the
>> master (guest A) dies, perhaps something needs to happen to preserve
>> the memory contents.
>
> Definitely.  But we aren't...

Then transparent live migration is impossible.  IMHO, that's a 
fundamental mistake that we will regret down the road.

>>    But since we're sharing host memory, the
>> applications in the guests can race to determine the master by
>> grabbing a lock at offset 0 or by using lowest VM ID.
>>
>> Looking at it another way, it is the applications using shared memory
>> that may or may not need a master, the Qemu processes don't need the
>> concept of a master since the memory belongs to the host.
>
> Exactly.  Furthermore, even in a master/slave relationship, there will 
> be different masters for different sub-areas, it would be a pity to 
> expose all this in the hardware abstraction.  This way we have an 
> external device, and PCI HBAs which connect to it - just like a 
> multi-tailed SCSI disk.

To support transparent live migration, it's necessary to do two things:

1) Preserve the memory contents of the PCI BAR after disconnected from a 
shared memory segment
2) Synchronize any changes made to the PCI BAR with the shared memory 
segment upon reconnect/initial connection.

N.B. savevm/loadvm both constitute disconnect and reconnect events 
respectively.

Supporting (1) is easy since we just need to memcpy() the contents of 
the shared memory segment to a temporary RAM area upon disconnect.

Supporting (2) is easy when the shared memory segment is viewed as owned 
by the guest since it has the definitive copy of the data.  IMHO, this 
is what role=master means.  However, if we want to support a model where 
the guest does not have a definitive copy of the data, upon reconnect, 
we need to throw away the guest's changes and make the shared memory 
segment appear to simultaneously update to the guest.  This is what 
role=peer means.

For role=peer, it's necessary to signal to the guest when it's not 
connected.  This means prior to savevm it's necessary to indicate to the 
guest that it's been disconnected.

I think it's important that we build this mechanism in from the start 
because as I've stated in the past, I don't think role=peer is going to 
be the dominant use-case.  I actually don't think that shared memory 
between guests is all that interesting compared to shared memory to an 
external process on the host.

Regards,

Anthony Liguori
Cam Macdonell - May 11, 2010, 4:39 p.m.
On Tue, May 11, 2010 at 9:51 AM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 05/11/2010 09:53 AM, Avi Kivity wrote:
>>
>> On 05/11/2010 05:17 PM, Cam Macdonell wrote:
>>>
>>>> The master is the shared memory area.  It's a completely separate entity
>>>> that is represented by the backing file (or shared memory server handing
>>>> out
>>>> the fd to mmap).  It can exists independently of any guest.
>>>
>>> I think the master/peer idea would be necessary if we were sharing
>>> guest memory (sharing guest A's memory with guest B).  Then if the
>>> master (guest A) dies, perhaps something needs to happen to preserve
>>> the memory contents.
>>
>> Definitely.  But we aren't...
>
> Then transparent live migration is impossible.  IMHO, that's a fundamental
> mistake that we will regret down the road.
>
>>>   But since we're sharing host memory, the
>>> applications in the guests can race to determine the master by
>>> grabbing a lock at offset 0 or by using lowest VM ID.
>>>
>>> Looking at it another way, it is the applications using shared memory
>>> that may or may not need a master, the Qemu processes don't need the
>>> concept of a master since the memory belongs to the host.
>>
>> Exactly.  Furthermore, even in a master/slave relationship, there will be
>> different masters for different sub-areas, it would be a pity to expose all
>> this in the hardware abstraction.  This way we have an external device, and
>> PCI HBAs which connect to it - just like a multi-tailed SCSI disk.
>
> To support transparent live migration, it's necessary to do two things:
>
> 1) Preserve the memory contents of the PCI BAR after disconnected from a
> shared memory segment
> 2) Synchronize any changes made to the PCI BAR with the shared memory
> segment upon reconnect/initial connection.
>
> N.B. savevm/loadvm both constitute disconnect and reconnect events
> respectively.
>
> Supporting (1) is easy since we just need to memcpy() the contents of the
> shared memory segment to a temporary RAM area upon disconnect.
>
> Supporting (2) is easy when the shared memory segment is viewed as owned by
> the guest since it has the definitive copy of the data.  IMHO, this is what
> role=master means.  However, if we want to support a model where the guest
> does not have a definitive copy of the data, upon reconnect, we need to
> throw away the guest's changes and make the shared memory segment appear to
> simultaneously update to the guest.  This is what role=peer means.
>
> For role=peer, it's necessary to signal to the guest when it's not
> connected.  This means prior to savevm it's necessary to indicate to the
> guest that it's been disconnected.
>
> I think it's important that we build this mechanism in from the start
> because as I've stated in the past, I don't think role=peer is going to be
> the dominant use-case.  I actually don't think that shared memory between
> guests is all that interesting compared to shared memory to an external
> process on the host.
>

Most of the people I hear from who are using my patch are using a peer
model to share data between applications (simulations, JVMs, etc).
But guest-to-host applications work as well of course.

I think "transparent migration" can be achieved by making the
connected/disconnected state transparent to the application.

When using the shared memory server, the server has to be setup anyway
on the new host and copying the memory region could be part of that as
well if the application needs the contents preserved.  I don't think
it has to be handled by the savevm/loadvm operations.  There's little
difference between naming one VM the master or letting the shared
memory server act like a master.

I think abstractions on top of shared memory could handle
disconnection issues (sort of how TCP handles them for networks) if
the application needs it.  Again, my opinion is to leave it to the
application to decide what it necessary.

Cam
Anthony Liguori - May 11, 2010, 5:05 p.m.
On 05/11/2010 11:39 AM, Cam Macdonell wrote:
>
> Most of the people I hear from who are using my patch are using a peer
> model to share data between applications (simulations, JVMs, etc).
> But guest-to-host applications work as well of course.
>
> I think "transparent migration" can be achieved by making the
> connected/disconnected state transparent to the application.
>
> When using the shared memory server, the server has to be setup anyway
> on the new host and copying the memory region could be part of that as
> well if the application needs the contents preserved.  I don't think
> it has to be handled by the savevm/loadvm operations.  There's little
> difference between naming one VM the master or letting the shared
> memory server act like a master.
>    

Except that to make it work with the shared memory server, you need the 
server to participate in the live migration protocol which is something 
I'd prefer to avoid at it introduces additional down time.

Regards,

Anthony Liguori

> I think abstractions on top of shared memory could handle
> disconnection issues (sort of how TCP handles them for networks) if
> the application needs it.  Again, my opinion is to leave it to the
> application to decide what it necessary.
>
> Cam
>
Cam Macdonell - May 11, 2010, 5:50 p.m.
On Tue, May 11, 2010 at 11:05 AM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 05/11/2010 11:39 AM, Cam Macdonell wrote:
>>
>> Most of the people I hear from who are using my patch are using a peer
>> model to share data between applications (simulations, JVMs, etc).
>> But guest-to-host applications work as well of course.
>>
>> I think "transparent migration" can be achieved by making the
>> connected/disconnected state transparent to the application.
>>
>> When using the shared memory server, the server has to be setup anyway
>> on the new host and copying the memory region could be part of that as
>> well if the application needs the contents preserved.  I don't think
>> it has to be handled by the savevm/loadvm operations.  There's little
>> difference between naming one VM the master or letting the shared
>> memory server act like a master.
>>
>
> Except that to make it work with the shared memory server, you need the
> server to participate in the live migration protocol which is something I'd
> prefer to avoid at it introduces additional down time.

Fair enough, then to move to a resolution on this can we either

not support migration at this point, which leaves us free to add it
later as migration use cases become better understand. (my preference)

OR

1 - not support migration when the server is used
2 - if role=master is specified in the non-server case, then that
guest will copy the memory with it.  If role=peer is specified, the
guest will use the shared memory object on the destination host as is
(possibly creating it or output an error if memory object doesn't
exist).

Cam
Avi Kivity - May 11, 2010, 6:09 p.m.
On 05/11/2010 06:51 PM, Anthony Liguori wrote:
> On 05/11/2010 09:53 AM, Avi Kivity wrote:
>> On 05/11/2010 05:17 PM, Cam Macdonell wrote:
>>>
>>>> The master is the shared memory area.  It's a completely separate 
>>>> entity
>>>> that is represented by the backing file (or shared memory server 
>>>> handing out
>>>> the fd to mmap).  It can exists independently of any guest.
>>> I think the master/peer idea would be necessary if we were sharing
>>> guest memory (sharing guest A's memory with guest B).  Then if the
>>> master (guest A) dies, perhaps something needs to happen to preserve
>>> the memory contents.
>>
>> Definitely.  But we aren't...
>
> Then transparent live migration is impossible.  IMHO, that's a 
> fundamental mistake that we will regret down the road.

I don't see why the two cases are any different.  In all cases, all 
guests have to be migrated simultaneously, or we have to support 
distributed shared memory (likely at the kernel level).  Who owns the 
memory makes no difference.

There is a two non-transparent variants:
- forcibly disconnect the migrating guest, and migrate it later
   - puts all the burden on the guest application
- ask the guest to detach from the memory device
   - host is at the mercy of the guest

Since the consumers of shared memory are academia, they'll probably 
implement DSM.

>
>>>    But since we're sharing host memory, the
>>> applications in the guests can race to determine the master by
>>> grabbing a lock at offset 0 or by using lowest VM ID.
>>>
>>> Looking at it another way, it is the applications using shared memory
>>> that may or may not need a master, the Qemu processes don't need the
>>> concept of a master since the memory belongs to the host.
>>
>> Exactly.  Furthermore, even in a master/slave relationship, there 
>> will be different masters for different sub-areas, it would be a pity 
>> to expose all this in the hardware abstraction.  This way we have an 
>> external device, and PCI HBAs which connect to it - just like a 
>> multi-tailed SCSI disk.
>
> To support transparent live migration, it's necessary to do two things:
>
> 1) Preserve the memory contents of the PCI BAR after disconnected from 
> a shared memory segment
> 2) Synchronize any changes made to the PCI BAR with the shared memory 
> segment upon reconnect/initial connection.

Disconnect/reconnect mean it's no longer transparent.

>
> N.B. savevm/loadvm both constitute disconnect and reconnect events 
> respectively.
>
> Supporting (1) is easy since we just need to memcpy() the contents of 
> the shared memory segment to a temporary RAM area upon disconnect.
>
> Supporting (2) is easy when the shared memory segment is viewed as 
> owned by the guest since it has the definitive copy of the data.  
> IMHO, this is what role=master means. 

There is no 'the guest', if the memory is to be shared there will be 
multiple guests (or multiple entities).

> However, if we want to support a model where the guest does not have a 
> definitive copy of the data, upon reconnect, we need to throw away the 
> guest's changes and make the shared memory segment appear to 
> simultaneously update to the guest.  This is what role=peer means.
>
> For role=peer, it's necessary to signal to the guest when it's not 
> connected.  This means prior to savevm it's necessary to indicate to 
> the guest that it's been disconnected.
>
> I think it's important that we build this mechanism in from the start 
> because as I've stated in the past, I don't think role=peer is going 
> to be the dominant use-case.  I actually don't think that shared 
> memory between guests is all that interesting compared to shared 
> memory to an external process on the host.

I'd like to avoid making the distinction.  Why limit at the outset?
Avi Kivity - May 11, 2010, 6:13 p.m.
On 05/11/2010 08:05 PM, Anthony Liguori wrote:
> On 05/11/2010 11:39 AM, Cam Macdonell wrote:
>>
>> Most of the people I hear from who are using my patch are using a peer
>> model to share data between applications (simulations, JVMs, etc).
>> But guest-to-host applications work as well of course.
>>
>> I think "transparent migration" can be achieved by making the
>> connected/disconnected state transparent to the application.
>>
>> When using the shared memory server, the server has to be setup anyway
>> on the new host and copying the memory region could be part of that as
>> well if the application needs the contents preserved.  I don't think
>> it has to be handled by the savevm/loadvm operations.  There's little
>> difference between naming one VM the master or letting the shared
>> memory server act like a master.
>
> Except that to make it work with the shared memory server, you need 
> the server to participate in the live migration protocol which is 
> something I'd prefer to avoid at it introduces additional down time.

We can tunnel its migration data through qemu.  Of course, gathering its 
dirty bitmap will be interesting.  DSM may be the way to go here (we can 
even live migrate qemu through DSM: share the guest address space and 
immediately start running on the destination node; the guest will fault 
its memory to the destination.  An advantage is that that the cpu load 
is immediately transferred.
Cam Macdonell - May 12, 2010, 3:32 p.m.
On Tue, May 11, 2010 at 12:13 PM, Avi Kivity <avi@redhat.com> wrote:
> On 05/11/2010 08:05 PM, Anthony Liguori wrote:
>>
>> On 05/11/2010 11:39 AM, Cam Macdonell wrote:
>>>
>>> Most of the people I hear from who are using my patch are using a peer
>>> model to share data between applications (simulations, JVMs, etc).
>>> But guest-to-host applications work as well of course.
>>>
>>> I think "transparent migration" can be achieved by making the
>>> connected/disconnected state transparent to the application.
>>>
>>> When using the shared memory server, the server has to be setup anyway
>>> on the new host and copying the memory region could be part of that as
>>> well if the application needs the contents preserved.  I don't think
>>> it has to be handled by the savevm/loadvm operations.  There's little
>>> difference between naming one VM the master or letting the shared
>>> memory server act like a master.
>>
>> Except that to make it work with the shared memory server, you need the
>> server to participate in the live migration protocol which is something I'd
>> prefer to avoid at it introduces additional down time.
>
> We can tunnel its migration data through qemu.  Of course, gathering its
> dirty bitmap will be interesting.  DSM may be the way to go here (we can
> even live migrate qemu through DSM: share the guest address space and
> immediately start running on the destination node; the guest will fault its
> memory to the destination.  An advantage is that that the cpu load is
> immediately transferred.
>

Given the potential need to develop DSM and migrating multiple VMs
simultaneously as well as few details to decide on, can the patch
series (with other review tweaks fixed) be accepted without migration
support?  I'll continue to work on it of course, but I think the patch
is useful to users without migration at the moment.

Cam
Avi Kivity - May 12, 2010, 3:48 p.m.
On 05/12/2010 06:32 PM, Cam Macdonell wrote:
>
>> We can tunnel its migration data through qemu.  Of course, gathering its
>> dirty bitmap will be interesting.  DSM may be the way to go here (we can
>> even live migrate qemu through DSM: share the guest address space and
>> immediately start running on the destination node; the guest will fault its
>> memory to the destination.  An advantage is that that the cpu load is
>> immediately transferred.
>>
>>      
> Given the potential need to develop DSM and migrating multiple VMs
> simultaneously as well as few details to decide on, can the patch
> series (with other review tweaks fixed) be accepted without migration
> support?

Definitely.  I don't expect DSM to materialize tomorrow (or ever).
Avi Kivity - May 12, 2010, 3:49 p.m.
On 05/10/2010 07:48 PM, Cam Macdonell wrote:
> On Mon, May 10, 2010 at 10:40 AM, Avi Kivity<avi@redhat.com>  wrote:
>    
>> On 05/10/2010 06:41 PM, Cam Macdonell wrote:
>>      
>>>        
>>>> What would happen to any data written to the BAR before the the handshake
>>>> completed?  I think it would disappear.
>>>>
>>>>          
>>> But, the BAR isn't there until the handshake is completed.  Only after
>>> receiving the shared memory fd does my device call pci_register_bar()
>>> in the callback function.  So there may be a case with BAR2 (the
>>> shared memory BAR) missing during initialization.  FWIW, I haven't
>>> encountered this.
>>>
>>>        
>> Well, that violates PCI.  You can't have a PCI device with no BAR, then have
>> a BAR appear.  It may work since the BAR is registered a lot faster than the
>> BIOS is able to peek at it, but it's a race nevertheless.
>>      
> Agreed.  I'll get Anthony's idea up and running.  It seems that is the
> way forward.
>    

What, with the separate allocation and memcpy?  Or another one?

Why can't we complete initialization before exposing the card and BAR?  
Seems to be the simplest solution.
Cam Macdonell - May 12, 2010, 4:14 p.m.
On Wed, May 12, 2010 at 9:49 AM, Avi Kivity <avi@redhat.com> wrote:
> On 05/10/2010 07:48 PM, Cam Macdonell wrote:
>>
>> On Mon, May 10, 2010 at 10:40 AM, Avi Kivity<avi@redhat.com>  wrote:
>>
>>>
>>> On 05/10/2010 06:41 PM, Cam Macdonell wrote:
>>>
>>>>
>>>>
>>>>>
>>>>> What would happen to any data written to the BAR before the the
>>>>> handshake
>>>>> completed?  I think it would disappear.
>>>>>
>>>>>
>>>>
>>>> But, the BAR isn't there until the handshake is completed.  Only after
>>>> receiving the shared memory fd does my device call pci_register_bar()
>>>> in the callback function.  So there may be a case with BAR2 (the
>>>> shared memory BAR) missing during initialization.  FWIW, I haven't
>>>> encountered this.
>>>>
>>>>
>>>
>>> Well, that violates PCI.  You can't have a PCI device with no BAR, then
>>> have
>>> a BAR appear.  It may work since the BAR is registered a lot faster than
>>> the
>>> BIOS is able to peek at it, but it's a race nevertheless.
>>>
>>
>> Agreed.  I'll get Anthony's idea up and running.  It seems that is the
>> way forward.
>>
>
> What, with the separate allocation and memcpy?  Or another one?

Mapping in the memory when it is received from the server.

>
> Why can't we complete initialization before exposing the card and BAR?
>  Seems to be the simplest solution.

Looking at it more closely, you're right, the fds for shared
memory/eventfds are received in a fraction of a second, so that's why
I haven't seen any problems since the memory is mapped before the BIOS
detects and configures the device.

We can't block on a qemu char device (in anyway I can see) so we have
to handle mapping the memory BAR in the callback function.  But, we
can make the semantics that the VM ID is not set until the memory is
mapped.  So if the VM ID is -1, then the memory has not been mapped
yet, reads/writes work but don't do anything useful.  So the user can
detect the mapping of the memory
and it does not invalidate PCI since the BAR is always present, but
just not mapped to the shared memory.

Cam
Avi Kivity - May 12, 2010, 4:45 p.m.
On 05/12/2010 07:14 PM, Cam Macdonell wrote:
>
>> Why can't we complete initialization before exposing the card and BAR?
>>   Seems to be the simplest solution.
>>      
> Looking at it more closely, you're right, the fds for shared
> memory/eventfds are received in a fraction of a second, so that's why
> I haven't seen any problems since the memory is mapped before the BIOS
> detects and configures the device.
>
> We can't block on a qemu char device (in anyway I can see) so we have
> to handle mapping the memory BAR in the callback function.  But, we
> can make the semantics that the VM ID is not set until the memory is
> mapped.  So if the VM ID is -1, then the memory has not been mapped
> yet, reads/writes work but don't do anything useful.  So the user can
> detect the mapping of the memory
> and it does not invalidate PCI since the BAR is always present, but
> just not mapped to the shared memory.
>    

I don't like this very much.  We expose an internal qemu implementation 
detail, the lack of ability to complete negotiation during init, and 
make the device more error prone to use.

However, it does make some sense if we regard the device as an HBA 
accessing an external memory, so it's not unreasonable.  But please be 
sure to document this.
Cam Macdonell - May 13, 2010, 9:10 p.m.
On Mon, May 10, 2010 at 5:59 AM, Avi Kivity <avi@redhat.com> wrote:
> On 04/21/2010 08:53 PM, Cam Macdonell wrote:

>> +
>> +        /* allocate/initialize space for interrupt handling */
>> +        s->eventfds = qemu_mallocz(s->nr_alloc_guests * sizeof(int *));
>> +        s->eventfd_table = qemu_mallocz(s->vectors *
>> sizeof(EventfdEntry));
>> +        s->eventfds_posn_count = qemu_mallocz(s->nr_alloc_guests *
>> sizeof(int));
>> +
>> +        pci_conf[PCI_INTERRUPT_PIN] = 1; /* we are going to support
>> interrupts */
>>
>
> This is done by the guest BIOS.
>
>

If I remove that line, my driver crashes when it falls back to
pin-based interrupts (when MSI is turned off).  Is there something in
the device driver that I need to set in place of this?  A number of
other devices (mostly network cards) set the interrupt pin this way,
so I'm a little confused.
Avi Kivity - May 15, 2010, 6:05 a.m.
On 05/14/2010 12:10 AM, Cam Macdonell wrote:
> On Mon, May 10, 2010 at 5:59 AM, Avi Kivity<avi@redhat.com>  wrote:
>    
>> On 04/21/2010 08:53 PM, Cam Macdonell wrote:
>>      
>    
>>> +
>>> +        /* allocate/initialize space for interrupt handling */
>>> +        s->eventfds = qemu_mallocz(s->nr_alloc_guests * sizeof(int *));
>>> +        s->eventfd_table = qemu_mallocz(s->vectors *
>>> sizeof(EventfdEntry));
>>> +        s->eventfds_posn_count = qemu_mallocz(s->nr_alloc_guests *
>>> sizeof(int));
>>> +
>>> +        pci_conf[PCI_INTERRUPT_PIN] = 1; /* we are going to support
>>> interrupts */
>>>
>>>        
>> This is done by the guest BIOS.
>>
>>
>>      
> If I remove that line, my driver crashes when it falls back to
> pin-based interrupts (when MSI is turned off).  Is there something in
> the device driver that I need to set in place of this?  A number of
> other devices (mostly network cards) set the interrupt pin this way,
> so I'm a little confused.
>    

Sorry, I confused this with PCI_INTERRUPT_LINE.

Note there is a helper to set it, pci_config_set_interrupt_pin().
Cam Macdonell - May 18, 2010, 4:58 p.m.
On Mon, May 10, 2010 at 10:52 AM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>> Yes, I think the ack is the way to go, so the guest has to be aware of
>> it.  Would setting a flag in the driver-specific config space be an
>> acceptable ack that the shared region is now mapped?
>>
>
> You know it's mapped because it's mapped when the pci map function returns.
>  You don't need the guest to explicitly tell you.
>

I've been playing with migration.  It appears that the memory is
preserved on migration in the default case which makes sense as it is
part of the qemu memory allocation.  In my current implementation, I
"map" the shared memory in by calling cpu_register_physical_memory()
with the offset returned from qemu_ram_map().

My question is how to I unregister the physical memory so it is not
copied on migration (for the role=peer case).  There isn't a
cpu_unregister_physical_memory().

Cam
Avi Kivity - May 18, 2010, 5:27 p.m.
On 05/18/2010 07:58 PM, Cam Macdonell wrote:
>
> My question is how to I unregister the physical memory so it is not
> copied on migration (for the role=peer case).  There isn't a
> cpu_unregister_physical_memory().
>    

It doesn't need to be unregistered, simply marked not migratable.  
Perhaps a flags argument to c_r_p_m().

Patch

diff --git a/Makefile.target b/Makefile.target
index 1ffd802..bc9a681 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -199,6 +199,9 @@  obj-$(CONFIG_USB_OHCI) += usb-ohci.o
 obj-y += rtl8139.o
 obj-y += e1000.o
 
+# Inter-VM PCI shared memory
+obj-y += ivshmem.o
+
 # Hardware support
 obj-i386-y = pckbd.o dma.o
 obj-i386-y += vga.o
diff --git a/hw/ivshmem.c b/hw/ivshmem.c
new file mode 100644
index 0000000..f8d8fdb
--- /dev/null
+++ b/hw/ivshmem.c
@@ -0,0 +1,727 @@ 
+/*
+ * Inter-VM Shared Memory PCI device.
+ *
+ * Author:
+ *      Cam Macdonell <cam@cs.ualberta.ca>
+ *
+ * Based On: cirrus_vga.c and rtl8139.c
+ *
+ * This code is licensed under the GNU GPL v2.
+ */
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include "hw.h"
+#include "console.h"
+#include "pc.h"
+#include "pci.h"
+#include "sysemu.h"
+
+#include "msix.h"
+#include "qemu-kvm.h"
+#include "libkvm.h"
+
+#include <sys/eventfd.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+
+#define IVSHMEM_IRQFD   0
+#define IVSHMEM_MSI     1
+
+#define DEBUG_IVSHMEM
+#ifdef DEBUG_IVSHMEM
+#define IVSHMEM_DPRINTF(fmt, args...)        \
+    do {printf("IVSHMEM: " fmt, ##args); } while (0)
+#else
+#define IVSHMEM_DPRINTF(fmt, args...)
+#endif
+
+typedef struct EventfdEntry {
+    PCIDevice *pdev;
+    int vector;
+} EventfdEntry;
+
+typedef struct IVShmemState {
+    PCIDevice dev;
+    uint32_t intrmask;
+    uint32_t intrstatus;
+    uint32_t doorbell;
+
+    CharDriverState * chr;
+    CharDriverState ** eventfd_chr;
+    int ivshmem_mmio_io_addr;
+
+    pcibus_t mmio_addr;
+    unsigned long ivshmem_offset;
+    uint64_t ivshmem_size; /* size of shared memory region */
+    int shm_fd; /* shared memory file descriptor */
+
+    int nr_allocated_vms;
+    /* array of eventfds for each guest */
+    int ** eventfds;
+    /* keep track of # of eventfds for each guest*/
+    int * eventfds_posn_count;
+
+    int nr_alloc_guests;
+    int vm_id;
+    int num_eventfds;
+    uint32_t vectors;
+    uint32_t features;
+    EventfdEntry *eventfd_table;
+
+    char * shmobj;
+    char * sizearg;
+} IVShmemState;
+
+/* registers for the Inter-VM shared memory device */
+enum ivshmem_registers {
+    IntrMask = 0,
+    IntrStatus = 4,
+    IVPosition = 8,
+    Doorbell = 12,
+};
+
+static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int feature) {
+    return (ivs->features & (1 << feature));
+}
+
+static inline int is_power_of_two(int x) {
+    return (x & (x-1)) == 0;
+}
+
+static void ivshmem_map(PCIDevice *pci_dev, int region_num,
+                    pcibus_t addr, pcibus_t size, int type)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
+
+    IVSHMEM_DPRINTF("addr = %u size = %u\n", (uint32_t)addr, (uint32_t)size);
+    cpu_register_physical_memory(addr, s->ivshmem_size, s->ivshmem_offset);
+
+}
+
+/* accessing registers - based on rtl8139 */
+static void ivshmem_update_irq(IVShmemState *s, int val)
+{
+    int isr;
+    isr = (s->intrstatus & s->intrmask) & 0xffffffff;
+
+    /* don't print ISR resets */
+    if (isr) {
+        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
+           isr ? 1 : 0, s->intrstatus, s->intrmask);
+    }
+
+    qemu_set_irq(s->dev.irq[0], (isr != 0));
+}
+
+static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
+{
+    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
+
+    s->intrmask = val;
+
+    ivshmem_update_irq(s, val);
+}
+
+static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
+{
+    uint32_t ret = s->intrmask;
+
+    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
+
+    return ret;
+}
+
+static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
+{
+    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
+
+    s->intrstatus = val;
+
+    ivshmem_update_irq(s, val);
+    return;
+}
+
+static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
+{
+    uint32_t ret = s->intrstatus;
+
+    /* reading ISR clears all interrupts */
+    s->intrstatus = 0;
+
+    ivshmem_update_irq(s, 0);
+
+    return ret;
+}
+
+static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val)
+{
+
+    IVSHMEM_DPRINTF("We shouldn't be writing words\n");
+}
+
+static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
+{
+    IVShmemState *s = opaque;
+
+    u_int64_t write_one = 1;
+    u_int16_t dest = val >> 16;
+    u_int16_t vector = val & 0xff;
+
+    addr &= 0xfe;
+
+    switch (addr)
+    {
+        case IntrMask:
+            ivshmem_IntrMask_write(s, val);
+            break;
+
+        case IntrStatus:
+            ivshmem_IntrStatus_write(s, val);
+            break;
+
+        case Doorbell:
+            /* check doorbell range */
+            if ((vector >= 0) && (vector < s->eventfds_posn_count[dest])) {
+                IVSHMEM_DPRINTF("Writing %ld to VM %d on vector %d\n", write_one, dest, vector);
+                if (write(s->eventfds[dest][vector], &(write_one), 8) != 8) {
+                    IVSHMEM_DPRINTF("error writing to eventfd\n");
+                }
+            }
+            break;
+        default:
+            IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest);
+    }
+}
+
+static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val)
+{
+    IVSHMEM_DPRINTF("We shouldn't be writing bytes\n");
+}
+
+static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr)
+{
+
+    IVSHMEM_DPRINTF("We shouldn't be reading words\n");
+    return 0;
+}
+
+static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr)
+{
+
+    IVShmemState *s = opaque;
+    uint32_t ret;
+
+    switch (addr)
+    {
+        case IntrMask:
+            ret = ivshmem_IntrMask_read(s);
+            break;
+
+        case IntrStatus:
+            ret = ivshmem_IntrStatus_read(s);
+            break;
+
+        case IVPosition:
+            /* return my id in the ivshmem list */
+            ret = s->vm_id;
+            break;
+
+        default:
+            IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr);
+            ret = 0;
+    }
+
+    return ret;
+
+}
+
+static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr)
+{
+    IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
+
+    return 0;
+}
+
+static void ivshmem_mmio_writeb(void *opaque,
+                                target_phys_addr_t addr, uint32_t val)
+{
+    ivshmem_io_writeb(opaque, addr & 0xFF, val);
+}
+
+static void ivshmem_mmio_writew(void *opaque,
+                                target_phys_addr_t addr, uint32_t val)
+{
+    ivshmem_io_writew(opaque, addr & 0xFF, val);
+}
+
+static void ivshmem_mmio_writel(void *opaque,
+                                target_phys_addr_t addr, uint32_t val)
+{
+    ivshmem_io_writel(opaque, addr & 0xFF, val);
+}
+
+static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr)
+{
+    return ivshmem_io_readb(opaque, addr & 0xFF);
+}
+
+static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr)
+{
+    uint32_t val = ivshmem_io_readw(opaque, addr & 0xFF);
+    return val;
+}
+
+static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr)
+{
+    uint32_t val = ivshmem_io_readl(opaque, addr & 0xFF);
+    return val;
+}
+
+static CPUReadMemoryFunc *ivshmem_mmio_read[3] = {
+    ivshmem_mmio_readb,
+    ivshmem_mmio_readw,
+    ivshmem_mmio_readl,
+};
+
+static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = {
+    ivshmem_mmio_writeb,
+    ivshmem_mmio_writew,
+    ivshmem_mmio_writel,
+};
+
+static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
+{
+    IVShmemState *s = opaque;
+
+    ivshmem_IntrStatus_write(s, *buf);
+
+    IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
+}
+
+static int ivshmem_can_receive(void * opaque)
+{
+    return 8;
+}
+
+static void ivshmem_event(void *opaque, int event)
+{
+    IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
+}
+
+static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {
+
+    EventfdEntry *entry = opaque;
+    PCIDevice *pdev = entry->pdev;
+
+    IVSHMEM_DPRINTF("fake irqfd on vector %d\n", entry->vector);
+    msix_notify(pdev, entry->vector);
+}
+
+static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd,
+                                                                    int vector)
+{
+    /* create a event character device based on the passed eventfd */
+    IVShmemState *s = opaque;
+    CharDriverState * chr;
+
+    chr = qemu_chr_open_eventfd(eventfd);
+
+    if (chr == NULL) {
+        IVSHMEM_DPRINTF("creating eventfd for eventfd %d failed\n", eventfd);
+        exit(-1);
+    }
+
+    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
+        s->eventfd_table[vector].pdev = &s->dev;
+        s->eventfd_table[vector].vector = vector;
+
+        qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
+                      ivshmem_event, &s->eventfd_table[vector]);
+    } else {
+        qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
+                      ivshmem_event, s);
+    }
+
+    return chr;
+
+}
+
+static int check_shm_size(IVShmemState *s, int shmemfd) {
+    /* check that the guest isn't going to try and map more memory than the
+     * card server allocated return -1 to indicate error */
+
+    struct stat buf;
+
+    fstat(shmemfd, &buf);
+
+    if (s->ivshmem_size > buf.st_size) {
+        fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
+        fprintf(stderr, " than shared object size (%ld > %ld)\n",
+                                          s->ivshmem_size, buf.st_size);
+        return -1;
+    } else {
+        return 0;
+    }
+}
+
+static void create_shared_memory_BAR(IVShmemState *s, int fd) {
+
+    s->shm_fd = fd;
+
+    s->ivshmem_offset = qemu_ram_mmap(s->shm_fd, s->ivshmem_size,
+             MAP_SHARED, 0);
+
+    /* region for shared memory */
+    pci_register_bar(&s->dev, 2, s->ivshmem_size,
+                                    PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
+}
+
+static void close_guest_eventfds(IVShmemState *s, int posn)
+{
+    int i, guest_curr_max;
+
+    guest_curr_max = s->eventfds_posn_count[posn];
+
+    for (i = 0; i < guest_curr_max; i++)
+        close(s->eventfds[posn][i]);
+
+    free(s->eventfds[posn]);
+    s->eventfds_posn_count[posn] = 0;
+}
+
+/* this function increase the dynamic storage need to store data about other
+ * guests */
+static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
+
+    int j, old_nr_alloc;
+
+    old_nr_alloc = s->nr_alloc_guests;
+
+    while (s->nr_alloc_guests < new_min_size)
+        s->nr_alloc_guests = s->nr_alloc_guests * 2;
+
+    IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nr_alloc_guests);
+    s->eventfds = qemu_realloc(s->eventfds, s->nr_alloc_guests *
+                                                        sizeof(int *));
+    s->eventfds_posn_count = qemu_realloc(s->eventfds_posn_count,
+                                                    s->nr_alloc_guests *
+                                                        sizeof(int));
+    s->eventfd_table = qemu_realloc(s->eventfd_table, s->nr_alloc_guests *
+                                                    sizeof(EventfdEntry));
+
+    if ((s->eventfds == NULL) || (s->eventfds_posn_count == NULL) ||
+            (s->eventfd_table == NULL)) {
+        fprintf(stderr, "Allocation error - exiting\n");
+        exit(1);
+    }
+
+    if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
+        s->eventfd_chr = (CharDriverState **)qemu_realloc(s->eventfd_chr,
+                                    s->nr_alloc_guests * sizeof(void *));
+        if (s->eventfd_chr == NULL) {
+            fprintf(stderr, "Allocation error - exiting\n");
+            exit(1);
+        }
+    }
+
+    /* zero out new pointers */
+    for (j = old_nr_alloc; j < s->nr_alloc_guests; j++) {
+        s->eventfds[j] = NULL;
+    }
+}
+
+static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
+{
+    IVShmemState *s = opaque;
+    int incoming_fd, tmp_fd;
+    int guest_curr_max;
+    long incoming_posn;
+
+    memcpy(&incoming_posn, buf, sizeof(long));
+    /* pick off s->chr->msgfd and store it, posn should accompany msg */
+    tmp_fd = qemu_chr_get_msgfd(s->chr);
+    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
+
+    /* make sure we have enough space for this guest */
+    if (incoming_posn >= s->nr_alloc_guests) {
+        increase_dynamic_storage(s, incoming_posn);
+    }
+
+    if (tmp_fd == -1) {
+        /* if posn is positive and unseen before then this is our posn*/
+        if ((incoming_posn >= 0) && (s->eventfds[incoming_posn] == NULL)) {
+            /* receive our posn */
+            s->vm_id = incoming_posn;
+            return;
+        } else {
+            /* otherwise an fd == -1 means an existing guest has gone away */
+            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
+            close_guest_eventfds(s, incoming_posn);
+            return;
+        }
+    }
+
+    /* because of the implementation of get_msgfd, we need a dup */
+    incoming_fd = dup(tmp_fd);
+
+    /* if the position is -1, then it's shared memory region fd */
+    if (incoming_posn == -1) {
+
+        s->num_eventfds = 0;
+
+        if (check_shm_size(s, incoming_fd) == -1) {
+            exit(-1);
+        }
+
+        /* creating a BAR in qemu_chr callback may be crazy */
+        create_shared_memory_BAR(s, incoming_fd);
+
+       return;
+    }
+
+    /* each guest has an array of eventfds, and we keep track of how many
+     * guests for each VM */
+    guest_curr_max = s->eventfds_posn_count[incoming_posn];
+    if (guest_curr_max == 0) {
+        /* one eventfd per MSI vector */
+        s->eventfds[incoming_posn] = (int *) qemu_malloc(s->vectors *
+                                                                sizeof(int));
+    }
+
+    /* this is an eventfd for a particular guest VM */
+    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, guest_curr_max,
+                                                                incoming_fd);
+    s->eventfds[incoming_posn][guest_curr_max] = incoming_fd;
+
+    /* increment count for particular guest */
+    s->eventfds_posn_count[incoming_posn]++;
+
+    /* ioeventfd and irqfd are enabled together,
+     * so the flag IRQFD refers to both */
+    if (ivshmem_has_feature(s, IVSHMEM_IRQFD) && guest_curr_max >= 0) {
+        /* allocate ioeventfd for the new fd
+         * received for guest @ incoming_posn */
+        kvm_set_ioeventfd_mmio_long(incoming_fd, s->mmio_addr + Doorbell,
+                                (incoming_posn << 16) | guest_curr_max, 1);
+    }
+
+    /* keep track of the maximum VM ID */
+    if (incoming_posn > s->num_eventfds) {
+        s->num_eventfds = incoming_posn;
+    }
+
+    if (incoming_posn == s->vm_id) {
+        if (ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
+            /* setup irqfd for this VM's eventfd */
+            int vector = guest_curr_max;
+            kvm_set_irqfd(s->eventfds[s->vm_id][guest_curr_max], vector,
+                                        s->dev.msix_irq_entries[vector].gsi);
+        } else {
+            /* initialize char device for callback
+             * if this is one of my eventfd */
+            s->eventfd_chr[guest_curr_max] = create_eventfd_chr_device(s,
+                s->eventfds[s->vm_id][guest_curr_max], guest_curr_max);
+        }
+    }
+
+    return;
+}
+
+static void ivshmem_reset(DeviceState *d)
+{
+    return;
+}
+
+static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
+                       pcibus_t addr, pcibus_t size, int type)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
+
+    s->mmio_addr = addr;
+    cpu_register_physical_memory(addr + 0, 0x400, s->ivshmem_mmio_io_addr);
+
+    /* now that our mmio region has been allocated, we can receive
+     * the file descriptors */
+    if (s->chr != NULL) {
+        qemu_chr_add_handlers(s->chr, ivshmem_can_receive, ivshmem_read,
+                     ivshmem_event, s);
+    }
+
+}
+
+static uint64_t ivshmem_get_size(IVShmemState * s) {
+
+    uint64_t value;
+    char *ptr;
+
+    value = strtoul(s->sizearg, &ptr, 10);
+    switch (*ptr) {
+        case 0: case 'M': case 'm':
+            value <<= 20;
+            break;
+        case 'G': case 'g':
+            value <<= 30;
+            break;
+        default:
+            fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg);
+            exit(1);
+    }
+
+    /* BARs must be a power of 2 */
+    if (!is_power_of_two(value)) {
+        fprintf(stderr, "ivshmem: size must be power of 2\n");
+        exit(1);
+    }
+
+    return value;
+
+}
+
+static int pci_ivshmem_init(PCIDevice *dev)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
+    uint8_t *pci_conf;
+    int i;
+
+    if (s->sizearg == NULL)
+        s->ivshmem_size = 4 << 20; /* 4 MB default */
+    else {
+        s->ivshmem_size = ivshmem_get_size(s);
+    }
+
+    /* IRQFD requires MSI */
+    if (ivshmem_has_feature(s, IVSHMEM_IRQFD) &&
+        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
+        fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n");
+        exit(1);
+    }
+
+    pci_conf = s->dev.config;
+    pci_conf[0x00] = 0xf4; /* Qumranet vendor ID 0x5002 */
+    pci_conf[0x01] = 0x1a;
+    pci_conf[0x02] = 0x10;
+    pci_conf[0x03] = 0x11;
+    pci_conf[0x04] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
+    pci_conf[0x0a] = 0x00; /* RAM controller */
+    pci_conf[0x0b] = 0x05;
+    pci_conf[0x0e] = 0x00; /* header_type */
+
+    s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
+                                    ivshmem_mmio_write, s);
+    /* region for registers*/
+    pci_register_bar(&s->dev, 0, 0x400,
+                           PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map);
+
+    /* allocate the MSI-X vectors */
+    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
+
+        if (!msix_init(&s->dev, s->vectors, 1, 0)) {
+            pci_register_bar(&s->dev, 1,
+                             msix_bar_size(&s->dev),
+                             PCI_BASE_ADDRESS_SPACE_MEMORY,
+                             msix_mmio_map);
+            IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
+        } else {
+            IVSHMEM_DPRINTF("msix initialization failed\n");
+        }
+
+        /* 'activate' the vectors */
+        for (i = 0; i < s->vectors; i++) {
+            msix_vector_use(&s->dev, i);
+        }
+    }
+
+    if ((s->chr != NULL) && (strncmp(s->chr->filename, "unix:", 5) == 0)) {
+        /* if we get a UNIX socket as the parameter we will talk
+         * to the ivshmem server later once the MMIO BAR is actually
+         * allocated (see ivshmem_mmio_map) */
+
+        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
+                                                            s->chr->filename);
+
+        /* we allocate enough space for 16 guests and grow as needed */
+        s->nr_alloc_guests = 16;
+        s->vm_id = -1;
+
+        /* allocate/initialize space for interrupt handling */
+        s->eventfds = qemu_mallocz(s->nr_alloc_guests * sizeof(int *));
+        s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry));
+        s->eventfds_posn_count = qemu_mallocz(s->nr_alloc_guests * sizeof(int));
+
+        pci_conf[PCI_INTERRUPT_PIN] = 1; /* we are going to support interrupts */
+
+        if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
+            s->eventfd_chr = (CharDriverState **)qemu_malloc(s->nr_alloc_guests *
+                                                            sizeof(void *));
+        }
+
+    } else {
+        /* just map the file immediately, we're not using a server */
+        int fd;
+
+        if (s->shmobj == NULL) {
+            fprintf(stderr, "Must specify 'chardev' or 'shm' to ivshmem\n");
+        }
+
+        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);
+
+        /* try opening with O_EXCL and if it succeeds zero the memory
+         * by truncating to 0 */
+        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
+                        S_IRWXU|S_IRWXG|S_IRWXO)) > 0) {
+           /* truncate file to length PCI device's memory */
+            if (ftruncate(fd, s->ivshmem_size) != 0) {
+                fprintf(stderr, "kvm_ivshmem: could not truncate shared file\n");
+            }
+
+        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
+                        S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
+            fprintf(stderr, "kvm_ivshmem: could not open shared file\n");
+            exit(-1);
+        }
+
+        create_shared_memory_BAR(s, fd);
+
+    }
+
+
+    return 0;
+}
+
+static int pci_ivshmem_uninit(PCIDevice *dev)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
+
+    cpu_unregister_io_memory(s->ivshmem_mmio_io_addr);
+
+    return 0;
+}
+
+static PCIDeviceInfo ivshmem_info = {
+    .qdev.name  = "ivshmem",
+    .qdev.size  = sizeof(IVShmemState),
+    .qdev.reset = ivshmem_reset,
+    .init       = pci_ivshmem_init,
+    .exit       = pci_ivshmem_uninit,
+    .qdev.props = (Property[]) {
+        DEFINE_PROP_CHR("chardev", IVShmemState, chr),
+        DEFINE_PROP_STRING("size", IVShmemState, sizearg),
+        DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
+        DEFINE_PROP_BIT("irqfd", IVShmemState, features, IVSHMEM_IRQFD, false),
+        DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
+        DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
+        DEFINE_PROP_END_OF_LIST(),
+    }
+};
+
+static void ivshmem_register_devices(void)
+{
+    pci_qdev_register(&ivshmem_info);
+}
+
+device_init(ivshmem_register_devices)
diff --git a/qemu-char.c b/qemu-char.c
index 048da3f..41cb8c7 100644
--- a/qemu-char.c
+++ b/qemu-char.c
@@ -2076,6 +2076,12 @@  static void tcp_chr_read(void *opaque)
     }
 }
 
+CharDriverState *qemu_chr_open_eventfd(int eventfd){
+
+    return qemu_chr_open_fd(eventfd, eventfd);
+
+}
+
 static void tcp_chr_connect(void *opaque)
 {
     CharDriverState *chr = opaque;
diff --git a/qemu-char.h b/qemu-char.h
index 3a9427b..1571091 100644
--- a/qemu-char.h
+++ b/qemu-char.h
@@ -93,6 +93,9 @@  void qemu_chr_info_print(Monitor *mon, const QObject *ret_data);
 void qemu_chr_info(Monitor *mon, QObject **ret_data);
 CharDriverState *qemu_chr_find(const char *name);
 
+/* add an eventfd to the qemu devices that are polled */
+CharDriverState *qemu_chr_open_eventfd(int eventfd);
+
 extern int term_escape_char;
 
 /* async I/O support */
diff --git a/qemu-doc.texi b/qemu-doc.texi
index 6647b7b..2df4687 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -706,6 +706,31 @@  Using the @option{-net socket} option, it is possible to make VLANs
 that span several QEMU instances. See @ref{sec_invocation} to have a
 basic example.
 
+@section Other Devices
+
+@subsection Inter-VM Shared Memory device
+
+With KVM enabled on a Linux host, a shared memory device is available.  Guests
+map a POSIX shared memory region into the guest as a PCI device that enables
+zero-copy communication to the application level of the guests.  The basic
+syntax is:
+
+@example
+qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
+@end example
+
+If desired, interrupts can be sent between guest VMs accessing the same shared
+memory region.  Interrupt support requires using a shared memory server and
+using a chardev socket to connect to it.  The code for the shared memory server
+is qemu.git/contrib/ivshmem-server.  An example syntax when using the shared
+memory server is:
+
+@example
+qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
+                        [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
+qemu -chardev socket,path=<path>,id=<id>
+@end example
+
 @node direct_linux_boot
 @section Direct Linux Boot