Patchwork VirtIO Support for Memory Regions

login
register
mail settings
Submitter Cam Macdonell
Date Feb. 23, 2010, 8:52 p.m.
Message ID <20100223205225.14860.50990.stgit@st-brides.cs.ualberta.ca>
Download mbox | patch
Permalink /patch/46098/
State New
Headers show

Comments

Cam Macdonell - Feb. 23, 2010, 8:52 p.m.
Support for passing memory regions via VirtIO to remove need for PCI
support in the guest.

Adds new vectors to VirtIO config space to pass memory regions similar
to how virtqueues are passed.

Kernel patch is forthcoming that add device_ops to access the memory regions.

I have used this mechanism to implement my host shared memory implementation
and modified Alex's Virtio FB to use it as well.

Comments are welcome,

Thanks,
Cam
---
 hw/virtio-pci.c |   61 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 hw/virtio.c     |   38 ++++++++++++++++++++++++++++++++++
 hw/virtio.h     |   22 ++++++++++++++++++++
 3 files changed, 117 insertions(+), 4 deletions(-)
Anthony Liguori - Feb. 23, 2010, 9:05 p.m.
Hi Cam,

On 02/23/2010 02:52 PM, Cam Macdonell wrote:
> Support for passing memory regions via VirtIO to remove need for PCI
> support in the guest.
>
> Adds new vectors to VirtIO config space to pass memory regions similar
> to how virtqueues are passed.
>
> Kernel patch is forthcoming that add device_ops to access the memory regions.
>
> I have used this mechanism to implement my host shared memory implementation
> and modified Alex's Virtio FB to use it as well.
>    

Virtio is really a DMA engine.  One of the nice things about it's design 
is that you can do things like transparent bounce buffering if needed.  
Adding a mechanism like this breaks this abstract and turns virtio into 
something that's more than I think it should be.

For guest shared memory, what makes sense to me is to make use of 
uio_pci within a guest to map bars in userspace.  The device can have 
the first bar map a small MMIO region that is tied to a ioeventfd with 
KVM.  With just qemu, it can be tied to a normal eventfd.  You can use 
irqfd with KVM to tie an eventfd to the PCI irq.  Likewise, you can use 
a normal eventfd to emulate that.  I'd suggest using a /dev/shm file to 
act as the shared memory segment.  You'll need a little magic within 
qemu_ram_alloc() to use this (maybe qemu_ram_alloc() should take a 
context parameter so this can be setup).

You'll need to setup the backend with the name of the shared memory 
segment and the two eventfds.  Also, a user needs to specify a pci 
vendor/device id.  It's probably worth having some kind of identifier 
tag in the PCI device's first MMIO region.

I think there's a couple important characteristics of an implementation 
like this.  By using a uio_pci, you can implement shared memory in 
userspace but you can also implement a proper kernel driver if you so 
desired for a specific device.

Regards,

Anthony Liguori

> Comments are welcome,
>
> Thanks,
> Cam
> ---
>   hw/virtio-pci.c |   61 +++++++++++++++++++++++++++++++++++++++++++++++++++----
>   hw/virtio.c     |   38 ++++++++++++++++++++++++++++++++++
>   hw/virtio.h     |   22 ++++++++++++++++++++
>   3 files changed, 117 insertions(+), 4 deletions(-)
>
> diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
> index f3373ae..e71bf64 100644
> --- a/hw/virtio-pci.c
> +++ b/hw/virtio-pci.c
> @@ -55,13 +55,13 @@
>
>   /* MSI-X registers: only enabled if MSI-X is enabled. */
>   /* A 16-bit vector for configuration changes. */
> -#define VIRTIO_MSI_CONFIG_VECTOR        20
> +#define VIRTIO_MSI_CONFIG_VECTOR        32
>   /* A 16-bit vector for selected queue notifications. */
> -#define VIRTIO_MSI_QUEUE_VECTOR         22
> +#define VIRTIO_MSI_QUEUE_VECTOR         34
>
>   /* Config space size */
> -#define VIRTIO_PCI_CONFIG_NOMSI         20
> -#define VIRTIO_PCI_CONFIG_MSI           24
> +#define VIRTIO_PCI_CONFIG_NOMSI         32
> +#define VIRTIO_PCI_CONFIG_MSI           36
>   #define VIRTIO_PCI_REGION_SIZE(dev)     (msix_present(dev) ? \
>                                            VIRTIO_PCI_CONFIG_MSI : \
>                                            VIRTIO_PCI_CONFIG_NOMSI)
> @@ -72,6 +72,12 @@
>                                            VIRTIO_PCI_CONFIG_MSI : \
>                                            VIRTIO_PCI_CONFIG_NOMSI)
>
> +#define VIRTIO_PCI_MEM_SEL                  20
> +
> +#define VIRTIO_PCI_MEM_SIZE                 24
> +
> +#define VIRTIO_PCI_MEM_PFN                  28
> +
>   /* Virtio ABI version, if we increment this, we break the guest driver. */
>   #define VIRTIO_PCI_ABI_VERSION          0
>
> @@ -227,6 +233,11 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>               val = VIRTIO_NO_VECTOR;
>           virtio_queue_set_vector(vdev, vdev->queue_sel, val);
>           break;
> +    case VIRTIO_PCI_MEM_SEL:
> +        fprintf(stderr, "%s: VIRTIO_PCI_MEM_SEL %d\n", __func__, val);
> +        if (val<  VIRTIO_PCI_MEM_MAX)
> +             vdev->mem_sel = val;
> +        break;
>       default:
>           fprintf(stderr, "%s: unexpected address 0x%x value 0x%x\n",
>                   __func__, addr, val);
> @@ -271,6 +282,14 @@ static uint32_t virtio_ioport_read(VirtIOPCIProxy *proxy, uint32_t addr)
>       case VIRTIO_MSI_QUEUE_VECTOR:
>           ret = virtio_queue_vector(vdev, vdev->queue_sel);
>           break;
> +    case VIRTIO_PCI_MEM_SIZE:
> +        ret = virtio_mem_get_size(vdev, vdev->mem_sel);
> +        break;
> +    case VIRTIO_PCI_MEM_PFN:
> +        ret = virtio_mem_get_addr(vdev, vdev->mem_sel)
> +>>  VIRTIO_PCI_QUEUE_ADDR_SHIFT;
> +        break;
> +
>       default:
>           break;
>       }
> @@ -370,6 +389,30 @@ static void virtio_map(PCIDevice *pci_dev, int region_num,
>           vdev->get_config(vdev, vdev->config);
>   }
>
> +
> +static void virtio_setup_mem(PCIDevice *d, int region_num,
> +                       pcibus_t addr, pcibus_t size, int type)
> +{
> +    VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, d);
> +    VirtIODevice *vdev;
> +
> +    printf("XXX setmem to %#x - %#x\n", (uint32_t)addr, (uint32_t)(addr + size));
> +    vdev = proxy->vdev;
> +
> +    cpu_register_physical_memory(addr, size, IO_MEM_UNASSIGNED);
> +    cpu_register_physical_memory(addr, size, vdev->vmem[region_num-2].ram_addr | IO_MEM_RAM);
> +
> +    /* store the address of the region and get a pointer for it
> +     * that can access the mem region */
> +    vdev->vmem[region_num-2].guest_phys_addr = (target_phys_addr_t)addr;
> +
> +    if (vdev->vmem[region_num-2].on_map) {
> +        vdev->vmem[region_num-2].on_map(vdev, addr, size);
> +    }
> +    return;
> +
> +}
> +
>   static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
>                                   uint32_t val, int len)
>   {
> @@ -406,6 +449,7 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
>   {
>       uint8_t *config;
>       uint32_t size;
> +    int i;
>
>       proxy->vdev = vdev;
>
> @@ -426,6 +470,15 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
>
>       config[0x3d] = 1;
>
> +    // only BARs that were added with virtio_add_memory will be allocated here
> +    for (i = 0; i<  VIRTIO_PCI_QUEUE_MAX; i++) {
> +        if (vdev->vmem[i].size>  0) {
> +            pci_register_bar(&proxy->pci_dev, 2 + i, vdev->vmem[i].size,
> +                PCI_BASE_ADDRESS_SPACE_MEMORY, virtio_setup_mem);
> +        }
> +    }
> +
> +
>       if (vdev->nvectors&&  !msix_init(&proxy->pci_dev, vdev->nvectors, 1, 0)) {
>           pci_register_bar(&proxy->pci_dev, 1,
>                            msix_bar_size(&proxy->pci_dev),
> diff --git a/hw/virtio.c b/hw/virtio.c
> index 7c020a3..1f7924e 100644
> --- a/hw/virtio.c
> +++ b/hw/virtio.c
> @@ -549,6 +549,16 @@ target_phys_addr_t virtio_queue_get_addr(VirtIODevice *vdev, int n)
>       return vdev->vq[n].pa;
>   }
>
> +target_phys_addr_t virtio_mem_get_addr(VirtIODevice *vdev, int n)
> +{
> +    return vdev->vmem[n].guest_phys_addr;
> +}
> +
> +int virtio_mem_get_size(VirtIODevice *vdev, int n)
> +{
> +    return vdev->vmem[n].size;
> +}
> +
>   int virtio_queue_get_num(VirtIODevice *vdev, int n)
>   {
>       return vdev->vq[n].vring.num;
> @@ -592,6 +602,33 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
>       return&vdev->vq[i];
>   }
>
> +VirtMem *virtio_add_memory(VirtIODevice *vdev, int mem_size,
> +                            void (*handle_access)(VirtIODevice *,
> +                                target_phys_addr_t, uint32_t))
> +{
> +    int i;
> +
> +    /* trace what virtio_fb_setmem does */
> +
> +    for (i = 0; i<  VIRTIO_PCI_MEM_MAX; i++) {
> +        if (vdev->vmem[i].size == 0) {
> +            /* XXX: check that memory is a power of 2 */
> +            vdev->vmem[i].size = mem_size * 1024 * 1024; /* presume size is in MBs*/
> +            vdev->vmem[i].ram_addr = qemu_ram_alloc(vdev->vmem[i].size);
> +            vdev->vmem[i].host_ptr = qemu_get_ram_ptr(vdev->vmem[i].ram_addr);
> +            vdev->vmem[i].on_map = handle_access;
> +            break;
> +        }
> +    }
> +
> +    return&vdev->vmem[i];
> +}
> +
> +void * virtio_get_memory_ptr(VirtMem * vmem)
> +{
> +    return vmem->host_ptr;
> +}
> +
>   void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
>   {
>       /* Always notify when queue is empty (when feature acknowledge) */
> @@ -714,6 +751,7 @@ VirtIODevice *virtio_common_init(const char *name, uint16_t device_id,
>       vdev->queue_sel = 0;
>       vdev->config_vector = VIRTIO_NO_VECTOR;
>       vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX);
> +    vdev->vmem = qemu_mallocz(sizeof(VirtMem) * VIRTIO_PCI_MEM_MAX);
>       for(i = 0; i<  VIRTIO_PCI_QUEUE_MAX; i++)
>           vdev->vq[i].vector = VIRTIO_NO_VECTOR;
>
> diff --git a/hw/virtio.h b/hw/virtio.h
> index 3baa2a3..02beb27 100644
> --- a/hw/virtio.h
> +++ b/hw/virtio.h
> @@ -82,6 +82,15 @@ typedef struct VirtQueueElement
>       struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
>   } VirtQueueElement;
>
> +typedef struct VirtMem
> +{
> +    target_phys_addr_t guest_phys_addr;
> +    ram_addr_t ram_addr;
> +    int size;
> +    uint8_t * host_ptr;
> +    void (*on_map)(VirtIODevice *vdev, target_phys_addr_t, uint32_t);
> +} VirtMem;
> +
>   typedef struct {
>       void (*notify)(void * opaque, uint16_t vector);
>       void (*save_config)(void * opaque, QEMUFile *f);
> @@ -93,6 +102,8 @@ typedef struct {
>
>   #define VIRTIO_PCI_QUEUE_MAX 64
>
> +#define VIRTIO_PCI_MEM_MAX 16
> +
>   #define VIRTIO_NO_VECTOR 0xffff
>
>   struct VirtIODevice
> @@ -101,6 +112,7 @@ struct VirtIODevice
>       uint8_t status;
>       uint8_t isr;
>       uint16_t queue_sel;
> +    uint16_t mem_sel;
>       uint32_t guest_features;
>       size_t config_len;
>       void *config;
> @@ -113,6 +125,7 @@ struct VirtIODevice
>       void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
>       void (*reset)(VirtIODevice *vdev);
>       VirtQueue *vq;
> +    VirtMem * vmem;
>       const VirtIOBindings *binding;
>       void *binding_opaque;
>       uint16_t device_id;
> @@ -122,6 +135,12 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
>                               void (*handle_output)(VirtIODevice *,
>                                                     VirtQueue *));
>
> +void * virtio_get_memory_ptr(VirtMem * vmem);
> +VirtMem *virtio_add_memory(VirtIODevice *vdev, int mem_size,
> +                            void (*handle_access)(VirtIODevice *,
> +                                                  target_phys_addr_t,
> +                                                  uint32_t));
> +
>   void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem,
>                       unsigned int len);
>   void virtqueue_flush(VirtQueue *vq, unsigned int count);
> @@ -169,6 +188,9 @@ void virtio_update_irq(VirtIODevice *vdev);
>   void virtio_bind_device(VirtIODevice *vdev, const VirtIOBindings *binding,
>                           void *opaque);
>
> +target_phys_addr_t virtio_mem_get_addr(VirtIODevice *vdev, int n);
> +int virtio_mem_get_size(VirtIODevice *vdev, int n);
> +
>   /* Base devices.  */
>   VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf);
>   VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf);
>
>
>
Anthony Liguori - Feb. 23, 2010, 9:09 p.m.
On 02/23/2010 03:05 PM, Anthony Liguori wrote:
> Hi Cam,
>
> On 02/23/2010 02:52 PM, Cam Macdonell wrote:
>> Support for passing memory regions via VirtIO to remove need for PCI
>> support in the guest.
>>
>> Adds new vectors to VirtIO config space to pass memory regions similar
>> to how virtqueues are passed.
>>
>> Kernel patch is forthcoming that add device_ops to access the memory 
>> regions.
>>
>> I have used this mechanism to implement my host shared memory 
>> implementation
>> and modified Alex's Virtio FB to use it as well.
>
> Virtio is really a DMA engine.  One of the nice things about it's 
> design is that you can do things like transparent bounce buffering if 
> needed.  Adding a mechanism like this breaks this abstract and turns 
> virtio into something that's more than I think it should be.

More specifically, virtio does not assume cache coherent shared memory 
today.  While we assume this in the virtio-pci vring implementation, 
that's just an implementation detail of one transport.  Adding generic 
shared memory to virtio presumes that one can have cache coherent shared 
memory in any virtio transport which is not a good assumption to make.

Regards,

Anthony Liguori
Cam Macdonell - Feb. 24, 2010, 6:20 a.m.
On Tue, Feb 23, 2010 at 2:05 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Hi Cam,
>
> On 02/23/2010 02:52 PM, Cam Macdonell wrote:
>>
>> Support for passing memory regions via VirtIO to remove need for PCI
>> support in the guest.
>>
>> Adds new vectors to VirtIO config space to pass memory regions similar
>> to how virtqueues are passed.
>>
>> Kernel patch is forthcoming that add device_ops to access the memory
>> regions.
>>
>> I have used this mechanism to implement my host shared memory
>> implementation
>> and modified Alex's Virtio FB to use it as well.
>>
>
> Virtio is really a DMA engine.  One of the nice things about it's design is
> that you can do things like transparent bounce buffering if needed.  Adding
> a mechanism like this breaks this abstract and turns virtio into something
> that's more than I think it should be.
>
> For guest shared memory, what makes sense to me is to make use of uio_pci
> within a guest to map bars in userspace.  The device can have the first bar
> map a small MMIO region that is tied to a ioeventfd with KVM.  With just
> qemu, it can be tied to a normal eventfd.  You can use irqfd with KVM to tie
> an eventfd to the PCI irq.  Likewise, you can use a normal eventfd to
> emulate that.  I'd suggest using a /dev/shm file to act as the shared memory
> segment.  You'll need a little magic within qemu_ram_alloc() to use this
> (maybe qemu_ram_alloc() should take a context parameter so this can be
> setup).

Part of my goal with this patch was to support systems without PCI.
My previous patch

http://www.mail-archive.com/kvm@vger.kernel.org/msg14284.html

is a PCI implementation that uses eventfds (no ioeventfds yet) to
support interrupts between guests and a /dev/shm file to share between
guests.  I'm not sure what you're suggesting the interrupts and
eventfds would be used for if the mapping is initiated by the guest in
the case.

To be clear, this patch is not for shared memory per se, but adds
memory regions that have multiple uses and that doesn't rely on PCI to
access them.  For example, I have modified Alex's virtio frame buffer
as a test of this patch of a use that doesn't share memory between
guests.  But my interest particularly is to share memory using VirtIO.

In the response to my initial PCI implementation, you had suggested
using VirtIO, but the discussion around virtio not being ideal for
importing memory (from another guest or the host) led to Christian's
suggestion of the new device_ops which I ran with (slowly).

>
> You'll need to setup the backend with the name of the shared memory segment
> and the two eventfds.  Also, a user needs to specify a pci vendor/device id.
>  It's probably worth having some kind of identifier tag in the PCI device's
> first MMIO region.
>
> I think there's a couple important characteristics of an implementation like
> this.  By using a uio_pci, you can implement shared memory in userspace but
> you can also implement a proper kernel driver if you so desired for a
> specific device.

Agreed, you had suggested uio_pci for my PCI driver and I'd be happy
to work on that if we can agree on what the underlying device looks
like whether it should be VirtIO or just PCI.

Thanks for the feedback,
Cam

>
> Regards,
>
> Anthony Liguori
>
>> Comments are welcome,
>>
>> Thanks,
>> Cam
>> ---
>>  hw/virtio-pci.c |   61
>> +++++++++++++++++++++++++++++++++++++++++++++++++++----
>>  hw/virtio.c     |   38 ++++++++++++++++++++++++++++++++++
>>  hw/virtio.h     |   22 ++++++++++++++++++++
>>  3 files changed, 117 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
>> index f3373ae..e71bf64 100644
>> --- a/hw/virtio-pci.c
>> +++ b/hw/virtio-pci.c
>> @@ -55,13 +55,13 @@
>>
>>  /* MSI-X registers: only enabled if MSI-X is enabled. */
>>  /* A 16-bit vector for configuration changes. */
>> -#define VIRTIO_MSI_CONFIG_VECTOR        20
>> +#define VIRTIO_MSI_CONFIG_VECTOR        32
>>  /* A 16-bit vector for selected queue notifications. */
>> -#define VIRTIO_MSI_QUEUE_VECTOR         22
>> +#define VIRTIO_MSI_QUEUE_VECTOR         34
>>
>>  /* Config space size */
>> -#define VIRTIO_PCI_CONFIG_NOMSI         20
>> -#define VIRTIO_PCI_CONFIG_MSI           24
>> +#define VIRTIO_PCI_CONFIG_NOMSI         32
>> +#define VIRTIO_PCI_CONFIG_MSI           36
>>  #define VIRTIO_PCI_REGION_SIZE(dev)     (msix_present(dev) ? \
>>                                           VIRTIO_PCI_CONFIG_MSI : \
>>                                           VIRTIO_PCI_CONFIG_NOMSI)
>> @@ -72,6 +72,12 @@
>>                                           VIRTIO_PCI_CONFIG_MSI : \
>>                                           VIRTIO_PCI_CONFIG_NOMSI)
>>
>> +#define VIRTIO_PCI_MEM_SEL                  20
>> +
>> +#define VIRTIO_PCI_MEM_SIZE                 24
>> +
>> +#define VIRTIO_PCI_MEM_PFN                  28
>> +
>>  /* Virtio ABI version, if we increment this, we break the guest driver.
>> */
>>  #define VIRTIO_PCI_ABI_VERSION          0
>>
>> @@ -227,6 +233,11 @@ static void virtio_ioport_write(void *opaque,
>> uint32_t addr, uint32_t val)
>>              val = VIRTIO_NO_VECTOR;
>>          virtio_queue_set_vector(vdev, vdev->queue_sel, val);
>>          break;
>> +    case VIRTIO_PCI_MEM_SEL:
>> +        fprintf(stderr, "%s: VIRTIO_PCI_MEM_SEL %d\n", __func__, val);
>> +        if (val<  VIRTIO_PCI_MEM_MAX)
>> +             vdev->mem_sel = val;
>> +        break;
>>      default:
>>          fprintf(stderr, "%s: unexpected address 0x%x value 0x%x\n",
>>                  __func__, addr, val);
>> @@ -271,6 +282,14 @@ static uint32_t virtio_ioport_read(VirtIOPCIProxy
>> *proxy, uint32_t addr)
>>      case VIRTIO_MSI_QUEUE_VECTOR:
>>          ret = virtio_queue_vector(vdev, vdev->queue_sel);
>>          break;
>> +    case VIRTIO_PCI_MEM_SIZE:
>> +        ret = virtio_mem_get_size(vdev, vdev->mem_sel);
>> +        break;
>> +    case VIRTIO_PCI_MEM_PFN:
>> +        ret = virtio_mem_get_addr(vdev, vdev->mem_sel)
>> +>>  VIRTIO_PCI_QUEUE_ADDR_SHIFT;
>> +        break;
>> +
>>      default:
>>          break;
>>      }
>> @@ -370,6 +389,30 @@ static void virtio_map(PCIDevice *pci_dev, int
>> region_num,
>>          vdev->get_config(vdev, vdev->config);
>>  }
>>
>> +
>> +static void virtio_setup_mem(PCIDevice *d, int region_num,
>> +                       pcibus_t addr, pcibus_t size, int type)
>> +{
>> +    VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, d);
>> +    VirtIODevice *vdev;
>> +
>> +    printf("XXX setmem to %#x - %#x\n", (uint32_t)addr, (uint32_t)(addr +
>> size));
>> +    vdev = proxy->vdev;
>> +
>> +    cpu_register_physical_memory(addr, size, IO_MEM_UNASSIGNED);
>> +    cpu_register_physical_memory(addr, size,
>> vdev->vmem[region_num-2].ram_addr | IO_MEM_RAM);
>> +
>> +    /* store the address of the region and get a pointer for it
>> +     * that can access the mem region */
>> +    vdev->vmem[region_num-2].guest_phys_addr = (target_phys_addr_t)addr;
>> +
>> +    if (vdev->vmem[region_num-2].on_map) {
>> +        vdev->vmem[region_num-2].on_map(vdev, addr, size);
>> +    }
>> +    return;
>> +
>> +}
>> +
>>  static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
>>                                  uint32_t val, int len)
>>  {
>> @@ -406,6 +449,7 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy,
>> VirtIODevice *vdev,
>>  {
>>      uint8_t *config;
>>      uint32_t size;
>> +    int i;
>>
>>      proxy->vdev = vdev;
>>
>> @@ -426,6 +470,15 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy,
>> VirtIODevice *vdev,
>>
>>      config[0x3d] = 1;
>>
>> +    // only BARs that were added with virtio_add_memory will be allocated
>> here
>> +    for (i = 0; i<  VIRTIO_PCI_QUEUE_MAX; i++) {
>> +        if (vdev->vmem[i].size>  0) {
>> +            pci_register_bar(&proxy->pci_dev, 2 + i, vdev->vmem[i].size,
>> +                PCI_BASE_ADDRESS_SPACE_MEMORY, virtio_setup_mem);
>> +        }
>> +    }
>> +
>> +
>>      if (vdev->nvectors&&  !msix_init(&proxy->pci_dev, vdev->nvectors, 1,
>> 0)) {
>>          pci_register_bar(&proxy->pci_dev, 1,
>>                           msix_bar_size(&proxy->pci_dev),
>> diff --git a/hw/virtio.c b/hw/virtio.c
>> index 7c020a3..1f7924e 100644
>> --- a/hw/virtio.c
>> +++ b/hw/virtio.c
>> @@ -549,6 +549,16 @@ target_phys_addr_t virtio_queue_get_addr(VirtIODevice
>> *vdev, int n)
>>      return vdev->vq[n].pa;
>>  }
>>
>> +target_phys_addr_t virtio_mem_get_addr(VirtIODevice *vdev, int n)
>> +{
>> +    return vdev->vmem[n].guest_phys_addr;
>> +}
>> +
>> +int virtio_mem_get_size(VirtIODevice *vdev, int n)
>> +{
>> +    return vdev->vmem[n].size;
>> +}
>> +
>>  int virtio_queue_get_num(VirtIODevice *vdev, int n)
>>  {
>>      return vdev->vq[n].vring.num;
>> @@ -592,6 +602,33 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int
>> queue_size,
>>      return&vdev->vq[i];
>>  }
>>
>> +VirtMem *virtio_add_memory(VirtIODevice *vdev, int mem_size,
>> +                            void (*handle_access)(VirtIODevice *,
>> +                                target_phys_addr_t, uint32_t))
>> +{
>> +    int i;
>> +
>> +    /* trace what virtio_fb_setmem does */
>> +
>> +    for (i = 0; i<  VIRTIO_PCI_MEM_MAX; i++) {
>> +        if (vdev->vmem[i].size == 0) {
>> +            /* XXX: check that memory is a power of 2 */
>> +            vdev->vmem[i].size = mem_size * 1024 * 1024; /* presume size
>> is in MBs*/
>> +            vdev->vmem[i].ram_addr = qemu_ram_alloc(vdev->vmem[i].size);
>> +            vdev->vmem[i].host_ptr =
>> qemu_get_ram_ptr(vdev->vmem[i].ram_addr);
>> +            vdev->vmem[i].on_map = handle_access;
>> +            break;
>> +        }
>> +    }
>> +
>> +    return&vdev->vmem[i];
>> +}
>> +
>> +void * virtio_get_memory_ptr(VirtMem * vmem)
>> +{
>> +    return vmem->host_ptr;
>> +}
>> +
>>  void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
>>  {
>>      /* Always notify when queue is empty (when feature acknowledge) */
>> @@ -714,6 +751,7 @@ VirtIODevice *virtio_common_init(const char *name,
>> uint16_t device_id,
>>      vdev->queue_sel = 0;
>>      vdev->config_vector = VIRTIO_NO_VECTOR;
>>      vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX);
>> +    vdev->vmem = qemu_mallocz(sizeof(VirtMem) * VIRTIO_PCI_MEM_MAX);
>>      for(i = 0; i<  VIRTIO_PCI_QUEUE_MAX; i++)
>>          vdev->vq[i].vector = VIRTIO_NO_VECTOR;
>>
>> diff --git a/hw/virtio.h b/hw/virtio.h
>> index 3baa2a3..02beb27 100644
>> --- a/hw/virtio.h
>> +++ b/hw/virtio.h
>> @@ -82,6 +82,15 @@ typedef struct VirtQueueElement
>>      struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
>>  } VirtQueueElement;
>>
>> +typedef struct VirtMem
>> +{
>> +    target_phys_addr_t guest_phys_addr;
>> +    ram_addr_t ram_addr;
>> +    int size;
>> +    uint8_t * host_ptr;
>> +    void (*on_map)(VirtIODevice *vdev, target_phys_addr_t, uint32_t);
>> +} VirtMem;
>> +
>>  typedef struct {
>>      void (*notify)(void * opaque, uint16_t vector);
>>      void (*save_config)(void * opaque, QEMUFile *f);
>> @@ -93,6 +102,8 @@ typedef struct {
>>
>>  #define VIRTIO_PCI_QUEUE_MAX 64
>>
>> +#define VIRTIO_PCI_MEM_MAX 16
>> +
>>  #define VIRTIO_NO_VECTOR 0xffff
>>
>>  struct VirtIODevice
>> @@ -101,6 +112,7 @@ struct VirtIODevice
>>      uint8_t status;
>>      uint8_t isr;
>>      uint16_t queue_sel;
>> +    uint16_t mem_sel;
>>      uint32_t guest_features;
>>      size_t config_len;
>>      void *config;
>> @@ -113,6 +125,7 @@ struct VirtIODevice
>>      void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
>>      void (*reset)(VirtIODevice *vdev);
>>      VirtQueue *vq;
>> +    VirtMem * vmem;
>>      const VirtIOBindings *binding;
>>      void *binding_opaque;
>>      uint16_t device_id;
>> @@ -122,6 +135,12 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int
>> queue_size,
>>                              void (*handle_output)(VirtIODevice *,
>>                                                    VirtQueue *));
>>
>> +void * virtio_get_memory_ptr(VirtMem * vmem);
>> +VirtMem *virtio_add_memory(VirtIODevice *vdev, int mem_size,
>> +                            void (*handle_access)(VirtIODevice *,
>> +                                                  target_phys_addr_t,
>> +                                                  uint32_t));
>> +
>>  void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem,
>>                      unsigned int len);
>>  void virtqueue_flush(VirtQueue *vq, unsigned int count);
>> @@ -169,6 +188,9 @@ void virtio_update_irq(VirtIODevice *vdev);
>>  void virtio_bind_device(VirtIODevice *vdev, const VirtIOBindings
>> *binding,
>>                          void *opaque);
>>
>> +target_phys_addr_t virtio_mem_get_addr(VirtIODevice *vdev, int n);
>> +int virtio_mem_get_size(VirtIODevice *vdev, int n);
>> +
>>  /* Base devices.  */
>>  VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf);
>>  VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf);
>>
>>
>>
>
>
Avi Kivity - Feb. 24, 2010, 11:14 a.m.
On 02/24/2010 08:20 AM, Cam Macdonell wrote:
>
>> Virtio is really a DMA engine.  One of the nice things about it's design is
>> that you can do things like transparent bounce buffering if needed.  Adding
>> a mechanism like this breaks this abstract and turns virtio into something
>> that's more than I think it should be.
>>
>> For guest shared memory, what makes sense to me is to make use of uio_pci
>> within a guest to map bars in userspace.  The device can have the first bar
>> map a small MMIO region that is tied to a ioeventfd with KVM.  With just
>> qemu, it can be tied to a normal eventfd.  You can use irqfd with KVM to tie
>> an eventfd to the PCI irq.  Likewise, you can use a normal eventfd to
>> emulate that.  I'd suggest using a /dev/shm file to act as the shared memory
>> segment.  You'll need a little magic within qemu_ram_alloc() to use this
>> (maybe qemu_ram_alloc() should take a context parameter so this can be
>> setup).
>>      
> Part of my goal with this patch was to support systems without PCI.
> My previous patch
>    

Which?  The only one I know of is s390 (and it would be a pity not to 
have shared memory for that).
Anthony Liguori - Feb. 24, 2010, 3:01 p.m.
On 02/24/2010 05:14 AM, Avi Kivity wrote:
> On 02/24/2010 08:20 AM, Cam Macdonell wrote:
>>
>>> Virtio is really a DMA engine.  One of the nice things about it's 
>>> design is
>>> that you can do things like transparent bounce buffering if needed.  
>>> Adding
>>> a mechanism like this breaks this abstract and turns virtio into 
>>> something
>>> that's more than I think it should be.
>>>
>>> For guest shared memory, what makes sense to me is to make use of 
>>> uio_pci
>>> within a guest to map bars in userspace.  The device can have the 
>>> first bar
>>> map a small MMIO region that is tied to a ioeventfd with KVM.  With 
>>> just
>>> qemu, it can be tied to a normal eventfd.  You can use irqfd with 
>>> KVM to tie
>>> an eventfd to the PCI irq.  Likewise, you can use a normal eventfd to
>>> emulate that.  I'd suggest using a /dev/shm file to act as the 
>>> shared memory
>>> segment.  You'll need a little magic within qemu_ram_alloc() to use 
>>> this
>>> (maybe qemu_ram_alloc() should take a context parameter so this can be
>>> setup).
>> Part of my goal with this patch was to support systems without PCI.
>> My previous patch
>
> Which?  The only one I know of is s390 (and it would be a pity not to 
> have shared memory for that).

uio provides insulation against devices and busses.  When you use uio in 
userspace, you don't have to have direct knowledge of the fact that a 
device is or isn't a PCI device.

Regards,

Anthony Liguori
Anthony Liguori - Feb. 24, 2010, 3:13 p.m.
On 02/24/2010 12:20 AM, Cam Macdonell wrote:
> On Tue, Feb 23, 2010 at 2:05 PM, Anthony Liguori<anthony@codemonkey.ws>  wrote:
>    
>> Hi Cam,
>>
>> On 02/23/2010 02:52 PM, Cam Macdonell wrote:
>>      
>>> Support for passing memory regions via VirtIO to remove need for PCI
>>> support in the guest.
>>>
>>> Adds new vectors to VirtIO config space to pass memory regions similar
>>> to how virtqueues are passed.
>>>
>>> Kernel patch is forthcoming that add device_ops to access the memory
>>> regions.
>>>
>>> I have used this mechanism to implement my host shared memory
>>> implementation
>>> and modified Alex's Virtio FB to use it as well.
>>>
>>>        
>> Virtio is really a DMA engine.  One of the nice things about it's design is
>> that you can do things like transparent bounce buffering if needed.  Adding
>> a mechanism like this breaks this abstract and turns virtio into something
>> that's more than I think it should be.
>>
>> For guest shared memory, what makes sense to me is to make use of uio_pci
>> within a guest to map bars in userspace.  The device can have the first bar
>> map a small MMIO region that is tied to a ioeventfd with KVM.  With just
>> qemu, it can be tied to a normal eventfd.  You can use irqfd with KVM to tie
>> an eventfd to the PCI irq.  Likewise, you can use a normal eventfd to
>> emulate that.  I'd suggest using a /dev/shm file to act as the shared memory
>> segment.  You'll need a little magic within qemu_ram_alloc() to use this
>> (maybe qemu_ram_alloc() should take a context parameter so this can be
>> setup).
>>      
> Part of my goal with this patch was to support systems without PCI.
> My previous patch
>
> http://www.mail-archive.com/kvm@vger.kernel.org/msg14284.html
>
> is a PCI implementation that uses eventfds (no ioeventfds yet) to
> support interrupts between guests and a /dev/shm file to share between
> guests.  I'm not sure what you're suggesting the interrupts and
> eventfds would be used for if the mapping is initiated by the guest in
> the case.
>
> To be clear, this patch is not for shared memory per se, but adds
> memory regions that have multiple uses and that doesn't rely on PCI to
> access them.  For example, I have modified Alex's virtio frame buffer
> as a test of this patch of a use that doesn't share memory between
> guests.  But my interest particularly is to share memory using VirtIO.
>    

Shared memory doesn't fit into the way virtio is modelled.  virtio is 
very device centric and it's generally impossible to have cache coherent 
shared memory with most types of devices (certainly with all PCI devices).

> In the response to my initial PCI implementation, you had suggested
> using VirtIO,

I was wrong in making that suggestion :-)

Technically, I suggested that you send a shared memory buffer over the 
ring, not add a shared memory pool to virtio.  The problem with this 
suggest is that it again breaks the DMA model.

>> You'll need to setup the backend with the name of the shared memory segment
>> and the two eventfds.  Also, a user needs to specify a pci vendor/device id.
>>   It's probably worth having some kind of identifier tag in the PCI device's
>> first MMIO region.
>>
>> I think there's a couple important characteristics of an implementation like
>> this.  By using a uio_pci, you can implement shared memory in userspace but
>> you can also implement a proper kernel driver if you so desired for a
>> specific device.
>>      
> Agreed, you had suggested uio_pci for my PCI driver and I'd be happy
> to work on that if we can agree on what the underlying device looks
> like whether it should be VirtIO or just PCI.
>    

I feel pretty strongly that we shouldn't add shared memory to virtio.  
I'm pretty sure Christian would too because it eliminates the 
possibility of implementing a copy-based transport for virtio 
(virtio-over-ethernet).

With respect to making a generic shared memory mechanism that works for 
non-PCI systems, I'm somewhat indifferent.  We could create the 
virtio-equivalent of a shared memory mechanism but IMHO, shared memory 
is an imperfect mechanism and building that level of infrastructure 
around it isn't worthwhile.

Guest shared memory means you're implement a device driver outside of 
qemu or potentially in another guest.  It's interesting for a few 
one-off circumstances but in general, it means things like live 
migration are extremely difficult.

I still think the right thing to do for virtio-framebuffer is to copy 
the data into a shadow framebuffer and send rectangular updates over 
virtio.  I know Alex disagrees, but since you have to do this in the 
backend anyway, I don't see the point in having qemu do this work 
instead of having the guest do the work.

Regards,

Anthony Liguori

> Thanks for the feedback,
> Cam
>
>    
>> Regards,
>>
>> Anthony Liguori
>>
>>      
>>> Comments are welcome,
>>>
>>> Thanks,
>>> Cam
>>> ---
>>>   hw/virtio-pci.c |   61
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++----
>>>   hw/virtio.c     |   38 ++++++++++++++++++++++++++++++++++
>>>   hw/virtio.h     |   22 ++++++++++++++++++++
>>>   3 files changed, 117 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
>>> index f3373ae..e71bf64 100644
>>> --- a/hw/virtio-pci.c
>>> +++ b/hw/virtio-pci.c
>>> @@ -55,13 +55,13 @@
>>>
>>>   /* MSI-X registers: only enabled if MSI-X is enabled. */
>>>   /* A 16-bit vector for configuration changes. */
>>> -#define VIRTIO_MSI_CONFIG_VECTOR        20
>>> +#define VIRTIO_MSI_CONFIG_VECTOR        32
>>>   /* A 16-bit vector for selected queue notifications. */
>>> -#define VIRTIO_MSI_QUEUE_VECTOR         22
>>> +#define VIRTIO_MSI_QUEUE_VECTOR         34
>>>
>>>   /* Config space size */
>>> -#define VIRTIO_PCI_CONFIG_NOMSI         20
>>> -#define VIRTIO_PCI_CONFIG_MSI           24
>>> +#define VIRTIO_PCI_CONFIG_NOMSI         32
>>> +#define VIRTIO_PCI_CONFIG_MSI           36
>>>   #define VIRTIO_PCI_REGION_SIZE(dev)     (msix_present(dev) ? \
>>>                                            VIRTIO_PCI_CONFIG_MSI : \
>>>                                            VIRTIO_PCI_CONFIG_NOMSI)
>>> @@ -72,6 +72,12 @@
>>>                                            VIRTIO_PCI_CONFIG_MSI : \
>>>                                            VIRTIO_PCI_CONFIG_NOMSI)
>>>
>>> +#define VIRTIO_PCI_MEM_SEL                  20
>>> +
>>> +#define VIRTIO_PCI_MEM_SIZE                 24
>>> +
>>> +#define VIRTIO_PCI_MEM_PFN                  28
>>> +
>>>   /* Virtio ABI version, if we increment this, we break the guest driver.
>>> */
>>>   #define VIRTIO_PCI_ABI_VERSION          0
>>>
>>> @@ -227,6 +233,11 @@ static void virtio_ioport_write(void *opaque,
>>> uint32_t addr, uint32_t val)
>>>               val = VIRTIO_NO_VECTOR;
>>>           virtio_queue_set_vector(vdev, vdev->queue_sel, val);
>>>           break;
>>> +    case VIRTIO_PCI_MEM_SEL:
>>> +        fprintf(stderr, "%s: VIRTIO_PCI_MEM_SEL %d\n", __func__, val);
>>> +        if (val<    VIRTIO_PCI_MEM_MAX)
>>> +             vdev->mem_sel = val;
>>> +        break;
>>>       default:
>>>           fprintf(stderr, "%s: unexpected address 0x%x value 0x%x\n",
>>>                   __func__, addr, val);
>>> @@ -271,6 +282,14 @@ static uint32_t virtio_ioport_read(VirtIOPCIProxy
>>> *proxy, uint32_t addr)
>>>       case VIRTIO_MSI_QUEUE_VECTOR:
>>>           ret = virtio_queue_vector(vdev, vdev->queue_sel);
>>>           break;
>>> +    case VIRTIO_PCI_MEM_SIZE:
>>> +        ret = virtio_mem_get_size(vdev, vdev->mem_sel);
>>> +        break;
>>> +    case VIRTIO_PCI_MEM_PFN:
>>> +        ret = virtio_mem_get_addr(vdev, vdev->mem_sel)
>>> +>>    VIRTIO_PCI_QUEUE_ADDR_SHIFT;
>>> +        break;
>>> +
>>>       default:
>>>           break;
>>>       }
>>> @@ -370,6 +389,30 @@ static void virtio_map(PCIDevice *pci_dev, int
>>> region_num,
>>>           vdev->get_config(vdev, vdev->config);
>>>   }
>>>
>>> +
>>> +static void virtio_setup_mem(PCIDevice *d, int region_num,
>>> +                       pcibus_t addr, pcibus_t size, int type)
>>> +{
>>> +    VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, d);
>>> +    VirtIODevice *vdev;
>>> +
>>> +    printf("XXX setmem to %#x - %#x\n", (uint32_t)addr, (uint32_t)(addr +
>>> size));
>>> +    vdev = proxy->vdev;
>>> +
>>> +    cpu_register_physical_memory(addr, size, IO_MEM_UNASSIGNED);
>>> +    cpu_register_physical_memory(addr, size,
>>> vdev->vmem[region_num-2].ram_addr | IO_MEM_RAM);
>>> +
>>> +    /* store the address of the region and get a pointer for it
>>> +     * that can access the mem region */
>>> +    vdev->vmem[region_num-2].guest_phys_addr = (target_phys_addr_t)addr;
>>> +
>>> +    if (vdev->vmem[region_num-2].on_map) {
>>> +        vdev->vmem[region_num-2].on_map(vdev, addr, size);
>>> +    }
>>> +    return;
>>> +
>>> +}
>>> +
>>>   static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
>>>                                   uint32_t val, int len)
>>>   {
>>> @@ -406,6 +449,7 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy,
>>> VirtIODevice *vdev,
>>>   {
>>>       uint8_t *config;
>>>       uint32_t size;
>>> +    int i;
>>>
>>>       proxy->vdev = vdev;
>>>
>>> @@ -426,6 +470,15 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy,
>>> VirtIODevice *vdev,
>>>
>>>       config[0x3d] = 1;
>>>
>>> +    // only BARs that were added with virtio_add_memory will be allocated
>>> here
>>> +    for (i = 0; i<    VIRTIO_PCI_QUEUE_MAX; i++) {
>>> +        if (vdev->vmem[i].size>    0) {
>>> +            pci_register_bar(&proxy->pci_dev, 2 + i, vdev->vmem[i].size,
>>> +                PCI_BASE_ADDRESS_SPACE_MEMORY, virtio_setup_mem);
>>> +        }
>>> +    }
>>> +
>>> +
>>>       if (vdev->nvectors&&    !msix_init(&proxy->pci_dev, vdev->nvectors, 1,
>>> 0)) {
>>>           pci_register_bar(&proxy->pci_dev, 1,
>>>                            msix_bar_size(&proxy->pci_dev),
>>> diff --git a/hw/virtio.c b/hw/virtio.c
>>> index 7c020a3..1f7924e 100644
>>> --- a/hw/virtio.c
>>> +++ b/hw/virtio.c
>>> @@ -549,6 +549,16 @@ target_phys_addr_t virtio_queue_get_addr(VirtIODevice
>>> *vdev, int n)
>>>       return vdev->vq[n].pa;
>>>   }
>>>
>>> +target_phys_addr_t virtio_mem_get_addr(VirtIODevice *vdev, int n)
>>> +{
>>> +    return vdev->vmem[n].guest_phys_addr;
>>> +}
>>> +
>>> +int virtio_mem_get_size(VirtIODevice *vdev, int n)
>>> +{
>>> +    return vdev->vmem[n].size;
>>> +}
>>> +
>>>   int virtio_queue_get_num(VirtIODevice *vdev, int n)
>>>   {
>>>       return vdev->vq[n].vring.num;
>>> @@ -592,6 +602,33 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int
>>> queue_size,
>>>       return&vdev->vq[i];
>>>   }
>>>
>>> +VirtMem *virtio_add_memory(VirtIODevice *vdev, int mem_size,
>>> +                            void (*handle_access)(VirtIODevice *,
>>> +                                target_phys_addr_t, uint32_t))
>>> +{
>>> +    int i;
>>> +
>>> +    /* trace what virtio_fb_setmem does */
>>> +
>>> +    for (i = 0; i<    VIRTIO_PCI_MEM_MAX; i++) {
>>> +        if (vdev->vmem[i].size == 0) {
>>> +            /* XXX: check that memory is a power of 2 */
>>> +            vdev->vmem[i].size = mem_size * 1024 * 1024; /* presume size
>>> is in MBs*/
>>> +            vdev->vmem[i].ram_addr = qemu_ram_alloc(vdev->vmem[i].size);
>>> +            vdev->vmem[i].host_ptr =
>>> qemu_get_ram_ptr(vdev->vmem[i].ram_addr);
>>> +            vdev->vmem[i].on_map = handle_access;
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    return&vdev->vmem[i];
>>> +}
>>> +
>>> +void * virtio_get_memory_ptr(VirtMem * vmem)
>>> +{
>>> +    return vmem->host_ptr;
>>> +}
>>> +
>>>   void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
>>>   {
>>>       /* Always notify when queue is empty (when feature acknowledge) */
>>> @@ -714,6 +751,7 @@ VirtIODevice *virtio_common_init(const char *name,
>>> uint16_t device_id,
>>>       vdev->queue_sel = 0;
>>>       vdev->config_vector = VIRTIO_NO_VECTOR;
>>>       vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX);
>>> +    vdev->vmem = qemu_mallocz(sizeof(VirtMem) * VIRTIO_PCI_MEM_MAX);
>>>       for(i = 0; i<    VIRTIO_PCI_QUEUE_MAX; i++)
>>>           vdev->vq[i].vector = VIRTIO_NO_VECTOR;
>>>
>>> diff --git a/hw/virtio.h b/hw/virtio.h
>>> index 3baa2a3..02beb27 100644
>>> --- a/hw/virtio.h
>>> +++ b/hw/virtio.h
>>> @@ -82,6 +82,15 @@ typedef struct VirtQueueElement
>>>       struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
>>>   } VirtQueueElement;
>>>
>>> +typedef struct VirtMem
>>> +{
>>> +    target_phys_addr_t guest_phys_addr;
>>> +    ram_addr_t ram_addr;
>>> +    int size;
>>> +    uint8_t * host_ptr;
>>> +    void (*on_map)(VirtIODevice *vdev, target_phys_addr_t, uint32_t);
>>> +} VirtMem;
>>> +
>>>   typedef struct {
>>>       void (*notify)(void * opaque, uint16_t vector);
>>>       void (*save_config)(void * opaque, QEMUFile *f);
>>> @@ -93,6 +102,8 @@ typedef struct {
>>>
>>>   #define VIRTIO_PCI_QUEUE_MAX 64
>>>
>>> +#define VIRTIO_PCI_MEM_MAX 16
>>> +
>>>   #define VIRTIO_NO_VECTOR 0xffff
>>>
>>>   struct VirtIODevice
>>> @@ -101,6 +112,7 @@ struct VirtIODevice
>>>       uint8_t status;
>>>       uint8_t isr;
>>>       uint16_t queue_sel;
>>> +    uint16_t mem_sel;
>>>       uint32_t guest_features;
>>>       size_t config_len;
>>>       void *config;
>>> @@ -113,6 +125,7 @@ struct VirtIODevice
>>>       void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
>>>       void (*reset)(VirtIODevice *vdev);
>>>       VirtQueue *vq;
>>> +    VirtMem * vmem;
>>>       const VirtIOBindings *binding;
>>>       void *binding_opaque;
>>>       uint16_t device_id;
>>> @@ -122,6 +135,12 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int
>>> queue_size,
>>>                               void (*handle_output)(VirtIODevice *,
>>>                                                     VirtQueue *));
>>>
>>> +void * virtio_get_memory_ptr(VirtMem * vmem);
>>> +VirtMem *virtio_add_memory(VirtIODevice *vdev, int mem_size,
>>> +                            void (*handle_access)(VirtIODevice *,
>>> +                                                  target_phys_addr_t,
>>> +                                                  uint32_t));
>>> +
>>>   void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem,
>>>                       unsigned int len);
>>>   void virtqueue_flush(VirtQueue *vq, unsigned int count);
>>> @@ -169,6 +188,9 @@ void virtio_update_irq(VirtIODevice *vdev);
>>>   void virtio_bind_device(VirtIODevice *vdev, const VirtIOBindings
>>> *binding,
>>>                           void *opaque);
>>>
>>> +target_phys_addr_t virtio_mem_get_addr(VirtIODevice *vdev, int n);
>>> +int virtio_mem_get_size(VirtIODevice *vdev, int n);
>>> +
>>>   /* Base devices.  */
>>>   VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf);
>>>   VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf);
>>>
>>>
>>>
>>>        
>>
>>
Anthony Liguori - Feb. 24, 2010, 3:41 p.m.
On 02/24/2010 09:13 AM, Anthony Liguori wrote:
>> to work on that if we can agree on what the underlying device looks
>> like whether it should be VirtIO or just PCI.
> Agreed, you had suggested uio_pci for my PCI driver and I'd be happy
>
> I feel pretty strongly that we shouldn't add shared memory to virtio.  
> I'm pretty sure Christian would too because it eliminates the 
> possibility of implementing a copy-based transport for virtio 
> (virtio-over-ethernet).

Actually, ignore my crazy justification based on virtio-over-ethernet :-)

Certain hypervisors (like Xen and PHYP) have very limited/non-existent 
support for shared memory.  In the case of Xen, there's a very small 
pool that can be used for DMA buffers.  In the case of PHYP, you have 
hardware accelerated copying for all I/O.

Adding shared memory to virtio means that virtio could not be supported 
(or at least, all virtio devices) on these hypervisors.

Regards,

Anthony Liguori

Patch

diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index f3373ae..e71bf64 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -55,13 +55,13 @@ 
 
 /* MSI-X registers: only enabled if MSI-X is enabled. */
 /* A 16-bit vector for configuration changes. */
-#define VIRTIO_MSI_CONFIG_VECTOR        20
+#define VIRTIO_MSI_CONFIG_VECTOR        32
 /* A 16-bit vector for selected queue notifications. */
-#define VIRTIO_MSI_QUEUE_VECTOR         22
+#define VIRTIO_MSI_QUEUE_VECTOR         34
 
 /* Config space size */
-#define VIRTIO_PCI_CONFIG_NOMSI         20
-#define VIRTIO_PCI_CONFIG_MSI           24
+#define VIRTIO_PCI_CONFIG_NOMSI         32
+#define VIRTIO_PCI_CONFIG_MSI           36
 #define VIRTIO_PCI_REGION_SIZE(dev)     (msix_present(dev) ? \
                                          VIRTIO_PCI_CONFIG_MSI : \
                                          VIRTIO_PCI_CONFIG_NOMSI)
@@ -72,6 +72,12 @@ 
                                          VIRTIO_PCI_CONFIG_MSI : \
                                          VIRTIO_PCI_CONFIG_NOMSI)
 
+#define VIRTIO_PCI_MEM_SEL                  20
+
+#define VIRTIO_PCI_MEM_SIZE                 24
+
+#define VIRTIO_PCI_MEM_PFN                  28
+
 /* Virtio ABI version, if we increment this, we break the guest driver. */
 #define VIRTIO_PCI_ABI_VERSION          0
 
@@ -227,6 +233,11 @@  static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
             val = VIRTIO_NO_VECTOR;
         virtio_queue_set_vector(vdev, vdev->queue_sel, val);
         break;
+    case VIRTIO_PCI_MEM_SEL:
+        fprintf(stderr, "%s: VIRTIO_PCI_MEM_SEL %d\n", __func__, val);
+        if (val < VIRTIO_PCI_MEM_MAX)
+             vdev->mem_sel = val;
+        break;
     default:
         fprintf(stderr, "%s: unexpected address 0x%x value 0x%x\n",
                 __func__, addr, val);
@@ -271,6 +282,14 @@  static uint32_t virtio_ioport_read(VirtIOPCIProxy *proxy, uint32_t addr)
     case VIRTIO_MSI_QUEUE_VECTOR:
         ret = virtio_queue_vector(vdev, vdev->queue_sel);
         break;
+    case VIRTIO_PCI_MEM_SIZE:
+        ret = virtio_mem_get_size(vdev, vdev->mem_sel);
+        break;
+    case VIRTIO_PCI_MEM_PFN:
+        ret = virtio_mem_get_addr(vdev, vdev->mem_sel)
+              >> VIRTIO_PCI_QUEUE_ADDR_SHIFT;
+        break;
+
     default:
         break;
     }
@@ -370,6 +389,30 @@  static void virtio_map(PCIDevice *pci_dev, int region_num,
         vdev->get_config(vdev, vdev->config);
 }
 
+
+static void virtio_setup_mem(PCIDevice *d, int region_num,
+                       pcibus_t addr, pcibus_t size, int type)
+{
+    VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, d);
+    VirtIODevice *vdev;
+
+    printf("XXX setmem to %#x - %#x\n", (uint32_t)addr, (uint32_t)(addr + size));
+    vdev = proxy->vdev;
+
+    cpu_register_physical_memory(addr, size, IO_MEM_UNASSIGNED);
+    cpu_register_physical_memory(addr, size, vdev->vmem[region_num-2].ram_addr | IO_MEM_RAM);
+
+    /* store the address of the region and get a pointer for it
+     * that can access the mem region */
+    vdev->vmem[region_num-2].guest_phys_addr = (target_phys_addr_t)addr;
+
+    if (vdev->vmem[region_num-2].on_map) {
+        vdev->vmem[region_num-2].on_map(vdev, addr, size);
+    }
+    return;
+
+}
+
 static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
                                 uint32_t val, int len)
 {
@@ -406,6 +449,7 @@  static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
 {
     uint8_t *config;
     uint32_t size;
+    int i;
 
     proxy->vdev = vdev;
 
@@ -426,6 +470,15 @@  static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
 
     config[0x3d] = 1;
 
+    // only BARs that were added with virtio_add_memory will be allocated here
+    for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
+        if (vdev->vmem[i].size > 0) {
+            pci_register_bar(&proxy->pci_dev, 2 + i, vdev->vmem[i].size,
+                PCI_BASE_ADDRESS_SPACE_MEMORY, virtio_setup_mem);
+        }
+    }
+
+
     if (vdev->nvectors && !msix_init(&proxy->pci_dev, vdev->nvectors, 1, 0)) {
         pci_register_bar(&proxy->pci_dev, 1,
                          msix_bar_size(&proxy->pci_dev),
diff --git a/hw/virtio.c b/hw/virtio.c
index 7c020a3..1f7924e 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -549,6 +549,16 @@  target_phys_addr_t virtio_queue_get_addr(VirtIODevice *vdev, int n)
     return vdev->vq[n].pa;
 }
 
+target_phys_addr_t virtio_mem_get_addr(VirtIODevice *vdev, int n)
+{
+    return vdev->vmem[n].guest_phys_addr;
+}
+
+int virtio_mem_get_size(VirtIODevice *vdev, int n)
+{
+    return vdev->vmem[n].size;
+}
+
 int virtio_queue_get_num(VirtIODevice *vdev, int n)
 {
     return vdev->vq[n].vring.num;
@@ -592,6 +602,33 @@  VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
     return &vdev->vq[i];
 }
 
+VirtMem *virtio_add_memory(VirtIODevice *vdev, int mem_size,
+                            void (*handle_access)(VirtIODevice *,
+                                target_phys_addr_t, uint32_t))
+{
+    int i;
+
+    /* trace what virtio_fb_setmem does */
+
+    for (i = 0; i < VIRTIO_PCI_MEM_MAX; i++) {
+        if (vdev->vmem[i].size == 0) {
+            /* XXX: check that memory is a power of 2 */
+            vdev->vmem[i].size = mem_size * 1024 * 1024; /* presume size is in MBs*/
+            vdev->vmem[i].ram_addr = qemu_ram_alloc(vdev->vmem[i].size);
+            vdev->vmem[i].host_ptr = qemu_get_ram_ptr(vdev->vmem[i].ram_addr);
+            vdev->vmem[i].on_map = handle_access;
+            break;
+        }
+    }
+
+    return &vdev->vmem[i];
+}
+
+void * virtio_get_memory_ptr(VirtMem * vmem)
+{
+    return vmem->host_ptr;
+}
+
 void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
 {
     /* Always notify when queue is empty (when feature acknowledge) */
@@ -714,6 +751,7 @@  VirtIODevice *virtio_common_init(const char *name, uint16_t device_id,
     vdev->queue_sel = 0;
     vdev->config_vector = VIRTIO_NO_VECTOR;
     vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX);
+    vdev->vmem = qemu_mallocz(sizeof(VirtMem) * VIRTIO_PCI_MEM_MAX);
     for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++)
         vdev->vq[i].vector = VIRTIO_NO_VECTOR;
 
diff --git a/hw/virtio.h b/hw/virtio.h
index 3baa2a3..02beb27 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -82,6 +82,15 @@  typedef struct VirtQueueElement
     struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
 } VirtQueueElement;
 
+typedef struct VirtMem
+{
+    target_phys_addr_t guest_phys_addr;
+    ram_addr_t ram_addr;
+    int size;
+    uint8_t * host_ptr;
+    void (*on_map)(VirtIODevice *vdev, target_phys_addr_t, uint32_t);
+} VirtMem;
+
 typedef struct {
     void (*notify)(void * opaque, uint16_t vector);
     void (*save_config)(void * opaque, QEMUFile *f);
@@ -93,6 +102,8 @@  typedef struct {
 
 #define VIRTIO_PCI_QUEUE_MAX 64
 
+#define VIRTIO_PCI_MEM_MAX 16
+
 #define VIRTIO_NO_VECTOR 0xffff
 
 struct VirtIODevice
@@ -101,6 +112,7 @@  struct VirtIODevice
     uint8_t status;
     uint8_t isr;
     uint16_t queue_sel;
+    uint16_t mem_sel;
     uint32_t guest_features;
     size_t config_len;
     void *config;
@@ -113,6 +125,7 @@  struct VirtIODevice
     void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
     void (*reset)(VirtIODevice *vdev);
     VirtQueue *vq;
+    VirtMem * vmem;
     const VirtIOBindings *binding;
     void *binding_opaque;
     uint16_t device_id;
@@ -122,6 +135,12 @@  VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
                             void (*handle_output)(VirtIODevice *,
                                                   VirtQueue *));
 
+void * virtio_get_memory_ptr(VirtMem * vmem);
+VirtMem *virtio_add_memory(VirtIODevice *vdev, int mem_size,
+                            void (*handle_access)(VirtIODevice *,
+                                                  target_phys_addr_t,
+                                                  uint32_t));
+
 void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem,
                     unsigned int len);
 void virtqueue_flush(VirtQueue *vq, unsigned int count);
@@ -169,6 +188,9 @@  void virtio_update_irq(VirtIODevice *vdev);
 void virtio_bind_device(VirtIODevice *vdev, const VirtIOBindings *binding,
                         void *opaque);
 
+target_phys_addr_t virtio_mem_get_addr(VirtIODevice *vdev, int n);
+int virtio_mem_get_size(VirtIODevice *vdev, int n);
+
 /* Base devices.  */
 VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf);
 VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf);