Patchwork [RFC] virtio: Map virtqueue rings instead of referencing guest phys addrs

login
register
mail settings
Submitter Alex Williamson
Date Sept. 17, 2010, 5:30 p.m.
Message ID <20100917172902.11978.80677.stgit@s20.home>
Download mbox | patch
Permalink /patch/65096/
State New
Headers show

Comments

Alex Williamson - Sept. 17, 2010, 5:30 p.m.
Nearly any operation on virtqueues require multiple reads/writes
to virtqueue ring descriptors using guest physical addresses
(ld*/st*_phys).  These are expensive and result in
phys_page_find_alloc() and qemu_get_ram_ptr() showing up at the top
of profiles run under virtio net/block workloads.

We can avoid much of this by mapping the ring descriptors using
cpu_physical_memory_map() and referencing virtual addresses.  This
does a good job at dropping these high hitters down in the profile,
and generally shows a minor increase in block and net throughput
and latency.

I'm posting this as an RFC because this isn't nearly the slam dunk
performance improvement that the virtio-net TX bottom half handler
had.  On average, I see just shy of a 7% improvement in overall
network performance.  I'm having a hard time pinning down any
numbers for block since the gains typically seems quite a bit
smaller than my standard deviation.  I'm getting tired of testing
it, so I'll toss it out and see if anyone thinks this is a good
idea.  I have tested this with local migrations and haven't seen
any issues.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 Makefile.objs   |    3 -
 Makefile.target |    5 +
 hw/virtio.c     |  230 +++++++++++++++++++++++++++++++++++++++----------------
 3 files changed, 164 insertions(+), 74 deletions(-)
Anthony Liguori - Sept. 17, 2010, 6:04 p.m.
On 09/17/2010 12:30 PM, Alex Williamson wrote:
> Nearly any operation on virtqueues require multiple reads/writes
> to virtqueue ring descriptors using guest physical addresses
> (ld*/st*_phys).  These are expensive and result in
> phys_page_find_alloc() and qemu_get_ram_ptr() showing up at the top
> of profiles run under virtio net/block workloads.
>
> We can avoid much of this by mapping the ring descriptors using
> cpu_physical_memory_map() and referencing virtual addresses.  This
> does a good job at dropping these high hitters down in the profile,
> and generally shows a minor increase in block and net throughput
> and latency.
>
> I'm posting this as an RFC because this isn't nearly the slam dunk
> performance improvement that the virtio-net TX bottom half handler
> had.  On average, I see just shy of a 7% improvement in overall
> network performance.  I'm having a hard time pinning down any
> numbers for block since the gains typically seems quite a bit
> smaller than my standard deviation.  I'm getting tired of testing
> it, so I'll toss it out and see if anyone thinks this is a good
> idea.  I have tested this with local migrations and haven't seen
> any issues.
>
> Signed-off-by: Alex Williamson<alex.williamson@redhat.com>
> ---
>
>   Makefile.objs   |    3 -
>   Makefile.target |    5 +
>   hw/virtio.c     |  230 +++++++++++++++++++++++++++++++++++++++----------------
>   3 files changed, 164 insertions(+), 74 deletions(-)
>
> diff --git a/Makefile.objs b/Makefile.objs
> index 4a1eaa1..f362b8f 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -145,7 +145,6 @@ user-obj-y += cutils.o cache-utils.o
>
>   hw-obj-y =
>   hw-obj-y += vl.o loader.o
> -hw-obj-y += virtio.o virtio-console.o
>   hw-obj-y += fw_cfg.o pci.o pci_host.o pcie_host.o
>   hw-obj-y += watchdog.o
>   hw-obj-$(CONFIG_ISA_MMIO) += isa_mmio.o
> @@ -244,8 +243,6 @@ sound-obj-$(CONFIG_CS4231A) += cs4231a.o
>   adlib.o fmopl.o: QEMU_CFLAGS += -DBUILD_Y8950=0
>   hw-obj-$(CONFIG_SOUND) += $(sound-obj-y)
>
> -hw-obj-$(CONFIG_VIRTFS) += virtio-9p-debug.o virtio-9p-local.o
> -
>   ######################################################################
>   # libdis
>   # NOTE: the disassembler code is only needed for debugging
> diff --git a/Makefile.target b/Makefile.target
> index 18826bb..4e0cf8d 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -165,11 +165,12 @@ ifdef CONFIG_SOFTMMU
>   obj-y = arch_init.o cpus.o monitor.o machine.o gdbstub.o balloon.o
>   # virtio has to be here due to weird dependency between PCI and virtio-net.
>   # need to fix this properly
> -obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-serial-bus.o
> +obj-y += virtio.o virtio-console.o virtio-blk.o virtio-balloon.o
> +obj-y += virtio-net.o virtio-serial-bus.o
>   obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
>   obj-y += vhost_net.o
>   obj-$(CONFIG_VHOST_NET) += vhost.o
> -obj-$(CONFIG_VIRTFS) += virtio-9p.o
> +obj-$(CONFIG_VIRTFS) += virtio-9p.o virtio-9p-debug.o virtio-9p-local.o
>   obj-y += rwhandler.o
>   obj-$(CONFIG_KVM) += kvm.o kvm-all.o
>   obj-$(CONFIG_NO_KVM) += kvm-stub.o
> diff --git a/hw/virtio.c b/hw/virtio.c
> index 85312b3..0dcf375 100644
> --- a/hw/virtio.c
> +++ b/hw/virtio.c
> @@ -15,6 +15,8 @@
>
>   #include "virtio.h"
>   #include "sysemu.h"
> +#include "cpu.h"
> +#include "exec-all.h"
>
>   /* The alignment to use between consumer and producer parts of vring.
>    * x86 pagesize again. */
> @@ -61,8 +63,11 @@ typedef struct VRing
>   {
>       unsigned int num;
>       target_phys_addr_t desc;
> +    void *desc_va;
>       target_phys_addr_t avail;
> +    void *avail_va;
>       target_phys_addr_t used;
> +    void *used_va;
>   } VRing;
>
>   struct VirtQueue
> @@ -78,107 +83,145 @@ struct VirtQueue
>       EventNotifier host_notifier;
>   };
>
> +static void *virtqueue_map(target_phys_addr_t addr,
> +                           target_phys_addr_t len, int write)
> +{
> +    void *map;
> +    target_phys_addr_t mapped = len;
> +
> +    map = cpu_physical_memory_map(addr,&mapped, write);
> +    if (!map) {
> +        return NULL;
> +    }
> +    if (mapped != len) {
> +        cpu_physical_memory_unmap(map, mapped, write, 0);
> +        return NULL;
> +    }
> +    return map;
> +}
>    

You can't hold a reference from memory_map() once you fall back to the 
main loop.

Regards,

Anthony Liguori

> +
>   /* virt queue functions */
>   static void virtqueue_init(VirtQueue *vq)
>   {
> -    target_phys_addr_t pa = vq->pa;
> +    target_phys_addr_t size, pa = vq->pa;
>
>       vq->vring.desc = pa;
>       vq->vring.avail = pa + vq->vring.num * sizeof(VRingDesc);
>       vq->vring.used = vring_align(vq->vring.avail +
>                                    offsetof(VRingAvail, ring[vq->vring.num]),
>                                    VIRTIO_PCI_VRING_ALIGN);
> +
> +    size = vq->vring.num * sizeof(VRingDesc);
> +    if (!(vq->vring.desc_va = virtqueue_map(vq->vring.desc, size, 0))) {
> +        fprintf(stderr, "Failed to map vring.desc\n");
> +        exit(1); /* Ugh, wishing for -ENOMEM */
> +    }
> +    size = offsetof(VRingAvail, ring[vq->vring.num]);
> +    if (!(vq->vring.avail_va = virtqueue_map(vq->vring.avail, size, 0))) {
> +        fprintf(stderr, "Failed to map vring.avail\n");
> +        exit(1); /* Ugh, wishing for -ENOMEM */
> +    }
> +    size = offsetof(VRingUsed, ring[vq->vring.num]);
> +    if (!(vq->vring.used_va = virtqueue_map(vq->vring.used, size, 1))) {
> +        fprintf(stderr, "Failed to map vring.used\n");
> +        exit(1); /* Ugh, wishing for -ENOMEM */
> +    }
>   }
>
> -static inline uint64_t vring_desc_addr(target_phys_addr_t desc_pa, int i)
> +static inline void sync_dirty_bytes(void *ptr, int bytes)
>   {
> -    target_phys_addr_t pa;
> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
> -    return ldq_phys(pa);
> +    if (cpu_physical_memory_get_dirty_tracking()) {
> +        ram_addr_t addr = qemu_ram_addr_from_host(ptr);
> +
> +        if (!cpu_physical_memory_is_dirty(addr)) {
> +            tb_invalidate_phys_page_range(addr, addr + bytes, 0);
> +            cpu_physical_memory_set_dirty_flags(addr,
> +                                                (0xff&  ~CODE_DIRTY_FLAG));
> +        }
> +    }
>   }
>
> -static inline uint32_t vring_desc_len(target_phys_addr_t desc_pa, int i)
> +static inline uint64_t vring_desc_addr(void *desc_va, int i)
>   {
> -    target_phys_addr_t pa;
> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
> -    return ldl_phys(pa);
> +    void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
> +    return ldq_p(ptr);
>   }
>
> -static inline uint16_t vring_desc_flags(target_phys_addr_t desc_pa, int i)
> +static inline uint32_t vring_desc_len(void *desc_va, int i)
>   {
> -    target_phys_addr_t pa;
> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
> -    return lduw_phys(pa);
> +    void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
> +    return ldl_p(ptr);
>   }
>
> -static inline uint16_t vring_desc_next(target_phys_addr_t desc_pa, int i)
> +static inline uint16_t vring_desc_flags(void *desc_va, int i)
>   {
> -    target_phys_addr_t pa;
> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
> -    return lduw_phys(pa);
> +    void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
> +    return lduw_p(ptr);
> +}
> +
> +static inline uint16_t vring_desc_next(void *desc_va, int i)
> +{
> +    void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
> +    return lduw_p(ptr);
>   }
>
>   static inline uint16_t vring_avail_flags(VirtQueue *vq)
>   {
> -    target_phys_addr_t pa;
> -    pa = vq->vring.avail + offsetof(VRingAvail, flags);
> -    return lduw_phys(pa);
> +    void *ptr = vq->vring.avail_va + offsetof(VRingAvail, flags);
> +    return lduw_p(ptr);
>   }
>
>   static inline uint16_t vring_avail_idx(VirtQueue *vq)
>   {
> -    target_phys_addr_t pa;
> -    pa = vq->vring.avail + offsetof(VRingAvail, idx);
> -    return lduw_phys(pa);
> +    void *ptr = vq->vring.avail_va + offsetof(VRingAvail, idx);
> +    return lduw_p(ptr);
>   }
>
>   static inline uint16_t vring_avail_ring(VirtQueue *vq, int i)
>   {
> -    target_phys_addr_t pa;
> -    pa = vq->vring.avail + offsetof(VRingAvail, ring[i]);
> -    return lduw_phys(pa);
> +    void *ptr = vq->vring.avail_va + offsetof(VRingAvail, ring[i]);
> +    return lduw_p(ptr);
>   }
>
>   static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val)
>   {
> -    target_phys_addr_t pa;
> -    pa = vq->vring.used + offsetof(VRingUsed, ring[i].id);
> -    stl_phys(pa, val);
> +    void *ptr = vq->vring.used_va + offsetof(VRingUsed, ring[i].id);
> +    stl_p(ptr, val);
> +    sync_dirty_bytes(ptr, 4);
>   }
>
>   static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val)
>   {
> -    target_phys_addr_t pa;
> -    pa = vq->vring.used + offsetof(VRingUsed, ring[i].len);
> -    stl_phys(pa, val);
> +    void *ptr = vq->vring.used_va + offsetof(VRingUsed, ring[i].len);
> +    stl_p(ptr, val);
> +    sync_dirty_bytes(ptr, 4);
>   }
>
>   static uint16_t vring_used_idx(VirtQueue *vq)
>   {
> -    target_phys_addr_t pa;
> -    pa = vq->vring.used + offsetof(VRingUsed, idx);
> -    return lduw_phys(pa);
> +    void *ptr = vq->vring.used_va + offsetof(VRingUsed, idx);
> +    return lduw_p(ptr);
>   }
>
>   static inline void vring_used_idx_increment(VirtQueue *vq, uint16_t val)
>   {
> -    target_phys_addr_t pa;
> -    pa = vq->vring.used + offsetof(VRingUsed, idx);
> -    stw_phys(pa, vring_used_idx(vq) + val);
> +    void *ptr = vq->vring.used_va + offsetof(VRingUsed, idx);
> +    stw_p(ptr, vring_used_idx(vq) + val);
> +    sync_dirty_bytes(ptr, 2);
>   }
>
>   static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
>   {
> -    target_phys_addr_t pa;
> -    pa = vq->vring.used + offsetof(VRingUsed, flags);
> -    stw_phys(pa, lduw_phys(pa) | mask);
> +    void *ptr = vq->vring.used_va + offsetof(VRingUsed, flags);
> +    stw_p(ptr, lduw_p(ptr) | mask);
> +    sync_dirty_bytes(ptr, 2);
>   }
>
>   static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask)
>   {
> -    target_phys_addr_t pa;
> -    pa = vq->vring.used + offsetof(VRingUsed, flags);
> -    stw_phys(pa, lduw_phys(pa)&  ~mask);
> +    void *ptr = vq->vring.used_va + offsetof(VRingUsed, flags);
> +    stw_p(ptr, lduw_p(ptr)&  ~mask);
> +    sync_dirty_bytes(ptr, 2);
>   }
>
>   void virtio_queue_set_notification(VirtQueue *vq, int enable)
> @@ -274,17 +317,17 @@ static unsigned int virtqueue_get_head(VirtQueue *vq, unsigned int idx)
>       return head;
>   }
>
> -static unsigned virtqueue_next_desc(target_phys_addr_t desc_pa,
> +static unsigned virtqueue_next_desc(void *desc_va,
>                                       unsigned int i, unsigned int max)
>   {
>       unsigned int next;
>
>       /* If this descriptor says it doesn't chain, we're done. */
> -    if (!(vring_desc_flags(desc_pa, i)&  VRING_DESC_F_NEXT))
> +    if (!(vring_desc_flags(desc_va, i)&  VRING_DESC_F_NEXT))
>           return max;
>
>       /* Check they're not leading us off end of descriptors. */
> -    next = vring_desc_next(desc_pa, i);
> +    next = vring_desc_next(desc_va, i);
>       /* Make sure compiler knows to grab that: we don't want it changing! */
>       wmb();
>
> @@ -306,16 +349,18 @@ int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes)
>       total_bufs = in_total = out_total = 0;
>       while (virtqueue_num_heads(vq, idx)) {
>           unsigned int max, num_bufs, indirect = 0;
> -        target_phys_addr_t desc_pa;
> +        target_phys_addr_t len = 0;
> +        void *desc_va;
>           int i;
>
>           max = vq->vring.num;
>           num_bufs = total_bufs;
>           i = virtqueue_get_head(vq, idx++);
> -        desc_pa = vq->vring.desc;
> +        desc_va = vq->vring.desc_va;
>
> -        if (vring_desc_flags(desc_pa, i)&  VRING_DESC_F_INDIRECT) {
> -            if (vring_desc_len(desc_pa, i) % sizeof(VRingDesc)) {
> +        if (vring_desc_flags(desc_va, i)&  VRING_DESC_F_INDIRECT) {
> +            len = vring_desc_len(desc_va, i);
> +            if (len % sizeof(VRingDesc)) {
>                   fprintf(stderr, "Invalid size for indirect buffer table\n");
>                   exit(1);
>               }
> @@ -328,9 +373,13 @@ int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes)
>
>               /* loop over the indirect descriptor table */
>               indirect = 1;
> -            max = vring_desc_len(desc_pa, i) / sizeof(VRingDesc);
> +            max = len / sizeof(VRingDesc);
>               num_bufs = i = 0;
> -            desc_pa = vring_desc_addr(desc_pa, i);
> +            desc_va = virtqueue_map(vring_desc_addr(desc_va, i), len, 0);
> +            if (!desc_va) {
> +                fprintf(stderr, "Failed to map indirect virtqueue\n");
> +                exit(1);
> +            }
>           }
>
>           do {
> @@ -340,21 +389,31 @@ int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes)
>                   exit(1);
>               }
>
> -            if (vring_desc_flags(desc_pa, i)&  VRING_DESC_F_WRITE) {
> +            if (vring_desc_flags(desc_va, i)&  VRING_DESC_F_WRITE) {
>                   if (in_bytes>  0&&
> -                    (in_total += vring_desc_len(desc_pa, i))>= in_bytes)
> +                    (in_total += vring_desc_len(desc_va, i))>= in_bytes) {
> +                    if (indirect) {
> +                        cpu_physical_memory_unmap(desc_va, len, 0, 0);
> +                    }
>                       return 1;
> +                }
>               } else {
>                   if (out_bytes>  0&&
> -                    (out_total += vring_desc_len(desc_pa, i))>= out_bytes)
> +                    (out_total += vring_desc_len(desc_va, i))>= out_bytes) {
> +                    if (indirect) {
> +                        cpu_physical_memory_unmap(desc_va, len, 0, 0);
> +                    }
>                       return 1;
> +                }
>               }
> -        } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max);
> +        } while ((i = virtqueue_next_desc(desc_va, i, max)) != max);
>
> -        if (!indirect)
> +        if (!indirect) {
>               total_bufs = num_bufs;
> -        else
> +        } else {
> +            cpu_physical_memory_unmap(desc_va, len, 0, 0);
>               total_bufs++;
> +        }
>       }
>
>       return 0;
> @@ -379,7 +438,9 @@ void virtqueue_map_sg(struct iovec *sg, target_phys_addr_t *addr,
>   int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
>   {
>       unsigned int i, head, max;
> -    target_phys_addr_t desc_pa = vq->vring.desc;
> +    void *desc_va = vq->vring.desc_va;
> +    target_phys_addr_t len = 0;
> +    int indirect = 0;
>
>       if (!virtqueue_num_heads(vq, vq->last_avail_idx))
>           return 0;
> @@ -391,15 +452,21 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
>
>       i = head = virtqueue_get_head(vq, vq->last_avail_idx++);
>
> -    if (vring_desc_flags(desc_pa, i)&  VRING_DESC_F_INDIRECT) {
> -        if (vring_desc_len(desc_pa, i) % sizeof(VRingDesc)) {
> +    if (vring_desc_flags(desc_va, i)&  VRING_DESC_F_INDIRECT) {
> +        len = vring_desc_len(desc_va, i);
> +        if (len % sizeof(VRingDesc)) {
>               fprintf(stderr, "Invalid size for indirect buffer table\n");
>               exit(1);
>           }
>
>           /* loop over the indirect descriptor table */
> -        max = vring_desc_len(desc_pa, i) / sizeof(VRingDesc);
> -        desc_pa = vring_desc_addr(desc_pa, i);
> +        max = len / sizeof(VRingDesc);
> +        desc_va = virtqueue_map(vring_desc_addr(desc_va, i), len, 0);
> +        if (!desc_va) {
> +            fprintf(stderr, "Failed to map indirect virtqueue\n");
> +            exit(1);
> +        }
> +        indirect = 1;
>           i = 0;
>       }
>
> @@ -407,22 +474,26 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
>       do {
>           struct iovec *sg;
>
> -        if (vring_desc_flags(desc_pa, i)&  VRING_DESC_F_WRITE) {
> -            elem->in_addr[elem->in_num] = vring_desc_addr(desc_pa, i);
> +        if (vring_desc_flags(desc_va, i)&  VRING_DESC_F_WRITE) {
> +            elem->in_addr[elem->in_num] = vring_desc_addr(desc_va, i);
>               sg =&elem->in_sg[elem->in_num++];
>           } else {
> -            elem->out_addr[elem->out_num] = vring_desc_addr(desc_pa, i);
> +            elem->out_addr[elem->out_num] = vring_desc_addr(desc_va, i);
>               sg =&elem->out_sg[elem->out_num++];
>           }
>
> -        sg->iov_len = vring_desc_len(desc_pa, i);
> +        sg->iov_len = vring_desc_len(desc_va, i);
>
>           /* If we've got too many, that implies a descriptor loop. */
>           if ((elem->in_num + elem->out_num)>  max) {
>               fprintf(stderr, "Looped descriptor");
>               exit(1);
>           }
> -    } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max);
> +    } while ((i = virtqueue_next_desc(desc_va, i, max)) != max);
> +
> +    if (indirect) {
> +        cpu_physical_memory_unmap(desc_va, len, 0, 0);
> +    }
>
>       /* Now map what we have collected */
>       virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1);
> @@ -467,6 +538,27 @@ void virtio_reset(void *opaque)
>           vdev->vq[i].vring.desc = 0;
>           vdev->vq[i].vring.avail = 0;
>           vdev->vq[i].vring.used = 0;
> +        if (vdev->vq[i].vring.desc_va) {
> +            cpu_physical_memory_unmap(vdev->vq[i].vring.desc_va,
> +                                      vdev->vq[i].vring.num *
> +                                      sizeof(VRingDesc), 0, 0);
> +            vdev->vq[i].vring.desc_va = NULL;
> +        }
> +        if (vdev->vq[i].vring.avail_va) {
> +            cpu_physical_memory_unmap(vdev->vq[i].vring.avail_va,
> +                                      offsetof(VRingAvail,
> +                                               ring[vdev->vq[i].vring.num]),
> +                                      0, 0);
> +            vdev->vq[i].vring.avail_va = NULL;
> +        }
> +        if (vdev->vq[i].vring.used_va) {
> +            cpu_physical_memory_unmap(vdev->vq[i].vring.used_va,
> +                                      offsetof(VRingUsed,
> +                                               ring[vdev->vq[i].vring.num]), 1,
> +                                      offsetof(VRingUsed,
> +                                               ring[vdev->vq[i].vring.num]));
> +            vdev->vq[i].vring.used_va = NULL;
> +        }
>           vdev->vq[i].last_avail_idx = 0;
>           vdev->vq[i].pa = 0;
>           vdev->vq[i].vector = VIRTIO_NO_VECTOR;
>
>
>

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 4a1eaa1..f362b8f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -145,7 +145,6 @@  user-obj-y += cutils.o cache-utils.o
 
 hw-obj-y =
 hw-obj-y += vl.o loader.o
-hw-obj-y += virtio.o virtio-console.o
 hw-obj-y += fw_cfg.o pci.o pci_host.o pcie_host.o
 hw-obj-y += watchdog.o
 hw-obj-$(CONFIG_ISA_MMIO) += isa_mmio.o
@@ -244,8 +243,6 @@  sound-obj-$(CONFIG_CS4231A) += cs4231a.o
 adlib.o fmopl.o: QEMU_CFLAGS += -DBUILD_Y8950=0
 hw-obj-$(CONFIG_SOUND) += $(sound-obj-y)
 
-hw-obj-$(CONFIG_VIRTFS) += virtio-9p-debug.o virtio-9p-local.o
-
 ######################################################################
 # libdis
 # NOTE: the disassembler code is only needed for debugging
diff --git a/Makefile.target b/Makefile.target
index 18826bb..4e0cf8d 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -165,11 +165,12 @@  ifdef CONFIG_SOFTMMU
 obj-y = arch_init.o cpus.o monitor.o machine.o gdbstub.o balloon.o
 # virtio has to be here due to weird dependency between PCI and virtio-net.
 # need to fix this properly
-obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-serial-bus.o
+obj-y += virtio.o virtio-console.o virtio-blk.o virtio-balloon.o
+obj-y += virtio-net.o virtio-serial-bus.o
 obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
 obj-y += vhost_net.o
 obj-$(CONFIG_VHOST_NET) += vhost.o
-obj-$(CONFIG_VIRTFS) += virtio-9p.o
+obj-$(CONFIG_VIRTFS) += virtio-9p.o virtio-9p-debug.o virtio-9p-local.o
 obj-y += rwhandler.o
 obj-$(CONFIG_KVM) += kvm.o kvm-all.o
 obj-$(CONFIG_NO_KVM) += kvm-stub.o
diff --git a/hw/virtio.c b/hw/virtio.c
index 85312b3..0dcf375 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -15,6 +15,8 @@ 
 
 #include "virtio.h"
 #include "sysemu.h"
+#include "cpu.h"
+#include "exec-all.h"
 
 /* The alignment to use between consumer and producer parts of vring.
  * x86 pagesize again. */
@@ -61,8 +63,11 @@  typedef struct VRing
 {
     unsigned int num;
     target_phys_addr_t desc;
+    void *desc_va;
     target_phys_addr_t avail;
+    void *avail_va;
     target_phys_addr_t used;
+    void *used_va;
 } VRing;
 
 struct VirtQueue
@@ -78,107 +83,145 @@  struct VirtQueue
     EventNotifier host_notifier;
 };
 
+static void *virtqueue_map(target_phys_addr_t addr,
+                           target_phys_addr_t len, int write)
+{
+    void *map;
+    target_phys_addr_t mapped = len;
+
+    map = cpu_physical_memory_map(addr, &mapped, write);
+    if (!map) {
+        return NULL;
+    }
+    if (mapped != len) {
+        cpu_physical_memory_unmap(map, mapped, write, 0);
+        return NULL;
+    }
+    return map;
+}
+
 /* virt queue functions */
 static void virtqueue_init(VirtQueue *vq)
 {
-    target_phys_addr_t pa = vq->pa;
+    target_phys_addr_t size, pa = vq->pa;
 
     vq->vring.desc = pa;
     vq->vring.avail = pa + vq->vring.num * sizeof(VRingDesc);
     vq->vring.used = vring_align(vq->vring.avail +
                                  offsetof(VRingAvail, ring[vq->vring.num]),
                                  VIRTIO_PCI_VRING_ALIGN);
+
+    size = vq->vring.num * sizeof(VRingDesc);
+    if (!(vq->vring.desc_va = virtqueue_map(vq->vring.desc, size, 0))) {
+        fprintf(stderr, "Failed to map vring.desc\n");
+        exit(1); /* Ugh, wishing for -ENOMEM */
+    }
+    size = offsetof(VRingAvail, ring[vq->vring.num]);
+    if (!(vq->vring.avail_va = virtqueue_map(vq->vring.avail, size, 0))) {
+        fprintf(stderr, "Failed to map vring.avail\n");
+        exit(1); /* Ugh, wishing for -ENOMEM */
+    }
+    size = offsetof(VRingUsed, ring[vq->vring.num]);
+    if (!(vq->vring.used_va = virtqueue_map(vq->vring.used, size, 1))) {
+        fprintf(stderr, "Failed to map vring.used\n");
+        exit(1); /* Ugh, wishing for -ENOMEM */
+    }
 }
 
-static inline uint64_t vring_desc_addr(target_phys_addr_t desc_pa, int i)
+static inline void sync_dirty_bytes(void *ptr, int bytes)
 {
-    target_phys_addr_t pa;
-    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
-    return ldq_phys(pa);
+    if (cpu_physical_memory_get_dirty_tracking()) {
+        ram_addr_t addr = qemu_ram_addr_from_host(ptr);
+
+        if (!cpu_physical_memory_is_dirty(addr)) {
+            tb_invalidate_phys_page_range(addr, addr + bytes, 0);
+            cpu_physical_memory_set_dirty_flags(addr,
+                                                (0xff & ~CODE_DIRTY_FLAG));
+        }
+    }
 }
 
-static inline uint32_t vring_desc_len(target_phys_addr_t desc_pa, int i)
+static inline uint64_t vring_desc_addr(void *desc_va, int i)
 {
-    target_phys_addr_t pa;
-    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
-    return ldl_phys(pa);
+    void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
+    return ldq_p(ptr);
 }
 
-static inline uint16_t vring_desc_flags(target_phys_addr_t desc_pa, int i)
+static inline uint32_t vring_desc_len(void *desc_va, int i)
 {
-    target_phys_addr_t pa;
-    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
-    return lduw_phys(pa);
+    void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
+    return ldl_p(ptr);
 }
 
-static inline uint16_t vring_desc_next(target_phys_addr_t desc_pa, int i)
+static inline uint16_t vring_desc_flags(void *desc_va, int i)
 {
-    target_phys_addr_t pa;
-    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
-    return lduw_phys(pa);
+    void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
+    return lduw_p(ptr);
+}
+
+static inline uint16_t vring_desc_next(void *desc_va, int i)
+{
+    void *ptr = desc_va + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
+    return lduw_p(ptr);
 }
 
 static inline uint16_t vring_avail_flags(VirtQueue *vq)
 {
-    target_phys_addr_t pa;
-    pa = vq->vring.avail + offsetof(VRingAvail, flags);
-    return lduw_phys(pa);
+    void *ptr = vq->vring.avail_va + offsetof(VRingAvail, flags);
+    return lduw_p(ptr);
 }
 
 static inline uint16_t vring_avail_idx(VirtQueue *vq)
 {
-    target_phys_addr_t pa;
-    pa = vq->vring.avail + offsetof(VRingAvail, idx);
-    return lduw_phys(pa);
+    void *ptr = vq->vring.avail_va + offsetof(VRingAvail, idx);
+    return lduw_p(ptr);
 }
 
 static inline uint16_t vring_avail_ring(VirtQueue *vq, int i)
 {
-    target_phys_addr_t pa;
-    pa = vq->vring.avail + offsetof(VRingAvail, ring[i]);
-    return lduw_phys(pa);
+    void *ptr = vq->vring.avail_va + offsetof(VRingAvail, ring[i]);
+    return lduw_p(ptr);
 }
 
 static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val)
 {
-    target_phys_addr_t pa;
-    pa = vq->vring.used + offsetof(VRingUsed, ring[i].id);
-    stl_phys(pa, val);
+    void *ptr = vq->vring.used_va + offsetof(VRingUsed, ring[i].id);
+    stl_p(ptr, val);
+    sync_dirty_bytes(ptr, 4);
 }
 
 static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val)
 {
-    target_phys_addr_t pa;
-    pa = vq->vring.used + offsetof(VRingUsed, ring[i].len);
-    stl_phys(pa, val);
+    void *ptr = vq->vring.used_va + offsetof(VRingUsed, ring[i].len);
+    stl_p(ptr, val);
+    sync_dirty_bytes(ptr, 4);
 }
 
 static uint16_t vring_used_idx(VirtQueue *vq)
 {
-    target_phys_addr_t pa;
-    pa = vq->vring.used + offsetof(VRingUsed, idx);
-    return lduw_phys(pa);
+    void *ptr = vq->vring.used_va + offsetof(VRingUsed, idx);
+    return lduw_p(ptr);
 }
 
 static inline void vring_used_idx_increment(VirtQueue *vq, uint16_t val)
 {
-    target_phys_addr_t pa;
-    pa = vq->vring.used + offsetof(VRingUsed, idx);
-    stw_phys(pa, vring_used_idx(vq) + val);
+    void *ptr = vq->vring.used_va + offsetof(VRingUsed, idx);
+    stw_p(ptr, vring_used_idx(vq) + val);
+    sync_dirty_bytes(ptr, 2);
 }
 
 static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
 {
-    target_phys_addr_t pa;
-    pa = vq->vring.used + offsetof(VRingUsed, flags);
-    stw_phys(pa, lduw_phys(pa) | mask);
+    void *ptr = vq->vring.used_va + offsetof(VRingUsed, flags);
+    stw_p(ptr, lduw_p(ptr) | mask);
+    sync_dirty_bytes(ptr, 2);
 }
 
 static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask)
 {
-    target_phys_addr_t pa;
-    pa = vq->vring.used + offsetof(VRingUsed, flags);
-    stw_phys(pa, lduw_phys(pa) & ~mask);
+    void *ptr = vq->vring.used_va + offsetof(VRingUsed, flags);
+    stw_p(ptr, lduw_p(ptr) & ~mask);
+    sync_dirty_bytes(ptr, 2);
 }
 
 void virtio_queue_set_notification(VirtQueue *vq, int enable)
@@ -274,17 +317,17 @@  static unsigned int virtqueue_get_head(VirtQueue *vq, unsigned int idx)
     return head;
 }
 
-static unsigned virtqueue_next_desc(target_phys_addr_t desc_pa,
+static unsigned virtqueue_next_desc(void *desc_va,
                                     unsigned int i, unsigned int max)
 {
     unsigned int next;
 
     /* If this descriptor says it doesn't chain, we're done. */
-    if (!(vring_desc_flags(desc_pa, i) & VRING_DESC_F_NEXT))
+    if (!(vring_desc_flags(desc_va, i) & VRING_DESC_F_NEXT))
         return max;
 
     /* Check they're not leading us off end of descriptors. */
-    next = vring_desc_next(desc_pa, i);
+    next = vring_desc_next(desc_va, i);
     /* Make sure compiler knows to grab that: we don't want it changing! */
     wmb();
 
@@ -306,16 +349,18 @@  int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes)
     total_bufs = in_total = out_total = 0;
     while (virtqueue_num_heads(vq, idx)) {
         unsigned int max, num_bufs, indirect = 0;
-        target_phys_addr_t desc_pa;
+        target_phys_addr_t len = 0;
+        void *desc_va;
         int i;
 
         max = vq->vring.num;
         num_bufs = total_bufs;
         i = virtqueue_get_head(vq, idx++);
-        desc_pa = vq->vring.desc;
+        desc_va = vq->vring.desc_va;
 
-        if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_INDIRECT) {
-            if (vring_desc_len(desc_pa, i) % sizeof(VRingDesc)) {
+        if (vring_desc_flags(desc_va, i) & VRING_DESC_F_INDIRECT) {
+            len = vring_desc_len(desc_va, i);
+            if (len % sizeof(VRingDesc)) {
                 fprintf(stderr, "Invalid size for indirect buffer table\n");
                 exit(1);
             }
@@ -328,9 +373,13 @@  int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes)
 
             /* loop over the indirect descriptor table */
             indirect = 1;
-            max = vring_desc_len(desc_pa, i) / sizeof(VRingDesc);
+            max = len / sizeof(VRingDesc);
             num_bufs = i = 0;
-            desc_pa = vring_desc_addr(desc_pa, i);
+            desc_va = virtqueue_map(vring_desc_addr(desc_va, i), len, 0);
+            if (!desc_va) {
+                fprintf(stderr, "Failed to map indirect virtqueue\n");
+                exit(1);
+            }
         }
 
         do {
@@ -340,21 +389,31 @@  int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes)
                 exit(1);
             }
 
-            if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_WRITE) {
+            if (vring_desc_flags(desc_va, i) & VRING_DESC_F_WRITE) {
                 if (in_bytes > 0 &&
-                    (in_total += vring_desc_len(desc_pa, i)) >= in_bytes)
+                    (in_total += vring_desc_len(desc_va, i)) >= in_bytes) {
+                    if (indirect) {
+                        cpu_physical_memory_unmap(desc_va, len, 0, 0);
+                    }
                     return 1;
+                }
             } else {
                 if (out_bytes > 0 &&
-                    (out_total += vring_desc_len(desc_pa, i)) >= out_bytes)
+                    (out_total += vring_desc_len(desc_va, i)) >= out_bytes) {
+                    if (indirect) {
+                        cpu_physical_memory_unmap(desc_va, len, 0, 0);
+                    }
                     return 1;
+                }
             }
-        } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max);
+        } while ((i = virtqueue_next_desc(desc_va, i, max)) != max);
 
-        if (!indirect)
+        if (!indirect) {
             total_bufs = num_bufs;
-        else
+        } else {
+            cpu_physical_memory_unmap(desc_va, len, 0, 0);
             total_bufs++;
+        }
     }
 
     return 0;
@@ -379,7 +438,9 @@  void virtqueue_map_sg(struct iovec *sg, target_phys_addr_t *addr,
 int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
 {
     unsigned int i, head, max;
-    target_phys_addr_t desc_pa = vq->vring.desc;
+    void *desc_va = vq->vring.desc_va;
+    target_phys_addr_t len = 0;
+    int indirect = 0;
 
     if (!virtqueue_num_heads(vq, vq->last_avail_idx))
         return 0;
@@ -391,15 +452,21 @@  int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
 
     i = head = virtqueue_get_head(vq, vq->last_avail_idx++);
 
-    if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_INDIRECT) {
-        if (vring_desc_len(desc_pa, i) % sizeof(VRingDesc)) {
+    if (vring_desc_flags(desc_va, i) & VRING_DESC_F_INDIRECT) {
+        len = vring_desc_len(desc_va, i);
+        if (len % sizeof(VRingDesc)) {
             fprintf(stderr, "Invalid size for indirect buffer table\n");
             exit(1);
         }
 
         /* loop over the indirect descriptor table */
-        max = vring_desc_len(desc_pa, i) / sizeof(VRingDesc);
-        desc_pa = vring_desc_addr(desc_pa, i);
+        max = len / sizeof(VRingDesc);
+        desc_va = virtqueue_map(vring_desc_addr(desc_va, i), len, 0);
+        if (!desc_va) {
+            fprintf(stderr, "Failed to map indirect virtqueue\n");
+            exit(1);
+        }
+        indirect = 1;
         i = 0;
     }
 
@@ -407,22 +474,26 @@  int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
     do {
         struct iovec *sg;
 
-        if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_WRITE) {
-            elem->in_addr[elem->in_num] = vring_desc_addr(desc_pa, i);
+        if (vring_desc_flags(desc_va, i) & VRING_DESC_F_WRITE) {
+            elem->in_addr[elem->in_num] = vring_desc_addr(desc_va, i);
             sg = &elem->in_sg[elem->in_num++];
         } else {
-            elem->out_addr[elem->out_num] = vring_desc_addr(desc_pa, i);
+            elem->out_addr[elem->out_num] = vring_desc_addr(desc_va, i);
             sg = &elem->out_sg[elem->out_num++];
         }
 
-        sg->iov_len = vring_desc_len(desc_pa, i);
+        sg->iov_len = vring_desc_len(desc_va, i);
 
         /* If we've got too many, that implies a descriptor loop. */
         if ((elem->in_num + elem->out_num) > max) {
             fprintf(stderr, "Looped descriptor");
             exit(1);
         }
-    } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max);
+    } while ((i = virtqueue_next_desc(desc_va, i, max)) != max);
+
+    if (indirect) {
+        cpu_physical_memory_unmap(desc_va, len, 0, 0);
+    }
 
     /* Now map what we have collected */
     virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1);
@@ -467,6 +538,27 @@  void virtio_reset(void *opaque)
         vdev->vq[i].vring.desc = 0;
         vdev->vq[i].vring.avail = 0;
         vdev->vq[i].vring.used = 0;
+        if (vdev->vq[i].vring.desc_va) {
+            cpu_physical_memory_unmap(vdev->vq[i].vring.desc_va,
+                                      vdev->vq[i].vring.num *
+                                      sizeof(VRingDesc), 0, 0);
+            vdev->vq[i].vring.desc_va = NULL;
+        }
+        if (vdev->vq[i].vring.avail_va) {
+            cpu_physical_memory_unmap(vdev->vq[i].vring.avail_va,
+                                      offsetof(VRingAvail,
+                                               ring[vdev->vq[i].vring.num]),
+                                      0, 0);
+            vdev->vq[i].vring.avail_va = NULL;
+        }
+        if (vdev->vq[i].vring.used_va) {
+            cpu_physical_memory_unmap(vdev->vq[i].vring.used_va,
+                                      offsetof(VRingUsed,
+                                               ring[vdev->vq[i].vring.num]), 1,
+                                      offsetof(VRingUsed,
+                                               ring[vdev->vq[i].vring.num]));
+            vdev->vq[i].vring.used_va = NULL;
+        }
         vdev->vq[i].last_avail_idx = 0;
         vdev->vq[i].pa = 0;
         vdev->vq[i].vector = VIRTIO_NO_VECTOR;