diff mbox

[PATCHv2,09/12] vhost: vhost net support

Message ID af686d9630f18829e4e2ba1333a198efea1126b6.1267122331.git.mst@redhat.com
State New
Headers show

Commit Message

Michael S. Tsirkin Feb. 25, 2010, 6:28 p.m. UTC
This adds vhost net device support in qemu. Will be tied to tap device
and virtio by following patches.  Raw backend is currently missing,
will be worked on/submitted separately.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 Makefile.target |    2 +
 configure       |   21 ++
 hw/vhost.c      |  631 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vhost.h      |   44 ++++
 hw/vhost_net.c  |  177 ++++++++++++++++
 hw/vhost_net.h  |   20 ++
 6 files changed, 895 insertions(+), 0 deletions(-)
 create mode 100644 hw/vhost.c
 create mode 100644 hw/vhost.h
 create mode 100644 hw/vhost_net.c
 create mode 100644 hw/vhost_net.h

Comments

Juan Quintela Feb. 25, 2010, 7:04 p.m. UTC | #1
"Michael S. Tsirkin" <mst@redhat.com> wrote:
> This adds vhost net device support in qemu. Will be tied to tap device
> and virtio by following patches.  Raw backend is currently missing,
> will be worked on/submitted separately.
>

+obj-y += vhost_net.o
+obj-$(CONFIG_VHOST_NET) += vhost.o

hy is vhost_net.o configured unconditionally?

> --- a/configure
> +++ b/configure
> @@ -1498,6 +1498,23 @@ EOF
>  fi
>
This misses vhost_net var definition at the start of the file and
--enable-vhost/--disable-vhost options.


>  ##########################################
> +# test for vhost net
> +
> +if test "$kvm" != "no"; then
> +	cat > $TMPC <<EOF
> +#include <linux/vhost.h>
> +int main(void) { return 0; }
> +EOF
> +	if compile_prog "$kvm_cflags" "" ; then
> +	vhost_net=yes
> +	else
> +	vhost_net=no
> +	fi

Indent please.

> +else
> +	vhost_net=no
> +fi
> +
> +##########################################
>  # pthread probe
>  PTHREADLIBS_LIST="-lpthread -lpthreadGC2"
>  
> @@ -1968,6 +1985,7 @@ echo "fdt support       $fdt"
>  echo "preadv support    $preadv"
>  echo "fdatasync         $fdatasync"
>  echo "uuid support      $uuid"
> +echo "vhost-net support $vhost_net"

Otherwise this couldo not be there.

>  if test $sdl_too_old = "yes"; then
>  echo "-> Your SDL version is too old - please upgrade to have SDL support"
> @@ -2492,6 +2510,9 @@ case "$target_arch2" in
>        if test "$kvm_para" = "yes"; then
>          echo "CONFIG_KVM_PARA=y" >> $config_target_mak
>        fi
> +      if test $vhost_net = "yes" ; then
> +        echo "CONFIG_VHOST_NET=y" >> $config_target_mak
> +      fi
>      fi
>  esac
>  echo "TARGET_PHYS_ADDR_BITS=$target_phys_bits" >> $config_target_mak

> +    for (;from < to; ++from) {
> +        vhost_log_chunk_t log;

.....

> +                ffsll(log) : ffs(log))) {

  if you defines vhost_log_chuck_t, you also define vhost_log_ffs() and
  you are done without this if.

Later, Juan.
Anthony Liguori Feb. 25, 2010, 7:44 p.m. UTC | #2
On 02/25/2010 12:28 PM, Michael S. Tsirkin wrote:
> This adds vhost net device support in qemu. Will be tied to tap device
> and virtio by following patches.  Raw backend is currently missing,
> will be worked on/submitted separately.
>
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> ---
>   Makefile.target |    2 +
>   configure       |   21 ++
>   hw/vhost.c      |  631 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   hw/vhost.h      |   44 ++++
>   hw/vhost_net.c  |  177 ++++++++++++++++
>   hw/vhost_net.h  |   20 ++
>   6 files changed, 895 insertions(+), 0 deletions(-)
>   create mode 100644 hw/vhost.c
>   create mode 100644 hw/vhost.h
>   create mode 100644 hw/vhost_net.c
>   create mode 100644 hw/vhost_net.h
>
> diff --git a/Makefile.target b/Makefile.target
> index c1580e9..9b4fd84 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -174,6 +174,8 @@ obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o
>   # need to fix this properly
>   obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-pci.o virtio-serial-bus.o
>   obj-y += notifier.o
> +obj-y += vhost_net.o
> +obj-$(CONFIG_VHOST_NET) += vhost.o
>   obj-y += rwhandler.o
>   obj-$(CONFIG_KVM) += kvm.o kvm-all.o
>   obj-$(CONFIG_ISA_MMIO) += isa_mmio.o
> diff --git a/configure b/configure
> index 8eb5f5b..5eccc7c 100755
> --- a/configure
> +++ b/configure
> @@ -1498,6 +1498,23 @@ EOF
>   fi
>
>   ##########################################
> +# test for vhost net
> +
> +if test "$kvm" != "no"; then
> +	cat>  $TMPC<<EOF
> +#include<linux/vhost.h>
> +int main(void) { return 0; }
> +EOF
> +	if compile_prog "$kvm_cflags" "" ; then
> +	vhost_net=yes
> +	else
> +	vhost_net=no
> +	fi
> +else
> +	vhost_net=no
> +fi
> +
> +##########################################
>   # pthread probe
>   PTHREADLIBS_LIST="-lpthread -lpthreadGC2"
>
> @@ -1968,6 +1985,7 @@ echo "fdt support       $fdt"
>   echo "preadv support    $preadv"
>   echo "fdatasync         $fdatasync"
>   echo "uuid support      $uuid"
> +echo "vhost-net support $vhost_net"
>
>   if test $sdl_too_old = "yes"; then
>   echo "->  Your SDL version is too old - please upgrade to have SDL support"
> @@ -2492,6 +2510,9 @@ case "$target_arch2" in
>         if test "$kvm_para" = "yes"; then
>           echo "CONFIG_KVM_PARA=y">>  $config_target_mak
>         fi
> +      if test $vhost_net = "yes" ; then
> +        echo "CONFIG_VHOST_NET=y">>  $config_target_mak
> +      fi
>       fi
>   esac
>   echo "TARGET_PHYS_ADDR_BITS=$target_phys_bits">>  $config_target_mak
> diff --git a/hw/vhost.c b/hw/vhost.c
> new file mode 100644
> index 0000000..4d5ea02
> --- /dev/null
> +++ b/hw/vhost.c
>    

Needs copyright/license.

> @@ -0,0 +1,631 @@
> +#include "linux/vhost.h"
> +#include<sys/ioctl.h>
> +#include<sys/eventfd.h>
> +#include "vhost.h"
> +#include "hw/hw.h"
> +/* For range_get_last */
> +#include "pci.h"
> +
> +static void vhost_dev_sync_region(struct vhost_dev *dev,
> +                                  uint64_t mfirst, uint64_t mlast,
> +                                  uint64_t rfirst, uint64_t rlast)
> +{
> +    uint64_t start = MAX(mfirst, rfirst);
> +    uint64_t end = MIN(mlast, rlast);
> +    vhost_log_chunk_t *from = dev->log + start / VHOST_LOG_CHUNK;
> +    vhost_log_chunk_t *to = dev->log + end / VHOST_LOG_CHUNK + 1;
> +    uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;
> +
> +    assert(end / VHOST_LOG_CHUNK<  dev->log_size);
> +    assert(start / VHOST_LOG_CHUNK<  dev->log_size);
> +    if (end<  start) {
> +        return;
> +    }
> +    for (;from<  to; ++from) {
> +        vhost_log_chunk_t log;
> +        int bit;
> +        /* We first check with non-atomic: much cheaper,
> +         * and we expect non-dirty to be the common case. */
> +        if (!*from) {
> +            continue;
> +        }
> +        /* Data must be read atomically. We don't really
> +         * need the barrier semantics of __sync
> +         * builtins, but it's easier to use them than
> +         * roll our own. */
> +        log = __sync_fetch_and_and(from, 0);
>    

Is this too non-portable for us?

Technically speaking, it would be better to use qemu address types 
instead of uint64_t.

> +        while ((bit = sizeof(log)>  sizeof(int) ?
> +                ffsll(log) : ffs(log))) {
> +            bit -= 1;
> +            cpu_physical_memory_set_dirty(addr + bit * VHOST_LOG_PAGE);
> +            log&= ~(0x1ull<<  bit);
> +        }
> +        addr += VHOST_LOG_CHUNK;
> +    }
> +}
> +
> +static int vhost_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
> +                                          target_phys_addr_t start_addr,
> +                                          target_phys_addr_t end_addr)
>    

The 'struct' shouldn't be needed...

> +{
> +    struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
> +    int i;
> +    if (!dev->log_enabled || !dev->started) {
> +        return 0;
> +    }
> +    for (i = 0; i<  dev->mem->nregions; ++i) {
> +        struct vhost_memory_region *reg = dev->mem->regions + i;
> +        vhost_dev_sync_region(dev, start_addr, end_addr,
> +                              reg->guest_phys_addr,
> +                              range_get_last(reg->guest_phys_addr,
> +                                             reg->memory_size));
> +    }
> +    for (i = 0; i<  dev->nvqs; ++i) {
> +        struct vhost_virtqueue *vq = dev->vqs + i;
> +        unsigned size = offsetof(struct vring_used, ring) +
> +            sizeof(struct vring_used_elem) * vq->num;
> +        vhost_dev_sync_region(dev, start_addr, end_addr, vq->used_phys,
> +                              range_get_last(vq->used_phys, size));
> +    }
> +    return 0;
> +}
> +
> +/* Assign/unassign. Keep an unsorted array of non-overlapping
> + * memory regions in dev->mem. */
> +static void vhost_dev_unassign_memory(struct vhost_dev *dev,
> +                                      uint64_t start_addr,
> +                                      uint64_t size)
> +{
> +    int from, to, n = dev->mem->nregions;
> +    /* Track overlapping/split regions for sanity checking. */
> +    int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
> +
> +    for (from = 0, to = 0; from<  n; ++from, ++to) {
> +        struct vhost_memory_region *reg = dev->mem->regions + to;
> +        uint64_t reglast;
> +        uint64_t memlast;
> +        uint64_t change;
> +
> +        /* clone old region */
> +        if (to != from) {
> +            memcpy(reg, dev->mem->regions + from, sizeof *reg);
> +        }
> +
> +        /* No overlap is simple */
> +        if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
> +                            start_addr, size)) {
> +            continue;
> +        }
> +
> +        /* Split only happens if supplied region
> +         * is in the middle of an existing one. Thus it can not
> +         * overlap with any other existing region. */
> +        assert(!split);
> +
> +        reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
> +        memlast = range_get_last(start_addr, size);
> +
> +        /* Remove whole region */
> +        if (start_addr<= reg->guest_phys_addr&&  memlast>= reglast) {
> +            --dev->mem->nregions;
> +            --to;
> +            assert(to>= 0);
> +            ++overlap_middle;
> +            continue;
> +        }
> +
> +        /* Shrink region */
> +        if (memlast>= reglast) {
> +            reg->memory_size = start_addr - reg->guest_phys_addr;
> +            assert(reg->memory_size);
> +            assert(!overlap_end);
> +            ++overlap_end;
> +            continue;
> +        }
> +
> +        /* Shift region */
> +        if (start_addr<= reg->guest_phys_addr) {
> +            change = memlast + 1 - reg->guest_phys_addr;
> +            reg->memory_size -= change;
> +            reg->guest_phys_addr += change;
> +            reg->userspace_addr += change;
> +            assert(reg->memory_size);
> +            assert(!overlap_start);
> +            ++overlap_start;
> +            continue;
> +        }
> +
> +        /* This only happens if supplied region
> +         * is in the middle of an existing one. Thus it can not
> +         * overlap with any other existing region. */
> +        assert(!overlap_start);
> +        assert(!overlap_end);
> +        assert(!overlap_middle);
> +        /* Split region: shrink first part, shift second part. */
> +        memcpy(dev->mem->regions + n, reg, sizeof *reg);
> +        reg->memory_size = start_addr - reg->guest_phys_addr;
> +        assert(reg->memory_size);
> +        change = memlast + 1 - reg->guest_phys_addr;
> +        reg = dev->mem->regions + n;
> +        reg->memory_size -= change;
> +        assert(reg->memory_size);
> +        reg->guest_phys_addr += change;
> +        reg->userspace_addr += change;
> +        /* Never add more than 1 region */
> +        assert(dev->mem->nregions == n);
> +        ++dev->mem->nregions;
> +        ++split;
> +    }
> +}
>    

This code is basically replicating the code in kvm-all with respect to 
translating qemu ram registrations into a list of non-overlapping 
slots.  We should commonize the code and perhaps even change the 
notification API to deal with non-overlapping slots since that's what 
users seem to want.
> +
> +/* Called after unassign, so no regions overlap the given range. */
> +static void vhost_dev_assign_memory(struct vhost_dev *dev,
> +                                    uint64_t start_addr,
> +                                    uint64_t size,
> +                                    uint64_t uaddr)
> +{
> +    int from, to;
> +    struct vhost_memory_region *merged = NULL;
> +    for (from = 0, to = 0; from<  dev->mem->nregions; ++from, ++to) {
> +        struct vhost_memory_region *reg = dev->mem->regions + to;
> +        uint64_t prlast, urlast;
> +        uint64_t pmlast, umlast;
> +        uint64_t s, e, u;
> +
> +        /* clone old region */
> +        if (to != from) {
> +            memcpy(reg, dev->mem->regions + from, sizeof *reg);
> +        }
> +        prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
> +        pmlast = range_get_last(start_addr, size);
> +        urlast = range_get_last(reg->userspace_addr, reg->memory_size);
> +        umlast = range_get_last(uaddr, size);
> +
> +        /* check for overlapping regions: should never happen. */
> +        assert(prlast<  start_addr || pmlast<  reg->guest_phys_addr);
> +        /* Not an adjacent or overlapping region - do not merge. */
> +        if ((prlast + 1 != start_addr || urlast + 1 != uaddr)&&
> +            (pmlast + 1 != reg->guest_phys_addr ||
> +             umlast + 1 != reg->userspace_addr)) {
> +            continue;
> +        }
> +
> +        if (merged) {
> +            --to;
> +            assert(to>= 0);
> +        } else {
> +            merged = reg;
> +        }
> +        u = MIN(uaddr, reg->userspace_addr);
> +        s = MIN(start_addr, reg->guest_phys_addr);
> +        e = MAX(pmlast, prlast);
> +        uaddr = merged->userspace_addr = u;
> +        start_addr = merged->guest_phys_addr = s;
> +        size = merged->memory_size = e - s + 1;
> +        assert(merged->memory_size);
> +    }
> +
> +    if (!merged) {
> +        struct vhost_memory_region *reg = dev->mem->regions + to;
> +        memset(reg, 0, sizeof *reg);
> +        reg->memory_size = size;
> +        assert(reg->memory_size);
> +        reg->guest_phys_addr = start_addr;
> +        reg->userspace_addr = uaddr;
> +        ++to;
> +    }
> +    assert(to<= dev->mem->nregions + 1);
> +    dev->mem->nregions = to;
> +}
>    

See above.  Unifying the two bits of code is important IMHO because we 
had an unending supply of bugs with the code in kvm-all it seems.

> +static uint64_t vhost_get_log_size(struct vhost_dev *dev)
> +{
> +    uint64_t log_size = 0;
> +    int i;
> +    for (i = 0; i<  dev->mem->nregions; ++i) {
> +        struct vhost_memory_region *reg = dev->mem->regions + i;
> +        uint64_t last = range_get_last(reg->guest_phys_addr,
> +                                       reg->memory_size);
> +        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
> +    }
> +    for (i = 0; i<  dev->nvqs; ++i) {
> +        struct vhost_virtqueue *vq = dev->vqs + i;
> +        uint64_t last = vq->used_phys +
> +            offsetof(struct vring_used, ring) +
> +            sizeof(struct vring_used_elem) * vq->num - 1;
> +        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
> +    }
> +    return log_size;
> +}
> +
> +static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size)
> +{
> +    vhost_log_chunk_t *log;
> +    uint64_t log_base;
> +    int r;
> +    if (size) {
> +        log = qemu_mallocz(size * sizeof *log);
> +    } else {
> +        log = NULL;
> +    }
> +    log_base = (uint64_t)(unsigned long)log;
> +    r = ioctl(dev->control, VHOST_SET_LOG_BASE,&log_base);
> +    assert(r>= 0);
> +    vhost_client_sync_dirty_bitmap(&dev->client, 0,
> +                                   (target_phys_addr_t)~0x0ull);
> +    if (dev->log) {
> +        qemu_free(dev->log);
> +    }
> +    dev->log = log;
> +    dev->log_size = size;
> +}
> +
> +static void vhost_client_set_memory(CPUPhysMemoryClient *client,
> +                                    target_phys_addr_t start_addr,
> +                                    ram_addr_t size,
> +                                    ram_addr_t phys_offset)
> +{
> +    struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
> +    ram_addr_t flags = phys_offset&  ~TARGET_PAGE_MASK;
> +    int s = offsetof(struct vhost_memory, regions) +
> +        (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
> +    uint64_t log_size;
> +    int r;
> +    dev->mem = qemu_realloc(dev->mem, s);
> +
> +    assert(size);
> +
> +    vhost_dev_unassign_memory(dev, start_addr, size);
> +    if (flags == IO_MEM_RAM) {
> +        /* Add given mapping, merging adjacent regions if any */
> +        vhost_dev_assign_memory(dev, start_addr, size,
> +                                (uintptr_t)qemu_get_ram_ptr(phys_offset));
> +    } else {
> +        /* Remove old mapping for this memory, if any. */
> +        vhost_dev_unassign_memory(dev, start_addr, size);
> +    }
> +
> +    if (!dev->started) {
> +        return;
> +    }
> +    if (!dev->log_enabled) {
> +        r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
> +        assert(r>= 0);
> +        return;
> +    }
> +    log_size = vhost_get_log_size(dev);
> +    /* We allocate an extra 4K bytes to log,
> +     * to reduce the * number of reallocations. */
> +#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
> +    /* To log more, must increase log size before table update. */
> +    if (dev->log_size<  log_size) {
> +        vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
> +    }
> +    r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
> +    assert(r>= 0);
> +    /* To log less, can only decrease log size after table update. */
> +    if (dev->log_size>  log_size + VHOST_LOG_BUFFER) {
> +        vhost_dev_log_resize(dev, log_size);
> +    }
> +}
> +
> +static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
> +                                    struct vhost_virtqueue *vq,
> +                                    unsigned idx, bool enable_log)
> +{
> +    struct vhost_vring_addr addr = {
> +        .index = idx,
> +        .desc_user_addr = (u_int64_t)(unsigned long)vq->desc,
> +        .avail_user_addr = (u_int64_t)(unsigned long)vq->avail,
> +        .used_user_addr = (u_int64_t)(unsigned long)vq->used,
> +        .log_guest_addr = vq->used_phys,
> +        .flags = enable_log ? (1<<  VHOST_VRING_F_LOG) : 0,
> +    };
> +    int r = ioctl(dev->control, VHOST_SET_VRING_ADDR,&addr);
> +    if (r<  0) {
> +        return -errno;
> +    }
> +    return 0;
> +}
> +
> +static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
> +{
> +    uint64_t features = dev->acked_features;
> +    int r;
> +    if (enable_log) {
> +        features |= 0x1<<  VHOST_F_LOG_ALL;
> +    }
> +    r = ioctl(dev->control, VHOST_SET_FEATURES,&features);
> +    return r<  0 ? -errno : 0;
> +}
> +
> +static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
> +{
> +    int r, t, i;
> +    r = vhost_dev_set_features(dev, enable_log);
> +    if (r<  0)
> +        goto err_features;
>    

Coding Style is off with single line ifs.

> +    for (i = 0; i<  dev->nvqs; ++i) {
>    

C++ habits die hard :-)

> +        r = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
> +                                     enable_log);
> +        if (r<  0)
> +            goto err_vq;
> +    }
> +    return 0;
> +err_vq:
> +    for (; i>= 0; --i) {
> +        t = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
> +                                     dev->log_enabled);
> +        assert(t>= 0);
> +    }
> +    t = vhost_dev_set_features(dev, dev->log_enabled);
> +    assert(t>= 0);
> +err_features:
> +    return r;
> +}
> +
> +static int vhost_client_migration_log(struct CPUPhysMemoryClient *client,
> +                                      int enable)
> +{
> +    struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
> +    int r;
> +    if (!!enable == dev->log_enabled) {
> +        return 0;
> +    }
> +    if (!dev->started) {
> +        dev->log_enabled = enable;
> +        return 0;
> +    }
> +    if (!enable) {
> +        r = vhost_dev_set_log(dev, false);
> +        if (r<  0) {
> +            return r;
> +        }
> +        if (dev->log) {
> +            qemu_free(dev->log);
> +        }
> +        dev->log = NULL;
> +        dev->log_size = 0;
> +    } else {
> +        vhost_dev_log_resize(dev, vhost_get_log_size(dev));
> +        r = vhost_dev_set_log(dev, true);
> +        if (r<  0) {
> +            return r;
> +        }
> +    }
> +    dev->log_enabled = enable;
> +    return 0;
> +}
> +
> +static int vhost_virtqueue_init(struct vhost_dev *dev,
> +                                struct VirtIODevice *vdev,
> +                                struct vhost_virtqueue *vq,
> +                                unsigned idx)
> +{
> +    target_phys_addr_t s, l, a;
> +    int r;
> +    struct vhost_vring_file file = {
> +        .index = idx,
> +    };
> +    struct vhost_vring_state state = {
> +        .index = idx,
> +    };
> +    struct VirtQueue *q = virtio_queue(vdev, idx);
> +
> +    vq->num = state.num = virtio_queue_get_num(vdev, idx);
> +    r = ioctl(dev->control, VHOST_SET_VRING_NUM,&state);
> +    if (r) {
> +        return -errno;
> +    }
> +
> +    state.num = virtio_queue_last_avail_idx(vdev, idx);
> +    r = ioctl(dev->control, VHOST_SET_VRING_BASE,&state);
> +    if (r) {
> +        return -errno;
> +    }
> +
> +    s = l = sizeof(struct vring_desc) * vq->num;
> +    a = virtio_queue_get_desc(vdev, idx);
> +    vq->desc = cpu_physical_memory_map(a,&l, 0);
> +    if (!vq->desc || l != s) {
> +        r = -ENOMEM;
> +        goto fail_alloc;
> +    }
> +    s = l = offsetof(struct vring_avail, ring) +
> +        sizeof(u_int64_t) * vq->num;
> +    a = virtio_queue_get_avail(vdev, idx);
> +    vq->avail = cpu_physical_memory_map(a,&l, 0);
> +    if (!vq->avail || l != s) {
> +        r = -ENOMEM;
> +        goto fail_alloc;
> +    }
>    

You don't unmap avail/desc on failure.  map() may fail because the ring 
cross MMIO memory and you run out of a bounce buffer.

IMHO, it would be better to attempt to map the full ring at once and 
then if that doesn't succeed, bail out.  You can still pass individual 
pointers via vhost ioctls but within qemu, it's much easier to deal with 
the whole ring at a time.

> +    s = l = offsetof(struct vring_used, ring) +
> +        sizeof(struct vring_used_elem) * vq->num;
>    

This is unfortunate.  We redefine this structures in qemu to avoid 
depending on Linux headers.  But you're using the linux versions instead 
of the qemu versions.  Is it really necessary for vhost.h to include 
virtio.h?

> +    vq->used_phys = a = virtio_queue_get_used(vdev, idx);
> +    vq->used = cpu_physical_memory_map(a,&l, 1);
> +    if (!vq->used || l != s) {
> +        r = -ENOMEM;
> +        goto fail_alloc;
> +    }
> +
> +    r = vhost_virtqueue_set_addr(dev, vq, idx, dev->log_enabled);
> +    if (r<  0) {
> +        r = -errno;
> +        goto fail_alloc;
> +    }
> +    if (!vdev->binding->guest_notifier || !vdev->binding->host_notifier) {
> +        fprintf(stderr, "binding does not support irqfd/queuefd\n");
> +        r = -ENOSYS;
> +        goto fail_alloc;
> +    }
> +    r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, true);
> +    if (r<  0) {
> +        fprintf(stderr, "Error binding guest notifier: %d\n", -r);
> +        goto fail_guest_notifier;
> +    }
> +
> +    r = vdev->binding->host_notifier(vdev->binding_opaque, idx, true);
> +    if (r<  0) {
> +        fprintf(stderr, "Error binding host notifier: %d\n", -r);
> +        goto fail_host_notifier;
> +    }
> +
> +    file.fd = event_notifier_get_fd(virtio_queue_host_notifier(q));
> +    r = ioctl(dev->control, VHOST_SET_VRING_KICK,&file);
> +    if (r) {
> +        goto fail_kick;
> +    }
> +
> +    file.fd = event_notifier_get_fd(virtio_queue_guest_notifier(q));
> +    r = ioctl(dev->control, VHOST_SET_VRING_CALL,&file);
> +    if (r) {
> +        goto fail_call;
> +    }
>    

This function would be a bit more reasonable if it were split into 
sections FWIW.

> +    return 0;
> +
> +fail_call:
> +fail_kick:
> +    vdev->binding->host_notifier(vdev->binding_opaque, idx, false);
> +fail_host_notifier:
> +    vdev->binding->guest_notifier(vdev->binding_opaque, idx, false);
> +fail_guest_notifier:
> +fail_alloc:
> +    return r;
> +}
> +
> +static void vhost_virtqueue_cleanup(struct vhost_dev *dev,
> +                                    struct VirtIODevice *vdev,
> +                                    struct vhost_virtqueue *vq,
> +                                    unsigned idx)
> +{
> +    struct vhost_vring_state state = {
> +        .index = idx,
> +    };
> +    int r;
> +    r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, false);
> +    if (r<  0) {
> +        fprintf(stderr, "vhost VQ %d guest cleanup failed: %d\n", idx, r);
> +        fflush(stderr);
> +    }
> +    assert (r>= 0);
> +
> +    r = vdev->binding->host_notifier(vdev->binding_opaque, idx, false);
> +    if (r<  0) {
> +        fprintf(stderr, "vhost VQ %d host cleanup failed: %d\n", idx, r);
> +        fflush(stderr);
> +    }
> +    assert (r>= 0);
> +    r = ioctl(dev->control, VHOST_GET_VRING_BASE,&state);
> +    if (r<  0) {
> +        fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
> +        fflush(stderr);
> +    }
> +    virtio_queue_set_last_avail_idx(vdev, idx, state.num);
> +    assert (r>= 0);
>    

You never unmap() the mapped memory and you're cheating by assuming that 
the virtio rings have a constant mapping for the life time of a guest.  
That's not technically true.  My concern is that since a guest can 
trigger remappings (by adjusting PCI mappings) badness can ensue.
> diff --git a/hw/vhost_net.c b/hw/vhost_net.c
> new file mode 100644
> index 0000000..06b7648
>    

Need copyright/license.

> --- /dev/null
> +++ b/hw/vhost_net.c
> @@ -0,0 +1,177 @@
> +#include "net.h"
> +#include "net/tap.h"
> +
> +#include "virtio-net.h"
> +#include "vhost_net.h"
> +
> +#include "config.h"
> +
> +#ifdef CONFIG_VHOST_NET
> +#include<sys/eventfd.h>
> +#include<sys/socket.h>
> +#include<linux/kvm.h>
> +#include<fcntl.h>
> +#include<sys/ioctl.h>
> +#include<linux/virtio_ring.h>
> +#include<netpacket/packet.h>
> +#include<net/ethernet.h>
> +#include<net/if.h>
> +#include<netinet/in.h>
> +
> +#include<stdio.h>
> +
> +#include "vhost.h"
> +
> +struct vhost_net {
>    


VHostNetState.

> +    struct vhost_dev dev;
> +    struct vhost_virtqueue vqs[2];
> +    int backend;
> +    VLANClientState *vc;
> +};
> +
> +unsigned vhost_net_get_features(struct vhost_net *net, unsigned features)
> +{
> +    /* Clear features not supported by host kernel. */
> +    if (!(net->dev.features&  (1<<  VIRTIO_F_NOTIFY_ON_EMPTY)))
> +        features&= ~(1<<  VIRTIO_F_NOTIFY_ON_EMPTY);
> +    if (!(net->dev.features&  (1<<  VIRTIO_RING_F_INDIRECT_DESC)))
> +        features&= ~(1<<  VIRTIO_RING_F_INDIRECT_DESC);
> +    if (!(net->dev.features&  (1<<  VIRTIO_NET_F_MRG_RXBUF)))
> +        features&= ~(1<<  VIRTIO_NET_F_MRG_RXBUF);
> +    return features;
> +}
> +
> +void vhost_net_ack_features(struct vhost_net *net, unsigned features)
> +{
> +    net->dev.acked_features = net->dev.backend_features;
> +    if (features&  (1<<  VIRTIO_F_NOTIFY_ON_EMPTY))
> +        net->dev.acked_features |= (1<<  VIRTIO_F_NOTIFY_ON_EMPTY);
> +    if (features&  (1<<  VIRTIO_RING_F_INDIRECT_DESC))
> +        net->dev.acked_features |= (1<<  VIRTIO_RING_F_INDIRECT_DESC);
> +}
> +
> +static int vhost_net_get_fd(VLANClientState *backend)
> +{
> +    switch (backend->info->type) {
> +    case NET_CLIENT_TYPE_TAP:
> +        return tap_get_fd(backend);
> +    default:
> +        fprintf(stderr, "vhost-net requires tap backend\n");
> +        return -EBADFD;
> +    }
> +}
> +
> +struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
> +{
> +    int r;
> +    struct vhost_net *net = qemu_malloc(sizeof *net);
> +    if (!backend) {
> +        fprintf(stderr, "vhost-net requires backend to be setup\n");
> +        goto fail;
> +    }
> +    r = vhost_net_get_fd(backend);
> +    if (r<  0)
> +        goto fail;
> +    net->vc = backend;
> +    net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 :
> +        (1<<  VHOST_NET_F_VIRTIO_NET_HDR);
> +    net->backend = r;
> +
> +    r = vhost_dev_init(&net->dev, devfd);
> +    if (r<  0)
> +        goto fail;
> +    if (~net->dev.features&  net->dev.backend_features) {
> +        fprintf(stderr, "vhost lacks feature mask %llu for backend\n",
> +                ~net->dev.features&  net->dev.backend_features);
> +        vhost_dev_cleanup(&net->dev);
> +        goto fail;
> +    }
> +
> +    /* Set sane init value. Override when guest acks. */
> +    vhost_net_ack_features(net, 0);
> +    return net;
> +fail:
> +    qemu_free(net);
> +    return NULL;
> +}
> +
> +int vhost_net_start(struct vhost_net *net,
> +                    VirtIODevice *dev)
> +{
> +    struct vhost_vring_file file = { };
> +    int r;
> +
> +    net->dev.nvqs = 2;
> +    net->dev.vqs = net->vqs;
> +    r = vhost_dev_start(&net->dev, dev);
> +    if (r<  0)
> +        return r;
> +
> +    net->vc->info->poll(net->vc, false);
> +    qemu_set_fd_handler(net->backend, NULL, NULL, NULL);
> +    file.fd = net->backend;
> +    for (file.index = 0; file.index<  net->dev.nvqs; ++file.index) {
> +        r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND,&file);
> +        if (r<  0) {
> +            r = -errno;
> +            goto fail;
> +        }
> +    }
> +    return 0;
> +fail:
> +    file.fd = -1;
> +    while (--file.index>= 0) {
> +        int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND,&file);
> +        assert(r>= 0);
> +    }
> +    net->vc->info->poll(net->vc, true);
> +    vhost_dev_stop(&net->dev, dev);
> +    return r;
> +}
> +
> +void vhost_net_stop(struct vhost_net *net,
> +                    VirtIODevice *dev)
> +{
> +    struct vhost_vring_file file = { .fd = -1 };
> +
> +    for (file.index = 0; file.index<  net->dev.nvqs; ++file.index) {
> +        int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND,&file);
> +        assert(r>= 0);
> +    }
> +    net->vc->info->poll(net->vc, true);
> +    vhost_dev_stop(&net->dev, dev);
> +}
> +
> +void vhost_net_cleanup(struct vhost_net *net)
> +{
> +    vhost_dev_cleanup(&net->dev);
> +    qemu_free(net);
> +}
> +#else
>    

If you're going this way, I'd suggest making static inlines in the 
header file instead of polluting the C file.  It's more common to search 
within a C file and having two declarations can get annoying.

Regards,

Anthony Liguori

> +struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
> +{
> +	return NULL;
> +}
> +
> +int vhost_net_start(struct vhost_net *net,
> +		    VirtIODevice *dev)
> +{
> +	return -ENOSYS;
> +}
> +void vhost_net_stop(struct vhost_net *net,
> +		    VirtIODevice *dev)
> +{
> +}
> +
> +void vhost_net_cleanup(struct vhost_net *net)
> +{
> +}
> +
> +unsigned vhost_net_get_features(struct vhost_net *net, unsigned features)
> +{
> +	return features;
> +}
> +void vhost_net_ack_features(struct vhost_net *net, unsigned features)
> +{
> +}
> +#endif
> diff --git a/hw/vhost_net.h b/hw/vhost_net.h
> new file mode 100644
> index 0000000..2a10210
> --- /dev/null
> +++ b/hw/vhost_net.h
> @@ -0,0 +1,20 @@
> +#ifndef VHOST_NET_H
> +#define VHOST_NET_H
> +
> +#include "net.h"
> +
> +struct vhost_net;
> +
> +struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd);
> +
> +int vhost_net_start(struct vhost_net *net,
> +                    VirtIODevice *dev);
> +void vhost_net_stop(struct vhost_net *net,
> +                    VirtIODevice *dev);
> +
> +void vhost_net_cleanup(struct vhost_net *net);
> +
> +unsigned vhost_net_get_features(struct vhost_net *net, unsigned features);
> +void vhost_net_ack_features(struct vhost_net *net, unsigned features);
> +
> +#endif
>
Michael S. Tsirkin Feb. 26, 2010, 2:32 p.m. UTC | #3
On Thu, Feb 25, 2010 at 08:04:21PM +0100, Juan Quintela wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > This adds vhost net device support in qemu. Will be tied to tap device
> > and virtio by following patches.  Raw backend is currently missing,
> > will be worked on/submitted separately.
> >
> 
> +obj-y += vhost_net.o
> +obj-$(CONFIG_VHOST_NET) += vhost.o
> 
> hy is vhost_net.o configured unconditionally?


It includes stubs that return error. This way callers do not
need to include any ifdefs.

> > --- a/configure
> > +++ b/configure
> > @@ -1498,6 +1498,23 @@ EOF
> >  fi
> >
> This misses vhost_net var definition at the start of the file

Seems to work without. Why is it needed?

> and
> --enable-vhost/--disable-vhost options.
> 

I don't really see why we need --enable-vhost/--disable-vhost.
Runtime flag is enough.

> >  ##########################################
> > +# test for vhost net
> > +
> > +if test "$kvm" != "no"; then
> > +	cat > $TMPC <<EOF
> > +#include <linux/vhost.h>
> > +int main(void) { return 0; }
> > +EOF
> > +	if compile_prog "$kvm_cflags" "" ; then
> > +	vhost_net=yes
> > +	else
> > +	vhost_net=no
> > +	fi
> 
> Indent please.
> 
> > +else
> > +	vhost_net=no
> > +fi
> > +
> > +##########################################
> >  # pthread probe
> >  PTHREADLIBS_LIST="-lpthread -lpthreadGC2"
> >  
> > @@ -1968,6 +1985,7 @@ echo "fdt support       $fdt"
> >  echo "preadv support    $preadv"
> >  echo "fdatasync         $fdatasync"
> >  echo "uuid support      $uuid"
> > +echo "vhost-net support $vhost_net"
> 
> Otherwise this couldo not be there.


What do you mean? vhost_net always gets a value, so it is safe to use
here.

> >  if test $sdl_too_old = "yes"; then
> >  echo "-> Your SDL version is too old - please upgrade to have SDL support"
> > @@ -2492,6 +2510,9 @@ case "$target_arch2" in
> >        if test "$kvm_para" = "yes"; then
> >          echo "CONFIG_KVM_PARA=y" >> $config_target_mak
> >        fi
> > +      if test $vhost_net = "yes" ; then
> > +        echo "CONFIG_VHOST_NET=y" >> $config_target_mak
> > +      fi
> >      fi
> >  esac
> >  echo "TARGET_PHYS_ADDR_BITS=$target_phys_bits" >> $config_target_mak
> 
> > +    for (;from < to; ++from) {
> > +        vhost_log_chunk_t log;
> 
> .....
> 
> > +                ffsll(log) : ffs(log))) {
> 
>   if you defines vhost_log_chuck_t, you also define vhost_log_ffs() and
>   you are done without this if.
> 
> Later, Juan.
Anthony Liguori Feb. 26, 2010, 2:38 p.m. UTC | #4
On 02/26/2010 08:32 AM, Michael S. Tsirkin wrote:
>> and
>> --enable-vhost/--disable-vhost options.
>>
>>      
> I don't really see why we need --enable-vhost/--disable-vhost.
> Runtime flag is enough.
>    

So that packagers can disable features at build time that they don't 
want to support.

Regards,

Anthony Liguori
Michael S. Tsirkin Feb. 26, 2010, 2:49 p.m. UTC | #5
On Thu, Feb 25, 2010 at 01:44:34PM -0600, Anthony Liguori wrote:
> On 02/25/2010 12:28 PM, Michael S. Tsirkin wrote:
>> This adds vhost net device support in qemu. Will be tied to tap device
>> and virtio by following patches.  Raw backend is currently missing,
>> will be worked on/submitted separately.
>>
>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>> ---
>>   Makefile.target |    2 +
>>   configure       |   21 ++
>>   hw/vhost.c      |  631 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   hw/vhost.h      |   44 ++++
>>   hw/vhost_net.c  |  177 ++++++++++++++++
>>   hw/vhost_net.h  |   20 ++
>>   6 files changed, 895 insertions(+), 0 deletions(-)
>>   create mode 100644 hw/vhost.c
>>   create mode 100644 hw/vhost.h
>>   create mode 100644 hw/vhost_net.c
>>   create mode 100644 hw/vhost_net.h
>>
>> diff --git a/Makefile.target b/Makefile.target
>> index c1580e9..9b4fd84 100644
>> --- a/Makefile.target
>> +++ b/Makefile.target
>> @@ -174,6 +174,8 @@ obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o
>>   # need to fix this properly
>>   obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-pci.o virtio-serial-bus.o
>>   obj-y += notifier.o
>> +obj-y += vhost_net.o
>> +obj-$(CONFIG_VHOST_NET) += vhost.o
>>   obj-y += rwhandler.o
>>   obj-$(CONFIG_KVM) += kvm.o kvm-all.o
>>   obj-$(CONFIG_ISA_MMIO) += isa_mmio.o
>> diff --git a/configure b/configure
>> index 8eb5f5b..5eccc7c 100755
>> --- a/configure
>> +++ b/configure
>> @@ -1498,6 +1498,23 @@ EOF
>>   fi
>>
>>   ##########################################
>> +# test for vhost net
>> +
>> +if test "$kvm" != "no"; then
>> +	cat>  $TMPC<<EOF
>> +#include<linux/vhost.h>
>> +int main(void) { return 0; }
>> +EOF
>> +	if compile_prog "$kvm_cflags" "" ; then
>> +	vhost_net=yes
>> +	else
>> +	vhost_net=no
>> +	fi
>> +else
>> +	vhost_net=no
>> +fi
>> +
>> +##########################################
>>   # pthread probe
>>   PTHREADLIBS_LIST="-lpthread -lpthreadGC2"
>>
>> @@ -1968,6 +1985,7 @@ echo "fdt support       $fdt"
>>   echo "preadv support    $preadv"
>>   echo "fdatasync         $fdatasync"
>>   echo "uuid support      $uuid"
>> +echo "vhost-net support $vhost_net"
>>
>>   if test $sdl_too_old = "yes"; then
>>   echo "->  Your SDL version is too old - please upgrade to have SDL support"
>> @@ -2492,6 +2510,9 @@ case "$target_arch2" in
>>         if test "$kvm_para" = "yes"; then
>>           echo "CONFIG_KVM_PARA=y">>  $config_target_mak
>>         fi
>> +      if test $vhost_net = "yes" ; then
>> +        echo "CONFIG_VHOST_NET=y">>  $config_target_mak
>> +      fi
>>       fi
>>   esac
>>   echo "TARGET_PHYS_ADDR_BITS=$target_phys_bits">>  $config_target_mak
>> diff --git a/hw/vhost.c b/hw/vhost.c
>> new file mode 100644
>> index 0000000..4d5ea02
>> --- /dev/null
>> +++ b/hw/vhost.c
>>    
>
> Needs copyright/license.
>
>> @@ -0,0 +1,631 @@
>> +#include "linux/vhost.h"
>> +#include<sys/ioctl.h>
>> +#include<sys/eventfd.h>
>> +#include "vhost.h"
>> +#include "hw/hw.h"
>> +/* For range_get_last */
>> +#include "pci.h"
>> +
>> +static void vhost_dev_sync_region(struct vhost_dev *dev,
>> +                                  uint64_t mfirst, uint64_t mlast,
>> +                                  uint64_t rfirst, uint64_t rlast)
>> +{
>> +    uint64_t start = MAX(mfirst, rfirst);
>> +    uint64_t end = MIN(mlast, rlast);
>> +    vhost_log_chunk_t *from = dev->log + start / VHOST_LOG_CHUNK;
>> +    vhost_log_chunk_t *to = dev->log + end / VHOST_LOG_CHUNK + 1;
>> +    uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;
>> +
>> +    assert(end / VHOST_LOG_CHUNK<  dev->log_size);
>> +    assert(start / VHOST_LOG_CHUNK<  dev->log_size);
>> +    if (end<  start) {
>> +        return;
>> +    }
>> +    for (;from<  to; ++from) {
>> +        vhost_log_chunk_t log;
>> +        int bit;
>> +        /* We first check with non-atomic: much cheaper,
>> +         * and we expect non-dirty to be the common case. */
>> +        if (!*from) {
>> +            continue;
>> +        }
>> +        /* Data must be read atomically. We don't really
>> +         * need the barrier semantics of __sync
>> +         * builtins, but it's easier to use them than
>> +         * roll our own. */
>> +        log = __sync_fetch_and_and(from, 0);
>>    
>
> Is this too non-portable for us?
>
> Technically speaking, it would be better to use qemu address types  
> instead of uint64_t.
>
>> +        while ((bit = sizeof(log)>  sizeof(int) ?
>> +                ffsll(log) : ffs(log))) {
>> +            bit -= 1;
>> +            cpu_physical_memory_set_dirty(addr + bit * VHOST_LOG_PAGE);
>> +            log&= ~(0x1ull<<  bit);
>> +        }
>> +        addr += VHOST_LOG_CHUNK;
>> +    }
>> +}
>> +
>> +static int vhost_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
>> +                                          target_phys_addr_t start_addr,
>> +                                          target_phys_addr_t end_addr)
>>    
>
> The 'struct' shouldn't be needed...
>
>> +{
>> +    struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
>> +    int i;
>> +    if (!dev->log_enabled || !dev->started) {
>> +        return 0;
>> +    }
>> +    for (i = 0; i<  dev->mem->nregions; ++i) {
>> +        struct vhost_memory_region *reg = dev->mem->regions + i;
>> +        vhost_dev_sync_region(dev, start_addr, end_addr,
>> +                              reg->guest_phys_addr,
>> +                              range_get_last(reg->guest_phys_addr,
>> +                                             reg->memory_size));
>> +    }
>> +    for (i = 0; i<  dev->nvqs; ++i) {
>> +        struct vhost_virtqueue *vq = dev->vqs + i;
>> +        unsigned size = offsetof(struct vring_used, ring) +
>> +            sizeof(struct vring_used_elem) * vq->num;
>> +        vhost_dev_sync_region(dev, start_addr, end_addr, vq->used_phys,
>> +                              range_get_last(vq->used_phys, size));
>> +    }
>> +    return 0;
>> +}
>> +
>> +/* Assign/unassign. Keep an unsorted array of non-overlapping
>> + * memory regions in dev->mem. */
>> +static void vhost_dev_unassign_memory(struct vhost_dev *dev,
>> +                                      uint64_t start_addr,
>> +                                      uint64_t size)
>> +{
>> +    int from, to, n = dev->mem->nregions;
>> +    /* Track overlapping/split regions for sanity checking. */
>> +    int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
>> +
>> +    for (from = 0, to = 0; from<  n; ++from, ++to) {
>> +        struct vhost_memory_region *reg = dev->mem->regions + to;
>> +        uint64_t reglast;
>> +        uint64_t memlast;
>> +        uint64_t change;
>> +
>> +        /* clone old region */
>> +        if (to != from) {
>> +            memcpy(reg, dev->mem->regions + from, sizeof *reg);
>> +        }
>> +
>> +        /* No overlap is simple */
>> +        if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
>> +                            start_addr, size)) {
>> +            continue;
>> +        }
>> +
>> +        /* Split only happens if supplied region
>> +         * is in the middle of an existing one. Thus it can not
>> +         * overlap with any other existing region. */
>> +        assert(!split);
>> +
>> +        reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
>> +        memlast = range_get_last(start_addr, size);
>> +
>> +        /* Remove whole region */
>> +        if (start_addr<= reg->guest_phys_addr&&  memlast>= reglast) {
>> +            --dev->mem->nregions;
>> +            --to;
>> +            assert(to>= 0);
>> +            ++overlap_middle;
>> +            continue;
>> +        }
>> +
>> +        /* Shrink region */
>> +        if (memlast>= reglast) {
>> +            reg->memory_size = start_addr - reg->guest_phys_addr;
>> +            assert(reg->memory_size);
>> +            assert(!overlap_end);
>> +            ++overlap_end;
>> +            continue;
>> +        }
>> +
>> +        /* Shift region */
>> +        if (start_addr<= reg->guest_phys_addr) {
>> +            change = memlast + 1 - reg->guest_phys_addr;
>> +            reg->memory_size -= change;
>> +            reg->guest_phys_addr += change;
>> +            reg->userspace_addr += change;
>> +            assert(reg->memory_size);
>> +            assert(!overlap_start);
>> +            ++overlap_start;
>> +            continue;
>> +        }
>> +
>> +        /* This only happens if supplied region
>> +         * is in the middle of an existing one. Thus it can not
>> +         * overlap with any other existing region. */
>> +        assert(!overlap_start);
>> +        assert(!overlap_end);
>> +        assert(!overlap_middle);
>> +        /* Split region: shrink first part, shift second part. */
>> +        memcpy(dev->mem->regions + n, reg, sizeof *reg);
>> +        reg->memory_size = start_addr - reg->guest_phys_addr;
>> +        assert(reg->memory_size);
>> +        change = memlast + 1 - reg->guest_phys_addr;
>> +        reg = dev->mem->regions + n;
>> +        reg->memory_size -= change;
>> +        assert(reg->memory_size);
>> +        reg->guest_phys_addr += change;
>> +        reg->userspace_addr += change;
>> +        /* Never add more than 1 region */
>> +        assert(dev->mem->nregions == n);
>> +        ++dev->mem->nregions;
>> +        ++split;
>> +    }
>> +}
>>    
>
> This code is basically replicating the code in kvm-all with respect to  
> translating qemu ram registrations into a list of non-overlapping slots.  
> We should commonize the code and perhaps even change the notification API 
> to deal with non-overlapping slots since that's what users seem to want.

KVM code needs all kind of work-arounds for KVM specific issues.
It also assumes that KVM is registered at startup, so it
does not try to optimize finding slots.

I propose merging this as is, and then someone who has an idea
how to do this better can come and unify the code.

>> +
>> +/* Called after unassign, so no regions overlap the given range. */
>> +static void vhost_dev_assign_memory(struct vhost_dev *dev,
>> +                                    uint64_t start_addr,
>> +                                    uint64_t size,
>> +                                    uint64_t uaddr)
>> +{
>> +    int from, to;
>> +    struct vhost_memory_region *merged = NULL;
>> +    for (from = 0, to = 0; from<  dev->mem->nregions; ++from, ++to) {
>> +        struct vhost_memory_region *reg = dev->mem->regions + to;
>> +        uint64_t prlast, urlast;
>> +        uint64_t pmlast, umlast;
>> +        uint64_t s, e, u;
>> +
>> +        /* clone old region */
>> +        if (to != from) {
>> +            memcpy(reg, dev->mem->regions + from, sizeof *reg);
>> +        }
>> +        prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
>> +        pmlast = range_get_last(start_addr, size);
>> +        urlast = range_get_last(reg->userspace_addr, reg->memory_size);
>> +        umlast = range_get_last(uaddr, size);
>> +
>> +        /* check for overlapping regions: should never happen. */
>> +        assert(prlast<  start_addr || pmlast<  reg->guest_phys_addr);
>> +        /* Not an adjacent or overlapping region - do not merge. */
>> +        if ((prlast + 1 != start_addr || urlast + 1 != uaddr)&&
>> +            (pmlast + 1 != reg->guest_phys_addr ||
>> +             umlast + 1 != reg->userspace_addr)) {
>> +            continue;
>> +        }
>> +
>> +        if (merged) {
>> +            --to;
>> +            assert(to>= 0);
>> +        } else {
>> +            merged = reg;
>> +        }
>> +        u = MIN(uaddr, reg->userspace_addr);
>> +        s = MIN(start_addr, reg->guest_phys_addr);
>> +        e = MAX(pmlast, prlast);
>> +        uaddr = merged->userspace_addr = u;
>> +        start_addr = merged->guest_phys_addr = s;
>> +        size = merged->memory_size = e - s + 1;
>> +        assert(merged->memory_size);
>> +    }
>> +
>> +    if (!merged) {
>> +        struct vhost_memory_region *reg = dev->mem->regions + to;
>> +        memset(reg, 0, sizeof *reg);
>> +        reg->memory_size = size;
>> +        assert(reg->memory_size);
>> +        reg->guest_phys_addr = start_addr;
>> +        reg->userspace_addr = uaddr;
>> +        ++to;
>> +    }
>> +    assert(to<= dev->mem->nregions + 1);
>> +    dev->mem->nregions = to;
>> +}
>>    
>
> See above.  Unifying the two bits of code is important IMHO because we  
> had an unending supply of bugs with the code in kvm-all it seems.

Mine has no bugs, let's switch to it!

Seriously, need to tread very carefully here.
This is why I say: merge it, then look at how to reuse code.

>> +static uint64_t vhost_get_log_size(struct vhost_dev *dev)
>> +{
>> +    uint64_t log_size = 0;
>> +    int i;
>> +    for (i = 0; i<  dev->mem->nregions; ++i) {
>> +        struct vhost_memory_region *reg = dev->mem->regions + i;
>> +        uint64_t last = range_get_last(reg->guest_phys_addr,
>> +                                       reg->memory_size);
>> +        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
>> +    }
>> +    for (i = 0; i<  dev->nvqs; ++i) {
>> +        struct vhost_virtqueue *vq = dev->vqs + i;
>> +        uint64_t last = vq->used_phys +
>> +            offsetof(struct vring_used, ring) +
>> +            sizeof(struct vring_used_elem) * vq->num - 1;
>> +        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
>> +    }
>> +    return log_size;
>> +}
>> +
>> +static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size)
>> +{
>> +    vhost_log_chunk_t *log;
>> +    uint64_t log_base;
>> +    int r;
>> +    if (size) {
>> +        log = qemu_mallocz(size * sizeof *log);
>> +    } else {
>> +        log = NULL;
>> +    }
>> +    log_base = (uint64_t)(unsigned long)log;
>> +    r = ioctl(dev->control, VHOST_SET_LOG_BASE,&log_base);
>> +    assert(r>= 0);
>> +    vhost_client_sync_dirty_bitmap(&dev->client, 0,
>> +                                   (target_phys_addr_t)~0x0ull);
>> +    if (dev->log) {
>> +        qemu_free(dev->log);
>> +    }
>> +    dev->log = log;
>> +    dev->log_size = size;
>> +}
>> +
>> +static void vhost_client_set_memory(CPUPhysMemoryClient *client,
>> +                                    target_phys_addr_t start_addr,
>> +                                    ram_addr_t size,
>> +                                    ram_addr_t phys_offset)
>> +{
>> +    struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
>> +    ram_addr_t flags = phys_offset&  ~TARGET_PAGE_MASK;
>> +    int s = offsetof(struct vhost_memory, regions) +
>> +        (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
>> +    uint64_t log_size;
>> +    int r;
>> +    dev->mem = qemu_realloc(dev->mem, s);
>> +
>> +    assert(size);
>> +
>> +    vhost_dev_unassign_memory(dev, start_addr, size);
>> +    if (flags == IO_MEM_RAM) {
>> +        /* Add given mapping, merging adjacent regions if any */
>> +        vhost_dev_assign_memory(dev, start_addr, size,
>> +                                (uintptr_t)qemu_get_ram_ptr(phys_offset));
>> +    } else {
>> +        /* Remove old mapping for this memory, if any. */
>> +        vhost_dev_unassign_memory(dev, start_addr, size);
>> +    }
>> +
>> +    if (!dev->started) {
>> +        return;
>> +    }
>> +    if (!dev->log_enabled) {
>> +        r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
>> +        assert(r>= 0);
>> +        return;
>> +    }
>> +    log_size = vhost_get_log_size(dev);
>> +    /* We allocate an extra 4K bytes to log,
>> +     * to reduce the * number of reallocations. */
>> +#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
>> +    /* To log more, must increase log size before table update. */
>> +    if (dev->log_size<  log_size) {
>> +        vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
>> +    }
>> +    r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
>> +    assert(r>= 0);
>> +    /* To log less, can only decrease log size after table update. */
>> +    if (dev->log_size>  log_size + VHOST_LOG_BUFFER) {
>> +        vhost_dev_log_resize(dev, log_size);
>> +    }
>> +}
>> +
>> +static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
>> +                                    struct vhost_virtqueue *vq,
>> +                                    unsigned idx, bool enable_log)
>> +{
>> +    struct vhost_vring_addr addr = {
>> +        .index = idx,
>> +        .desc_user_addr = (u_int64_t)(unsigned long)vq->desc,
>> +        .avail_user_addr = (u_int64_t)(unsigned long)vq->avail,
>> +        .used_user_addr = (u_int64_t)(unsigned long)vq->used,
>> +        .log_guest_addr = vq->used_phys,
>> +        .flags = enable_log ? (1<<  VHOST_VRING_F_LOG) : 0,
>> +    };
>> +    int r = ioctl(dev->control, VHOST_SET_VRING_ADDR,&addr);
>> +    if (r<  0) {
>> +        return -errno;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
>> +{
>> +    uint64_t features = dev->acked_features;
>> +    int r;
>> +    if (enable_log) {
>> +        features |= 0x1<<  VHOST_F_LOG_ALL;
>> +    }
>> +    r = ioctl(dev->control, VHOST_SET_FEATURES,&features);
>> +    return r<  0 ? -errno : 0;
>> +}
>> +
>> +static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
>> +{
>> +    int r, t, i;
>> +    r = vhost_dev_set_features(dev, enable_log);
>> +    if (r<  0)
>> +        goto err_features;
>>    
>
> Coding Style is off with single line ifs.
>
>> +    for (i = 0; i<  dev->nvqs; ++i) {
>>    
>
> C++ habits die hard :-)


What's that about?

>> +        r = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
>> +                                     enable_log);
>> +        if (r<  0)
>> +            goto err_vq;
>> +    }
>> +    return 0;
>> +err_vq:
>> +    for (; i>= 0; --i) {
>> +        t = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
>> +                                     dev->log_enabled);
>> +        assert(t>= 0);
>> +    }
>> +    t = vhost_dev_set_features(dev, dev->log_enabled);
>> +    assert(t>= 0);
>> +err_features:
>> +    return r;
>> +}
>> +
>> +static int vhost_client_migration_log(struct CPUPhysMemoryClient *client,
>> +                                      int enable)
>> +{
>> +    struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
>> +    int r;
>> +    if (!!enable == dev->log_enabled) {
>> +        return 0;
>> +    }
>> +    if (!dev->started) {
>> +        dev->log_enabled = enable;
>> +        return 0;
>> +    }
>> +    if (!enable) {
>> +        r = vhost_dev_set_log(dev, false);
>> +        if (r<  0) {
>> +            return r;
>> +        }
>> +        if (dev->log) {
>> +            qemu_free(dev->log);
>> +        }
>> +        dev->log = NULL;
>> +        dev->log_size = 0;
>> +    } else {
>> +        vhost_dev_log_resize(dev, vhost_get_log_size(dev));
>> +        r = vhost_dev_set_log(dev, true);
>> +        if (r<  0) {
>> +            return r;
>> +        }
>> +    }
>> +    dev->log_enabled = enable;
>> +    return 0;
>> +}
>> +
>> +static int vhost_virtqueue_init(struct vhost_dev *dev,
>> +                                struct VirtIODevice *vdev,
>> +                                struct vhost_virtqueue *vq,
>> +                                unsigned idx)
>> +{
>> +    target_phys_addr_t s, l, a;
>> +    int r;
>> +    struct vhost_vring_file file = {
>> +        .index = idx,
>> +    };
>> +    struct vhost_vring_state state = {
>> +        .index = idx,
>> +    };
>> +    struct VirtQueue *q = virtio_queue(vdev, idx);
>> +
>> +    vq->num = state.num = virtio_queue_get_num(vdev, idx);
>> +    r = ioctl(dev->control, VHOST_SET_VRING_NUM,&state);
>> +    if (r) {
>> +        return -errno;
>> +    }
>> +
>> +    state.num = virtio_queue_last_avail_idx(vdev, idx);
>> +    r = ioctl(dev->control, VHOST_SET_VRING_BASE,&state);
>> +    if (r) {
>> +        return -errno;
>> +    }
>> +
>> +    s = l = sizeof(struct vring_desc) * vq->num;
>> +    a = virtio_queue_get_desc(vdev, idx);
>> +    vq->desc = cpu_physical_memory_map(a,&l, 0);
>> +    if (!vq->desc || l != s) {
>> +        r = -ENOMEM;
>> +        goto fail_alloc;
>> +    }
>> +    s = l = offsetof(struct vring_avail, ring) +
>> +        sizeof(u_int64_t) * vq->num;
>> +    a = virtio_queue_get_avail(vdev, idx);
>> +    vq->avail = cpu_physical_memory_map(a,&l, 0);
>> +    if (!vq->avail || l != s) {
>> +        r = -ENOMEM;
>> +        goto fail_alloc;
>> +    }
>>    
>
> You don't unmap avail/desc on failure.  map() may fail because the ring  
> cross MMIO memory and you run out of a bounce buffer.
>
> IMHO, it would be better to attempt to map the full ring at once and  
> then if that doesn't succeed, bail out.  You can still pass individual  
> pointers via vhost ioctls but within qemu, it's much easier to deal with  
> the whole ring at a time.

I prefer to keep as much logic about ring layout as possible
in virtio.c

>> +    s = l = offsetof(struct vring_used, ring) +
>> +        sizeof(struct vring_used_elem) * vq->num;
>>    
>
> This is unfortunate.  We redefine this structures in qemu to avoid  
> depending on Linux headers.

And we should for e.g. windows portability.

>  But you're using the linux versions instead  
> of the qemu versions.  Is it really necessary for vhost.h to include  
> virtio.h?

Yes. And anyway, vhost does not exist on non-linux systems so there
is no issue IMO.

>> +    vq->used_phys = a = virtio_queue_get_used(vdev, idx);
>> +    vq->used = cpu_physical_memory_map(a,&l, 1);
>> +    if (!vq->used || l != s) {
>> +        r = -ENOMEM;
>> +        goto fail_alloc;
>> +    }
>> +
>> +    r = vhost_virtqueue_set_addr(dev, vq, idx, dev->log_enabled);
>> +    if (r<  0) {
>> +        r = -errno;
>> +        goto fail_alloc;
>> +    }
>> +    if (!vdev->binding->guest_notifier || !vdev->binding->host_notifier) {
>> +        fprintf(stderr, "binding does not support irqfd/queuefd\n");
>> +        r = -ENOSYS;
>> +        goto fail_alloc;
>> +    }
>> +    r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, true);
>> +    if (r<  0) {
>> +        fprintf(stderr, "Error binding guest notifier: %d\n", -r);
>> +        goto fail_guest_notifier;
>> +    }
>> +
>> +    r = vdev->binding->host_notifier(vdev->binding_opaque, idx, true);
>> +    if (r<  0) {
>> +        fprintf(stderr, "Error binding host notifier: %d\n", -r);
>> +        goto fail_host_notifier;
>> +    }
>> +
>> +    file.fd = event_notifier_get_fd(virtio_queue_host_notifier(q));
>> +    r = ioctl(dev->control, VHOST_SET_VRING_KICK,&file);
>> +    if (r) {
>> +        goto fail_kick;
>> +    }
>> +
>> +    file.fd = event_notifier_get_fd(virtio_queue_guest_notifier(q));
>> +    r = ioctl(dev->control, VHOST_SET_VRING_CALL,&file);
>> +    if (r) {
>> +        goto fail_call;
>> +    }
>>    
>
> This function would be a bit more reasonable if it were split into  
> sections FWIW.

Not sure what do you mean here.

>> +    return 0;
>> +
>> +fail_call:
>> +fail_kick:
>> +    vdev->binding->host_notifier(vdev->binding_opaque, idx, false);
>> +fail_host_notifier:
>> +    vdev->binding->guest_notifier(vdev->binding_opaque, idx, false);
>> +fail_guest_notifier:
>> +fail_alloc:
>> +    return r;
>> +}
>> +
>> +static void vhost_virtqueue_cleanup(struct vhost_dev *dev,
>> +                                    struct VirtIODevice *vdev,
>> +                                    struct vhost_virtqueue *vq,
>> +                                    unsigned idx)
>> +{
>> +    struct vhost_vring_state state = {
>> +        .index = idx,
>> +    };
>> +    int r;
>> +    r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, false);
>> +    if (r<  0) {
>> +        fprintf(stderr, "vhost VQ %d guest cleanup failed: %d\n", idx, r);
>> +        fflush(stderr);
>> +    }
>> +    assert (r>= 0);
>> +
>> +    r = vdev->binding->host_notifier(vdev->binding_opaque, idx, false);
>> +    if (r<  0) {
>> +        fprintf(stderr, "vhost VQ %d host cleanup failed: %d\n", idx, r);
>> +        fflush(stderr);
>> +    }
>> +    assert (r>= 0);
>> +    r = ioctl(dev->control, VHOST_GET_VRING_BASE,&state);
>> +    if (r<  0) {
>> +        fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
>> +        fflush(stderr);
>> +    }
>> +    virtio_queue_set_last_avail_idx(vdev, idx, state.num);
>> +    assert (r>= 0);
>>    
>
> You never unmap() the mapped memory and you're cheating by assuming that  
> the virtio rings have a constant mapping for the life time of a guest.   
> That's not technically true.  My concern is that since a guest can  
> trigger remappings (by adjusting PCI mappings) badness can ensue.

I do not know how this can happen. What do PCI mappings have to do with this?
Please explain. If it can, vhost will need notification to update.

>> diff --git a/hw/vhost_net.c b/hw/vhost_net.c
>> new file mode 100644
>> index 0000000..06b7648
>>    
>
> Need copyright/license.
>
>> --- /dev/null
>> +++ b/hw/vhost_net.c
>> @@ -0,0 +1,177 @@
>> +#include "net.h"
>> +#include "net/tap.h"
>> +
>> +#include "virtio-net.h"
>> +#include "vhost_net.h"
>> +
>> +#include "config.h"
>> +
>> +#ifdef CONFIG_VHOST_NET
>> +#include<sys/eventfd.h>
>> +#include<sys/socket.h>
>> +#include<linux/kvm.h>
>> +#include<fcntl.h>
>> +#include<sys/ioctl.h>
>> +#include<linux/virtio_ring.h>
>> +#include<netpacket/packet.h>
>> +#include<net/ethernet.h>
>> +#include<net/if.h>
>> +#include<netinet/in.h>
>> +
>> +#include<stdio.h>
>> +
>> +#include "vhost.h"
>> +
>> +struct vhost_net {
>>    
>
>
> VHostNetState.
>
>> +    struct vhost_dev dev;
>> +    struct vhost_virtqueue vqs[2];
>> +    int backend;
>> +    VLANClientState *vc;
>> +};
>> +
>> +unsigned vhost_net_get_features(struct vhost_net *net, unsigned features)
>> +{
>> +    /* Clear features not supported by host kernel. */
>> +    if (!(net->dev.features&  (1<<  VIRTIO_F_NOTIFY_ON_EMPTY)))
>> +        features&= ~(1<<  VIRTIO_F_NOTIFY_ON_EMPTY);
>> +    if (!(net->dev.features&  (1<<  VIRTIO_RING_F_INDIRECT_DESC)))
>> +        features&= ~(1<<  VIRTIO_RING_F_INDIRECT_DESC);
>> +    if (!(net->dev.features&  (1<<  VIRTIO_NET_F_MRG_RXBUF)))
>> +        features&= ~(1<<  VIRTIO_NET_F_MRG_RXBUF);
>> +    return features;
>> +}
>> +
>> +void vhost_net_ack_features(struct vhost_net *net, unsigned features)
>> +{
>> +    net->dev.acked_features = net->dev.backend_features;
>> +    if (features&  (1<<  VIRTIO_F_NOTIFY_ON_EMPTY))
>> +        net->dev.acked_features |= (1<<  VIRTIO_F_NOTIFY_ON_EMPTY);
>> +    if (features&  (1<<  VIRTIO_RING_F_INDIRECT_DESC))
>> +        net->dev.acked_features |= (1<<  VIRTIO_RING_F_INDIRECT_DESC);
>> +}
>> +
>> +static int vhost_net_get_fd(VLANClientState *backend)
>> +{
>> +    switch (backend->info->type) {
>> +    case NET_CLIENT_TYPE_TAP:
>> +        return tap_get_fd(backend);
>> +    default:
>> +        fprintf(stderr, "vhost-net requires tap backend\n");
>> +        return -EBADFD;
>> +    }
>> +}
>> +
>> +struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
>> +{
>> +    int r;
>> +    struct vhost_net *net = qemu_malloc(sizeof *net);
>> +    if (!backend) {
>> +        fprintf(stderr, "vhost-net requires backend to be setup\n");
>> +        goto fail;
>> +    }
>> +    r = vhost_net_get_fd(backend);
>> +    if (r<  0)
>> +        goto fail;
>> +    net->vc = backend;
>> +    net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 :
>> +        (1<<  VHOST_NET_F_VIRTIO_NET_HDR);
>> +    net->backend = r;
>> +
>> +    r = vhost_dev_init(&net->dev, devfd);
>> +    if (r<  0)
>> +        goto fail;
>> +    if (~net->dev.features&  net->dev.backend_features) {
>> +        fprintf(stderr, "vhost lacks feature mask %llu for backend\n",
>> +                ~net->dev.features&  net->dev.backend_features);
>> +        vhost_dev_cleanup(&net->dev);
>> +        goto fail;
>> +    }
>> +
>> +    /* Set sane init value. Override when guest acks. */
>> +    vhost_net_ack_features(net, 0);
>> +    return net;
>> +fail:
>> +    qemu_free(net);
>> +    return NULL;
>> +}
>> +
>> +int vhost_net_start(struct vhost_net *net,
>> +                    VirtIODevice *dev)
>> +{
>> +    struct vhost_vring_file file = { };
>> +    int r;
>> +
>> +    net->dev.nvqs = 2;
>> +    net->dev.vqs = net->vqs;
>> +    r = vhost_dev_start(&net->dev, dev);
>> +    if (r<  0)
>> +        return r;
>> +
>> +    net->vc->info->poll(net->vc, false);
>> +    qemu_set_fd_handler(net->backend, NULL, NULL, NULL);
>> +    file.fd = net->backend;
>> +    for (file.index = 0; file.index<  net->dev.nvqs; ++file.index) {
>> +        r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND,&file);
>> +        if (r<  0) {
>> +            r = -errno;
>> +            goto fail;
>> +        }
>> +    }
>> +    return 0;
>> +fail:
>> +    file.fd = -1;
>> +    while (--file.index>= 0) {
>> +        int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND,&file);
>> +        assert(r>= 0);
>> +    }
>> +    net->vc->info->poll(net->vc, true);
>> +    vhost_dev_stop(&net->dev, dev);
>> +    return r;
>> +}
>> +
>> +void vhost_net_stop(struct vhost_net *net,
>> +                    VirtIODevice *dev)
>> +{
>> +    struct vhost_vring_file file = { .fd = -1 };
>> +
>> +    for (file.index = 0; file.index<  net->dev.nvqs; ++file.index) {
>> +        int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND,&file);
>> +        assert(r>= 0);
>> +    }
>> +    net->vc->info->poll(net->vc, true);
>> +    vhost_dev_stop(&net->dev, dev);
>> +}
>> +
>> +void vhost_net_cleanup(struct vhost_net *net)
>> +{
>> +    vhost_dev_cleanup(&net->dev);
>> +    qemu_free(net);
>> +}
>> +#else
>>    
>
> If you're going this way, I'd suggest making static inlines in the  
> header file instead of polluting the C file.  It's more common to search  
> within a C file and having two declarations can get annoying.
>
> Regards,
>
> Anthony Liguori

The issue with inline is that this means that virtio net will depend on
target (need to be recompiled).  As it is, a single object can link with
vhost and non-vhost versions.

>> +struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
>> +{
>> +	return NULL;
>> +}
>> +
>> +int vhost_net_start(struct vhost_net *net,
>> +		    VirtIODevice *dev)
>> +{
>> +	return -ENOSYS;
>> +}
>> +void vhost_net_stop(struct vhost_net *net,
>> +		    VirtIODevice *dev)
>> +{
>> +}
>> +
>> +void vhost_net_cleanup(struct vhost_net *net)
>> +{
>> +}
>> +
>> +unsigned vhost_net_get_features(struct vhost_net *net, unsigned features)
>> +{
>> +	return features;
>> +}
>> +void vhost_net_ack_features(struct vhost_net *net, unsigned features)
>> +{
>> +}
>> +#endif
>> diff --git a/hw/vhost_net.h b/hw/vhost_net.h
>> new file mode 100644
>> index 0000000..2a10210
>> --- /dev/null
>> +++ b/hw/vhost_net.h
>> @@ -0,0 +1,20 @@
>> +#ifndef VHOST_NET_H
>> +#define VHOST_NET_H
>> +
>> +#include "net.h"
>> +
>> +struct vhost_net;
>> +
>> +struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd);
>> +
>> +int vhost_net_start(struct vhost_net *net,
>> +                    VirtIODevice *dev);
>> +void vhost_net_stop(struct vhost_net *net,
>> +                    VirtIODevice *dev);
>> +
>> +void vhost_net_cleanup(struct vhost_net *net);
>> +
>> +unsigned vhost_net_get_features(struct vhost_net *net, unsigned features);
>> +void vhost_net_ack_features(struct vhost_net *net, unsigned features);
>> +
>> +#endif
>>
Michael S. Tsirkin Feb. 26, 2010, 2:54 p.m. UTC | #6
On Fri, Feb 26, 2010 at 08:38:27AM -0600, Anthony Liguori wrote:
> On 02/26/2010 08:32 AM, Michael S. Tsirkin wrote:
>>> and
>>> --enable-vhost/--disable-vhost options.
>>>
>>>      
>> I don't really see why we need --enable-vhost/--disable-vhost.
>> Runtime flag is enough.
>>    
>
> So that packagers can disable features at build time that they don't  
> want to support.
>
> Regards,
>
> Anthony Liguori

Fair enough.
Anthony Liguori Feb. 26, 2010, 3:18 p.m. UTC | #7
On 02/26/2010 08:49 AM, Michael S. Tsirkin wrote:
>
> KVM code needs all kind of work-arounds for KVM specific issues.
> It also assumes that KVM is registered at startup, so it
> does not try to optimize finding slots.
>    

No, the slot mapping changes dynamically so KVM certainly needs to 
optimize this.

But the point is, why can't we keep a central list of slots somewhere 
that KVM and vhost-net can both use?  I'm not saying we use a common 
function to do this work, I'm saying qemu should maintain a proper slot 
list than anyone can access.

> I propose merging this as is, and then someone who has an idea
> how to do this better can come and unify the code.
>    

Like I said, this has been a huge source of very subtle bugs in the 
past.  I'm open to hearing what other people think, but I'm concerned 
that if we merge this code, we'll end up facing some nasty bugs that 
could easily be eliminated by just using the code in kvm-all that has 
already been tested rather extensively.

There really aren't that many work-arounds in the code BTW.  The work 
arounds just result in a couple of extra slots too so they shouldn't be 
a burden to vhost.

> Mine has no bugs, let's switch to it!
>
> Seriously, need to tread very carefully here.
> This is why I say: merge it, then look at how to reuse code.
>    

Once it's merged, there's no incentive to look at reusing code.  Again, 
I don't think this is a huge burden to vhost.  The two bits of code 
literally do exactly the same thing.  They just use different data 
structures that ultimately contain the same values.

>> C++ habits die hard :-)
>>      
>
> What's that about?
>    

'++i' is an odd thing to do in C in a for() loop.  We're not explicit 
about it in Coding Style but the vast majority of code just does 'i++'.

>>> +    vq->desc = cpu_physical_memory_map(a,&l, 0);
>>> +    if (!vq->desc || l != s) {
>>> +        r = -ENOMEM;
>>> +        goto fail_alloc;
>>> +    }
>>> +    s = l = offsetof(struct vring_avail, ring) +
>>> +        sizeof(u_int64_t) * vq->num;
>>> +    a = virtio_queue_get_avail(vdev, idx);
>>> +    vq->avail = cpu_physical_memory_map(a,&l, 0);
>>> +    if (!vq->avail || l != s) {
>>> +        r = -ENOMEM;
>>> +        goto fail_alloc;
>>> +    }
>>>
>>>        
>> You don't unmap avail/desc on failure.  map() may fail because the ring
>> cross MMIO memory and you run out of a bounce buffer.
>>
>> IMHO, it would be better to attempt to map the full ring at once and
>> then if that doesn't succeed, bail out.  You can still pass individual
>> pointers via vhost ioctls but within qemu, it's much easier to deal with
>> the whole ring at a time.
>>      
> + a = virtio_queue_get_desc(vdev, idx);
> I prefer to keep as much logic about ring layout as possible
> in virtio.c
>    

Well, the downside is that you need to deal with the error path and 
cleanup paths and it becomes more complicated.

>>> +    s = l = offsetof(struct vring_used, ring) +
>>> +        sizeof(struct vring_used_elem) * vq->num;
>>>
>>>        
>> This is unfortunate.  We redefine this structures in qemu to avoid
>> depending on Linux headers.
>>      
> And we should for e.g. windows portability.
>
>    
>>   But you're using the linux versions instead
>> of the qemu versions.  Is it really necessary for vhost.h to include
>> virtio.h?
>>      
> Yes. And anyway, vhost does not exist on non-linux systems so there
> is no issue IMO.
>    

Yeah, like I said, it's unfortunate because it means a read of vhost and 
a reader of virtio.c is likely to get confused.  I'm not saying there's 
an easy solution, it's just unfortunate.

>>> +    vq->used_phys = a = virtio_queue_get_used(vdev, idx);
>>> +    vq->used = cpu_physical_memory_map(a,&l, 1);
>>> +    if (!vq->used || l != s) {
>>> +        r = -ENOMEM;
>>> +        goto fail_alloc;
>>> +    }
>>> +
>>> +    r = vhost_virtqueue_set_addr(dev, vq, idx, dev->log_enabled);
>>> +    if (r<   0) {
>>> +        r = -errno;
>>> +        goto fail_alloc;
>>> +    }
>>> +    if (!vdev->binding->guest_notifier || !vdev->binding->host_notifier) {
>>> +        fprintf(stderr, "binding does not support irqfd/queuefd\n");
>>> +        r = -ENOSYS;
>>> +        goto fail_alloc;
>>> +    }
>>> +    r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, true);
>>> +    if (r<   0) {
>>> +        fprintf(stderr, "Error binding guest notifier: %d\n", -r);
>>> +        goto fail_guest_notifier;
>>> +    }
>>> +
>>> +    r = vdev->binding->host_notifier(vdev->binding_opaque, idx, true);
>>> +    if (r<   0) {
>>> +        fprintf(stderr, "Error binding host notifier: %d\n", -r);
>>> +        goto fail_host_notifier;
>>> +    }
>>> +
>>> +    file.fd = event_notifier_get_fd(virtio_queue_host_notifier(q));
>>> +    r = ioctl(dev->control, VHOST_SET_VRING_KICK,&file);
>>> +    if (r) {
>>> +        goto fail_kick;
>>> +    }
>>> +
>>> +    file.fd = event_notifier_get_fd(virtio_queue_guest_notifier(q));
>>> +    r = ioctl(dev->control, VHOST_SET_VRING_CALL,&file);
>>> +    if (r) {
>>> +        goto fail_call;
>>> +    }
>>>
>>>        
>> This function would be a bit more reasonable if it were split into
>> sections FWIW.
>>      
> Not sure what do you mean here.
>    

Just a suggestion.  For instance, moving the setting up of the notifiers 
to a separate function would help with readability IMHO.

>
>> You never unmap() the mapped memory and you're cheating by assuming that
>> the virtio rings have a constant mapping for the life time of a guest.
>> That's not technically true.  My concern is that since a guest can
>> trigger remappings (by adjusting PCI mappings) badness can ensue.
>>      
> I do not know how this can happen. What do PCI mappings have to do with this?
> Please explain. If it can, vhost will need notification to update.
>    

If a guest modifies the bar for an MMIO region such that it happens to 
exist in RAM, while this is a bad thing for the guest to do, I don't 
think we do anything to stop it.  When the region gets remapped, the 
result will be that the mapping will change.

Within qemu, because we carry the qemu_mutex, we know that the mappings 
are fixed as long as we're in qemu.  We're very careful to assume that 
we don't rely on a mapping past when we drop the qemu_mutex.

With vhost, you register a slot table and update it whenever mappings 
change.  I think that's good enough for dealing with ram addresses.  But 
you pass the virtual address for the rings and assume those mappings 
never change.

I'm pretty sure a guest can cause those to change and I'm not 100% sure, 
but I think it's a potential source of exploits if you assume a 
mapping.  In the very least, a guest can trick vhost into writing to ram 
that it wouldn't normally write to.

>> If you're going this way, I'd suggest making static inlines in the
>> header file instead of polluting the C file.  It's more common to search
>> within a C file and having two declarations can get annoying.
>>
>> Regards,
>>
>> Anthony Liguori
>>      
> The issue with inline is that this means that virtio net will depend on
> target (need to be recompiled).  As it is, a single object can link with
> vhost and non-vhost versions.
>    

Fair enough.

Regards,

Anthony Liguori
Michael S. Tsirkin Feb. 27, 2010, 7:38 p.m. UTC | #8
On Fri, Feb 26, 2010 at 09:18:03AM -0600, Anthony Liguori wrote:
> On 02/26/2010 08:49 AM, Michael S. Tsirkin wrote:
>>
>> KVM code needs all kind of work-arounds for KVM specific issues.
>> It also assumes that KVM is registered at startup, so it
>> does not try to optimize finding slots.
>>    
>
> No, the slot mapping changes dynamically so KVM certainly needs to  
> optimize this.

Maybe, but it does not, KVM algorithms are n^2 or worse.

> But the point is, why can't we keep a central list of slots somewhere  
> that KVM and vhost-net can both use?  I'm not saying we use a common  
> function to do this work, I'm saying qemu should maintain a proper slot  
> list than anyone can access.
>
>> I propose merging this as is, and then someone who has an idea
>> how to do this better can come and unify the code.
>>    
>
> Like I said, this has been a huge source of very subtle bugs in the  
> past.  I'm open to hearing what other people think, but I'm concerned  
> that if we merge this code, we'll end up facing some nasty bugs that  
> could easily be eliminated by just using the code in kvm-all that has  
> already been tested rather extensively.
>
> There really aren't that many work-arounds in the code BTW.  The work  
> arounds just result in a couple of extra slots too so they shouldn't be  
> a burden to vhost.
>
>> Mine has no bugs, let's switch to it!
>>
>> Seriously, need to tread very carefully here.
>> This is why I say: merge it, then look at how to reuse code.
>>    
>
> Once it's merged, there's no incentive to look at reusing code.
> Again, I don't think this is a huge burden to vhost.  The two bits of code  
> literally do exactly the same thing.  They just use different data  
> structures that ultimately contain the same values.

Not exactly. For example kvm track ROM and video ram addresses.

>>> C++ habits die hard :-)
>>>      
>>
>> What's that about?
>>    
>
> '++i' is an odd thing to do in C in a for() loop.  We're not explicit
> about it in Coding Style but the vast majority of code just does
> 'i++'.

Ugh. Do we really need to specify every little thing?

>>>> +    vq->desc = cpu_physical_memory_map(a,&l, 0);
>>>> +    if (!vq->desc || l != s) {
>>>> +        r = -ENOMEM;
>>>> +        goto fail_alloc;
>>>> +    }
>>>> +    s = l = offsetof(struct vring_avail, ring) +
>>>> +        sizeof(u_int64_t) * vq->num;
>>>> +    a = virtio_queue_get_avail(vdev, idx);
>>>> +    vq->avail = cpu_physical_memory_map(a,&l, 0);
>>>> +    if (!vq->avail || l != s) {
>>>> +        r = -ENOMEM;
>>>> +        goto fail_alloc;
>>>> +    }
>>>>
>>>>        
>>> You don't unmap avail/desc on failure.  map() may fail because the ring
>>> cross MMIO memory and you run out of a bounce buffer.
>>>
>>> IMHO, it would be better to attempt to map the full ring at once and
>>> then if that doesn't succeed, bail out.  You can still pass individual
>>> pointers via vhost ioctls but within qemu, it's much easier to deal with
>>> the whole ring at a time.
>>>      
>> + a = virtio_queue_get_desc(vdev, idx);
>> I prefer to keep as much logic about ring layout as possible
>> in virtio.c
>>    
>
> Well, the downside is that you need to deal with the error path and  
> cleanup paths and it becomes more complicated.
>
>>>> +    s = l = offsetof(struct vring_used, ring) +
>>>> +        sizeof(struct vring_used_elem) * vq->num;
>>>>
>>>>        
>>> This is unfortunate.  We redefine this structures in qemu to avoid
>>> depending on Linux headers.
>>>      
>> And we should for e.g. windows portability.
>>
>>    
>>>   But you're using the linux versions instead
>>> of the qemu versions.  Is it really necessary for vhost.h to include
>>> virtio.h?
>>>      
>> Yes. And anyway, vhost does not exist on non-linux systems so there
>> is no issue IMO.
>>    
>
> Yeah, like I said, it's unfortunate because it means a read of vhost and  
> a reader of virtio.c is likely to get confused.  I'm not saying there's  
> an easy solution, it's just unfortunate.
>
>>>> +    vq->used_phys = a = virtio_queue_get_used(vdev, idx);
>>>> +    vq->used = cpu_physical_memory_map(a,&l, 1);
>>>> +    if (!vq->used || l != s) {
>>>> +        r = -ENOMEM;
>>>> +        goto fail_alloc;
>>>> +    }
>>>> +
>>>> +    r = vhost_virtqueue_set_addr(dev, vq, idx, dev->log_enabled);
>>>> +    if (r<   0) {
>>>> +        r = -errno;
>>>> +        goto fail_alloc;
>>>> +    }
>>>> +    if (!vdev->binding->guest_notifier || !vdev->binding->host_notifier) {
>>>> +        fprintf(stderr, "binding does not support irqfd/queuefd\n");
>>>> +        r = -ENOSYS;
>>>> +        goto fail_alloc;
>>>> +    }
>>>> +    r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, true);
>>>> +    if (r<   0) {
>>>> +        fprintf(stderr, "Error binding guest notifier: %d\n", -r);
>>>> +        goto fail_guest_notifier;
>>>> +    }
>>>> +
>>>> +    r = vdev->binding->host_notifier(vdev->binding_opaque, idx, true);
>>>> +    if (r<   0) {
>>>> +        fprintf(stderr, "Error binding host notifier: %d\n", -r);
>>>> +        goto fail_host_notifier;
>>>> +    }
>>>> +
>>>> +    file.fd = event_notifier_get_fd(virtio_queue_host_notifier(q));
>>>> +    r = ioctl(dev->control, VHOST_SET_VRING_KICK,&file);
>>>> +    if (r) {
>>>> +        goto fail_kick;
>>>> +    }
>>>> +
>>>> +    file.fd = event_notifier_get_fd(virtio_queue_guest_notifier(q));
>>>> +    r = ioctl(dev->control, VHOST_SET_VRING_CALL,&file);
>>>> +    if (r) {
>>>> +        goto fail_call;
>>>> +    }
>>>>
>>>>        
>>> This function would be a bit more reasonable if it were split into
>>> sections FWIW.
>>>      
>> Not sure what do you mean here.
>>    
>
> Just a suggestion.  For instance, moving the setting up of the notifiers  
> to a separate function would help with readability IMHO.


Hmm. I'll look into it.
I actually think that for functions that just do a list of things
unconditionally, without branches or loops, or with just error handling
as here, it is perfectly fine for them to be of any length.

>>
>>> You never unmap() the mapped memory and you're cheating by assuming that
>>> the virtio rings have a constant mapping for the life time of a guest.
>>> That's not technically true.  My concern is that since a guest can
>>> trigger remappings (by adjusting PCI mappings) badness can ensue.
>>>      
>> I do not know how this can happen. What do PCI mappings have to do with this?
>> Please explain. If it can, vhost will need notification to update.
>>    
>
> If a guest modifies the bar for an MMIO region such that it happens to  
> exist in RAM, while this is a bad thing for the guest to do, I don't  
> think we do anything to stop it.  When the region gets remapped, the  
> result will be that the mapping will change.

So IMO this is the bug. If there's a BAR that matches RAM
physical address, it should never get mapped. Any idea how
to check this?

> Within qemu, because we carry the qemu_mutex, we know that the mappings  
> are fixed as long as we're in qemu.  We're very careful to assume that  
> we don't rely on a mapping past when we drop the qemu_mutex.
>
> With vhost, you register a slot table and update it whenever mappings  
> change.  I think that's good enough for dealing with ram addresses.  But  
> you pass the virtual address for the rings and assume those mappings  
> never change.

So, the issue IMO is that an MMIO address gets passed instead of RAM.
There's no reason to put virtio rings not in RAM, we just need to
verify this.

> 
> I'm pretty sure a guest can cause those to change and I'm not 100% sure,  
> but I think it's a potential source of exploits if you assume a mapping.  
> In the very least, a guest can trick vhost into writing to ram that it 
> wouldn't normally write to.

This seems harmless. guest can write anywhere in ram, anyway.

>>> If you're going this way, I'd suggest making static inlines in the
>>> header file instead of polluting the C file.  It's more common to search
>>> within a C file and having two declarations can get annoying.
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>      
>> The issue with inline is that this means that virtio net will depend on
>> target (need to be recompiled).  As it is, a single object can link with
>> vhost and non-vhost versions.
>>    
>
> Fair enough.
>
> Regards,
>
> Anthony Liguori
Paul Brook Feb. 28, 2010, 1:59 a.m. UTC | #9
> > I'm pretty sure a guest can cause those to change and I'm not 100%
> > sure,   but I think it's a potential source of exploits if you assume a
> > mapping. In the very least, a guest can trick vhost into writing to ram
> > that it wouldn't normally write to.
> 
> This seems harmless. guest can write anywhere in ram, anyway.

Surely writing to the wrong address is always a fatal flaw.  There certainly 
exist machines that can change physical RAM mapping.  While I wouldn't expect 
this to happen during normal operation, it could occur between a (virtio-
aware) bootloader/BIOS and real kernel.

Paul
Michael S. Tsirkin Feb. 28, 2010, 10:15 a.m. UTC | #10
On Sun, Feb 28, 2010 at 01:59:27AM +0000, Paul Brook wrote:
> > > I'm pretty sure a guest can cause those to change and I'm not 100%
> > > sure,   but I think it's a potential source of exploits if you assume a
> > > mapping. In the very least, a guest can trick vhost into writing to ram
> > > that it wouldn't normally write to.
> > 
> > This seems harmless. guest can write anywhere in ram, anyway.
> 
> Surely writing to the wrong address is always a fatal flaw.

If guest does an illegal operation, it can corrupt its own memory.
This is the case with physical devices as well.

>  There certainly 
> exist machines that can change physical RAM mapping.

I am talking about mapping between phy RAM offset and qemu virt address.
When can it change without RAM in question going away?

> While I wouldn't expect 
> this to happen during normal operation, it could occur between a (virtio-
> aware) bootloader/BIOS and real kernel.
> 
> Paul

Should not matter for vhost, it is only active if driver is active ...
Paul Brook Feb. 28, 2010, 12:45 p.m. UTC | #11
> >  There certainly
> > exist machines that can change physical RAM mapping.
> 
> I am talking about mapping between phy RAM offset and qemu virt address.
> When can it change without RAM in question going away?

RAM offset or guest physical address? The two are very different.
Some machines have chip selects that allow given physical address range to be 
mapped to different banks of ram.

Paul
Michael S. Tsirkin Feb. 28, 2010, 2:44 p.m. UTC | #12
On Sun, Feb 28, 2010 at 12:45:07PM +0000, Paul Brook wrote:
> > >  There certainly
> > > exist machines that can change physical RAM mapping.
> > 
> > I am talking about mapping between phy RAM offset and qemu virt address.
> > When can it change without RAM in question going away?
> 
> RAM offset or guest physical address? The two are very different.

RAM offset. virtio only cares about where the rings are.

> Some machines have chip selects that allow given physical address range to be 
> mapped to different banks of ram.
> 
> Paul

So guest can cause vhost to write to a wrong place in RAM, but it can
just pass a wrong address directly.  As long as vhost does not access a
non-RAM address, we are definitely fine.
Paul Brook Feb. 28, 2010, 3:23 p.m. UTC | #13
> So guest can cause vhost to write to a wrong place in RAM, but it can
> just pass a wrong address directly.  

That's not the point. Obviously any DMA capable device can be used to 
compromise a system. However if a device writes to address B after being told 
to write to address A, then you have a completely broken system.

> As long as vhost does not access a
> non-RAM address, we are definitely fine.

Why does it matter what it's changed to? The virtio DMA addresses guest 
physical addresses. If guest physical address mappings change then the virtio 
device must respect those changes. The extreme case is a system with an IOMMU 
(not currently implemented in QEMU). In that case it's likely that physical-
RAM mappings will change frequently.

Paul
Michael S. Tsirkin Feb. 28, 2010, 3:37 p.m. UTC | #14
On Sun, Feb 28, 2010 at 03:23:06PM +0000, Paul Brook wrote:
> > So guest can cause vhost to write to a wrong place in RAM, but it can
> > just pass a wrong address directly.  
> 
> That's not the point. Obviously any DMA capable device can be used to 
> compromise a system. However if a device writes to address B after being told 
> to write to address A, then you have a completely broken system.

Yes, but I do not see how this can happen with vhost backed to virtio.

> > As long as vhost does not access a
> > non-RAM address, we are definitely fine.
> 
> Why does it matter what it's changed to? The virtio DMA addresses guest 
> physical addresses. If guest physical address mappings change then the virtio 
> device must respect those changes. The extreme case is a system with an IOMMU 
> (not currently implemented in QEMU). In that case it's likely that physical-
> RAM mappings will change frequently.
> 
> Paul

Yes, but this is already supported. The one thing that my patches assume
does not change while device is active, is physical to qemu virtual
mapping for virtio ring.

Since virtio device is allowed to access the ring at any time,
such changes would only legal when device is not active IMO,
and my code translates physical to virtual when device is
made active.

So I do not see a bug.
Anthony Liguori Feb. 28, 2010, 4:02 p.m. UTC | #15
On 02/27/2010 01:38 PM, Michael S. Tsirkin wrote:
> On Fri, Feb 26, 2010 at 09:18:03AM -0600, Anthony Liguori wrote:
>    
>> On 02/26/2010 08:49 AM, Michael S. Tsirkin wrote:
>>      
>>> KVM code needs all kind of work-arounds for KVM specific issues.
>>> It also assumes that KVM is registered at startup, so it
>>> does not try to optimize finding slots.
>>>
>>>        
>> No, the slot mapping changes dynamically so KVM certainly needs to
>> optimize this.
>>      
> Maybe, but it does not, KVM algorithms are n^2 or worse.
>    

But n is small and the mappings don't change frequently.

More importantly, they change at the exact same times for vhost as they 
do for kvm.  So even if vhost has an O(n) algorithm, the KVM code gets 
executed either immediately before or immediately after the vhost code 
so your optimizations are lost in KVM's O(n^2) algorithm.

>>> Mine has no bugs, let's switch to it!
>>>
>>> Seriously, need to tread very carefully here.
>>> This is why I say: merge it, then look at how to reuse code.
>>>
>>>        
>> Once it's merged, there's no incentive to look at reusing code.
>> Again, I don't think this is a huge burden to vhost.  The two bits of code
>> literally do exactly the same thing.  They just use different data
>> structures that ultimately contain the same values.
>>      
> Not exactly. For example kvm track ROM and video ram addresses.
>    

KVM treats ROM and RAM the same (it even maps ROM as RAM).  There is no 
special handling for video ram addresses.

There is some magic in the VGA code to switch the VGA LFB from mmio to 
ram when possible but that happens at a higher layer.

>> '++i' is an odd thing to do in C in a for() loop.  We're not explicit
>> about it in Coding Style but the vast majority of code just does
>> 'i++'.
>>      
> Ugh. Do we really need to specify every little thing?
>    

I don't care that much about coding style.  I don't care if there are 
curly brackets on single line ifs.

However, it's been made very clear to me that most other people do and 
that it's something that's important to enforce.

> Hmm. I'll look into it.
> I actually think that for functions that just do a list of things
> unconditionally, without branches or loops, or with just error handling
> as here, it is perfectly fine for them to be of any length.
>    

Like I said, just a suggestion.

>>>        
>>>> You never unmap() the mapped memory and you're cheating by assuming that
>>>> the virtio rings have a constant mapping for the life time of a guest.
>>>> That's not technically true.  My concern is that since a guest can
>>>> trigger remappings (by adjusting PCI mappings) badness can ensue.
>>>>
>>>>          
>>> I do not know how this can happen. What do PCI mappings have to do with this?
>>> Please explain. If it can, vhost will need notification to update.
>>>
>>>        
>> If a guest modifies the bar for an MMIO region such that it happens to
>> exist in RAM, while this is a bad thing for the guest to do, I don't
>> think we do anything to stop it.  When the region gets remapped, the
>> result will be that the mapping will change.
>>      
> So IMO this is the bug. If there's a BAR that matches RAM
> physical address, it should never get mapped. Any idea how
> to check this?
>    

We could check it when the BAR is mapped in the PCI layer.  I'm 
suspicious there are other ways a guest can enforce/determine mappings 
though.

Generally speaking, I think it's necessary to assume that a guest can 
manipulate memory mappings.  If we can prove that a guest cannot, it 
would definitely simplify the code a lot.  I'd love to make the same 
assumptions in virtio userspace before it's actually a big source of 
overhead.

I'm pretty sure though that we have to let a guest control mappings though.

>> Within qemu, because we carry the qemu_mutex, we know that the mappings
>> are fixed as long as we're in qemu.  We're very careful to assume that
>> we don't rely on a mapping past when we drop the qemu_mutex.
>>
>> With vhost, you register a slot table and update it whenever mappings
>> change.  I think that's good enough for dealing with ram addresses.  But
>> you pass the virtual address for the rings and assume those mappings
>> never change.
>>      
> So, the issue IMO is that an MMIO address gets passed instead of RAM.
> There's no reason to put virtio rings not in RAM, we just need to
> verify this.
>    

Yes, but we don't always map PCI IO regions as MMIO or PIO.  In 
particular, for VGA devices (particularly VMware VGA), we map certain IO 
regions as RAM because that's how the device is designed.  Likewise, if 
we do shared memory PCI devices using IO regions as the ram contents, we 
would be mapping those as ram too.

So just checking to see if the virtio ring area is RAM or not is not 
enough.  A guest may do something that causes a virtio ring to still be 
ram, but a different ram address.  Now the vhost code is writing to RAM 
that it thinks is physical address X but is really guest physical address Y.

This is not something that a guest can use to break into qemu, but it is 
an emulation bug and depending on the guest OS, it may be possible to 
use it to do a privilege escalation within the guest.

I think the only way to handle this is to explicitly check for changes 
in the physical addresses the rings are mapped at and do the appropriate 
ioctls to vhost to let it know if the ring's address has changed.

>> I'm pretty sure a guest can cause those to change and I'm not 100% sure,
>> but I think it's a potential source of exploits if you assume a mapping.
>> In the very least, a guest can trick vhost into writing to ram that it
>> wouldn't normally write to.
>>      
> This seems harmless. guest can write anywhere in ram, anyway.
>    

Not all guest code is created equal and if we're writing to the wrong 
guest ram location, it can potentially circumvent the guest's security 
architecture.

Regards,

Anthony Liguori
diff mbox

Patch

diff --git a/Makefile.target b/Makefile.target
index c1580e9..9b4fd84 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -174,6 +174,8 @@  obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o
 # need to fix this properly
 obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-pci.o virtio-serial-bus.o
 obj-y += notifier.o
+obj-y += vhost_net.o
+obj-$(CONFIG_VHOST_NET) += vhost.o
 obj-y += rwhandler.o
 obj-$(CONFIG_KVM) += kvm.o kvm-all.o
 obj-$(CONFIG_ISA_MMIO) += isa_mmio.o
diff --git a/configure b/configure
index 8eb5f5b..5eccc7c 100755
--- a/configure
+++ b/configure
@@ -1498,6 +1498,23 @@  EOF
 fi
 
 ##########################################
+# test for vhost net
+
+if test "$kvm" != "no"; then
+	cat > $TMPC <<EOF
+#include <linux/vhost.h>
+int main(void) { return 0; }
+EOF
+	if compile_prog "$kvm_cflags" "" ; then
+	vhost_net=yes
+	else
+	vhost_net=no
+	fi
+else
+	vhost_net=no
+fi
+
+##########################################
 # pthread probe
 PTHREADLIBS_LIST="-lpthread -lpthreadGC2"
 
@@ -1968,6 +1985,7 @@  echo "fdt support       $fdt"
 echo "preadv support    $preadv"
 echo "fdatasync         $fdatasync"
 echo "uuid support      $uuid"
+echo "vhost-net support $vhost_net"
 
 if test $sdl_too_old = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -2492,6 +2510,9 @@  case "$target_arch2" in
       if test "$kvm_para" = "yes"; then
         echo "CONFIG_KVM_PARA=y" >> $config_target_mak
       fi
+      if test $vhost_net = "yes" ; then
+        echo "CONFIG_VHOST_NET=y" >> $config_target_mak
+      fi
     fi
 esac
 echo "TARGET_PHYS_ADDR_BITS=$target_phys_bits" >> $config_target_mak
diff --git a/hw/vhost.c b/hw/vhost.c
new file mode 100644
index 0000000..4d5ea02
--- /dev/null
+++ b/hw/vhost.c
@@ -0,0 +1,631 @@ 
+#include "linux/vhost.h"
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include "vhost.h"
+#include "hw/hw.h"
+/* For range_get_last */
+#include "pci.h"
+
+static void vhost_dev_sync_region(struct vhost_dev *dev,
+                                  uint64_t mfirst, uint64_t mlast,
+                                  uint64_t rfirst, uint64_t rlast)
+{
+    uint64_t start = MAX(mfirst, rfirst);
+    uint64_t end = MIN(mlast, rlast);
+    vhost_log_chunk_t *from = dev->log + start / VHOST_LOG_CHUNK;
+    vhost_log_chunk_t *to = dev->log + end / VHOST_LOG_CHUNK + 1;
+    uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;
+
+    assert(end / VHOST_LOG_CHUNK < dev->log_size);
+    assert(start / VHOST_LOG_CHUNK < dev->log_size);
+    if (end < start) {
+        return;
+    }
+    for (;from < to; ++from) {
+        vhost_log_chunk_t log;
+        int bit;
+        /* We first check with non-atomic: much cheaper,
+         * and we expect non-dirty to be the common case. */
+        if (!*from) {
+            continue;
+        }
+        /* Data must be read atomically. We don't really
+         * need the barrier semantics of __sync
+         * builtins, but it's easier to use them than
+         * roll our own. */
+        log = __sync_fetch_and_and(from, 0);
+        while ((bit = sizeof(log) > sizeof(int) ?
+                ffsll(log) : ffs(log))) {
+            bit -= 1;
+            cpu_physical_memory_set_dirty(addr + bit * VHOST_LOG_PAGE);
+            log &= ~(0x1ull << bit);
+        }
+        addr += VHOST_LOG_CHUNK;
+    }
+}
+
+static int vhost_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
+                                          target_phys_addr_t start_addr,
+                                          target_phys_addr_t end_addr)
+{
+    struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
+    int i;
+    if (!dev->log_enabled || !dev->started) {
+        return 0;
+    }
+    for (i = 0; i < dev->mem->nregions; ++i) {
+        struct vhost_memory_region *reg = dev->mem->regions + i;
+        vhost_dev_sync_region(dev, start_addr, end_addr,
+                              reg->guest_phys_addr,
+                              range_get_last(reg->guest_phys_addr,
+                                             reg->memory_size));
+    }
+    for (i = 0; i < dev->nvqs; ++i) {
+        struct vhost_virtqueue *vq = dev->vqs + i;
+        unsigned size = offsetof(struct vring_used, ring) +
+            sizeof(struct vring_used_elem) * vq->num;
+        vhost_dev_sync_region(dev, start_addr, end_addr, vq->used_phys,
+                              range_get_last(vq->used_phys, size));
+    }
+    return 0;
+}
+
+/* Assign/unassign. Keep an unsorted array of non-overlapping
+ * memory regions in dev->mem. */
+static void vhost_dev_unassign_memory(struct vhost_dev *dev,
+                                      uint64_t start_addr,
+                                      uint64_t size)
+{
+    int from, to, n = dev->mem->nregions;
+    /* Track overlapping/split regions for sanity checking. */
+    int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
+
+    for (from = 0, to = 0; from < n; ++from, ++to) {
+        struct vhost_memory_region *reg = dev->mem->regions + to;
+        uint64_t reglast;
+        uint64_t memlast;
+        uint64_t change;
+
+        /* clone old region */
+        if (to != from) {
+            memcpy(reg, dev->mem->regions + from, sizeof *reg);
+        }
+
+        /* No overlap is simple */
+        if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
+                            start_addr, size)) {
+            continue;
+        }
+
+        /* Split only happens if supplied region
+         * is in the middle of an existing one. Thus it can not
+         * overlap with any other existing region. */
+        assert(!split);
+
+        reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
+        memlast = range_get_last(start_addr, size);
+
+        /* Remove whole region */
+        if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
+            --dev->mem->nregions;
+            --to;
+            assert(to >= 0);
+            ++overlap_middle;
+            continue;
+        }
+
+        /* Shrink region */
+        if (memlast >= reglast) {
+            reg->memory_size = start_addr - reg->guest_phys_addr;
+            assert(reg->memory_size);
+            assert(!overlap_end);
+            ++overlap_end;
+            continue;
+        }
+
+        /* Shift region */
+        if (start_addr <= reg->guest_phys_addr) {
+            change = memlast + 1 - reg->guest_phys_addr;
+            reg->memory_size -= change;
+            reg->guest_phys_addr += change;
+            reg->userspace_addr += change;
+            assert(reg->memory_size);
+            assert(!overlap_start);
+            ++overlap_start;
+            continue;
+        }
+
+        /* This only happens if supplied region
+         * is in the middle of an existing one. Thus it can not
+         * overlap with any other existing region. */
+        assert(!overlap_start);
+        assert(!overlap_end);
+        assert(!overlap_middle);
+        /* Split region: shrink first part, shift second part. */
+        memcpy(dev->mem->regions + n, reg, sizeof *reg);
+        reg->memory_size = start_addr - reg->guest_phys_addr;
+        assert(reg->memory_size);
+        change = memlast + 1 - reg->guest_phys_addr;
+        reg = dev->mem->regions + n;
+        reg->memory_size -= change;
+        assert(reg->memory_size);
+        reg->guest_phys_addr += change;
+        reg->userspace_addr += change;
+        /* Never add more than 1 region */
+        assert(dev->mem->nregions == n);
+        ++dev->mem->nregions;
+        ++split;
+    }
+}
+
+/* Called after unassign, so no regions overlap the given range. */
+static void vhost_dev_assign_memory(struct vhost_dev *dev,
+                                    uint64_t start_addr,
+                                    uint64_t size,
+                                    uint64_t uaddr)
+{
+    int from, to;
+    struct vhost_memory_region *merged = NULL;
+    for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
+        struct vhost_memory_region *reg = dev->mem->regions + to;
+        uint64_t prlast, urlast;
+        uint64_t pmlast, umlast;
+        uint64_t s, e, u;
+
+        /* clone old region */
+        if (to != from) {
+            memcpy(reg, dev->mem->regions + from, sizeof *reg);
+        }
+        prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
+        pmlast = range_get_last(start_addr, size);
+        urlast = range_get_last(reg->userspace_addr, reg->memory_size);
+        umlast = range_get_last(uaddr, size);
+
+        /* check for overlapping regions: should never happen. */
+        assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
+        /* Not an adjacent or overlapping region - do not merge. */
+        if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
+            (pmlast + 1 != reg->guest_phys_addr ||
+             umlast + 1 != reg->userspace_addr)) {
+            continue;
+        }
+
+        if (merged) {
+            --to;
+            assert(to >= 0);
+        } else {
+            merged = reg;
+        }
+        u = MIN(uaddr, reg->userspace_addr);
+        s = MIN(start_addr, reg->guest_phys_addr);
+        e = MAX(pmlast, prlast);
+        uaddr = merged->userspace_addr = u;
+        start_addr = merged->guest_phys_addr = s;
+        size = merged->memory_size = e - s + 1;
+        assert(merged->memory_size);
+    }
+
+    if (!merged) {
+        struct vhost_memory_region *reg = dev->mem->regions + to;
+        memset(reg, 0, sizeof *reg);
+        reg->memory_size = size;
+        assert(reg->memory_size);
+        reg->guest_phys_addr = start_addr;
+        reg->userspace_addr = uaddr;
+        ++to;
+    }
+    assert(to <= dev->mem->nregions + 1);
+    dev->mem->nregions = to;
+}
+
+static uint64_t vhost_get_log_size(struct vhost_dev *dev)
+{
+    uint64_t log_size = 0;
+    int i;
+    for (i = 0; i < dev->mem->nregions; ++i) {
+        struct vhost_memory_region *reg = dev->mem->regions + i;
+        uint64_t last = range_get_last(reg->guest_phys_addr,
+                                       reg->memory_size);
+        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
+    }
+    for (i = 0; i < dev->nvqs; ++i) {
+        struct vhost_virtqueue *vq = dev->vqs + i;
+        uint64_t last = vq->used_phys +
+            offsetof(struct vring_used, ring) +
+            sizeof(struct vring_used_elem) * vq->num - 1;
+        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
+    }
+    return log_size;
+}
+
+static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size)
+{
+    vhost_log_chunk_t *log;
+    uint64_t log_base;
+    int r;
+    if (size) {
+        log = qemu_mallocz(size * sizeof *log);
+    } else {
+        log = NULL;
+    }
+    log_base = (uint64_t)(unsigned long)log;
+    r = ioctl(dev->control, VHOST_SET_LOG_BASE, &log_base);
+    assert(r >= 0);
+    vhost_client_sync_dirty_bitmap(&dev->client, 0,
+                                   (target_phys_addr_t)~0x0ull);
+    if (dev->log) {
+        qemu_free(dev->log);
+    }
+    dev->log = log;
+    dev->log_size = size;
+}
+
+static void vhost_client_set_memory(CPUPhysMemoryClient *client,
+                                    target_phys_addr_t start_addr,
+                                    ram_addr_t size,
+                                    ram_addr_t phys_offset)
+{
+    struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
+    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
+    int s = offsetof(struct vhost_memory, regions) +
+        (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
+    uint64_t log_size;
+    int r;
+    dev->mem = qemu_realloc(dev->mem, s);
+
+    assert(size);
+
+    vhost_dev_unassign_memory(dev, start_addr, size);
+    if (flags == IO_MEM_RAM) {
+        /* Add given mapping, merging adjacent regions if any */
+        vhost_dev_assign_memory(dev, start_addr, size,
+                                (uintptr_t)qemu_get_ram_ptr(phys_offset));
+    } else {
+        /* Remove old mapping for this memory, if any. */
+        vhost_dev_unassign_memory(dev, start_addr, size);
+    }
+
+    if (!dev->started) {
+        return;
+    }
+    if (!dev->log_enabled) {
+        r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
+        assert(r >= 0);
+        return;
+    }
+    log_size = vhost_get_log_size(dev);
+    /* We allocate an extra 4K bytes to log,
+     * to reduce the * number of reallocations. */
+#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
+    /* To log more, must increase log size before table update. */
+    if (dev->log_size < log_size) {
+        vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
+    }
+    r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
+    assert(r >= 0);
+    /* To log less, can only decrease log size after table update. */
+    if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
+        vhost_dev_log_resize(dev, log_size);
+    }
+}
+
+static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
+                                    struct vhost_virtqueue *vq,
+                                    unsigned idx, bool enable_log)
+{
+    struct vhost_vring_addr addr = {
+        .index = idx,
+        .desc_user_addr = (u_int64_t)(unsigned long)vq->desc,
+        .avail_user_addr = (u_int64_t)(unsigned long)vq->avail,
+        .used_user_addr = (u_int64_t)(unsigned long)vq->used,
+        .log_guest_addr = vq->used_phys,
+        .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
+    };
+    int r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr);
+    if (r < 0) {
+        return -errno;
+    }
+    return 0;
+}
+
+static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
+{
+    uint64_t features = dev->acked_features;
+    int r;
+    if (enable_log) {
+        features |= 0x1 << VHOST_F_LOG_ALL;
+    }
+    r = ioctl(dev->control, VHOST_SET_FEATURES, &features);
+    return r < 0 ? -errno : 0;
+}
+
+static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
+{
+    int r, t, i;
+    r = vhost_dev_set_features(dev, enable_log);
+    if (r < 0)
+        goto err_features;
+    for (i = 0; i < dev->nvqs; ++i) {
+        r = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
+                                     enable_log);
+        if (r < 0)
+            goto err_vq;
+    }
+    return 0;
+err_vq:
+    for (; i >= 0; --i) {
+        t = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
+                                     dev->log_enabled);
+        assert(t >= 0);
+    }
+    t = vhost_dev_set_features(dev, dev->log_enabled);
+    assert(t >= 0);
+err_features:
+    return r;
+}
+
+static int vhost_client_migration_log(struct CPUPhysMemoryClient *client,
+                                      int enable)
+{
+    struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
+    int r;
+    if (!!enable == dev->log_enabled) {
+        return 0;
+    }
+    if (!dev->started) {
+        dev->log_enabled = enable;
+        return 0;
+    }
+    if (!enable) {
+        r = vhost_dev_set_log(dev, false);
+        if (r < 0) {
+            return r;
+        }
+        if (dev->log) {
+            qemu_free(dev->log);
+        }
+        dev->log = NULL;
+        dev->log_size = 0;
+    } else {
+        vhost_dev_log_resize(dev, vhost_get_log_size(dev));
+        r = vhost_dev_set_log(dev, true);
+        if (r < 0) {
+            return r;
+        }
+    }
+    dev->log_enabled = enable;
+    return 0;
+}
+
+static int vhost_virtqueue_init(struct vhost_dev *dev,
+                                struct VirtIODevice *vdev,
+                                struct vhost_virtqueue *vq,
+                                unsigned idx)
+{
+    target_phys_addr_t s, l, a;
+    int r;
+    struct vhost_vring_file file = {
+        .index = idx,
+    };
+    struct vhost_vring_state state = {
+        .index = idx,
+    };
+    struct VirtQueue *q = virtio_queue(vdev, idx);
+
+    vq->num = state.num = virtio_queue_get_num(vdev, idx);
+    r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state);
+    if (r) {
+        return -errno;
+    }
+
+    state.num = virtio_queue_last_avail_idx(vdev, idx);
+    r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state);
+    if (r) {
+        return -errno;
+    }
+
+    s = l = sizeof(struct vring_desc) * vq->num;
+    a = virtio_queue_get_desc(vdev, idx);
+    vq->desc = cpu_physical_memory_map(a, &l, 0);
+    if (!vq->desc || l != s) {
+        r = -ENOMEM;
+        goto fail_alloc;
+    }
+    s = l = offsetof(struct vring_avail, ring) +
+        sizeof(u_int64_t) * vq->num;
+    a = virtio_queue_get_avail(vdev, idx);
+    vq->avail = cpu_physical_memory_map(a, &l, 0);
+    if (!vq->avail || l != s) {
+        r = -ENOMEM;
+        goto fail_alloc;
+    }
+    s = l = offsetof(struct vring_used, ring) +
+        sizeof(struct vring_used_elem) * vq->num;
+    vq->used_phys = a = virtio_queue_get_used(vdev, idx);
+    vq->used = cpu_physical_memory_map(a, &l, 1);
+    if (!vq->used || l != s) {
+        r = -ENOMEM;
+        goto fail_alloc;
+    }
+
+    r = vhost_virtqueue_set_addr(dev, vq, idx, dev->log_enabled);
+    if (r < 0) {
+        r = -errno;
+        goto fail_alloc;
+    }
+    if (!vdev->binding->guest_notifier || !vdev->binding->host_notifier) {
+        fprintf(stderr, "binding does not support irqfd/queuefd\n");
+        r = -ENOSYS;
+        goto fail_alloc;
+    }
+    r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, true);
+    if (r < 0) {
+        fprintf(stderr, "Error binding guest notifier: %d\n", -r);
+        goto fail_guest_notifier;
+    }
+
+    r = vdev->binding->host_notifier(vdev->binding_opaque, idx, true);
+    if (r < 0) {
+        fprintf(stderr, "Error binding host notifier: %d\n", -r);
+        goto fail_host_notifier;
+    }
+
+    file.fd = event_notifier_get_fd(virtio_queue_host_notifier(q));
+    r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file);
+    if (r) {
+        goto fail_kick;
+    }
+
+    file.fd = event_notifier_get_fd(virtio_queue_guest_notifier(q));
+    r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file);
+    if (r) {
+        goto fail_call;
+    }
+
+    return 0;
+
+fail_call:
+fail_kick:
+    vdev->binding->host_notifier(vdev->binding_opaque, idx, false);
+fail_host_notifier:
+    vdev->binding->guest_notifier(vdev->binding_opaque, idx, false);
+fail_guest_notifier:
+fail_alloc:
+    return r;
+}
+
+static void vhost_virtqueue_cleanup(struct vhost_dev *dev,
+                                    struct VirtIODevice *vdev,
+                                    struct vhost_virtqueue *vq,
+                                    unsigned idx)
+{
+    struct vhost_vring_state state = {
+        .index = idx,
+    };
+    int r;
+    r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, false);
+    if (r < 0) {
+        fprintf(stderr, "vhost VQ %d guest cleanup failed: %d\n", idx, r);
+        fflush(stderr);
+    }
+    assert (r >= 0);
+
+    r = vdev->binding->host_notifier(vdev->binding_opaque, idx, false);
+    if (r < 0) {
+        fprintf(stderr, "vhost VQ %d host cleanup failed: %d\n", idx, r);
+        fflush(stderr);
+    }
+    assert (r >= 0);
+    r = ioctl(dev->control, VHOST_GET_VRING_BASE, &state);
+    if (r < 0) {
+        fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
+        fflush(stderr);
+    }
+    virtio_queue_set_last_avail_idx(vdev, idx, state.num);
+    assert (r >= 0);
+}
+
+int vhost_dev_init(struct vhost_dev *hdev, int devfd)
+{
+    uint64_t features;
+    int r;
+    if (devfd >= 0) {
+        hdev->control = devfd;
+    } else {
+        hdev->control = open("/dev/vhost-net", O_RDWR);
+        if (hdev->control < 0)
+            return -errno;
+    }
+    r = ioctl(hdev->control, VHOST_SET_OWNER, NULL);
+    if (r < 0)
+        goto fail;
+
+    r = ioctl(hdev->control, VHOST_GET_FEATURES, &features);
+    if (r < 0)
+        goto fail;
+    hdev->features = features;
+
+    hdev->client.set_memory = vhost_client_set_memory;
+    hdev->client.sync_dirty_bitmap = vhost_client_sync_dirty_bitmap;
+    hdev->client.migration_log = vhost_client_migration_log;
+    hdev->mem = qemu_mallocz(offsetof(struct vhost_memory, regions));
+    hdev->log = NULL;
+    hdev->log_size = 0;
+    hdev->log_enabled = false;
+    hdev->started = false;
+    cpu_register_phys_memory_client(&hdev->client);
+    return 0;
+fail:
+    r = -errno;
+    close(hdev->control);
+    return r;
+}
+
+void vhost_dev_cleanup(struct vhost_dev *hdev)
+{
+    cpu_unregister_phys_memory_client(&hdev->client);
+    qemu_free(hdev->mem);
+    close(hdev->control);
+}
+
+int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+    int i, r;
+
+    r = vhost_dev_set_features(hdev, hdev->log_enabled);
+    if (r < 0)
+        goto fail;
+    r = ioctl(hdev->control, VHOST_SET_MEM_TABLE, hdev->mem);
+    if (r < 0) {
+        r = -errno;
+        goto fail;
+    }
+    if (hdev->log_enabled) {
+        hdev->log_size = vhost_get_log_size(hdev);
+        hdev->log = hdev->log_size ?
+            qemu_mallocz(hdev->log_size * sizeof *hdev->log) : NULL;
+        r = ioctl(hdev->control, VHOST_SET_LOG_BASE,
+                  (uint64_t)(unsigned long)hdev->log);
+        if (r < 0) {
+            r = -errno;
+            goto fail;
+        }
+    }
+
+    for (i = 0; i < hdev->nvqs; ++i) {
+        r = vhost_virtqueue_init(hdev,
+                                 vdev,
+                                 hdev->vqs + i,
+                                 i);
+        if (r < 0)
+            goto fail_vq;
+    }
+    hdev->started = true;
+
+    return 0;
+fail_vq:
+    while (--i >= 0) {
+        vhost_virtqueue_cleanup(hdev,
+                                vdev,
+                                hdev->vqs + i,
+                                i);
+    }
+fail:
+    return r;
+}
+
+void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+    int i;
+    for (i = 0; i < hdev->nvqs; ++i) {
+        vhost_virtqueue_cleanup(hdev,
+                                vdev,
+                                hdev->vqs + i,
+                                i);
+    }
+    vhost_client_sync_dirty_bitmap(&hdev->client, 0,
+                                   (target_phys_addr_t)~0x0ull);
+    hdev->started = false;
+    qemu_free(hdev->log);
+    hdev->log_size = 0;
+}
diff --git a/hw/vhost.h b/hw/vhost.h
new file mode 100644
index 0000000..8f3e9ce
--- /dev/null
+++ b/hw/vhost.h
@@ -0,0 +1,44 @@ 
+#ifndef VHOST_H
+#define VHOST_H
+
+#include "hw/hw.h"
+#include "hw/virtio.h"
+
+/* Generic structures common for any vhost based device. */
+struct vhost_virtqueue {
+    int kick;
+    int call;
+    void *desc;
+    void *avail;
+    void *used;
+    int num;
+    unsigned long long used_phys;
+};
+
+typedef unsigned long vhost_log_chunk_t;
+#define VHOST_LOG_PAGE 0x1000
+#define VHOST_LOG_BITS (8 * sizeof(vhost_log_chunk_t))
+#define VHOST_LOG_CHUNK (VHOST_LOG_PAGE * VHOST_LOG_BITS)
+
+struct vhost_memory;
+struct vhost_dev {
+    CPUPhysMemoryClient client;
+    int control;
+    struct vhost_memory *mem;
+    struct vhost_virtqueue *vqs;
+    int nvqs;
+    unsigned long long features;
+    unsigned long long acked_features;
+    unsigned long long backend_features;
+    bool started;
+    bool log_enabled;
+    vhost_log_chunk_t *log;
+    unsigned long long log_size;
+};
+
+int vhost_dev_init(struct vhost_dev *hdev, int devfd);
+void vhost_dev_cleanup(struct vhost_dev *hdev);
+int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
+void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev);
+
+#endif
diff --git a/hw/vhost_net.c b/hw/vhost_net.c
new file mode 100644
index 0000000..06b7648
--- /dev/null
+++ b/hw/vhost_net.c
@@ -0,0 +1,177 @@ 
+#include "net.h"
+#include "net/tap.h"
+
+#include "virtio-net.h"
+#include "vhost_net.h"
+
+#include "config.h"
+
+#ifdef CONFIG_VHOST_NET
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+#include <linux/kvm.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/virtio_ring.h>
+#include <netpacket/packet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/in.h>
+
+#include <stdio.h>
+
+#include "vhost.h"
+
+struct vhost_net {
+    struct vhost_dev dev;
+    struct vhost_virtqueue vqs[2];
+    int backend;
+    VLANClientState *vc;
+};
+
+unsigned vhost_net_get_features(struct vhost_net *net, unsigned features)
+{
+    /* Clear features not supported by host kernel. */
+    if (!(net->dev.features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)))
+        features &= ~(1 << VIRTIO_F_NOTIFY_ON_EMPTY);
+    if (!(net->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC)))
+        features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
+    if (!(net->dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
+        features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF);
+    return features;
+}
+
+void vhost_net_ack_features(struct vhost_net *net, unsigned features)
+{
+    net->dev.acked_features = net->dev.backend_features;
+    if (features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY))
+        net->dev.acked_features |= (1 << VIRTIO_F_NOTIFY_ON_EMPTY);
+    if (features & (1 << VIRTIO_RING_F_INDIRECT_DESC))
+        net->dev.acked_features |= (1 << VIRTIO_RING_F_INDIRECT_DESC);
+}
+
+static int vhost_net_get_fd(VLANClientState *backend)
+{
+    switch (backend->info->type) {
+    case NET_CLIENT_TYPE_TAP:
+        return tap_get_fd(backend);
+    default:
+        fprintf(stderr, "vhost-net requires tap backend\n");
+        return -EBADFD;
+    }
+}
+
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
+{
+    int r;
+    struct vhost_net *net = qemu_malloc(sizeof *net);
+    if (!backend) {
+        fprintf(stderr, "vhost-net requires backend to be setup\n");
+        goto fail;
+    }
+    r = vhost_net_get_fd(backend);
+    if (r < 0)
+        goto fail;
+    net->vc = backend;
+    net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 :
+        (1 << VHOST_NET_F_VIRTIO_NET_HDR);
+    net->backend = r;
+
+    r = vhost_dev_init(&net->dev, devfd);
+    if (r < 0)
+        goto fail;
+    if (~net->dev.features & net->dev.backend_features) {
+        fprintf(stderr, "vhost lacks feature mask %llu for backend\n",
+                ~net->dev.features & net->dev.backend_features);
+        vhost_dev_cleanup(&net->dev);
+        goto fail;
+    }
+
+    /* Set sane init value. Override when guest acks. */
+    vhost_net_ack_features(net, 0);
+    return net;
+fail:
+    qemu_free(net);
+    return NULL;
+}
+
+int vhost_net_start(struct vhost_net *net,
+                    VirtIODevice *dev)
+{
+    struct vhost_vring_file file = { };
+    int r;
+
+    net->dev.nvqs = 2;
+    net->dev.vqs = net->vqs;
+    r = vhost_dev_start(&net->dev, dev);
+    if (r < 0)
+        return r;
+
+    net->vc->info->poll(net->vc, false);
+    qemu_set_fd_handler(net->backend, NULL, NULL, NULL);
+    file.fd = net->backend;
+    for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
+        r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file);
+        if (r < 0) {
+            r = -errno;
+            goto fail;
+        }
+    }
+    return 0;
+fail:
+    file.fd = -1;
+    while (--file.index >= 0) {
+        int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file);
+        assert(r >= 0);
+    }
+    net->vc->info->poll(net->vc, true);
+    vhost_dev_stop(&net->dev, dev);
+    return r;
+}
+
+void vhost_net_stop(struct vhost_net *net,
+                    VirtIODevice *dev)
+{
+    struct vhost_vring_file file = { .fd = -1 };
+
+    for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
+        int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file);
+        assert(r >= 0);
+    }
+    net->vc->info->poll(net->vc, true);
+    vhost_dev_stop(&net->dev, dev);
+}
+
+void vhost_net_cleanup(struct vhost_net *net)
+{
+    vhost_dev_cleanup(&net->dev);
+    qemu_free(net);
+}
+#else
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
+{
+	return NULL;
+}
+
+int vhost_net_start(struct vhost_net *net,
+		    VirtIODevice *dev)
+{
+	return -ENOSYS;
+}
+void vhost_net_stop(struct vhost_net *net,
+		    VirtIODevice *dev)
+{
+}
+
+void vhost_net_cleanup(struct vhost_net *net)
+{
+}
+
+unsigned vhost_net_get_features(struct vhost_net *net, unsigned features)
+{
+	return features;
+}
+void vhost_net_ack_features(struct vhost_net *net, unsigned features)
+{
+}
+#endif
diff --git a/hw/vhost_net.h b/hw/vhost_net.h
new file mode 100644
index 0000000..2a10210
--- /dev/null
+++ b/hw/vhost_net.h
@@ -0,0 +1,20 @@ 
+#ifndef VHOST_NET_H
+#define VHOST_NET_H
+
+#include "net.h"
+
+struct vhost_net;
+
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd);
+
+int vhost_net_start(struct vhost_net *net,
+                    VirtIODevice *dev);
+void vhost_net_stop(struct vhost_net *net,
+                    VirtIODevice *dev);
+
+void vhost_net_cleanup(struct vhost_net *net);
+
+unsigned vhost_net_get_features(struct vhost_net *net, unsigned features);
+void vhost_net_ack_features(struct vhost_net *net, unsigned features);
+
+#endif