Patchwork [PATCH-RFC,13/13] virtio-net: connect to vhost net backend

login
register
mail settings
Submitter Michael S. Tsirkin
Date Jan. 11, 2010, 5:23 p.m.
Message ID <20100111172326.GB12084@redhat.com>
Download mbox | patch
Permalink /patch/42645/
State New
Headers show

Comments

Michael S. Tsirkin - Jan. 11, 2010, 5:23 p.m.
start/stop backend on driver start/stop

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
 1 files changed, 40 insertions(+), 0 deletions(-)
Anthony Liguori - Jan. 25, 2010, 7:58 p.m.
On 01/11/2010 11:23 AM, Michael S. Tsirkin wrote:
> start/stop backend on driver start/stop
>
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> ---
>   hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
>   1 files changed, 40 insertions(+), 0 deletions(-)
>
> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
> index c2a389f..99169e1 100644
> --- a/hw/virtio-net.c
> +++ b/hw/virtio-net.c
> @@ -17,6 +17,7 @@
>   #include "net/tap.h"
>   #include "qemu-timer.h"
>   #include "virtio-net.h"
> +#include "vhost_net.h"
>
>   #define VIRTIO_NET_VM_VERSION    11
>
> @@ -47,6 +48,7 @@ typedef struct VirtIONet
>       uint8_t nomulti;
>       uint8_t nouni;
>       uint8_t nobcast;
> +    uint8_t vhost_started;
>       struct {
>           int in_use;
>           int first_multi;
> @@ -114,6 +116,10 @@ static void virtio_net_reset(VirtIODevice *vdev)
>       n->nomulti = 0;
>       n->nouni = 0;
>       n->nobcast = 0;
> +    if (n->vhost_started) {
> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
> +        n->vhost_started = 0;
> +    }
>
>       /* Flush any MAC and VLAN filter table state */
>       n->mac_table.in_use = 0;
> @@ -820,6 +826,36 @@ static NetClientInfo net_virtio_info = {
>       .link_status_changed = virtio_net_set_link_status,
>   };
>
> +static void virtio_net_set_status(struct VirtIODevice *vdev)
> +{
> +    VirtIONet *n = to_virtio_net(vdev);
> +    if (!n->nic->nc.peer) {
> +        return;
> +    }
> +    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
> +        return;
> +    }
> +
> +    if (!tap_get_vhost_net(n->nic->nc.peer)) {
> +        return;
> +    }
> +    if (!!n->vhost_started == !!(vdev->status&  VIRTIO_CONFIG_S_DRIVER_OK)) {
> +        return;
> +    }
> +    if (vdev->status&  VIRTIO_CONFIG_S_DRIVER_OK) {
> +        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
> +        if (r<  0) {
> +            fprintf(stderr, "unable to start vhost net: "
> +                    "falling back on userspace virtio\n");
> +        } else {
> +            n->vhost_started = 1;
> +        }
> +    } else {
> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
> +        n->vhost_started = 0;
> +    }
> +}
> +
>    

This function does a number of bad things.  It makes virtio-net have 
specific knowledge of backends (like tap) and then has virtio-net pass 
device specific state (vdev) to a network backend.

Ultimately, the following things need to happen:

1) when a driver is ready to begin operating:
   a) virtio-net needs to tell vhost the location of the ring in 
physical memory
   b) virtio-net needs to tell vhost about any notification it receives 
(allowing kvm to shortcut userspace)
   c) virtio-net needs to tell vhost about which irq is being used 
(allowing kvm to shortcut userspace)

What this suggests is that we need an API for the network backends to do 
the following:

  1) probe whether ring passthrough is supported
  2) set the *virtual* address of the ring elements
  3) hand the backend some sort of notification object for sending and 
receiving notifications
  4) stop ring passthrough

vhost should not need any direct knowledge of the device.  This 
interface should be enough to communicating the required data.  I think 
the only bit that is a little fuzzy and perhaps non-obvious for the 
current patches is the notification object.  I would expect it look 
something like:

typedef struct IOEvent {
   int type;
   void (*notify)(IOEvent *);
   void (*on_notify)(IOEvent *, void (*cb)(IOEvent *, void *));
} IOEvent;

And then we would have

typedef struct KVMIOEvent {
   IOEvent event = {.type = KVM_IO_EVENT};
   int fd;
} KVMIOEvent;

if (kvm_enabled()) in virtio-net, we would allocate a KVMIOEvent for the 
PIO notification and hand that to vhost.  vhost would check event.type 
and if it's KVM_IOEVENT, downcast and use that to get at the file 
descriptor.

The KVMIOEvent should be created, not in the set status callback, but in 
the PCI map callback.  And in the PCI map callback, cpu_single_env 
should be passed to a kvm specific function to create this KVMIOEvent 
object that contains the created eventfd() that's handed to kvm via ioctl.

It doesn't have to be exactly like this, but the point of all of this is 
that these KVM specific mechanism (which are really implementation 
details) should not be pervasive throughout the QEMU interfaces.  There 
should also be strong separation between the vhost-net code and the 
virtio-net device.

Regards,

Anthony Liguori


>   VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>   {
>       VirtIONet *n;
> @@ -835,6 +871,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>       n->vdev.set_features = virtio_net_set_features;
>       n->vdev.bad_features = virtio_net_bad_features;
>       n->vdev.reset = virtio_net_reset;
> +    n->vdev.set_status = virtio_net_set_status;
>       n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx);
>       n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx);
>       n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
> @@ -864,6 +901,9 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>   void virtio_net_exit(VirtIODevice *vdev)
>   {
>       VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev);
> +    if (n->vhost_started) {
> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
> +    }
>
>       qemu_purge_queued_packets(&n->nic->nc);
>
>
Michael S. Tsirkin - Jan. 25, 2010, 8:27 p.m.
On Mon, Jan 25, 2010 at 01:58:08PM -0600, Anthony Liguori wrote:
> On 01/11/2010 11:23 AM, Michael S. Tsirkin wrote:
>> start/stop backend on driver start/stop
>>
>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>> ---
>>   hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
>>   1 files changed, 40 insertions(+), 0 deletions(-)
>>
>> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
>> index c2a389f..99169e1 100644
>> --- a/hw/virtio-net.c
>> +++ b/hw/virtio-net.c
>> @@ -17,6 +17,7 @@
>>   #include "net/tap.h"
>>   #include "qemu-timer.h"
>>   #include "virtio-net.h"
>> +#include "vhost_net.h"
>>
>>   #define VIRTIO_NET_VM_VERSION    11
>>
>> @@ -47,6 +48,7 @@ typedef struct VirtIONet
>>       uint8_t nomulti;
>>       uint8_t nouni;
>>       uint8_t nobcast;
>> +    uint8_t vhost_started;
>>       struct {
>>           int in_use;
>>           int first_multi;
>> @@ -114,6 +116,10 @@ static void virtio_net_reset(VirtIODevice *vdev)
>>       n->nomulti = 0;
>>       n->nouni = 0;
>>       n->nobcast = 0;
>> +    if (n->vhost_started) {
>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>> +        n->vhost_started = 0;
>> +    }
>>
>>       /* Flush any MAC and VLAN filter table state */
>>       n->mac_table.in_use = 0;
>> @@ -820,6 +826,36 @@ static NetClientInfo net_virtio_info = {
>>       .link_status_changed = virtio_net_set_link_status,
>>   };
>>
>> +static void virtio_net_set_status(struct VirtIODevice *vdev)
>> +{
>> +    VirtIONet *n = to_virtio_net(vdev);
>> +    if (!n->nic->nc.peer) {
>> +        return;
>> +    }
>> +    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
>> +        return;
>> +    }
>> +
>> +    if (!tap_get_vhost_net(n->nic->nc.peer)) {
>> +        return;
>> +    }
>> +    if (!!n->vhost_started == !!(vdev->status&  VIRTIO_CONFIG_S_DRIVER_OK)) {
>> +        return;
>> +    }
>> +    if (vdev->status&  VIRTIO_CONFIG_S_DRIVER_OK) {
>> +        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
>> +        if (r<  0) {
>> +            fprintf(stderr, "unable to start vhost net: "
>> +                    "falling back on userspace virtio\n");
>> +        } else {
>> +            n->vhost_started = 1;
>> +        }
>> +    } else {
>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>> +        n->vhost_started = 0;
>> +    }
>> +}
>> +
>>    
>
> This function does a number of bad things.  It makes virtio-net have  
> specific knowledge of backends (like tap) and then has virtio-net pass  
> device specific state (vdev) to a network backend.
>
> Ultimately, the following things need to happen:
>
> 1) when a driver is ready to begin operating:
>   a) virtio-net needs to tell vhost the location of the ring in physical 
> memory
>   b) virtio-net needs to tell vhost about any notification it receives  
> (allowing kvm to shortcut userspace)
>   c) virtio-net needs to tell vhost about which irq is being used  
> (allowing kvm to shortcut userspace)
>
> What this suggests is that we need an API for the network backends to do  
> the following:
>
>  1) probe whether ring passthrough is supported
>  2) set the *virtual* address of the ring elements
>  3) hand the backend some sort of notification object for sending and  
> receiving notifications
>  4) stop ring passthrough

Look at how vnet_hdr is setup: frontend probes backend, and has 'if
(backend has vnet header) blabla' vhost is really very similar, so I
would like to do it in the same way.

Generally I do not believe in abstractions that only have one
implementation behind it: you only know how to abstract interface after you
have more than one implementation.  So whoever writes another frontend
that can use vhost will be in a better position to add infrastructure to
abstract both that new thing and virtio.

> vhost should not need any direct knowledge of the device.  This  
> interface should be enough to communicating the required data.  I think  
> the only bit that is a little fuzzy and perhaps non-obvious for the  
> current patches is the notification object.  I would expect it look  
> something like:
>
> typedef struct IOEvent {
>   int type;
>   void (*notify)(IOEvent *);
>   void (*on_notify)(IOEvent *, void (*cb)(IOEvent *, void *));
> } IOEvent;
> And then we would have
>
> typedef struct KVMIOEvent {
>   IOEvent event = {.type = KVM_IO_EVENT};
>   int fd;
> } KVMIOEvent;
>
> if (kvm_enabled()) in virtio-net, we would allocate a KVMIOEvent for the  
> PIO notification and hand that to vhost.  vhost would check event.type  
> and if it's KVM_IOEVENT, downcast and use that to get at the file  
> descriptor.

Since we don't have any other IOEvents, I just put the fd
in the generic structure directly. If other types surface
we'll see how to generalize it.

> The KVMIOEvent should be created, not in the set status callback, but in  
> the PCI map callback.  And in the PCI map callback, cpu_single_env  
> should be passed to a kvm specific function to create this KVMIOEvent  
> object that contains the created eventfd() that's handed to kvm via 
> ioctl.

So this specific thing does not work very well because with irqchip, we
want to bypass notification and send irq directly from kernel.
So I created a structure but it does not have callbacks,
only the fd.

> It doesn't have to be exactly like this, but the point of all of this is  
> that these KVM specific mechanism (which are really implementation  
> details) should not be pervasive throughout the QEMU interfaces.


>  There  
> should also be strong separation between the vhost-net code and the  
> virtio-net device.
>
> Regards,
>
> Anthony Liguori
>

I don't see the point of this last idea.  vhost is virtio accelerator,
not a generic network backend.  Whoever wants to use vhost for
non-virtio frontends will have to add infrastructure for this
separation, but I do not believe it's practical or desirable.



>
>>   VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>>   {
>>       VirtIONet *n;
>> @@ -835,6 +871,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>>       n->vdev.set_features = virtio_net_set_features;
>>       n->vdev.bad_features = virtio_net_bad_features;
>>       n->vdev.reset = virtio_net_reset;
>> +    n->vdev.set_status = virtio_net_set_status;
>>       n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx);
>>       n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx);
>>       n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
>> @@ -864,6 +901,9 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>>   void virtio_net_exit(VirtIODevice *vdev)
>>   {
>>       VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev);
>> +    if (n->vhost_started) {
>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>> +    }
>>
>>       qemu_purge_queued_packets(&n->nic->nc);
>>
>>
Anthony Liguori - Jan. 25, 2010, 9 p.m.
On 01/25/2010 02:27 PM, Michael S. Tsirkin wrote:
> On Mon, Jan 25, 2010 at 01:58:08PM -0600, Anthony Liguori wrote:
>    
>> On 01/11/2010 11:23 AM, Michael S. Tsirkin wrote:
>>      
>>> start/stop backend on driver start/stop
>>>
>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>> ---
>>>    hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
>>>    1 files changed, 40 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
>>> index c2a389f..99169e1 100644
>>> --- a/hw/virtio-net.c
>>> +++ b/hw/virtio-net.c
>>> @@ -17,6 +17,7 @@
>>>    #include "net/tap.h"
>>>    #include "qemu-timer.h"
>>>    #include "virtio-net.h"
>>> +#include "vhost_net.h"
>>>
>>>    #define VIRTIO_NET_VM_VERSION    11
>>>
>>> @@ -47,6 +48,7 @@ typedef struct VirtIONet
>>>        uint8_t nomulti;
>>>        uint8_t nouni;
>>>        uint8_t nobcast;
>>> +    uint8_t vhost_started;
>>>        struct {
>>>            int in_use;
>>>            int first_multi;
>>> @@ -114,6 +116,10 @@ static void virtio_net_reset(VirtIODevice *vdev)
>>>        n->nomulti = 0;
>>>        n->nouni = 0;
>>>        n->nobcast = 0;
>>> +    if (n->vhost_started) {
>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>> +        n->vhost_started = 0;
>>> +    }
>>>
>>>        /* Flush any MAC and VLAN filter table state */
>>>        n->mac_table.in_use = 0;
>>> @@ -820,6 +826,36 @@ static NetClientInfo net_virtio_info = {
>>>        .link_status_changed = virtio_net_set_link_status,
>>>    };
>>>
>>> +static void virtio_net_set_status(struct VirtIODevice *vdev)
>>> +{
>>> +    VirtIONet *n = to_virtio_net(vdev);
>>> +    if (!n->nic->nc.peer) {
>>> +        return;
>>> +    }
>>> +    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
>>> +        return;
>>> +    }
>>> +
>>> +    if (!tap_get_vhost_net(n->nic->nc.peer)) {
>>> +        return;
>>> +    }
>>> +    if (!!n->vhost_started == !!(vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK)) {
>>> +        return;
>>> +    }
>>> +    if (vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK) {
>>> +        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>> +        if (r<   0) {
>>> +            fprintf(stderr, "unable to start vhost net: "
>>> +                    "falling back on userspace virtio\n");
>>> +        } else {
>>> +            n->vhost_started = 1;
>>> +        }
>>> +    } else {
>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>> +        n->vhost_started = 0;
>>> +    }
>>> +}
>>> +
>>>
>>>        
>> This function does a number of bad things.  It makes virtio-net have
>> specific knowledge of backends (like tap) and then has virtio-net pass
>> device specific state (vdev) to a network backend.
>>
>> Ultimately, the following things need to happen:
>>
>> 1) when a driver is ready to begin operating:
>>    a) virtio-net needs to tell vhost the location of the ring in physical
>> memory
>>    b) virtio-net needs to tell vhost about any notification it receives
>> (allowing kvm to shortcut userspace)
>>    c) virtio-net needs to tell vhost about which irq is being used
>> (allowing kvm to shortcut userspace)
>>
>> What this suggests is that we need an API for the network backends to do
>> the following:
>>
>>   1) probe whether ring passthrough is supported
>>   2) set the *virtual* address of the ring elements
>>   3) hand the backend some sort of notification object for sending and
>> receiving notifications
>>   4) stop ring passthrough
>>      
> Look at how vnet_hdr is setup: frontend probes backend, and has 'if
> (backend has vnet header) blabla' vhost is really very similar, so I
> would like to do it in the same way.
>    

vnet_hdr is IMHO a really bad example to copy from.

vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this 
means is that if you want to migrate from -net tap -net nic,model=virtio 
to -net user -net nic,model=virtio, it will fail.

This is a hard problem to solve in qemu though because it would require 
that we implement software GSO which so far, no one has stepped up to do.

We don't have to repeat this with vhost-net though because unlike 
vnet_hdr, we don't have to expose anything to the guest.

> Generally I do not believe in abstractions that only have one
> implementation behind it: you only know how to abstract interface after you
> have more than one implementation.  So whoever writes another frontend
> that can use vhost will be in a better position to add infrastructure to
> abstract both that new thing and virtio.
>    

I agree with you, but at the same time, I also believe that layering 
violations should be avoided.  I'm not suggesting that you need to make 
anything other than the vhost-net + virtio-net case work.  Just make the 
interfaces abstract enough that you don't need to hand a vdev to 
vhost-net and such that you don't have to pass kvm specific data 
structure (ioeventfd) in what are supposed to be generic interfaces.

>> vhost should not need any direct knowledge of the device.  This
>> interface should be enough to communicating the required data.  I think
>> the only bit that is a little fuzzy and perhaps non-obvious for the
>> current patches is the notification object.  I would expect it look
>> something like:
>>
>> typedef struct IOEvent {
>>    int type;
>>    void (*notify)(IOEvent *);
>>    void (*on_notify)(IOEvent *, void (*cb)(IOEvent *, void *));
>> } IOEvent;
>> And then we would have
>>
>> typedef struct KVMIOEvent {
>>    IOEvent event = {.type = KVM_IO_EVENT};
>>    int fd;
>> } KVMIOEvent;
>>
>> if (kvm_enabled()) in virtio-net, we would allocate a KVMIOEvent for the
>> PIO notification and hand that to vhost.  vhost would check event.type
>> and if it's KVM_IOEVENT, downcast and use that to get at the file
>> descriptor.
>>      
> Since we don't have any other IOEvents, I just put the fd
> in the generic structure directly. If other types surface
> we'll see how to generalize it.
>    

I'd feel a whole lot better if we didn't pass the fd around and instead 
passed around a structure.  With just a tiny bit of layering, we can 
even avoid making that structure KVM specific :-)

>> The KVMIOEvent should be created, not in the set status callback, but in
>> the PCI map callback.  And in the PCI map callback, cpu_single_env
>> should be passed to a kvm specific function to create this KVMIOEvent
>> object that contains the created eventfd() that's handed to kvm via
>> ioctl.
>>      
> So this specific thing does not work very well because with irqchip, we
> want to bypass notification and send irq directly from kernel.
> So I created a structure but it does not have callbacks,
> only the fd.
>    

Your structure is an int, right?  I understand the limits due to lack of 
in-kernel irqchip but I still think passing around an fd is a mistake.

>>   There
>> should also be strong separation between the vhost-net code and the
>> virtio-net device.
>>
>> Regards,
>>
>> Anthony Liguori
>>
>>      
> I don't see the point of this last idea.  vhost is virtio accelerator,
> not a generic network backend.  Whoever wants to use vhost for
> non-virtio frontends will have to add infrastructure for this
> separation, but I do not believe it's practical or desirable.
>    

vhost is a userspace interface to inject packets into a network device 
just like a raw socket or a tun/tap device is.  It happens to have some 
interesting features like it supports remapping of physical addresses 
and it implements interfaces for notifications that can conveniently be 
used by KVM to bypass userspace in the fast paths.

We should think of it this way for the same reason that vhost-net 
doesn't live in kvm.ko.  If it's easy to isolate something and make it 
more generic, it's almost always the right thing to do.  In this case, 
isolating vhost-net from virtio-net in qemu just involves passing some 
information in a function call verses passing a non-public data 
structure that is then accessed directly.  Regardless of whether you 
agree with my view of vhost-net, the argument alone to avoid making a 
non-public structure public should be enough of an argument.

Regards,

Anthony Liguori
Michael S. Tsirkin - Jan. 25, 2010, 9:01 p.m.
On Mon, Jan 25, 2010 at 03:00:16PM -0600, Anthony Liguori wrote:
> On 01/25/2010 02:27 PM, Michael S. Tsirkin wrote:
>> On Mon, Jan 25, 2010 at 01:58:08PM -0600, Anthony Liguori wrote:
>>    
>>> On 01/11/2010 11:23 AM, Michael S. Tsirkin wrote:
>>>      
>>>> start/stop backend on driver start/stop
>>>>
>>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>>> ---
>>>>    hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
>>>>    1 files changed, 40 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
>>>> index c2a389f..99169e1 100644
>>>> --- a/hw/virtio-net.c
>>>> +++ b/hw/virtio-net.c
>>>> @@ -17,6 +17,7 @@
>>>>    #include "net/tap.h"
>>>>    #include "qemu-timer.h"
>>>>    #include "virtio-net.h"
>>>> +#include "vhost_net.h"
>>>>
>>>>    #define VIRTIO_NET_VM_VERSION    11
>>>>
>>>> @@ -47,6 +48,7 @@ typedef struct VirtIONet
>>>>        uint8_t nomulti;
>>>>        uint8_t nouni;
>>>>        uint8_t nobcast;
>>>> +    uint8_t vhost_started;
>>>>        struct {
>>>>            int in_use;
>>>>            int first_multi;
>>>> @@ -114,6 +116,10 @@ static void virtio_net_reset(VirtIODevice *vdev)
>>>>        n->nomulti = 0;
>>>>        n->nouni = 0;
>>>>        n->nobcast = 0;
>>>> +    if (n->vhost_started) {
>>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        n->vhost_started = 0;
>>>> +    }
>>>>
>>>>        /* Flush any MAC and VLAN filter table state */
>>>>        n->mac_table.in_use = 0;
>>>> @@ -820,6 +826,36 @@ static NetClientInfo net_virtio_info = {
>>>>        .link_status_changed = virtio_net_set_link_status,
>>>>    };
>>>>
>>>> +static void virtio_net_set_status(struct VirtIODevice *vdev)
>>>> +{
>>>> +    VirtIONet *n = to_virtio_net(vdev);
>>>> +    if (!n->nic->nc.peer) {
>>>> +        return;
>>>> +    }
>>>> +    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    if (!tap_get_vhost_net(n->nic->nc.peer)) {
>>>> +        return;
>>>> +    }
>>>> +    if (!!n->vhost_started == !!(vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK)) {
>>>> +        return;
>>>> +    }
>>>> +    if (vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK) {
>>>> +        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        if (r<   0) {
>>>> +            fprintf(stderr, "unable to start vhost net: "
>>>> +                    "falling back on userspace virtio\n");
>>>> +        } else {
>>>> +            n->vhost_started = 1;
>>>> +        }
>>>> +    } else {
>>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        n->vhost_started = 0;
>>>> +    }
>>>> +}
>>>> +
>>>>
>>>>        
>>> This function does a number of bad things.  It makes virtio-net have
>>> specific knowledge of backends (like tap) and then has virtio-net pass
>>> device specific state (vdev) to a network backend.
>>>
>>> Ultimately, the following things need to happen:
>>>
>>> 1) when a driver is ready to begin operating:
>>>    a) virtio-net needs to tell vhost the location of the ring in physical
>>> memory
>>>    b) virtio-net needs to tell vhost about any notification it receives
>>> (allowing kvm to shortcut userspace)
>>>    c) virtio-net needs to tell vhost about which irq is being used
>>> (allowing kvm to shortcut userspace)
>>>
>>> What this suggests is that we need an API for the network backends to do
>>> the following:
>>>
>>>   1) probe whether ring passthrough is supported
>>>   2) set the *virtual* address of the ring elements
>>>   3) hand the backend some sort of notification object for sending and
>>> receiving notifications
>>>   4) stop ring passthrough
>>>      
>> Look at how vnet_hdr is setup: frontend probes backend, and has 'if
>> (backend has vnet header) blabla' vhost is really very similar, so I
>> would like to do it in the same way.
>>    
>
> vnet_hdr is IMHO a really bad example to copy from.
>
> vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this  
> means is that if you want to migrate from -net tap -net nic,model=virtio  
> to -net user -net nic,model=virtio, it will fail.
>
> This is a hard problem to solve in qemu though because it would require  
> that we implement software GSO which so far, no one has stepped up to do.
>
> We don't have to repeat this with vhost-net though because unlike  
> vnet_hdr, we don't have to expose anything to the guest.

We do, capabilities depend on what kernel supports.

>> Generally I do not believe in abstractions that only have one
>> implementation behind it: you only know how to abstract interface after you
>> have more than one implementation.  So whoever writes another frontend
>> that can use vhost will be in a better position to add infrastructure to
>> abstract both that new thing and virtio.
>>    
>
> I agree with you, but at the same time, I also believe that layering  
> violations should be avoided.  I'm not suggesting that you need to make  
> anything other than the vhost-net + virtio-net case work.  Just make the  
> interfaces abstract enough that you don't need to hand a vdev to  
> vhost-net and such that you don't have to pass kvm specific data  
> structure (ioeventfd) in what are supposed to be generic interfaces.
>
>>> vhost should not need any direct knowledge of the device.  This
>>> interface should be enough to communicating the required data.  I think
>>> the only bit that is a little fuzzy and perhaps non-obvious for the
>>> current patches is the notification object.  I would expect it look
>>> something like:
>>>
>>> typedef struct IOEvent {
>>>    int type;
>>>    void (*notify)(IOEvent *);
>>>    void (*on_notify)(IOEvent *, void (*cb)(IOEvent *, void *));
>>> } IOEvent;
>>> And then we would have
>>>
>>> typedef struct KVMIOEvent {
>>>    IOEvent event = {.type = KVM_IO_EVENT};
>>>    int fd;
>>> } KVMIOEvent;
>>>
>>> if (kvm_enabled()) in virtio-net, we would allocate a KVMIOEvent for the
>>> PIO notification and hand that to vhost.  vhost would check event.type
>>> and if it's KVM_IOEVENT, downcast and use that to get at the file
>>> descriptor.
>>>      
>> Since we don't have any other IOEvents, I just put the fd
>> in the generic structure directly. If other types surface
>> we'll see how to generalize it.
>>    
>
> I'd feel a whole lot better if we didn't pass the fd around and instead  
> passed around a structure.  With just a tiny bit of layering, we can  
> even avoid making that structure KVM specific :-)
>
>>> The KVMIOEvent should be created, not in the set status callback, but in
>>> the PCI map callback.  And in the PCI map callback, cpu_single_env
>>> should be passed to a kvm specific function to create this KVMIOEvent
>>> object that contains the created eventfd() that's handed to kvm via
>>> ioctl.
>>>      
>> So this specific thing does not work very well because with irqchip, we
>> want to bypass notification and send irq directly from kernel.
>> So I created a structure but it does not have callbacks,
>> only the fd.
>>    
>
> Your structure is an int, right?  I understand the limits due to lack of  
> in-kernel irqchip but I still think passing around an fd is a mistake.
>
>>>   There
>>> should also be strong separation between the vhost-net code and the
>>> virtio-net device.
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>>      
>> I don't see the point of this last idea.  vhost is virtio accelerator,
>> not a generic network backend.  Whoever wants to use vhost for
>> non-virtio frontends will have to add infrastructure for this
>> separation, but I do not believe it's practical or desirable.
>>    
>
> vhost is a userspace interface to inject packets into a network device  
> just like a raw socket or a tun/tap device is.  It happens to have some  
> interesting features like it supports remapping of physical addresses  
> and it implements interfaces for notifications that can conveniently be  
> used by KVM to bypass userspace in the fast paths.
>
> We should think of it this way for the same reason that vhost-net  
> doesn't live in kvm.ko.  If it's easy to isolate something and make it  
> more generic, it's almost always the right thing to do.  In this case,  
> isolating vhost-net from virtio-net in qemu just involves passing some  
> information in a function call verses passing a non-public data  
> structure that is then accessed directly.  Regardless of whether you  
> agree with my view of vhost-net, the argument alone to avoid making a  
> non-public structure public should be enough of an argument.
>
> Regards,
>
> Anthony Liguori
Michael S. Tsirkin - Jan. 25, 2010, 9:05 p.m.
On Mon, Jan 25, 2010 at 03:00:16PM -0600, Anthony Liguori wrote:
> On 01/25/2010 02:27 PM, Michael S. Tsirkin wrote:
>> On Mon, Jan 25, 2010 at 01:58:08PM -0600, Anthony Liguori wrote:
>>    
>>> On 01/11/2010 11:23 AM, Michael S. Tsirkin wrote:
>>>      
>>>> start/stop backend on driver start/stop
>>>>
>>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>>> ---
>>>>    hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
>>>>    1 files changed, 40 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
>>>> index c2a389f..99169e1 100644
>>>> --- a/hw/virtio-net.c
>>>> +++ b/hw/virtio-net.c
>>>> @@ -17,6 +17,7 @@
>>>>    #include "net/tap.h"
>>>>    #include "qemu-timer.h"
>>>>    #include "virtio-net.h"
>>>> +#include "vhost_net.h"
>>>>
>>>>    #define VIRTIO_NET_VM_VERSION    11
>>>>
>>>> @@ -47,6 +48,7 @@ typedef struct VirtIONet
>>>>        uint8_t nomulti;
>>>>        uint8_t nouni;
>>>>        uint8_t nobcast;
>>>> +    uint8_t vhost_started;
>>>>        struct {
>>>>            int in_use;
>>>>            int first_multi;
>>>> @@ -114,6 +116,10 @@ static void virtio_net_reset(VirtIODevice *vdev)
>>>>        n->nomulti = 0;
>>>>        n->nouni = 0;
>>>>        n->nobcast = 0;
>>>> +    if (n->vhost_started) {
>>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        n->vhost_started = 0;
>>>> +    }
>>>>
>>>>        /* Flush any MAC and VLAN filter table state */
>>>>        n->mac_table.in_use = 0;
>>>> @@ -820,6 +826,36 @@ static NetClientInfo net_virtio_info = {
>>>>        .link_status_changed = virtio_net_set_link_status,
>>>>    };
>>>>
>>>> +static void virtio_net_set_status(struct VirtIODevice *vdev)
>>>> +{
>>>> +    VirtIONet *n = to_virtio_net(vdev);
>>>> +    if (!n->nic->nc.peer) {
>>>> +        return;
>>>> +    }
>>>> +    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    if (!tap_get_vhost_net(n->nic->nc.peer)) {
>>>> +        return;
>>>> +    }
>>>> +    if (!!n->vhost_started == !!(vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK)) {
>>>> +        return;
>>>> +    }
>>>> +    if (vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK) {
>>>> +        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        if (r<   0) {
>>>> +            fprintf(stderr, "unable to start vhost net: "
>>>> +                    "falling back on userspace virtio\n");
>>>> +        } else {
>>>> +            n->vhost_started = 1;
>>>> +        }
>>>> +    } else {
>>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        n->vhost_started = 0;
>>>> +    }
>>>> +}
>>>> +
>>>>
>>>>        
>>> This function does a number of bad things.  It makes virtio-net have
>>> specific knowledge of backends (like tap) and then has virtio-net pass
>>> device specific state (vdev) to a network backend.
>>>
>>> Ultimately, the following things need to happen:
>>>
>>> 1) when a driver is ready to begin operating:
>>>    a) virtio-net needs to tell vhost the location of the ring in physical
>>> memory
>>>    b) virtio-net needs to tell vhost about any notification it receives
>>> (allowing kvm to shortcut userspace)
>>>    c) virtio-net needs to tell vhost about which irq is being used
>>> (allowing kvm to shortcut userspace)
>>>
>>> What this suggests is that we need an API for the network backends to do
>>> the following:
>>>
>>>   1) probe whether ring passthrough is supported
>>>   2) set the *virtual* address of the ring elements
>>>   3) hand the backend some sort of notification object for sending and
>>> receiving notifications
>>>   4) stop ring passthrough
>>>      
>> Look at how vnet_hdr is setup: frontend probes backend, and has 'if
>> (backend has vnet header) blabla' vhost is really very similar, so I
>> would like to do it in the same way.
>>    
>
> vnet_hdr is IMHO a really bad example to copy from.
>
> vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this  
> means is that if you want to migrate from -net tap -net nic,model=virtio  
> to -net user -net nic,model=virtio, it will fail.
>
> This is a hard problem to solve in qemu though because it would require  
> that we implement software GSO which so far, no one has stepped up to do.
>
> We don't have to repeat this with vhost-net though because unlike  
> vnet_hdr, we don't have to expose anything to the guest.
>
>> Generally I do not believe in abstractions that only have one
>> implementation behind it: you only know how to abstract interface after you
>> have more than one implementation.  So whoever writes another frontend
>> that can use vhost will be in a better position to add infrastructure to
>> abstract both that new thing and virtio.
>>    
>
> I agree with you, but at the same time, I also believe that layering  
> violations should be avoided.  I'm not suggesting that you need to make  
> anything other than the vhost-net + virtio-net case work.  Just make the  
> interfaces abstract enough that you don't need to hand a vdev to  
> vhost-net and such that you don't have to pass kvm specific data  
> structure (ioeventfd) in what are supposed to be generic interfaces.
>
>>> vhost should not need any direct knowledge of the device.  This
>>> interface should be enough to communicating the required data.  I think
>>> the only bit that is a little fuzzy and perhaps non-obvious for the
>>> current patches is the notification object.  I would expect it look
>>> something like:
>>>
>>> typedef struct IOEvent {
>>>    int type;
>>>    void (*notify)(IOEvent *);
>>>    void (*on_notify)(IOEvent *, void (*cb)(IOEvent *, void *));
>>> } IOEvent;
>>> And then we would have
>>>
>>> typedef struct KVMIOEvent {
>>>    IOEvent event = {.type = KVM_IO_EVENT};
>>>    int fd;
>>> } KVMIOEvent;
>>>
>>> if (kvm_enabled()) in virtio-net, we would allocate a KVMIOEvent for the
>>> PIO notification and hand that to vhost.  vhost would check event.type
>>> and if it's KVM_IOEVENT, downcast and use that to get at the file
>>> descriptor.
>>>      
>> Since we don't have any other IOEvents, I just put the fd
>> in the generic structure directly. If other types surface
>> we'll see how to generalize it.
>>    
>
> I'd feel a whole lot better if we didn't pass the fd around and instead  
> passed around a structure.  With just a tiny bit of layering, we can  
> even avoid making that structure KVM specific :-)
>
>>> The KVMIOEvent should be created, not in the set status callback, but in
>>> the PCI map callback.  And in the PCI map callback, cpu_single_env
>>> should be passed to a kvm specific function to create this KVMIOEvent
>>> object that contains the created eventfd() that's handed to kvm via
>>> ioctl.
>>>      
>> So this specific thing does not work very well because with irqchip, we
>> want to bypass notification and send irq directly from kernel.
>> So I created a structure but it does not have callbacks,
>> only the fd.
>>    
>
> Your structure is an int, right?

it *has* an int:
	struct EventNotifier {
		int fd;
	};

> I understand the limits due to lack of  
> in-kernel irqchip but I still think passing around an fd is a mistake.

So API's will all use EventNotifier *.
We'll be able to add downcasting etc if/when we need it.

>>> There
>>> should also be strong separation between the vhost-net code and the
>>> virtio-net device.
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>>      
>> I don't see the point of this last idea.  vhost is virtio accelerator,
>> not a generic network backend.  Whoever wants to use vhost for
>> non-virtio frontends will have to add infrastructure for this
>> separation, but I do not believe it's practical or desirable.
>>    
>
> vhost is a userspace interface to inject packets into a network device  
> just like a raw socket or a tun/tap device is.  It happens to have some  
> interesting features like it supports remapping of physical addresses  
> and it implements interfaces for notifications that can conveniently be  
> used by KVM to bypass userspace in the fast paths.
>
> We should think of it this way for the same reason that vhost-net  
> doesn't live in kvm.ko.  If it's easy to isolate something and make it  
> more generic, it's almost always the right thing to do.  In this case,  
> isolating vhost-net from virtio-net in qemu just involves passing some  
> information in a function call verses passing a non-public data  
> structure that is then accessed directly.  Regardless of whether you  
> agree with my view of vhost-net, the argument alone to avoid making a  
> non-public structure public should be enough of an argument.
>
> Regards,
>
> Anthony Liguori

I'll add accessors, it's not a big deal.
Anthony Liguori - Jan. 25, 2010, 9:11 p.m.
On 01/25/2010 03:05 PM, Michael S. Tsirkin wrote:
> it *has* an int:
> 	struct EventNotifier {
> 		int fd;
> 	};
>    

Yes, this would be a lot better.

Regards,

Anthony Liguori
Paul Brook - Feb. 24, 2010, 3:14 a.m.
> vnet_hdr is IMHO a really bad example to copy from.
> 
> vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this
> means is that if you want to migrate from -net tap -net nic,model=virtio
> to -net user -net nic,model=virtio, it will fail.
> 
> This is a hard problem to solve in qemu though because it would require
> that we implement software GSO which so far, no one has stepped up to do.

Or make virtio-net pass this on to the guest, and have that deal with the 
problem. If you implement software GSO, then devices can assume it's always 
present and don't need to probe.

Paul
Michael S. Tsirkin - Feb. 24, 2010, 5:29 a.m.
On Wed, Feb 24, 2010 at 03:14:25AM +0000, Paul Brook wrote:
> > vnet_hdr is IMHO a really bad example to copy from.
> > 
> > vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this
> > means is that if you want to migrate from -net tap -net nic,model=virtio
> > to -net user -net nic,model=virtio, it will fail.
> > 
> > This is a hard problem to solve in qemu though because it would require
> > that we implement software GSO which so far, no one has stepped up to do.
> 
> Or make virtio-net pass this on to the guest, and have that deal with the 
> problem.

This is exacly what we do, via feature bits.

> If you implement software GSO, then devices can assume it's always 
> present and don't need to probe.
> 
> Paul
Paul Brook - Feb. 24, 2010, 11:30 a.m.
> On Wed, Feb 24, 2010 at 03:14:25AM +0000, Paul Brook wrote:
> > > vnet_hdr is IMHO a really bad example to copy from.
> > >
> > > vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this
> > > means is that if you want to migrate from -net tap -net
> > > nic,model=virtio to -net user -net nic,model=virtio, it will fail.
> > >
> > > This is a hard problem to solve in qemu though because it would require
> > > that we implement software GSO which so far, no one has stepped up to
> > > do.
> >
> > Or make virtio-net pass this on to the guest, and have that deal with the
> > problem.
> 
> This is exacly what we do, via feature bits.

AFAIK we only have static feature bits. There aren't useful for anything that 
the user may change on the fly (or via migration).

If you don't have a fallback implementation then the guest must be able to 
cope with this feature disappearing without warning. If we do have a software 
fallback then the feature bit is just for backwards compatibility, and should 
be enabled unconditionally (on current machine types).

Paul
Michael S. Tsirkin - Feb. 24, 2010, 11:46 a.m.
On Wed, Feb 24, 2010 at 11:30:18AM +0000, Paul Brook wrote:
> > On Wed, Feb 24, 2010 at 03:14:25AM +0000, Paul Brook wrote:
> > > > vnet_hdr is IMHO a really bad example to copy from.
> > > >
> > > > vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this
> > > > means is that if you want to migrate from -net tap -net
> > > > nic,model=virtio to -net user -net nic,model=virtio, it will fail.
> > > >
> > > > This is a hard problem to solve in qemu though because it would require
> > > > that we implement software GSO which so far, no one has stepped up to
> > > > do.
> > >
> > > Or make virtio-net pass this on to the guest, and have that deal with the
> > > problem.
> > 
> > This is exacly what we do, via feature bits.
> 
> AFAIK we only have static feature bits. There aren't useful for anything that 
> the user may change on the fly (or via migration).

Ah, you mean telling the guest to switch features on and off: natureally
this would require guest driver changes.  This might be also non-trivial
to implement. Consider a request that has been posted without checksum,
suddenly we disable checksum support.  Guest will need a way to handle
that.  Guest OSes might also not be prepared to handle device features
going away.

> If you don't have a fallback implementation then the guest must be able to 
> cope with this feature disappearing without warning.

Instead, we simply fail migration at the moment. We also use machine
type to let users force some level of homogenuity in the cluster.

> If we do have a software 
> fallback then the feature bit is just for backwards compatibility, and should 
> be enabled unconditionally (on current machine types).
> 
> Paul

Software fallback might turn out to be slower than disabling the feature
in the guest. For example, and extra pass over packet might cause extra CPU
cache thrashing. In that case, it's not obvious whether enabling it
unconditionally will turn out to be a good idea. But we'll have to have
that code to be able to tell.
Paul Brook - Feb. 24, 2010, 12:26 p.m.
> > If we do have a software
> > fallback then the feature bit is just for backwards compatibility, and
> > should be enabled unconditionally (on current machine types).
>
> Software fallback might turn out to be slower than disabling the feature
> in the guest. For example, and extra pass over packet might cause extra CPU
> cache thrashing. In that case, it's not obvious whether enabling it
> unconditionally will turn out to be a good idea. But we'll have to have
> that code to be able to tell.

IMO once you accept that these things can change, consistency is more 
important than trying to guess what the "best" option may be.
Starting qemu on machine A and migrating to machine B should give the same 
guest environment as starting on machine B.

Paul
Michael S. Tsirkin - Feb. 24, 2010, 12:40 p.m.
On Wed, Feb 24, 2010 at 12:26:53PM +0000, Paul Brook wrote:
> > > If we do have a software
> > > fallback then the feature bit is just for backwards compatibility, and
> > > should be enabled unconditionally (on current machine types).
> >
> > Software fallback might turn out to be slower than disabling the feature
> > in the guest. For example, and extra pass over packet might cause extra CPU
> > cache thrashing. In that case, it's not obvious whether enabling it
> > unconditionally will turn out to be a good idea. But we'll have to have
> > that code to be able to tell.
> 
> IMO once you accept that these things can change, consistency is more 
> important than trying to guess what the "best" option may be.

Yes, SW fallback might be nice to have. What's important is likely to
depend on specific user.

> Starting qemu on machine A and migrating to machine B should give the same 
> guest environment as starting on machine B.
> 
> Paul

So currently, the way we try to ensure this is by checking feature bits
against the list supported by backend, and failing migration if there's
a discrepancy.
Anthony Liguori - Feb. 24, 2010, 2:51 p.m.
On 02/23/2010 09:14 PM, Paul Brook wrote:
>> vnet_hdr is IMHO a really bad example to copy from.
>>
>> vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this
>> means is that if you want to migrate from -net tap -net nic,model=virtio
>> to -net user -net nic,model=virtio, it will fail.
>>
>> This is a hard problem to solve in qemu though because it would require
>> that we implement software GSO which so far, no one has stepped up to do.
>>      
> Or make virtio-net pass this on to the guest, and have that deal with the
> problem. If you implement software GSO, then devices can assume it's always
> present and don't need to probe.
>    

That would make migration guest cooperative.  This implies that the 
guests become an important part of the test matrix with respect to 
migration.  The result is a combinatorial expansion of the test matrix.

There's a lot of value in having transparent migration.  Maybe it makes 
sense to have some sort of guest notification for the purposes of 
optimization, but we should be very careful about that because the 
practical cost is huge.

Regards,

Anthony Liguori

> Paul
>
Anthony Liguori - Feb. 24, 2010, 3:16 p.m.
On 02/24/2010 05:46 AM, Michael S. Tsirkin wrote:
>
> Ah, you mean telling the guest to switch features on and off: natureally
> this would require guest driver changes.  This might be also non-trivial
> to implement. Consider a request that has been posted without checksum,
> suddenly we disable checksum support.  Guest will need a way to handle
> that.  Guest OSes might also not be prepared to handle device features
> going away.
>    

It's the same as cpuid flags.

Management tools should plan appropriately and not advertise virtio 
features that cannot be supported throughout the migration pool.  
Fortunately, someone had enough foresight to allow management software 
to disable virtio features on a per-feature basis :-)

Regards,

Anthony Liguori

Patch

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index c2a389f..99169e1 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -17,6 +17,7 @@ 
 #include "net/tap.h"
 #include "qemu-timer.h"
 #include "virtio-net.h"
+#include "vhost_net.h"
 
 #define VIRTIO_NET_VM_VERSION    11
 
@@ -47,6 +48,7 @@  typedef struct VirtIONet
     uint8_t nomulti;
     uint8_t nouni;
     uint8_t nobcast;
+    uint8_t vhost_started;
     struct {
         int in_use;
         int first_multi;
@@ -114,6 +116,10 @@  static void virtio_net_reset(VirtIODevice *vdev)
     n->nomulti = 0;
     n->nouni = 0;
     n->nobcast = 0;
+    if (n->vhost_started) {
+        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
+        n->vhost_started = 0;
+    }
 
     /* Flush any MAC and VLAN filter table state */
     n->mac_table.in_use = 0;
@@ -820,6 +826,36 @@  static NetClientInfo net_virtio_info = {
     .link_status_changed = virtio_net_set_link_status,
 };
 
+static void virtio_net_set_status(struct VirtIODevice *vdev)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+    if (!n->nic->nc.peer) {
+        return;
+    }
+    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
+        return;
+    }
+
+    if (!tap_get_vhost_net(n->nic->nc.peer)) {
+        return;
+    }
+    if (!!n->vhost_started == !!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+        return;
+    }
+    if (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) {
+        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
+        if (r < 0) {
+            fprintf(stderr, "unable to start vhost net: "
+                    "falling back on userspace virtio\n");
+        } else {
+            n->vhost_started = 1;
+        }
+    } else {
+        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
+        n->vhost_started = 0;
+    }
+}
+
 VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
 {
     VirtIONet *n;
@@ -835,6 +871,7 @@  VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
     n->vdev.set_features = virtio_net_set_features;
     n->vdev.bad_features = virtio_net_bad_features;
     n->vdev.reset = virtio_net_reset;
+    n->vdev.set_status = virtio_net_set_status;
     n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx);
     n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx);
     n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
@@ -864,6 +901,9 @@  VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
 void virtio_net_exit(VirtIODevice *vdev)
 {
     VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev);
+    if (n->vhost_started) {
+        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
+    }
 
     qemu_purge_queued_packets(&n->nic->nc);